Bug Summary

File:llvm/include/llvm/ADT/SmallBitVector.h
Warning:line 120, column 3
Potential memory leak

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name X86ISelLowering.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -fno-rounding-math -mconstructor-aliases -munwind-tables -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/build-llvm/lib/Target/X86 -resource-dir /usr/lib/llvm-14/lib/clang/14.0.0 -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/build-llvm/lib/Target/X86 -I /build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/lib/Target/X86 -I /build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/build-llvm/include -I /build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/include -D NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-14/lib/clang/14.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir=/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/build-llvm/lib/Target/X86 -fdebug-prefix-map=/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e=. -ferror-limit 19 -fvisibility hidden -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2021-09-04-040900-46481-1 -x c++ /build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/lib/Target/X86/X86ISelLowering.cpp

/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/lib/Target/X86/X86ISelLowering.cpp

1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that X86 uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "X86ISelLowering.h"
15#include "MCTargetDesc/X86ShuffleDecode.h"
16#include "X86.h"
17#include "X86CallingConv.h"
18#include "X86FrameLowering.h"
19#include "X86InstrBuilder.h"
20#include "X86IntrinsicsInfo.h"
21#include "X86MachineFunctionInfo.h"
22#include "X86TargetMachine.h"
23#include "X86TargetObjectFile.h"
24#include "llvm/ADT/SmallBitVector.h"
25#include "llvm/ADT/SmallSet.h"
26#include "llvm/ADT/Statistic.h"
27#include "llvm/ADT/StringExtras.h"
28#include "llvm/ADT/StringSwitch.h"
29#include "llvm/Analysis/BlockFrequencyInfo.h"
30#include "llvm/Analysis/EHPersonalities.h"
31#include "llvm/Analysis/ObjCARCUtil.h"
32#include "llvm/Analysis/ProfileSummaryInfo.h"
33#include "llvm/Analysis/VectorUtils.h"
34#include "llvm/CodeGen/IntrinsicLowering.h"
35#include "llvm/CodeGen/MachineFrameInfo.h"
36#include "llvm/CodeGen/MachineFunction.h"
37#include "llvm/CodeGen/MachineInstrBuilder.h"
38#include "llvm/CodeGen/MachineJumpTableInfo.h"
39#include "llvm/CodeGen/MachineLoopInfo.h"
40#include "llvm/CodeGen/MachineModuleInfo.h"
41#include "llvm/CodeGen/MachineRegisterInfo.h"
42#include "llvm/CodeGen/TargetLowering.h"
43#include "llvm/CodeGen/WinEHFuncInfo.h"
44#include "llvm/IR/CallingConv.h"
45#include "llvm/IR/Constants.h"
46#include "llvm/IR/DerivedTypes.h"
47#include "llvm/IR/DiagnosticInfo.h"
48#include "llvm/IR/Function.h"
49#include "llvm/IR/GlobalAlias.h"
50#include "llvm/IR/GlobalVariable.h"
51#include "llvm/IR/IRBuilder.h"
52#include "llvm/IR/Instructions.h"
53#include "llvm/IR/Intrinsics.h"
54#include "llvm/IR/PatternMatch.h"
55#include "llvm/MC/MCAsmInfo.h"
56#include "llvm/MC/MCContext.h"
57#include "llvm/MC/MCExpr.h"
58#include "llvm/MC/MCSymbol.h"
59#include "llvm/Support/CommandLine.h"
60#include "llvm/Support/Debug.h"
61#include "llvm/Support/ErrorHandling.h"
62#include "llvm/Support/KnownBits.h"
63#include "llvm/Support/MathExtras.h"
64#include "llvm/Target/TargetOptions.h"
65#include <algorithm>
66#include <bitset>
67#include <cctype>
68#include <numeric>
69using namespace llvm;
70
71#define DEBUG_TYPE"x86-isel" "x86-isel"
72
73STATISTIC(NumTailCalls, "Number of tail calls")static llvm::Statistic NumTailCalls = {"x86-isel", "NumTailCalls"
, "Number of tail calls"}
;
74
75static cl::opt<int> ExperimentalPrefInnermostLoopAlignment(
76 "x86-experimental-pref-innermost-loop-alignment", cl::init(4),
77 cl::desc(
78 "Sets the preferable loop alignment for experiments (as log2 bytes) "
79 "for innermost loops only. If specified, this option overrides "
80 "alignment set by x86-experimental-pref-loop-alignment."),
81 cl::Hidden);
82
83static cl::opt<bool> MulConstantOptimization(
84 "mul-constant-optimization", cl::init(true),
85 cl::desc("Replace 'mul x, Const' with more effective instructions like "
86 "SHIFT, LEA, etc."),
87 cl::Hidden);
88
89static cl::opt<bool> ExperimentalUnorderedISEL(
90 "x86-experimental-unordered-atomic-isel", cl::init(false),
91 cl::desc("Use LoadSDNode and StoreSDNode instead of "
92 "AtomicSDNode for unordered atomic loads and "
93 "stores respectively."),
94 cl::Hidden);
95
96/// Call this when the user attempts to do something unsupported, like
97/// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
98/// report_fatal_error, so calling code should attempt to recover without
99/// crashing.
100static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
101 const char *Msg) {
102 MachineFunction &MF = DAG.getMachineFunction();
103 DAG.getContext()->diagnose(
104 DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc()));
105}
106
107X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
108 const X86Subtarget &STI)
109 : TargetLowering(TM), Subtarget(STI) {
110 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
111 X86ScalarSSEf64 = Subtarget.hasSSE2();
112 X86ScalarSSEf32 = Subtarget.hasSSE1();
113 X86ScalarSSEf16 = Subtarget.hasFP16();
114 MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
115
116 // Set up the TargetLowering object.
117
118 // X86 is weird. It always uses i8 for shift amounts and setcc results.
119 setBooleanContents(ZeroOrOneBooleanContent);
120 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
121 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
122
123 // For 64-bit, since we have so many registers, use the ILP scheduler.
124 // For 32-bit, use the register pressure specific scheduling.
125 // For Atom, always use ILP scheduling.
126 if (Subtarget.isAtom())
127 setSchedulingPreference(Sched::ILP);
128 else if (Subtarget.is64Bit())
129 setSchedulingPreference(Sched::ILP);
130 else
131 setSchedulingPreference(Sched::RegPressure);
132 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
133 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
134
135 // Bypass expensive divides and use cheaper ones.
136 if (TM.getOptLevel() >= CodeGenOpt::Default) {
137 if (Subtarget.hasSlowDivide32())
138 addBypassSlowDiv(32, 8);
139 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
140 addBypassSlowDiv(64, 32);
141 }
142
143 // Setup Windows compiler runtime calls.
144 if (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()) {
145 static const struct {
146 const RTLIB::Libcall Op;
147 const char * const Name;
148 const CallingConv::ID CC;
149 } LibraryCalls[] = {
150 { RTLIB::SDIV_I64, "_alldiv", CallingConv::X86_StdCall },
151 { RTLIB::UDIV_I64, "_aulldiv", CallingConv::X86_StdCall },
152 { RTLIB::SREM_I64, "_allrem", CallingConv::X86_StdCall },
153 { RTLIB::UREM_I64, "_aullrem", CallingConv::X86_StdCall },
154 { RTLIB::MUL_I64, "_allmul", CallingConv::X86_StdCall },
155 };
156
157 for (const auto &LC : LibraryCalls) {
158 setLibcallName(LC.Op, LC.Name);
159 setLibcallCallingConv(LC.Op, LC.CC);
160 }
161 }
162
163 if (Subtarget.getTargetTriple().isOSMSVCRT()) {
164 // MSVCRT doesn't have powi; fall back to pow
165 setLibcallName(RTLIB::POWI_F32, nullptr);
166 setLibcallName(RTLIB::POWI_F64, nullptr);
167 }
168
169 // If we don't have cmpxchg8b(meaing this is a 386/486), limit atomic size to
170 // 32 bits so the AtomicExpandPass will expand it so we don't need cmpxchg8b.
171 // FIXME: Should we be limiting the atomic size on other configs? Default is
172 // 1024.
173 if (!Subtarget.hasCmpxchg8b())
174 setMaxAtomicSizeInBitsSupported(32);
175
176 // Set up the register classes.
177 addRegisterClass(MVT::i8, &X86::GR8RegClass);
178 addRegisterClass(MVT::i16, &X86::GR16RegClass);
179 addRegisterClass(MVT::i32, &X86::GR32RegClass);
180 if (Subtarget.is64Bit())
181 addRegisterClass(MVT::i64, &X86::GR64RegClass);
182
183 for (MVT VT : MVT::integer_valuetypes())
184 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
185
186 // We don't accept any truncstore of integer registers.
187 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
188 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
189 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
190 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
191 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
192 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
193
194 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
195
196 // SETOEQ and SETUNE require checking two conditions.
197 for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
198 setCondCodeAction(ISD::SETOEQ, VT, Expand);
199 setCondCodeAction(ISD::SETUNE, VT, Expand);
200 }
201
202 // Integer absolute.
203 if (Subtarget.hasCMov()) {
204 setOperationAction(ISD::ABS , MVT::i16 , Custom);
205 setOperationAction(ISD::ABS , MVT::i32 , Custom);
206 if (Subtarget.is64Bit())
207 setOperationAction(ISD::ABS , MVT::i64 , Custom);
208 }
209
210 // Signed saturation subtraction.
211 setOperationAction(ISD::SSUBSAT , MVT::i8 , Custom);
212 setOperationAction(ISD::SSUBSAT , MVT::i16 , Custom);
213 setOperationAction(ISD::SSUBSAT , MVT::i32 , Custom);
214 if (Subtarget.is64Bit())
215 setOperationAction(ISD::SSUBSAT , MVT::i64 , Custom);
216
217 // Funnel shifts.
218 for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
219 // For slow shld targets we only lower for code size.
220 LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;
221
222 setOperationAction(ShiftOp , MVT::i8 , Custom);
223 setOperationAction(ShiftOp , MVT::i16 , Custom);
224 setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction);
225 if (Subtarget.is64Bit())
226 setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction);
227 }
228
229 if (!Subtarget.useSoftFloat()) {
230 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
231 // operation.
232 setOperationAction(ISD::UINT_TO_FP, MVT::i8, Promote);
233 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i8, Promote);
234 setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
235 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i16, Promote);
236 // We have an algorithm for SSE2, and we turn this into a 64-bit
237 // FILD or VCVTUSI2SS/SD for other targets.
238 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
239 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);
240 // We have an algorithm for SSE2->double, and we turn this into a
241 // 64-bit FILD followed by conditional FADD for other targets.
242 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
243 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);
244
245 // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
246 // this operation.
247 setOperationAction(ISD::SINT_TO_FP, MVT::i8, Promote);
248 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i8, Promote);
249 // SSE has no i16 to fp conversion, only i32. We promote in the handler
250 // to allow f80 to use i16 and f64 to use i16 with sse1 only
251 setOperationAction(ISD::SINT_TO_FP, MVT::i16, Custom);
252 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i16, Custom);
253 // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
254 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
255 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);
256 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
257 // are Legal, f80 is custom lowered.
258 setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
259 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);
260
261 // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
262 // this operation.
263 setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote);
264 // FIXME: This doesn't generate invalid exception when it should. PR44019.
265 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i8, Promote);
266 setOperationAction(ISD::FP_TO_SINT, MVT::i16, Custom);
267 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i16, Custom);
268 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
269 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
270 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
271 // are Legal, f80 is custom lowered.
272 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
273 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);
274
275 // Handle FP_TO_UINT by promoting the destination to a larger signed
276 // conversion.
277 setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote);
278 // FIXME: This doesn't generate invalid exception when it should. PR44019.
279 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i8, Promote);
280 setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
281 // FIXME: This doesn't generate invalid exception when it should. PR44019.
282 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i16, Promote);
283 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
284 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
285 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
286 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);
287
288 setOperationAction(ISD::LRINT, MVT::f32, Custom);
289 setOperationAction(ISD::LRINT, MVT::f64, Custom);
290 setOperationAction(ISD::LLRINT, MVT::f32, Custom);
291 setOperationAction(ISD::LLRINT, MVT::f64, Custom);
292
293 if (!Subtarget.is64Bit()) {
294 setOperationAction(ISD::LRINT, MVT::i64, Custom);
295 setOperationAction(ISD::LLRINT, MVT::i64, Custom);
296 }
297 }
298
299 if (Subtarget.hasSSE2()) {
300 // Custom lowering for saturating float to int conversions.
301 // We handle promotion to larger result types manually.
302 for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
303 setOperationAction(ISD::FP_TO_UINT_SAT, VT, Custom);
304 setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom);
305 }
306 if (Subtarget.is64Bit()) {
307 setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom);
308 setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom);
309 }
310 }
311
312 // Handle address space casts between mixed sized pointers.
313 setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
314 setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
315
316 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
317 if (!X86ScalarSSEf64) {
318 setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
319 setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
320 if (Subtarget.is64Bit()) {
321 setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
322 // Without SSE, i64->f64 goes through memory.
323 setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
324 }
325 } else if (!Subtarget.is64Bit())
326 setOperationAction(ISD::BITCAST , MVT::i64 , Custom);
327
328 // Scalar integer divide and remainder are lowered to use operations that
329 // produce two results, to match the available instructions. This exposes
330 // the two-result form to trivial CSE, which is able to combine x/y and x%y
331 // into a single instruction.
332 //
333 // Scalar integer multiply-high is also lowered to use two-result
334 // operations, to match the available instructions. However, plain multiply
335 // (low) operations are left as Legal, as there are single-result
336 // instructions for this in x86. Using the two-result multiply instructions
337 // when both high and low results are needed must be arranged by dagcombine.
338 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
339 setOperationAction(ISD::MULHS, VT, Expand);
340 setOperationAction(ISD::MULHU, VT, Expand);
341 setOperationAction(ISD::SDIV, VT, Expand);
342 setOperationAction(ISD::UDIV, VT, Expand);
343 setOperationAction(ISD::SREM, VT, Expand);
344 setOperationAction(ISD::UREM, VT, Expand);
345 }
346
347 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
348 setOperationAction(ISD::BRCOND , MVT::Other, Custom);
349 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
350 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
351 setOperationAction(ISD::BR_CC, VT, Expand);
352 setOperationAction(ISD::SELECT_CC, VT, Expand);
353 }
354 if (Subtarget.is64Bit())
355 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
356 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
357 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
358 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
359
360 setOperationAction(ISD::FREM , MVT::f32 , Expand);
361 setOperationAction(ISD::FREM , MVT::f64 , Expand);
362 setOperationAction(ISD::FREM , MVT::f80 , Expand);
363 setOperationAction(ISD::FREM , MVT::f128 , Expand);
364
365 if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {
366 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);
367 setOperationAction(ISD::SET_ROUNDING , MVT::Other, Custom);
368 }
369
370 // Promote the i8 variants and force them on up to i32 which has a shorter
371 // encoding.
372 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
373 setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
374
375 if (Subtarget.hasBMI()) {
376 // Promote the i16 zero undef variant and force it on up to i32 when tzcnt
377 // is enabled.
378 setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i16, MVT::i32);
379 } else {
380 setOperationAction(ISD::CTTZ, MVT::i16, Custom);
381 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
382 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Legal);
383 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
384 if (Subtarget.is64Bit()) {
385 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
386 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
387 }
388 }
389
390 if (Subtarget.hasLZCNT()) {
391 // When promoting the i8 variants, force them to i32 for a shorter
392 // encoding.
393 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
394 setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
395 } else {
396 for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
397 if (VT == MVT::i64 && !Subtarget.is64Bit())
398 continue;
399 setOperationAction(ISD::CTLZ , VT, Custom);
400 setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Custom);
401 }
402 }
403
404 for (auto Op : {ISD::FP16_TO_FP, ISD::STRICT_FP16_TO_FP, ISD::FP_TO_FP16,
405 ISD::STRICT_FP_TO_FP16}) {
406 // Special handling for half-precision floating point conversions.
407 // If we don't have F16C support, then lower half float conversions
408 // into library calls.
409 setOperationAction(
410 Op, MVT::f32,
411 (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);
412 // There's never any support for operations beyond MVT::f32.
413 setOperationAction(Op, MVT::f64, Expand);
414 setOperationAction(Op, MVT::f80, Expand);
415 setOperationAction(Op, MVT::f128, Expand);
416 }
417
418 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
419 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
420 setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
421 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f16, Expand);
422 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
423 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
424 setTruncStoreAction(MVT::f80, MVT::f16, Expand);
425 setTruncStoreAction(MVT::f128, MVT::f16, Expand);
426
427 setOperationAction(ISD::PARITY, MVT::i8, Custom);
428 if (Subtarget.hasPOPCNT()) {
429 setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
430 } else {
431 setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
432 setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
433 setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
434 if (Subtarget.is64Bit())
435 setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
436 else
437 setOperationAction(ISD::CTPOP , MVT::i64 , Custom);
438
439 setOperationAction(ISD::PARITY, MVT::i16, Custom);
440 setOperationAction(ISD::PARITY, MVT::i32, Custom);
441 if (Subtarget.is64Bit())
442 setOperationAction(ISD::PARITY, MVT::i64, Custom);
443 }
444
445 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
446
447 if (!Subtarget.hasMOVBE())
448 setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
449
450 // X86 wants to expand cmov itself.
451 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
452 setOperationAction(ISD::SELECT, VT, Custom);
453 setOperationAction(ISD::SETCC, VT, Custom);
454 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
455 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
456 }
457 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
458 if (VT == MVT::i64 && !Subtarget.is64Bit())
459 continue;
460 setOperationAction(ISD::SELECT, VT, Custom);
461 setOperationAction(ISD::SETCC, VT, Custom);
462 }
463
464 // Custom action for SELECT MMX and expand action for SELECT_CC MMX
465 setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
466 setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);
467
468 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
469 // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
470 // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
471 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
472 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
473 setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
474 if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
475 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
476
477 // Darwin ABI issue.
478 for (auto VT : { MVT::i32, MVT::i64 }) {
479 if (VT == MVT::i64 && !Subtarget.is64Bit())
480 continue;
481 setOperationAction(ISD::ConstantPool , VT, Custom);
482 setOperationAction(ISD::JumpTable , VT, Custom);
483 setOperationAction(ISD::GlobalAddress , VT, Custom);
484 setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
485 setOperationAction(ISD::ExternalSymbol , VT, Custom);
486 setOperationAction(ISD::BlockAddress , VT, Custom);
487 }
488
489 // 64-bit shl, sra, srl (iff 32-bit x86)
490 for (auto VT : { MVT::i32, MVT::i64 }) {
491 if (VT == MVT::i64 && !Subtarget.is64Bit())
492 continue;
493 setOperationAction(ISD::SHL_PARTS, VT, Custom);
494 setOperationAction(ISD::SRA_PARTS, VT, Custom);
495 setOperationAction(ISD::SRL_PARTS, VT, Custom);
496 }
497
498 if (Subtarget.hasSSEPrefetch() || Subtarget.has3DNow())
499 setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
500
501 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
502
503 // Expand certain atomics
504 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
505 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
506 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
507 setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
508 setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
509 setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
510 setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
511 setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
512 }
513
514 if (!Subtarget.is64Bit())
515 setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
516
517 if (Subtarget.hasCmpxchg16b()) {
518 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
519 }
520
521 // FIXME - use subtarget debug flags
522 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
523 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
524 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
525 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
526 }
527
528 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
529 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
530
531 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
532 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
533
534 setOperationAction(ISD::TRAP, MVT::Other, Legal);
535 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
536 setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);
537
538 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
539 setOperationAction(ISD::VASTART , MVT::Other, Custom);
540 setOperationAction(ISD::VAEND , MVT::Other, Expand);
541 bool Is64Bit = Subtarget.is64Bit();
542 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
543 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
544
545 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
546 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
547
548 setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
549
550 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
551 setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
552 setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
553
554 if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
555 // f32 and f64 use SSE.
556 // Set up the FP register classes.
557 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
558 : &X86::FR32RegClass);
559 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
560 : &X86::FR64RegClass);
561
562 // Disable f32->f64 extload as we can only generate this in one instruction
563 // under optsize. So its easier to pattern match (fpext (load)) for that
564 // case instead of needing to emit 2 instructions for extload in the
565 // non-optsize case.
566 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
567
568 for (auto VT : { MVT::f32, MVT::f64 }) {
569 // Use ANDPD to simulate FABS.
570 setOperationAction(ISD::FABS, VT, Custom);
571
572 // Use XORP to simulate FNEG.
573 setOperationAction(ISD::FNEG, VT, Custom);
574
575 // Use ANDPD and ORPD to simulate FCOPYSIGN.
576 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
577
578 // These might be better off as horizontal vector ops.
579 setOperationAction(ISD::FADD, VT, Custom);
580 setOperationAction(ISD::FSUB, VT, Custom);
581
582 // We don't support sin/cos/fmod
583 setOperationAction(ISD::FSIN , VT, Expand);
584 setOperationAction(ISD::FCOS , VT, Expand);
585 setOperationAction(ISD::FSINCOS, VT, Expand);
586 }
587
588 // Lower this to MOVMSK plus an AND.
589 setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
590 setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
591
592 } else if (!Subtarget.useSoftFloat() && X86ScalarSSEf32 &&
593 (UseX87 || Is64Bit)) {
594 // Use SSE for f32, x87 for f64.
595 // Set up the FP register classes.
596 addRegisterClass(MVT::f32, &X86::FR32RegClass);
597 if (UseX87)
598 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
599
600 // Use ANDPS to simulate FABS.
601 setOperationAction(ISD::FABS , MVT::f32, Custom);
602
603 // Use XORP to simulate FNEG.
604 setOperationAction(ISD::FNEG , MVT::f32, Custom);
605
606 if (UseX87)
607 setOperationAction(ISD::UNDEF, MVT::f64, Expand);
608
609 // Use ANDPS and ORPS to simulate FCOPYSIGN.
610 if (UseX87)
611 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
612 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
613
614 // We don't support sin/cos/fmod
615 setOperationAction(ISD::FSIN , MVT::f32, Expand);
616 setOperationAction(ISD::FCOS , MVT::f32, Expand);
617 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
618
619 if (UseX87) {
620 // Always expand sin/cos functions even though x87 has an instruction.
621 setOperationAction(ISD::FSIN, MVT::f64, Expand);
622 setOperationAction(ISD::FCOS, MVT::f64, Expand);
623 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
624 }
625 } else if (UseX87) {
626 // f32 and f64 in x87.
627 // Set up the FP register classes.
628 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
629 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
630
631 for (auto VT : { MVT::f32, MVT::f64 }) {
632 setOperationAction(ISD::UNDEF, VT, Expand);
633 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
634
635 // Always expand sin/cos functions even though x87 has an instruction.
636 setOperationAction(ISD::FSIN , VT, Expand);
637 setOperationAction(ISD::FCOS , VT, Expand);
638 setOperationAction(ISD::FSINCOS, VT, Expand);
639 }
640 }
641
642 // Expand FP32 immediates into loads from the stack, save special cases.
643 if (isTypeLegal(MVT::f32)) {
644 if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
645 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
646 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
647 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
648 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
649 } else // SSE immediates.
650 addLegalFPImmediate(APFloat(+0.0f)); // xorps
651 }
652 // Expand FP64 immediates into loads from the stack, save special cases.
653 if (isTypeLegal(MVT::f64)) {
654 if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
655 addLegalFPImmediate(APFloat(+0.0)); // FLD0
656 addLegalFPImmediate(APFloat(+1.0)); // FLD1
657 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
658 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
659 } else // SSE immediates.
660 addLegalFPImmediate(APFloat(+0.0)); // xorpd
661 }
662 // Handle constrained floating-point operations of scalar.
663 setOperationAction(ISD::STRICT_FADD, MVT::f32, Legal);
664 setOperationAction(ISD::STRICT_FADD, MVT::f64, Legal);
665 setOperationAction(ISD::STRICT_FSUB, MVT::f32, Legal);
666 setOperationAction(ISD::STRICT_FSUB, MVT::f64, Legal);
667 setOperationAction(ISD::STRICT_FMUL, MVT::f32, Legal);
668 setOperationAction(ISD::STRICT_FMUL, MVT::f64, Legal);
669 setOperationAction(ISD::STRICT_FDIV, MVT::f32, Legal);
670 setOperationAction(ISD::STRICT_FDIV, MVT::f64, Legal);
671 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);
672 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal);
673 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Legal);
674 setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Legal);
675 setOperationAction(ISD::STRICT_FSQRT, MVT::f64, Legal);
676
677 // We don't support FMA.
678 setOperationAction(ISD::FMA, MVT::f64, Expand);
679 setOperationAction(ISD::FMA, MVT::f32, Expand);
680
681 // f80 always uses X87.
682 if (UseX87) {
683 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
684 setOperationAction(ISD::UNDEF, MVT::f80, Expand);
685 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
686 {
687 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
688 addLegalFPImmediate(TmpFlt); // FLD0
689 TmpFlt.changeSign();
690 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
691
692 bool ignored;
693 APFloat TmpFlt2(+1.0);
694 TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
695 &ignored);
696 addLegalFPImmediate(TmpFlt2); // FLD1
697 TmpFlt2.changeSign();
698 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
699 }
700
701 // Always expand sin/cos functions even though x87 has an instruction.
702 setOperationAction(ISD::FSIN , MVT::f80, Expand);
703 setOperationAction(ISD::FCOS , MVT::f80, Expand);
704 setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
705
706 setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
707 setOperationAction(ISD::FCEIL, MVT::f80, Expand);
708 setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
709 setOperationAction(ISD::FRINT, MVT::f80, Expand);
710 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
711 setOperationAction(ISD::FMA, MVT::f80, Expand);
712 setOperationAction(ISD::LROUND, MVT::f80, Expand);
713 setOperationAction(ISD::LLROUND, MVT::f80, Expand);
714 setOperationAction(ISD::LRINT, MVT::f80, Custom);
715 setOperationAction(ISD::LLRINT, MVT::f80, Custom);
716
717 // Handle constrained floating-point operations of scalar.
718 setOperationAction(ISD::STRICT_FADD , MVT::f80, Legal);
719 setOperationAction(ISD::STRICT_FSUB , MVT::f80, Legal);
720 setOperationAction(ISD::STRICT_FMUL , MVT::f80, Legal);
721 setOperationAction(ISD::STRICT_FDIV , MVT::f80, Legal);
722 setOperationAction(ISD::STRICT_FSQRT , MVT::f80, Legal);
723 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Legal);
724 // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
725 // as Custom.
726 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Legal);
727 }
728
729 // f128 uses xmm registers, but most operations require libcalls.
730 if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
731 addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
732 : &X86::VR128RegClass);
733
734 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
735
736 setOperationAction(ISD::FADD, MVT::f128, LibCall);
737 setOperationAction(ISD::STRICT_FADD, MVT::f128, LibCall);
738 setOperationAction(ISD::FSUB, MVT::f128, LibCall);
739 setOperationAction(ISD::STRICT_FSUB, MVT::f128, LibCall);
740 setOperationAction(ISD::FDIV, MVT::f128, LibCall);
741 setOperationAction(ISD::STRICT_FDIV, MVT::f128, LibCall);
742 setOperationAction(ISD::FMUL, MVT::f128, LibCall);
743 setOperationAction(ISD::STRICT_FMUL, MVT::f128, LibCall);
744 setOperationAction(ISD::FMA, MVT::f128, LibCall);
745 setOperationAction(ISD::STRICT_FMA, MVT::f128, LibCall);
746
747 setOperationAction(ISD::FABS, MVT::f128, Custom);
748 setOperationAction(ISD::FNEG, MVT::f128, Custom);
749 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
750
751 setOperationAction(ISD::FSIN, MVT::f128, LibCall);
752 setOperationAction(ISD::STRICT_FSIN, MVT::f128, LibCall);
753 setOperationAction(ISD::FCOS, MVT::f128, LibCall);
754 setOperationAction(ISD::STRICT_FCOS, MVT::f128, LibCall);
755 setOperationAction(ISD::FSINCOS, MVT::f128, LibCall);
756 // No STRICT_FSINCOS
757 setOperationAction(ISD::FSQRT, MVT::f128, LibCall);
758 setOperationAction(ISD::STRICT_FSQRT, MVT::f128, LibCall);
759
760 setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
761 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Custom);
762 // We need to custom handle any FP_ROUND with an f128 input, but
763 // LegalizeDAG uses the result type to know when to run a custom handler.
764 // So we have to list all legal floating point result types here.
765 if (isTypeLegal(MVT::f32)) {
766 setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
767 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);
768 }
769 if (isTypeLegal(MVT::f64)) {
770 setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
771 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);
772 }
773 if (isTypeLegal(MVT::f80)) {
774 setOperationAction(ISD::FP_ROUND, MVT::f80, Custom);
775 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Custom);
776 }
777
778 setOperationAction(ISD::SETCC, MVT::f128, Custom);
779
780 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
781 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
782 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);
783 setTruncStoreAction(MVT::f128, MVT::f32, Expand);
784 setTruncStoreAction(MVT::f128, MVT::f64, Expand);
785 setTruncStoreAction(MVT::f128, MVT::f80, Expand);
786 }
787
788 // Always use a library call for pow.
789 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
790 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
791 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
792 setOperationAction(ISD::FPOW , MVT::f128 , Expand);
793
794 setOperationAction(ISD::FLOG, MVT::f80, Expand);
795 setOperationAction(ISD::FLOG2, MVT::f80, Expand);
796 setOperationAction(ISD::FLOG10, MVT::f80, Expand);
797 setOperationAction(ISD::FEXP, MVT::f80, Expand);
798 setOperationAction(ISD::FEXP2, MVT::f80, Expand);
799 setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
800 setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
801
802 // Some FP actions are always expanded for vector types.
803 for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
804 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
805 setOperationAction(ISD::FSIN, VT, Expand);
806 setOperationAction(ISD::FSINCOS, VT, Expand);
807 setOperationAction(ISD::FCOS, VT, Expand);
808 setOperationAction(ISD::FREM, VT, Expand);
809 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
810 setOperationAction(ISD::FPOW, VT, Expand);
811 setOperationAction(ISD::FLOG, VT, Expand);
812 setOperationAction(ISD::FLOG2, VT, Expand);
813 setOperationAction(ISD::FLOG10, VT, Expand);
814 setOperationAction(ISD::FEXP, VT, Expand);
815 setOperationAction(ISD::FEXP2, VT, Expand);
816 }
817
818 // First set operation action for all vector types to either promote
819 // (for widening) or expand (for scalarization). Then we will selectively
820 // turn on ones that can be effectively codegen'd.
821 for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
822 setOperationAction(ISD::SDIV, VT, Expand);
823 setOperationAction(ISD::UDIV, VT, Expand);
824 setOperationAction(ISD::SREM, VT, Expand);
825 setOperationAction(ISD::UREM, VT, Expand);
826 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
827 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
828 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
829 setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
830 setOperationAction(ISD::FMA, VT, Expand);
831 setOperationAction(ISD::FFLOOR, VT, Expand);
832 setOperationAction(ISD::FCEIL, VT, Expand);
833 setOperationAction(ISD::FTRUNC, VT, Expand);
834 setOperationAction(ISD::FRINT, VT, Expand);
835 setOperationAction(ISD::FNEARBYINT, VT, Expand);
836 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
837 setOperationAction(ISD::MULHS, VT, Expand);
838 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
839 setOperationAction(ISD::MULHU, VT, Expand);
840 setOperationAction(ISD::SDIVREM, VT, Expand);
841 setOperationAction(ISD::UDIVREM, VT, Expand);
842 setOperationAction(ISD::CTPOP, VT, Expand);
843 setOperationAction(ISD::CTTZ, VT, Expand);
844 setOperationAction(ISD::CTLZ, VT, Expand);
845 setOperationAction(ISD::ROTL, VT, Expand);
846 setOperationAction(ISD::ROTR, VT, Expand);
847 setOperationAction(ISD::BSWAP, VT, Expand);
848 setOperationAction(ISD::SETCC, VT, Expand);
849 setOperationAction(ISD::FP_TO_UINT, VT, Expand);
850 setOperationAction(ISD::FP_TO_SINT, VT, Expand);
851 setOperationAction(ISD::UINT_TO_FP, VT, Expand);
852 setOperationAction(ISD::SINT_TO_FP, VT, Expand);
853 setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
854 setOperationAction(ISD::TRUNCATE, VT, Expand);
855 setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
856 setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
857 setOperationAction(ISD::ANY_EXTEND, VT, Expand);
858 setOperationAction(ISD::SELECT_CC, VT, Expand);
859 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
860 setTruncStoreAction(InnerVT, VT, Expand);
861
862 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
863 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
864
865 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
866 // types, we have to deal with them whether we ask for Expansion or not.
867 // Setting Expand causes its own optimisation problems though, so leave
868 // them legal.
869 if (VT.getVectorElementType() == MVT::i1)
870 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
871
872 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
873 // split/scalarized right now.
874 if (VT.getVectorElementType() == MVT::f16)
875 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
876 }
877 }
878
879 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
880 // with -msoft-float, disable use of MMX as well.
881 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
882 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
883 // No operations on x86mmx supported, everything uses intrinsics.
884 }
885
886 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
887 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
888 : &X86::VR128RegClass);
889
890 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
891 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
892 setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);
893 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
894 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
895 setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
896 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
897 setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
898
899 setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
900 setOperationAction(ISD::STORE, MVT::v2f32, Custom);
901
902 setOperationAction(ISD::STRICT_FADD, MVT::v4f32, Legal);
903 setOperationAction(ISD::STRICT_FSUB, MVT::v4f32, Legal);
904 setOperationAction(ISD::STRICT_FMUL, MVT::v4f32, Legal);
905 setOperationAction(ISD::STRICT_FDIV, MVT::v4f32, Legal);
906 setOperationAction(ISD::STRICT_FSQRT, MVT::v4f32, Legal);
907 }
908
909 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
910 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
911 : &X86::VR128RegClass);
912
913 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
914 // registers cannot be used even for integer operations.
915 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
916 : &X86::VR128RegClass);
917 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
918 : &X86::VR128RegClass);
919 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
920 : &X86::VR128RegClass);
921 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
922 : &X86::VR128RegClass);
923
924 for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
925 MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
926 setOperationAction(ISD::SDIV, VT, Custom);
927 setOperationAction(ISD::SREM, VT, Custom);
928 setOperationAction(ISD::UDIV, VT, Custom);
929 setOperationAction(ISD::UREM, VT, Custom);
930 }
931
932 setOperationAction(ISD::MUL, MVT::v2i8, Custom);
933 setOperationAction(ISD::MUL, MVT::v4i8, Custom);
934 setOperationAction(ISD::MUL, MVT::v8i8, Custom);
935
936 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
937 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
938 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
939 setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
940 setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
941 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
942 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
943 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
944 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
945 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
946
947 setOperationAction(ISD::SMULO, MVT::v16i8, Custom);
948 setOperationAction(ISD::UMULO, MVT::v16i8, Custom);
949
950 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
951 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
952 setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);
953
954 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
955 setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
956 setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
957 setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
958 setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
959 }
960
961 setOperationAction(ISD::UADDSAT, MVT::v16i8, Legal);
962 setOperationAction(ISD::SADDSAT, MVT::v16i8, Legal);
963 setOperationAction(ISD::USUBSAT, MVT::v16i8, Legal);
964 setOperationAction(ISD::SSUBSAT, MVT::v16i8, Legal);
965 setOperationAction(ISD::UADDSAT, MVT::v8i16, Legal);
966 setOperationAction(ISD::SADDSAT, MVT::v8i16, Legal);
967 setOperationAction(ISD::USUBSAT, MVT::v8i16, Legal);
968 setOperationAction(ISD::SSUBSAT, MVT::v8i16, Legal);
969 setOperationAction(ISD::USUBSAT, MVT::v4i32, Custom);
970 setOperationAction(ISD::USUBSAT, MVT::v2i64, Custom);
971
972 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
973 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
974 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
975
976 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
977 setOperationAction(ISD::SETCC, VT, Custom);
978 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
979 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
980 setOperationAction(ISD::CTPOP, VT, Custom);
981 setOperationAction(ISD::ABS, VT, Custom);
982
983 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
984 // setcc all the way to isel and prefer SETGT in some isel patterns.
985 setCondCodeAction(ISD::SETLT, VT, Custom);
986 setCondCodeAction(ISD::SETLE, VT, Custom);
987 }
988
989 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
990 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
991 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
992 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
993 setOperationAction(ISD::VSELECT, VT, Custom);
994 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
995 }
996
997 for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
998 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
999 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1000 setOperationAction(ISD::VSELECT, VT, Custom);
1001
1002 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
1003 continue;
1004
1005 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1006 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1007 }
1008
1009 // Custom lower v2i64 and v2f64 selects.
1010 setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
1011 setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
1012 setOperationAction(ISD::SELECT, MVT::v4i32, Custom);
1013 setOperationAction(ISD::SELECT, MVT::v8i16, Custom);
1014 setOperationAction(ISD::SELECT, MVT::v16i8, Custom);
1015
1016 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
1017 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Custom);
1018 setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
1019 setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
1020 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Legal);
1021 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i32, Custom);
1022
1023 // Custom legalize these to avoid over promotion or custom promotion.
1024 for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
1025 setOperationAction(ISD::FP_TO_SINT, VT, Custom);
1026 setOperationAction(ISD::FP_TO_UINT, VT, Custom);
1027 setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom);
1028 setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom);
1029 }
1030
1031 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
1032 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i32, Legal);
1033 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
1034 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i32, Custom);
1035
1036 setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
1037 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i32, Custom);
1038
1039 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
1040 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32, Custom);
1041
1042 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
1043 setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
1044 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f32, Custom);
1045 setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
1046 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f32, Custom);
1047
1048 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
1049 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f32, Custom);
1050 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
1051 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f32, Custom);
1052
1053 // We want to legalize this to an f64 load rather than an i64 load on
1054 // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
1055 // store.
1056 setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
1057 setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
1058 setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
1059 setOperationAction(ISD::STORE, MVT::v2i32, Custom);
1060 setOperationAction(ISD::STORE, MVT::v4i16, Custom);
1061 setOperationAction(ISD::STORE, MVT::v8i8, Custom);
1062
1063 setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
1064 setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
1065 setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
1066 if (!Subtarget.hasAVX512())
1067 setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);
1068
1069 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
1070 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
1071 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
1072
1073 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
1074
1075 setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom);
1076 setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom);
1077 setOperationAction(ISD::TRUNCATE, MVT::v2i32, Custom);
1078 setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom);
1079 setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom);
1080 setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom);
1081
1082 // In the customized shift lowering, the legal v4i32/v2i64 cases
1083 // in AVX2 will be recognized.
1084 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1085 setOperationAction(ISD::SRL, VT, Custom);
1086 setOperationAction(ISD::SHL, VT, Custom);
1087 setOperationAction(ISD::SRA, VT, Custom);
1088 }
1089
1090 setOperationAction(ISD::ROTL, MVT::v4i32, Custom);
1091 setOperationAction(ISD::ROTL, MVT::v8i16, Custom);
1092
1093 // With 512-bit registers or AVX512VL+BW, expanding (and promoting the
1094 // shifts) is better.
1095 if (!Subtarget.useAVX512Regs() &&
1096 !(Subtarget.hasBWI() && Subtarget.hasVLX()))
1097 setOperationAction(ISD::ROTL, MVT::v16i8, Custom);
1098
1099 setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal);
1100 setOperationAction(ISD::STRICT_FADD, MVT::v2f64, Legal);
1101 setOperationAction(ISD::STRICT_FSUB, MVT::v2f64, Legal);
1102 setOperationAction(ISD::STRICT_FMUL, MVT::v2f64, Legal);
1103 setOperationAction(ISD::STRICT_FDIV, MVT::v2f64, Legal);
1104 }
1105
1106 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
1107 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
1108 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
1109 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
1110 setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);
1111 setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
1112 setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
1113 setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
1114 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1115
1116 // These might be better off as horizontal vector ops.
1117 setOperationAction(ISD::ADD, MVT::i16, Custom);
1118 setOperationAction(ISD::ADD, MVT::i32, Custom);
1119 setOperationAction(ISD::SUB, MVT::i16, Custom);
1120 setOperationAction(ISD::SUB, MVT::i32, Custom);
1121 }
1122
1123 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
1124 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
1125 setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
1126 setOperationAction(ISD::STRICT_FFLOOR, RoundedTy, Legal);
1127 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
1128 setOperationAction(ISD::STRICT_FCEIL, RoundedTy, Legal);
1129 setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
1130 setOperationAction(ISD::STRICT_FTRUNC, RoundedTy, Legal);
1131 setOperationAction(ISD::FRINT, RoundedTy, Legal);
1132 setOperationAction(ISD::STRICT_FRINT, RoundedTy, Legal);
1133 setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
1134 setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy, Legal);
1135 setOperationAction(ISD::FROUNDEVEN, RoundedTy, Legal);
1136 setOperationAction(ISD::STRICT_FROUNDEVEN, RoundedTy, Legal);
1137
1138 setOperationAction(ISD::FROUND, RoundedTy, Custom);
1139 }
1140
1141 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
1142 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
1143 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
1144 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
1145 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
1146 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
1147 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
1148 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
1149
1150 setOperationAction(ISD::UADDSAT, MVT::v4i32, Custom);
1151 setOperationAction(ISD::SADDSAT, MVT::v2i64, Custom);
1152 setOperationAction(ISD::SSUBSAT, MVT::v2i64, Custom);
1153
1154 // FIXME: Do we need to handle scalar-to-vector here?
1155 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
1156
1157 // We directly match byte blends in the backend as they match the VSELECT
1158 // condition form.
1159 setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
1160
1161 // SSE41 brings specific instructions for doing vector sign extend even in
1162 // cases where we don't have SRA.
1163 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1164 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
1165 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
1166 }
1167
1168 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1169 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1170 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
1171 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
1172 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
1173 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
1174 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
1175 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
1176 }
1177
1178 // i8 vectors are custom because the source register and source
1179 // source memory operand types are not the same width.
1180 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
1181
1182 if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
1183 // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
1184 // do the pre and post work in the vector domain.
1185 setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Custom);
1186 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i64, Custom);
1187 // We need to mark SINT_TO_FP as Custom even though we want to expand it
1188 // so that DAG combine doesn't try to turn it into uint_to_fp.
1189 setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Custom);
1190 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i64, Custom);
1191 }
1192 }
1193
1194 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
1195 setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom);
1196 }
1197
1198 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
1199 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1200 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1201 setOperationAction(ISD::ROTL, VT, Custom);
1202
1203 // XOP can efficiently perform BITREVERSE with VPPERM.
1204 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
1205 setOperationAction(ISD::BITREVERSE, VT, Custom);
1206
1207 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1208 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1209 setOperationAction(ISD::BITREVERSE, VT, Custom);
1210 }
1211
1212 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
1213 bool HasInt256 = Subtarget.hasInt256();
1214
1215 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
1216 : &X86::VR256RegClass);
1217 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1218 : &X86::VR256RegClass);
1219 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1220 : &X86::VR256RegClass);
1221 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1222 : &X86::VR256RegClass);
1223 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1224 : &X86::VR256RegClass);
1225 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1226 : &X86::VR256RegClass);
1227
1228 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
1229 setOperationAction(ISD::FFLOOR, VT, Legal);
1230 setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
1231 setOperationAction(ISD::FCEIL, VT, Legal);
1232 setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
1233 setOperationAction(ISD::FTRUNC, VT, Legal);
1234 setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
1235 setOperationAction(ISD::FRINT, VT, Legal);
1236 setOperationAction(ISD::STRICT_FRINT, VT, Legal);
1237 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1238 setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
1239 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
1240 setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);
1241
1242 setOperationAction(ISD::FROUND, VT, Custom);
1243
1244 setOperationAction(ISD::FNEG, VT, Custom);
1245 setOperationAction(ISD::FABS, VT, Custom);
1246 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1247 }
1248
1249 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1250 // even though v8i16 is a legal type.
1251 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1252 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1253 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1254 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1255 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
1256 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Custom);
1257 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i32, Legal);
1258
1259 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
1260 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i32, Legal);
1261
1262 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f32, Legal);
1263 setOperationAction(ISD::STRICT_FADD, MVT::v8f32, Legal);
1264 setOperationAction(ISD::STRICT_FADD, MVT::v4f64, Legal);
1265 setOperationAction(ISD::STRICT_FSUB, MVT::v8f32, Legal);
1266 setOperationAction(ISD::STRICT_FSUB, MVT::v4f64, Legal);
1267 setOperationAction(ISD::STRICT_FMUL, MVT::v8f32, Legal);
1268 setOperationAction(ISD::STRICT_FMUL, MVT::v4f64, Legal);
1269 setOperationAction(ISD::STRICT_FDIV, MVT::v8f32, Legal);
1270 setOperationAction(ISD::STRICT_FDIV, MVT::v4f64, Legal);
1271 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Legal);
1272 setOperationAction(ISD::STRICT_FSQRT, MVT::v8f32, Legal);
1273 setOperationAction(ISD::STRICT_FSQRT, MVT::v4f64, Legal);
1274
1275 if (!Subtarget.hasAVX512())
1276 setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
1277
1278 // In the customized shift lowering, the legal v8i32/v4i64 cases
1279 // in AVX2 will be recognized.
1280 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1281 setOperationAction(ISD::SRL, VT, Custom);
1282 setOperationAction(ISD::SHL, VT, Custom);
1283 setOperationAction(ISD::SRA, VT, Custom);
1284 }
1285
1286 // These types need custom splitting if their input is a 128-bit vector.
1287 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1288 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1289 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1290 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1291
1292 setOperationAction(ISD::ROTL, MVT::v8i32, Custom);
1293 setOperationAction(ISD::ROTL, MVT::v16i16, Custom);
1294
1295 // With BWI, expanding (and promoting the shifts) is the better.
1296 if (!Subtarget.useBWIRegs())
1297 setOperationAction(ISD::ROTL, MVT::v32i8, Custom);
1298
1299 setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
1300 setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
1301 setOperationAction(ISD::SELECT, MVT::v8i32, Custom);
1302 setOperationAction(ISD::SELECT, MVT::v16i16, Custom);
1303 setOperationAction(ISD::SELECT, MVT::v32i8, Custom);
1304 setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
1305
1306 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1307 setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1308 setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1309 setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1310 }
1311
1312 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1313 setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
1314 setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
1315 setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);
1316
1317 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1318 setOperationAction(ISD::SETCC, VT, Custom);
1319 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
1320 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
1321 setOperationAction(ISD::CTPOP, VT, Custom);
1322 setOperationAction(ISD::CTLZ, VT, Custom);
1323
1324 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1325 // setcc all the way to isel and prefer SETGT in some isel patterns.
1326 setCondCodeAction(ISD::SETLT, VT, Custom);
1327 setCondCodeAction(ISD::SETLE, VT, Custom);
1328 }
1329
1330 if (Subtarget.hasAnyFMA()) {
1331 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1332 MVT::v2f64, MVT::v4f64 }) {
1333 setOperationAction(ISD::FMA, VT, Legal);
1334 setOperationAction(ISD::STRICT_FMA, VT, Legal);
1335 }
1336 }
1337
1338 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1339 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1340 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1341 }
1342
1343 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1344 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1345 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1346 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1347
1348 setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
1349 setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
1350 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1351 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1352 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1353 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1354
1355 setOperationAction(ISD::SMULO, MVT::v32i8, Custom);
1356 setOperationAction(ISD::UMULO, MVT::v32i8, Custom);
1357
1358 setOperationAction(ISD::ABS, MVT::v4i64, Custom);
1359 setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
1360 setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
1361 setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
1362 setOperationAction(ISD::UMIN, MVT::v4i64, Custom);
1363
1364 setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1365 setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1366 setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1367 setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1368 setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1369 setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1370 setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1371 setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1372 setOperationAction(ISD::UADDSAT, MVT::v8i32, Custom);
1373 setOperationAction(ISD::USUBSAT, MVT::v8i32, Custom);
1374 setOperationAction(ISD::UADDSAT, MVT::v4i64, Custom);
1375 setOperationAction(ISD::USUBSAT, MVT::v4i64, Custom);
1376
1377 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1378 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1379 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1380 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1381 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1382 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1383 }
1384
1385 for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
1386 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
1387 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
1388 }
1389
1390 if (HasInt256) {
1391 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1392 // when we have a 256bit-wide blend with immediate.
1393 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1394 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32, Custom);
1395
1396 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1397 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1398 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1399 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1400 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1401 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1402 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1403 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1404 }
1405 }
1406
1407 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1408 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1409 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1410 setOperationAction(ISD::MSTORE, VT, Legal);
1411 }
1412
1413 // Extract subvector is special because the value type
1414 // (result) is 128-bit but the source is 256-bit wide.
1415 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1416 MVT::v4f32, MVT::v2f64 }) {
1417 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1418 }
1419
1420 // Custom lower several nodes for 256-bit types.
1421 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1422 MVT::v8f32, MVT::v4f64 }) {
1423 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1424 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1425 setOperationAction(ISD::VSELECT, VT, Custom);
1426 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1427 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1428 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1429 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1430 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1431 setOperationAction(ISD::STORE, VT, Custom);
1432 }
1433
1434 if (HasInt256) {
1435 setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
1436
1437 // Custom legalize 2x32 to get a little better code.
1438 setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
1439 setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);
1440
1441 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1442 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1443 setOperationAction(ISD::MGATHER, VT, Custom);
1444 }
1445 }
1446
1447 // This block controls legalization of the mask vector sizes that are
1448 // available with AVX512. 512-bit vectors are in a separate block controlled
1449 // by useAVX512Regs.
1450 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1451 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1452 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1453 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1454 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1455 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1456
1457 setOperationAction(ISD::SELECT, MVT::v1i1, Custom);
1458 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
1459 setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);
1460
1461 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1462 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1463 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1464 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1465 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1466 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1467 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1468 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1469 setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom);
1470 setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom);
1471 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i1, Custom);
1472 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i1, Custom);
1473
1474 // There is no byte sized k-register load or store without AVX512DQ.
1475 if (!Subtarget.hasDQI()) {
1476 setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
1477 setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
1478 setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
1479 setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
1480
1481 setOperationAction(ISD::STORE, MVT::v1i1, Custom);
1482 setOperationAction(ISD::STORE, MVT::v2i1, Custom);
1483 setOperationAction(ISD::STORE, MVT::v4i1, Custom);
1484 setOperationAction(ISD::STORE, MVT::v8i1, Custom);
1485 }
1486
1487 // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1488 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1489 setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1490 setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1491 setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1492 }
1493
1494 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })
1495 setOperationAction(ISD::VSELECT, VT, Expand);
1496
1497 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1498 setOperationAction(ISD::SETCC, VT, Custom);
1499 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
1500 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
1501 setOperationAction(ISD::SELECT, VT, Custom);
1502 setOperationAction(ISD::TRUNCATE, VT, Custom);
1503
1504 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1505 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1506 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1507 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1508 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1509 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1510 }
1511
1512 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1513 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1514 }
1515
1516 // This block controls legalization for 512-bit operations with 32/64 bit
1517 // elements. 512-bits can be disabled based on prefer-vector-width and
1518 // required-vector-width function attributes.
1519 if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1520 bool HasBWI = Subtarget.hasBWI();
1521
1522 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1523 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1524 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1525 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1526 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1527 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1528
1529 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1530 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1531 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1532 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1533 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1534 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1535 if (HasBWI)
1536 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1537 }
1538
1539 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1540 setOperationAction(ISD::FNEG, VT, Custom);
1541 setOperationAction(ISD::FABS, VT, Custom);
1542 setOperationAction(ISD::FMA, VT, Legal);
1543 setOperationAction(ISD::STRICT_FMA, VT, Legal);
1544 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1545 }
1546
1547 for (MVT VT : { MVT::v16i1, MVT::v16i8, MVT::v16i16 }) {
1548 setOperationPromotedToType(ISD::FP_TO_SINT , VT, MVT::v16i32);
1549 setOperationPromotedToType(ISD::FP_TO_UINT , VT, MVT::v16i32);
1550 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, VT, MVT::v16i32);
1551 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, VT, MVT::v16i32);
1552 }
1553 setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
1554 setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
1555 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v16i32, Legal);
1556 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v16i32, Legal);
1557 setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
1558 setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
1559 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i32, Legal);
1560 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i32, Legal);
1561
1562 setOperationAction(ISD::STRICT_FADD, MVT::v16f32, Legal);
1563 setOperationAction(ISD::STRICT_FADD, MVT::v8f64, Legal);
1564 setOperationAction(ISD::STRICT_FSUB, MVT::v16f32, Legal);
1565 setOperationAction(ISD::STRICT_FSUB, MVT::v8f64, Legal);
1566 setOperationAction(ISD::STRICT_FMUL, MVT::v16f32, Legal);
1567 setOperationAction(ISD::STRICT_FMUL, MVT::v8f64, Legal);
1568 setOperationAction(ISD::STRICT_FDIV, MVT::v16f32, Legal);
1569 setOperationAction(ISD::STRICT_FDIV, MVT::v8f64, Legal);
1570 setOperationAction(ISD::STRICT_FSQRT, MVT::v16f32, Legal);
1571 setOperationAction(ISD::STRICT_FSQRT, MVT::v8f64, Legal);
1572 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Legal);
1573 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f32, Legal);
1574
1575 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1576 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1577 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1578 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1579 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1580 if (HasBWI)
1581 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1582
1583 // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1584 // to 512-bit rather than use the AVX2 instructions so that we can use
1585 // k-masks.
1586 if (!Subtarget.hasVLX()) {
1587 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1588 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1589 setOperationAction(ISD::MLOAD, VT, Custom);
1590 setOperationAction(ISD::MSTORE, VT, Custom);
1591 }
1592 }
1593
1594 setOperationAction(ISD::TRUNCATE, MVT::v8i32, Legal);
1595 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal);
1596 setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom);
1597 setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
1598 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
1599 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1600 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1601 setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
1602 setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
1603 setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
1604 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
1605 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1606 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1607
1608 if (HasBWI) {
1609 // Extends from v64i1 masks to 512-bit vectors.
1610 setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
1611 setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
1612 setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom);
1613 }
1614
1615 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1616 setOperationAction(ISD::FFLOOR, VT, Legal);
1617 setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
1618 setOperationAction(ISD::FCEIL, VT, Legal);
1619 setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
1620 setOperationAction(ISD::FTRUNC, VT, Legal);
1621 setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
1622 setOperationAction(ISD::FRINT, VT, Legal);
1623 setOperationAction(ISD::STRICT_FRINT, VT, Legal);
1624 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1625 setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
1626 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
1627 setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);
1628
1629 setOperationAction(ISD::FROUND, VT, Custom);
1630 }
1631
1632 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
1633 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
1634 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
1635 }
1636
1637 setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);
1638 setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);
1639 setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom);
1640 setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom);
1641
1642 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1643 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1644 setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);
1645 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1646
1647 setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
1648 setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
1649 setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);
1650 setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
1651 setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
1652 setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
1653
1654 setOperationAction(ISD::SMULO, MVT::v64i8, Custom);
1655 setOperationAction(ISD::UMULO, MVT::v64i8, Custom);
1656
1657 setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);
1658
1659 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1660 setOperationAction(ISD::SRL, VT, Custom);
1661 setOperationAction(ISD::SHL, VT, Custom);
1662 setOperationAction(ISD::SRA, VT, Custom);
1663 setOperationAction(ISD::SETCC, VT, Custom);
1664
1665 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1666 // setcc all the way to isel and prefer SETGT in some isel patterns.
1667 setCondCodeAction(ISD::SETLT, VT, Custom);
1668 setCondCodeAction(ISD::SETLE, VT, Custom);
1669 }
1670 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1671 setOperationAction(ISD::SMAX, VT, Legal);
1672 setOperationAction(ISD::UMAX, VT, Legal);
1673 setOperationAction(ISD::SMIN, VT, Legal);
1674 setOperationAction(ISD::UMIN, VT, Legal);
1675 setOperationAction(ISD::ABS, VT, Legal);
1676 setOperationAction(ISD::CTPOP, VT, Custom);
1677 setOperationAction(ISD::ROTL, VT, Custom);
1678 setOperationAction(ISD::ROTR, VT, Custom);
1679 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
1680 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
1681 }
1682
1683 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1684 setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom);
1685 setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom);
1686 setOperationAction(ISD::CTLZ, VT, Custom);
1687 setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom);
1688 setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom);
1689 setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom);
1690 setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom);
1691 setOperationAction(ISD::UADDSAT, VT, HasBWI ? Legal : Custom);
1692 setOperationAction(ISD::SADDSAT, VT, HasBWI ? Legal : Custom);
1693 setOperationAction(ISD::USUBSAT, VT, HasBWI ? Legal : Custom);
1694 setOperationAction(ISD::SSUBSAT, VT, HasBWI ? Legal : Custom);
1695 }
1696
1697 if (Subtarget.hasDQI()) {
1698 setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);
1699 setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
1700 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i64, Legal);
1701 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i64, Legal);
1702 setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);
1703 setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);
1704 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i64, Legal);
1705 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i64, Legal);
1706
1707 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
1708 }
1709
1710 if (Subtarget.hasCDI()) {
1711 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1712 for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
1713 setOperationAction(ISD::CTLZ, VT, Legal);
1714 }
1715 } // Subtarget.hasCDI()
1716
1717 if (Subtarget.hasVPOPCNTDQ()) {
1718 for (auto VT : { MVT::v16i32, MVT::v8i64 })
1719 setOperationAction(ISD::CTPOP, VT, Legal);
1720 }
1721
1722 // Extract subvector is special because the value type
1723 // (result) is 256-bit but the source is 512-bit wide.
1724 // 128-bit was made Legal under AVX1.
1725 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1726 MVT::v8f32, MVT::v4f64 })
1727 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1728
1729 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
1730 MVT::v16f32, MVT::v8f64 }) {
1731 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1732 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1733 setOperationAction(ISD::SELECT, VT, Custom);
1734 setOperationAction(ISD::VSELECT, VT, Custom);
1735 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1736 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1737 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1738 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1739 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1740 }
1741
1742 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
1743 setOperationAction(ISD::MLOAD, VT, Legal);
1744 setOperationAction(ISD::MSTORE, VT, Legal);
1745 setOperationAction(ISD::MGATHER, VT, Custom);
1746 setOperationAction(ISD::MSCATTER, VT, Custom);
1747 }
1748 if (HasBWI) {
1749 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1750 setOperationAction(ISD::MLOAD, VT, Legal);
1751 setOperationAction(ISD::MSTORE, VT, Legal);
1752 }
1753 } else {
1754 setOperationAction(ISD::STORE, MVT::v32i16, Custom);
1755 setOperationAction(ISD::STORE, MVT::v64i8, Custom);
1756 }
1757
1758 if (Subtarget.hasVBMI2()) {
1759 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64,
1760 MVT::v16i16, MVT::v8i32, MVT::v4i64,
1761 MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1762 setOperationAction(ISD::FSHL, VT, Custom);
1763 setOperationAction(ISD::FSHR, VT, Custom);
1764 }
1765
1766 setOperationAction(ISD::ROTL, MVT::v32i16, Custom);
1767 setOperationAction(ISD::ROTR, MVT::v8i16, Custom);
1768 setOperationAction(ISD::ROTR, MVT::v16i16, Custom);
1769 setOperationAction(ISD::ROTR, MVT::v32i16, Custom);
1770 }
1771 }// useAVX512Regs
1772
1773 // This block controls legalization for operations that don't have
1774 // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
1775 // narrower widths.
1776 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1777 // These operations are handled on non-VLX by artificially widening in
1778 // isel patterns.
1779
1780 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32,
1781 Subtarget.hasVLX() ? Legal : Custom);
1782 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32,
1783 Subtarget.hasVLX() ? Legal : Custom);
1784 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32,
1785 Subtarget.hasVLX() ? Legal : Custom);
1786 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32,
1787 Subtarget.hasVLX() ? Legal : Custom);
1788 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i32, Custom);
1789 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32,
1790 Subtarget.hasVLX() ? Legal : Custom);
1791 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32,
1792 Subtarget.hasVLX() ? Legal : Custom);
1793 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32,
1794 Subtarget.hasVLX() ? Legal : Custom);
1795 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32,
1796 Subtarget.hasVLX() ? Legal : Custom);
1797
1798 if (Subtarget.hasDQI()) {
1799 // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
1800 // v2f32 UINT_TO_FP is already custom under SSE2.
1801 assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&(static_cast<void> (0))
1802 isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) &&(static_cast<void> (0))
1803 "Unexpected operation action!")(static_cast<void> (0));
1804 // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
1805 setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
1806 setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
1807 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f32, Custom);
1808 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f32, Custom);
1809 }
1810
1811 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1812 setOperationAction(ISD::SMAX, VT, Legal);
1813 setOperationAction(ISD::UMAX, VT, Legal);
1814 setOperationAction(ISD::SMIN, VT, Legal);
1815 setOperationAction(ISD::UMIN, VT, Legal);
1816 setOperationAction(ISD::ABS, VT, Legal);
1817 }
1818
1819 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
1820 setOperationAction(ISD::ROTL, VT, Custom);
1821 setOperationAction(ISD::ROTR, VT, Custom);
1822 }
1823
1824 // Custom legalize 2x32 to get a little better code.
1825 setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);
1826 setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);
1827
1828 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1829 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1830 setOperationAction(ISD::MSCATTER, VT, Custom);
1831
1832 if (Subtarget.hasDQI()) {
1833 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1834 setOperationAction(ISD::SINT_TO_FP, VT,
1835 Subtarget.hasVLX() ? Legal : Custom);
1836 setOperationAction(ISD::UINT_TO_FP, VT,
1837 Subtarget.hasVLX() ? Legal : Custom);
1838 setOperationAction(ISD::STRICT_SINT_TO_FP, VT,
1839 Subtarget.hasVLX() ? Legal : Custom);
1840 setOperationAction(ISD::STRICT_UINT_TO_FP, VT,
1841 Subtarget.hasVLX() ? Legal : Custom);
1842 setOperationAction(ISD::FP_TO_SINT, VT,
1843 Subtarget.hasVLX() ? Legal : Custom);
1844 setOperationAction(ISD::FP_TO_UINT, VT,
1845 Subtarget.hasVLX() ? Legal : Custom);
1846 setOperationAction(ISD::STRICT_FP_TO_SINT, VT,
1847 Subtarget.hasVLX() ? Legal : Custom);
1848 setOperationAction(ISD::STRICT_FP_TO_UINT, VT,
1849 Subtarget.hasVLX() ? Legal : Custom);
1850 setOperationAction(ISD::MUL, VT, Legal);
1851 }
1852 }
1853
1854 if (Subtarget.hasCDI()) {
1855 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
1856 setOperationAction(ISD::CTLZ, VT, Legal);
1857 }
1858 } // Subtarget.hasCDI()
1859
1860 if (Subtarget.hasVPOPCNTDQ()) {
1861 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
1862 setOperationAction(ISD::CTPOP, VT, Legal);
1863 }
1864 }
1865
1866 // This block control legalization of v32i1/v64i1 which are available with
1867 // AVX512BW. 512-bit v32i16 and v64i8 vector legalization is controlled with
1868 // useBWIRegs.
1869 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1870 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
1871 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
1872
1873 for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
1874 setOperationAction(ISD::VSELECT, VT, Expand);
1875 setOperationAction(ISD::TRUNCATE, VT, Custom);
1876 setOperationAction(ISD::SETCC, VT, Custom);
1877 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1878 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1879 setOperationAction(ISD::SELECT, VT, Custom);
1880 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1881 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1882 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1883 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1884 }
1885
1886 for (auto VT : { MVT::v16i1, MVT::v32i1 })
1887 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1888
1889 // Extends from v32i1 masks to 256-bit vectors.
1890 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
1891 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
1892 setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom);
1893
1894 for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
1895 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1896 setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
1897 }
1898
1899 // These operations are handled on non-VLX by artificially widening in
1900 // isel patterns.
1901 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
1902
1903 if (Subtarget.hasBITALG()) {
1904 for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
1905 setOperationAction(ISD::CTPOP, VT, Legal);
1906 }
1907 }
1908
1909 if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) {
1910 auto setGroup = [&] (MVT VT) {
1911 setOperationAction(ISD::FADD, VT, Legal);
1912 setOperationAction(ISD::STRICT_FADD, VT, Legal);
1913 setOperationAction(ISD::FSUB, VT, Legal);
1914 setOperationAction(ISD::STRICT_FSUB, VT, Legal);
1915 setOperationAction(ISD::FMUL, VT, Legal);
1916 setOperationAction(ISD::STRICT_FMUL, VT, Legal);
1917 setOperationAction(ISD::FDIV, VT, Legal);
1918 setOperationAction(ISD::STRICT_FDIV, VT, Legal);
1919 setOperationAction(ISD::FSQRT, VT, Legal);
1920 setOperationAction(ISD::STRICT_FSQRT, VT, Legal);
1921
1922 setOperationAction(ISD::FFLOOR, VT, Legal);
1923 setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
1924 setOperationAction(ISD::FCEIL, VT, Legal);
1925 setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
1926 setOperationAction(ISD::FTRUNC, VT, Legal);
1927 setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
1928 setOperationAction(ISD::FRINT, VT, Legal);
1929 setOperationAction(ISD::STRICT_FRINT, VT, Legal);
1930 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1931 setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
1932
1933 setOperationAction(ISD::LOAD, VT, Legal);
1934 setOperationAction(ISD::STORE, VT, Legal);
1935
1936 setOperationAction(ISD::FMA, VT, Legal);
1937 setOperationAction(ISD::STRICT_FMA, VT, Legal);
1938 setOperationAction(ISD::VSELECT, VT, Legal);
1939 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1940 setOperationAction(ISD::SELECT, VT, Custom);
1941
1942 setOperationAction(ISD::FNEG, VT, Custom);
1943 setOperationAction(ISD::FABS, VT, Custom);
1944 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1945 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1946 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1947 };
1948
1949 // AVX512_FP16 scalar operations
1950 setGroup(MVT::f16);
1951 addRegisterClass(MVT::f16, &X86::FR16XRegClass);
1952 setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
1953 setOperationAction(ISD::BR_CC, MVT::f16, Expand);
1954 setOperationAction(ISD::SETCC, MVT::f16, Custom);
1955 setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Custom);
1956 setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Custom);
1957 setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
1958 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);
1959 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Legal);
1960 if (isTypeLegal(MVT::f80)) {
1961 setOperationAction(ISD::FP_EXTEND, MVT::f80, Custom);
1962 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Custom);
1963 }
1964
1965 setCondCodeAction(ISD::SETOEQ, MVT::f16, Expand);
1966 setCondCodeAction(ISD::SETUNE, MVT::f16, Expand);
1967
1968 if (Subtarget.useAVX512Regs()) {
1969 setGroup(MVT::v32f16);
1970 addRegisterClass(MVT::v32f16, &X86::VR512RegClass);
1971 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32f16, Custom);
1972 setOperationAction(ISD::SINT_TO_FP, MVT::v32i16, Legal);
1973 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v32i16, Legal);
1974 setOperationAction(ISD::UINT_TO_FP, MVT::v32i16, Legal);
1975 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v32i16, Legal);
1976 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f16, Legal);
1977 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Legal);
1978 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32f16, Custom);
1979
1980 setOperationAction(ISD::FP_TO_SINT, MVT::v32i16, Custom);
1981 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v32i16, Custom);
1982 setOperationAction(ISD::FP_TO_UINT, MVT::v32i16, Custom);
1983 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v32i16, Custom);
1984 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i8, MVT::v32i16);
1985 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v32i8,
1986 MVT::v32i16);
1987 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i8, MVT::v32i16);
1988 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v32i8,
1989 MVT::v32i16);
1990 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i1, MVT::v32i16);
1991 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v32i1,
1992 MVT::v32i16);
1993 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i1, MVT::v32i16);
1994 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v32i1,
1995 MVT::v32i16);
1996
1997 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f16, Legal);
1998 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32f16, Legal);
1999 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32f16, Custom);
2000
2001 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Legal);
2002 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal);
2003
2004 setOperationAction(ISD::STRICT_FSETCC, MVT::v32i1, Custom);
2005 setOperationAction(ISD::STRICT_FSETCCS, MVT::v32i1, Custom);
2006 }
2007
2008 if (Subtarget.hasVLX()) {
2009 addRegisterClass(MVT::v8f16, &X86::VR128XRegClass);
2010 addRegisterClass(MVT::v16f16, &X86::VR256XRegClass);
2011 setGroup(MVT::v8f16);
2012 setGroup(MVT::v16f16);
2013
2014 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8f16, Legal);
2015 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16f16, Custom);
2016 setOperationAction(ISD::SINT_TO_FP, MVT::v16i16, Legal);
2017 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i16, Legal);
2018 setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Legal);
2019 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i16, Legal);
2020 setOperationAction(ISD::UINT_TO_FP, MVT::v16i16, Legal);
2021 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i16, Legal);
2022 setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Legal);
2023 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i16, Legal);
2024
2025 setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom);
2026 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i16, Custom);
2027 setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Custom);
2028 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i16, Custom);
2029 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f16, Legal);
2030 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f32, Legal);
2031
2032 // INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE
2033 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8f16, Custom);
2034 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16f16, Custom);
2035
2036 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f16, Legal);
2037 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16f16, Legal);
2038 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f16, Custom);
2039
2040 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Legal);
2041 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Legal);
2042 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Legal);
2043 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Legal);
2044
2045 // Need to custom widen these to prevent scalarization.
2046 setOperationAction(ISD::LOAD, MVT::v4f16, Custom);
2047 setOperationAction(ISD::STORE, MVT::v4f16, Custom);
2048 }
2049
2050 // Support fp16 0 immediate
2051 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf()));
2052 }
2053
2054 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
2055 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
2056 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
2057 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
2058 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
2059 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
2060
2061 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
2062 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
2063 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
2064 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
2065 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
2066
2067 if (Subtarget.hasBWI()) {
2068 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
2069 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
2070 }
2071
2072 if (Subtarget.hasFP16()) {
2073 // vcvttph2[u]dq v4f16 -> v4i32/64, v2f16 -> v2i32/64
2074 setOperationAction(ISD::FP_TO_SINT, MVT::v2f16, Custom);
2075 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f16, Custom);
2076 setOperationAction(ISD::FP_TO_UINT, MVT::v2f16, Custom);
2077 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f16, Custom);
2078 setOperationAction(ISD::FP_TO_SINT, MVT::v4f16, Custom);
2079 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4f16, Custom);
2080 setOperationAction(ISD::FP_TO_UINT, MVT::v4f16, Custom);
2081 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4f16, Custom);
2082 // vcvt[u]dq2ph v4i32/64 -> v4f16, v2i32/64 -> v2f16
2083 setOperationAction(ISD::SINT_TO_FP, MVT::v2f16, Custom);
2084 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f16, Custom);
2085 setOperationAction(ISD::UINT_TO_FP, MVT::v2f16, Custom);
2086 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f16, Custom);
2087 setOperationAction(ISD::SINT_TO_FP, MVT::v4f16, Custom);
2088 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4f16, Custom);
2089 setOperationAction(ISD::UINT_TO_FP, MVT::v4f16, Custom);
2090 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4f16, Custom);
2091 // vcvtps2phx v4f32 -> v4f16, v2f32 -> v2f16
2092 setOperationAction(ISD::FP_ROUND, MVT::v2f16, Custom);
2093 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f16, Custom);
2094 setOperationAction(ISD::FP_ROUND, MVT::v4f16, Custom);
2095 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f16, Custom);
2096 // vcvtph2psx v4f16 -> v4f32, v2f16 -> v2f32
2097 setOperationAction(ISD::FP_EXTEND, MVT::v2f16, Custom);
2098 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f16, Custom);
2099 setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Custom);
2100 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f16, Custom);
2101 }
2102
2103 setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom);
2104 setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom);
2105 setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
2106 }
2107
2108 if (Subtarget.hasAMXTILE()) {
2109 addRegisterClass(MVT::x86amx, &X86::TILERegClass);
2110 }
2111
2112 // We want to custom lower some of our intrinsics.
2113 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
2114 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
2115 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
2116 if (!Subtarget.is64Bit()) {
2117 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
2118 }
2119
2120 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
2121 // handle type legalization for these operations here.
2122 //
2123 // FIXME: We really should do custom legalization for addition and
2124 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
2125 // than generic legalization for 64-bit multiplication-with-overflow, though.
2126 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
2127 if (VT == MVT::i64 && !Subtarget.is64Bit())
2128 continue;
2129 // Add/Sub/Mul with overflow operations are custom lowered.
2130 setOperationAction(ISD::SADDO, VT, Custom);
2131 setOperationAction(ISD::UADDO, VT, Custom);
2132 setOperationAction(ISD::SSUBO, VT, Custom);
2133 setOperationAction(ISD::USUBO, VT, Custom);
2134 setOperationAction(ISD::SMULO, VT, Custom);
2135 setOperationAction(ISD::UMULO, VT, Custom);
2136
2137 // Support carry in as value rather than glue.
2138 setOperationAction(ISD::ADDCARRY, VT, Custom);
2139 setOperationAction(ISD::SUBCARRY, VT, Custom);
2140 setOperationAction(ISD::SETCCCARRY, VT, Custom);
2141 setOperationAction(ISD::SADDO_CARRY, VT, Custom);
2142 setOperationAction(ISD::SSUBO_CARRY, VT, Custom);
2143 }
2144
2145 if (!Subtarget.is64Bit()) {
2146 // These libcalls are not available in 32-bit.
2147 setLibcallName(RTLIB::SHL_I128, nullptr);
2148 setLibcallName(RTLIB::SRL_I128, nullptr);
2149 setLibcallName(RTLIB::SRA_I128, nullptr);
2150 setLibcallName(RTLIB::MUL_I128, nullptr);
2151 setLibcallName(RTLIB::MULO_I128, nullptr);
2152 }
2153
2154 // Combine sin / cos into _sincos_stret if it is available.
2155 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
2156 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
2157 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
2158 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
2159 }
2160
2161 if (Subtarget.isTargetWin64()) {
2162 setOperationAction(ISD::SDIV, MVT::i128, Custom);
2163 setOperationAction(ISD::UDIV, MVT::i128, Custom);
2164 setOperationAction(ISD::SREM, MVT::i128, Custom);
2165 setOperationAction(ISD::UREM, MVT::i128, Custom);
2166 }
2167
2168 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
2169 // is. We should promote the value to 64-bits to solve this.
2170 // This is what the CRT headers do - `fmodf` is an inline header
2171 // function casting to f64 and calling `fmod`.
2172 if (Subtarget.is32Bit() &&
2173 (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
2174 for (ISD::NodeType Op :
2175 {ISD::FCEIL, ISD::STRICT_FCEIL,
2176 ISD::FCOS, ISD::STRICT_FCOS,
2177 ISD::FEXP, ISD::STRICT_FEXP,
2178 ISD::FFLOOR, ISD::STRICT_FFLOOR,
2179 ISD::FREM, ISD::STRICT_FREM,
2180 ISD::FLOG, ISD::STRICT_FLOG,
2181 ISD::FLOG10, ISD::STRICT_FLOG10,
2182 ISD::FPOW, ISD::STRICT_FPOW,
2183 ISD::FSIN, ISD::STRICT_FSIN})
2184 if (isOperationExpand(Op, MVT::f32))
2185 setOperationAction(Op, MVT::f32, Promote);
2186
2187 // We have target-specific dag combine patterns for the following nodes:
2188 setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
2189 setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
2190 setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
2191 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
2192 setTargetDAGCombine(ISD::CONCAT_VECTORS);
2193 setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
2194 setTargetDAGCombine(ISD::EXTRACT_SUBVECTOR);
2195 setTargetDAGCombine(ISD::BITCAST);
2196 setTargetDAGCombine(ISD::VSELECT);
2197 setTargetDAGCombine(ISD::SELECT);
2198 setTargetDAGCombine(ISD::SHL);
2199 setTargetDAGCombine(ISD::SRA);
2200 setTargetDAGCombine(ISD::SRL);
2201 setTargetDAGCombine(ISD::OR);
2202 setTargetDAGCombine(ISD::AND);
2203 setTargetDAGCombine(ISD::ADD);
2204 setTargetDAGCombine(ISD::FADD);
2205 setTargetDAGCombine(ISD::FSUB);
2206 setTargetDAGCombine(ISD::FNEG);
2207 setTargetDAGCombine(ISD::FMA);
2208 setTargetDAGCombine(ISD::STRICT_FMA);
2209 setTargetDAGCombine(ISD::FMINNUM);
2210 setTargetDAGCombine(ISD::FMAXNUM);
2211 setTargetDAGCombine(ISD::SUB);
2212 setTargetDAGCombine(ISD::LOAD);
2213 setTargetDAGCombine(ISD::MLOAD);
2214 setTargetDAGCombine(ISD::STORE);
2215 setTargetDAGCombine(ISD::MSTORE);
2216 setTargetDAGCombine(ISD::TRUNCATE);
2217 setTargetDAGCombine(ISD::ZERO_EXTEND);
2218 setTargetDAGCombine(ISD::ANY_EXTEND);
2219 setTargetDAGCombine(ISD::SIGN_EXTEND);
2220 setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
2221 setTargetDAGCombine(ISD::ANY_EXTEND_VECTOR_INREG);
2222 setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);
2223 setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
2224 setTargetDAGCombine(ISD::SINT_TO_FP);
2225 setTargetDAGCombine(ISD::UINT_TO_FP);
2226 setTargetDAGCombine(ISD::STRICT_SINT_TO_FP);
2227 setTargetDAGCombine(ISD::STRICT_UINT_TO_FP);
2228 setTargetDAGCombine(ISD::SETCC);
2229 setTargetDAGCombine(ISD::MUL);
2230 setTargetDAGCombine(ISD::XOR);
2231 setTargetDAGCombine(ISD::MSCATTER);
2232 setTargetDAGCombine(ISD::MGATHER);
2233 setTargetDAGCombine(ISD::FP16_TO_FP);
2234 setTargetDAGCombine(ISD::FP_EXTEND);
2235 setTargetDAGCombine(ISD::STRICT_FP_EXTEND);
2236 setTargetDAGCombine(ISD::FP_ROUND);
2237
2238 computeRegisterProperties(Subtarget.getRegisterInfo());
2239
2240 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
2241 MaxStoresPerMemsetOptSize = 8;
2242 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
2243 MaxStoresPerMemcpyOptSize = 4;
2244 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
2245 MaxStoresPerMemmoveOptSize = 4;
2246
2247 // TODO: These control memcmp expansion in CGP and could be raised higher, but
2248 // that needs to benchmarked and balanced with the potential use of vector
2249 // load/store types (PR33329, PR33914).
2250 MaxLoadsPerMemcmp = 2;
2251 MaxLoadsPerMemcmpOptSize = 2;
2252
2253 // Default loop alignment, which can be overridden by -align-loops.
2254 setPrefLoopAlignment(Align(16));
2255
2256 // An out-of-order CPU can speculatively execute past a predictable branch,
2257 // but a conditional move could be stalled by an expensive earlier operation.
2258 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
2259 EnableExtLdPromotion = true;
2260 setPrefFunctionAlignment(Align(16));
2261
2262 verifyIntrinsicTables();
2263
2264 // Default to having -disable-strictnode-mutation on
2265 IsStrictFPEnabled = true;
2266}
2267
2268// This has so far only been implemented for 64-bit MachO.
2269bool X86TargetLowering::useLoadStackGuardNode() const {
2270 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
2271}
2272
2273bool X86TargetLowering::useStackGuardXorFP() const {
2274 // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
2275 return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
2276}
2277
2278SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
2279 const SDLoc &DL) const {
2280 EVT PtrTy = getPointerTy(DAG.getDataLayout());
2281 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
2282 MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
2283 return SDValue(Node, 0);
2284}
2285
2286TargetLoweringBase::LegalizeTypeAction
2287X86TargetLowering::getPreferredVectorAction(MVT VT) const {
2288 if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&
2289 !Subtarget.hasBWI())
2290 return TypeSplitVector;
2291
2292 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2293 VT.getVectorElementType() != MVT::i1)
2294 return TypeWidenVector;
2295
2296 return TargetLoweringBase::getPreferredVectorAction(VT);
2297}
2298
2299static std::pair<MVT, unsigned>
2300handleMaskRegisterForCallingConv(unsigned NumElts, CallingConv::ID CC,
2301 const X86Subtarget &Subtarget) {
2302 // v2i1/v4i1/v8i1/v16i1 all pass in xmm registers unless the calling
2303 // convention is one that uses k registers.
2304 if (NumElts == 2)
2305 return {MVT::v2i64, 1};
2306 if (NumElts == 4)
2307 return {MVT::v4i32, 1};
2308 if (NumElts == 8 && CC != CallingConv::X86_RegCall &&
2309 CC != CallingConv::Intel_OCL_BI)
2310 return {MVT::v8i16, 1};
2311 if (NumElts == 16 && CC != CallingConv::X86_RegCall &&
2312 CC != CallingConv::Intel_OCL_BI)
2313 return {MVT::v16i8, 1};
2314 // v32i1 passes in ymm unless we have BWI and the calling convention is
2315 // regcall.
2316 if (NumElts == 32 && (!Subtarget.hasBWI() || CC != CallingConv::X86_RegCall))
2317 return {MVT::v32i8, 1};
2318 // Split v64i1 vectors if we don't have v64i8 available.
2319 if (NumElts == 64 && Subtarget.hasBWI() && CC != CallingConv::X86_RegCall) {
2320 if (Subtarget.useAVX512Regs())
2321 return {MVT::v64i8, 1};
2322 return {MVT::v32i8, 2};
2323 }
2324
2325 // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
2326 if (!isPowerOf2_32(NumElts) || (NumElts == 64 && !Subtarget.hasBWI()) ||
2327 NumElts > 64)
2328 return {MVT::i8, NumElts};
2329
2330 return {MVT::INVALID_SIMPLE_VALUE_TYPE, 0};
2331}
2332
2333MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
2334 CallingConv::ID CC,
2335 EVT VT) const {
2336 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
2337 Subtarget.hasAVX512()) {
2338 unsigned NumElts = VT.getVectorNumElements();
2339
2340 MVT RegisterVT;
2341 unsigned NumRegisters;
2342 std::tie(RegisterVT, NumRegisters) =
2343 handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
2344 if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
2345 return RegisterVT;
2346 }
2347
2348 // v3f16 will be widen to v4f16. But we don't assign register class for v4f16.
2349 // So its default register type is f16. We override the type to v8f16 here.
2350 if (VT == MVT::v3f16 && Subtarget.hasFP16())
2351 return MVT::v8f16;
2352
2353 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
2354}
2355
2356unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
2357 CallingConv::ID CC,
2358 EVT VT) const {
2359 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
2360 Subtarget.hasAVX512()) {
2361 unsigned NumElts = VT.getVectorNumElements();
2362
2363 MVT RegisterVT;
2364 unsigned NumRegisters;
2365 std::tie(RegisterVT, NumRegisters) =
2366 handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
2367 if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
2368 return NumRegisters;
2369 }
2370
2371 // v3f16 will be widen to v4f16. But we don't assign register class for v4f16.
2372 // So its default register number is 3. We override the number to 1 here.
2373 if (VT == MVT::v3f16 && Subtarget.hasFP16())
2374 return 1;
2375
2376 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
2377}
2378
2379unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv(
2380 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
2381 unsigned &NumIntermediates, MVT &RegisterVT) const {
2382 // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
2383 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
2384 Subtarget.hasAVX512() &&
2385 (!isPowerOf2_32(VT.getVectorNumElements()) ||
2386 (VT.getVectorNumElements() == 64 && !Subtarget.hasBWI()) ||
2387 VT.getVectorNumElements() > 64)) {
2388 RegisterVT = MVT::i8;
2389 IntermediateVT = MVT::i1;
2390 NumIntermediates = VT.getVectorNumElements();
2391 return NumIntermediates;
2392 }
2393
2394 // Split v64i1 vectors if we don't have v64i8 available.
2395 if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
2396 CC != CallingConv::X86_RegCall) {
2397 RegisterVT = MVT::v32i8;
2398 IntermediateVT = MVT::v32i1;
2399 NumIntermediates = 2;
2400 return 2;
2401 }
2402
2403 return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT,
2404 NumIntermediates, RegisterVT);
2405}
2406
2407EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
2408 LLVMContext& Context,
2409 EVT VT) const {
2410 if (!VT.isVector())
2411 return MVT::i8;
2412
2413 if (Subtarget.hasAVX512()) {
2414 // Figure out what this type will be legalized to.
2415 EVT LegalVT = VT;
2416 while (getTypeAction(Context, LegalVT) != TypeLegal)
2417 LegalVT = getTypeToTransformTo(Context, LegalVT);
2418
2419 // If we got a 512-bit vector then we'll definitely have a vXi1 compare.
2420 if (LegalVT.getSimpleVT().is512BitVector())
2421 return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount());
2422
2423 if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {
2424 // If we legalized to less than a 512-bit vector, then we will use a vXi1
2425 // compare for vXi32/vXi64 for sure. If we have BWI we will also support
2426 // vXi16/vXi8.
2427 MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();
2428 if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32)
2429 return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount());
2430 }
2431 }
2432
2433 return VT.changeVectorElementTypeToInteger();
2434}
2435
2436/// Helper for getByValTypeAlignment to determine
2437/// the desired ByVal argument alignment.
2438static void getMaxByValAlign(Type *Ty, Align &MaxAlign) {
2439 if (MaxAlign == 16)
2440 return;
2441 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
2442 if (VTy->getPrimitiveSizeInBits().getFixedSize() == 128)
2443 MaxAlign = Align(16);
2444 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
2445 Align EltAlign;
2446 getMaxByValAlign(ATy->getElementType(), EltAlign);
2447 if (EltAlign > MaxAlign)
2448 MaxAlign = EltAlign;
2449 } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
2450 for (auto *EltTy : STy->elements()) {
2451 Align EltAlign;
2452 getMaxByValAlign(EltTy, EltAlign);
2453 if (EltAlign > MaxAlign)
2454 MaxAlign = EltAlign;
2455 if (MaxAlign == 16)
2456 break;
2457 }
2458 }
2459}
2460
2461/// Return the desired alignment for ByVal aggregate
2462/// function arguments in the caller parameter area. For X86, aggregates
2463/// that contain SSE vectors are placed at 16-byte boundaries while the rest
2464/// are at 4-byte boundaries.
2465unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
2466 const DataLayout &DL) const {
2467 if (Subtarget.is64Bit()) {
2468 // Max of 8 and alignment of type.
2469 Align TyAlign = DL.getABITypeAlign(Ty);
2470 if (TyAlign > 8)
2471 return TyAlign.value();
2472 return 8;
2473 }
2474
2475 Align Alignment(4);
2476 if (Subtarget.hasSSE1())
2477 getMaxByValAlign(Ty, Alignment);
2478 return Alignment.value();
2479}
2480
2481/// It returns EVT::Other if the type should be determined using generic
2482/// target-independent logic.
2483/// For vector ops we check that the overall size isn't larger than our
2484/// preferred vector width.
2485EVT X86TargetLowering::getOptimalMemOpType(
2486 const MemOp &Op, const AttributeList &FuncAttributes) const {
2487 if (!FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat)) {
2488 if (Op.size() >= 16 &&
2489 (!Subtarget.isUnalignedMem16Slow() || Op.isAligned(Align(16)))) {
2490 // FIXME: Check if unaligned 64-byte accesses are slow.
2491 if (Op.size() >= 64 && Subtarget.hasAVX512() &&
2492 (Subtarget.getPreferVectorWidth() >= 512)) {
2493 return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32;
2494 }
2495 // FIXME: Check if unaligned 32-byte accesses are slow.
2496 if (Op.size() >= 32 && Subtarget.hasAVX() &&
2497 (Subtarget.getPreferVectorWidth() >= 256)) {
2498 // Although this isn't a well-supported type for AVX1, we'll let
2499 // legalization and shuffle lowering produce the optimal codegen. If we
2500 // choose an optimal type with a vector element larger than a byte,
2501 // getMemsetStores() may create an intermediate splat (using an integer
2502 // multiply) before we splat as a vector.
2503 return MVT::v32i8;
2504 }
2505 if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128))
2506 return MVT::v16i8;
2507 // TODO: Can SSE1 handle a byte vector?
2508 // If we have SSE1 registers we should be able to use them.
2509 if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) &&
2510 (Subtarget.getPreferVectorWidth() >= 128))
2511 return MVT::v4f32;
2512 } else if (((Op.isMemcpy() && !Op.isMemcpyStrSrc()) || Op.isZeroMemset()) &&
2513 Op.size() >= 8 && !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
2514 // Do not use f64 to lower memcpy if source is string constant. It's
2515 // better to use i32 to avoid the loads.
2516 // Also, do not use f64 to lower memset unless this is a memset of zeros.
2517 // The gymnastics of splatting a byte value into an XMM register and then
2518 // only using 8-byte stores (because this is a CPU with slow unaligned
2519 // 16-byte accesses) makes that a loser.
2520 return MVT::f64;
2521 }
2522 }
2523 // This is a compromise. If we reach here, unaligned accesses may be slow on
2524 // this target. However, creating smaller, aligned accesses could be even
2525 // slower and would certainly be a lot more code.
2526 if (Subtarget.is64Bit() && Op.size() >= 8)
2527 return MVT::i64;
2528 return MVT::i32;
2529}
2530
2531bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
2532 if (VT == MVT::f32)
2533 return X86ScalarSSEf32;
2534 if (VT == MVT::f64)
2535 return X86ScalarSSEf64;
2536 return true;
2537}
2538
2539bool X86TargetLowering::allowsMisalignedMemoryAccesses(
2540 EVT VT, unsigned, Align Alignment, MachineMemOperand::Flags Flags,
2541 bool *Fast) const {
2542 if (Fast) {
2543 switch (VT.getSizeInBits()) {
2544 default:
2545 // 8-byte and under are always assumed to be fast.
2546 *Fast = true;
2547 break;
2548 case 128:
2549 *Fast = !Subtarget.isUnalignedMem16Slow();
2550 break;
2551 case 256:
2552 *Fast = !Subtarget.isUnalignedMem32Slow();
2553 break;
2554 // TODO: What about AVX-512 (512-bit) accesses?
2555 }
2556 }
2557 // NonTemporal vector memory ops must be aligned.
2558 if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {
2559 // NT loads can only be vector aligned, so if its less aligned than the
2560 // minimum vector size (which we can split the vector down to), we might as
2561 // well use a regular unaligned vector load.
2562 // We don't have any NT loads pre-SSE41.
2563 if (!!(Flags & MachineMemOperand::MOLoad))
2564 return (Alignment < 16 || !Subtarget.hasSSE41());
2565 return false;
2566 }
2567 // Misaligned accesses of any size are always allowed.
2568 return true;
2569}
2570
2571/// Return the entry encoding for a jump table in the
2572/// current function. The returned value is a member of the
2573/// MachineJumpTableInfo::JTEntryKind enum.
2574unsigned X86TargetLowering::getJumpTableEncoding() const {
2575 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
2576 // symbol.
2577 if (isPositionIndependent() && Subtarget.isPICStyleGOT())
2578 return MachineJumpTableInfo::EK_Custom32;
2579
2580 // Otherwise, use the normal jump table encoding heuristics.
2581 return TargetLowering::getJumpTableEncoding();
2582}
2583
2584bool X86TargetLowering::useSoftFloat() const {
2585 return Subtarget.useSoftFloat();
2586}
2587
2588void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
2589 ArgListTy &Args) const {
2590
2591 // Only relabel X86-32 for C / Stdcall CCs.
2592 if (Subtarget.is64Bit())
2593 return;
2594 if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
2595 return;
2596 unsigned ParamRegs = 0;
2597 if (auto *M = MF->getFunction().getParent())
2598 ParamRegs = M->getNumberRegisterParameters();
2599
2600 // Mark the first N int arguments as having reg
2601 for (auto &Arg : Args) {
2602 Type *T = Arg.Ty;
2603 if (T->isIntOrPtrTy())
2604 if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
2605 unsigned numRegs = 1;
2606 if (MF->getDataLayout().getTypeAllocSize(T) > 4)
2607 numRegs = 2;
2608 if (ParamRegs < numRegs)
2609 return;
2610 ParamRegs -= numRegs;
2611 Arg.IsInReg = true;
2612 }
2613 }
2614}
2615
2616const MCExpr *
2617X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
2618 const MachineBasicBlock *MBB,
2619 unsigned uid,MCContext &Ctx) const{
2620 assert(isPositionIndependent() && Subtarget.isPICStyleGOT())(static_cast<void> (0));
2621 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
2622 // entries.
2623 return MCSymbolRefExpr::create(MBB->getSymbol(),
2624 MCSymbolRefExpr::VK_GOTOFF, Ctx);
2625}
2626
2627/// Returns relocation base for the given PIC jumptable.
2628SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
2629 SelectionDAG &DAG) const {
2630 if (!Subtarget.is64Bit())
2631 // This doesn't have SDLoc associated with it, but is not really the
2632 // same as a Register.
2633 return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
2634 getPointerTy(DAG.getDataLayout()));
2635 return Table;
2636}
2637
2638/// This returns the relocation base for the given PIC jumptable,
2639/// the same as getPICJumpTableRelocBase, but as an MCExpr.
2640const MCExpr *X86TargetLowering::
2641getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
2642 MCContext &Ctx) const {
2643 // X86-64 uses RIP relative addressing based on the jump table label.
2644 if (Subtarget.isPICStyleRIPRel())
2645 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
2646
2647 // Otherwise, the reference is relative to the PIC base.
2648 return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
2649}
2650
2651std::pair<const TargetRegisterClass *, uint8_t>
2652X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
2653 MVT VT) const {
2654 const TargetRegisterClass *RRC = nullptr;
2655 uint8_t Cost = 1;
2656 switch (VT.SimpleTy) {
2657 default:
2658 return TargetLowering::findRepresentativeClass(TRI, VT);
2659 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
2660 RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
2661 break;
2662 case MVT::x86mmx:
2663 RRC = &X86::VR64RegClass;
2664 break;
2665 case MVT::f32: case MVT::f64:
2666 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
2667 case MVT::v4f32: case MVT::v2f64:
2668 case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
2669 case MVT::v8f32: case MVT::v4f64:
2670 case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
2671 case MVT::v16f32: case MVT::v8f64:
2672 RRC = &X86::VR128XRegClass;
2673 break;
2674 }
2675 return std::make_pair(RRC, Cost);
2676}
2677
2678unsigned X86TargetLowering::getAddressSpace() const {
2679 if (Subtarget.is64Bit())
2680 return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
2681 return 256;
2682}
2683
2684static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
2685 return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
2686 (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
2687}
2688
2689static Constant* SegmentOffset(IRBuilderBase &IRB,
2690 int Offset, unsigned AddressSpace) {
2691 return ConstantExpr::getIntToPtr(
2692 ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
2693 Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
2694}
2695
2696Value *X86TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const {
2697 // glibc, bionic, and Fuchsia have a special slot for the stack guard in
2698 // tcbhead_t; use it instead of the usual global variable (see
2699 // sysdeps/{i386,x86_64}/nptl/tls.h)
2700 if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
2701 if (Subtarget.isTargetFuchsia()) {
2702 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
2703 return SegmentOffset(IRB, 0x10, getAddressSpace());
2704 } else {
2705 unsigned AddressSpace = getAddressSpace();
2706 Module *M = IRB.GetInsertBlock()->getParent()->getParent();
2707 // Specially, some users may customize the base reg and offset.
2708 int Offset = M->getStackProtectorGuardOffset();
2709 // If we don't set -stack-protector-guard-offset value:
2710 // %fs:0x28, unless we're using a Kernel code model, in which case
2711 // it's %gs:0x28. gs:0x14 on i386.
2712 if (Offset == INT_MAX2147483647)
2713 Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
2714
2715 StringRef GuardReg = M->getStackProtectorGuardReg();
2716 if (GuardReg == "fs")
2717 AddressSpace = X86AS::FS;
2718 else if (GuardReg == "gs")
2719 AddressSpace = X86AS::GS;
2720 return SegmentOffset(IRB, Offset, AddressSpace);
2721 }
2722 }
2723 return TargetLowering::getIRStackGuard(IRB);
2724}
2725
2726void X86TargetLowering::insertSSPDeclarations(Module &M) const {
2727 // MSVC CRT provides functionalities for stack protection.
2728 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2729 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
2730 // MSVC CRT has a global variable holding security cookie.
2731 M.getOrInsertGlobal("__security_cookie",
2732 Type::getInt8PtrTy(M.getContext()));
2733
2734 // MSVC CRT has a function to validate security cookie.
2735 FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
2736 "__security_check_cookie", Type::getVoidTy(M.getContext()),
2737 Type::getInt8PtrTy(M.getContext()));
2738 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
2739 F->setCallingConv(CallingConv::X86_FastCall);
2740 F->addParamAttr(0, Attribute::AttrKind::InReg);
2741 }
2742 return;
2743 }
2744
2745 StringRef GuardMode = M.getStackProtectorGuard();
2746
2747 // glibc, bionic, and Fuchsia have a special slot for the stack guard.
2748 if ((GuardMode == "tls" || GuardMode.empty()) &&
2749 hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
2750 return;
2751 TargetLowering::insertSSPDeclarations(M);
2752}
2753
2754Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
2755 // MSVC CRT has a global variable holding security cookie.
2756 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2757 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
2758 return M.getGlobalVariable("__security_cookie");
2759 }
2760 return TargetLowering::getSDagStackGuard(M);
2761}
2762
2763Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
2764 // MSVC CRT has a function to validate security cookie.
2765 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2766 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
2767 return M.getFunction("__security_check_cookie");
2768 }
2769 return TargetLowering::getSSPStackGuardCheck(M);
2770}
2771
2772Value *
2773X86TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {
2774 if (Subtarget.getTargetTriple().isOSContiki())
2775 return getDefaultSafeStackPointerLocation(IRB, false);
2776
2777 // Android provides a fixed TLS slot for the SafeStack pointer. See the
2778 // definition of TLS_SLOT_SAFESTACK in
2779 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
2780 if (Subtarget.isTargetAndroid()) {
2781 // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
2782 // %gs:0x24 on i386
2783 int Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
2784 return SegmentOffset(IRB, Offset, getAddressSpace());
2785 }
2786
2787 // Fuchsia is similar.
2788 if (Subtarget.isTargetFuchsia()) {
2789 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
2790 return SegmentOffset(IRB, 0x18, getAddressSpace());
2791 }
2792
2793 return TargetLowering::getSafeStackPointerLocation(IRB);
2794}
2795
2796//===----------------------------------------------------------------------===//
2797// Return Value Calling Convention Implementation
2798//===----------------------------------------------------------------------===//
2799
2800bool X86TargetLowering::CanLowerReturn(
2801 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
2802 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
2803 SmallVector<CCValAssign, 16> RVLocs;
2804 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2805 return CCInfo.CheckReturn(Outs, RetCC_X86);
2806}
2807
2808const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2809 static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2810 return ScratchRegs;
2811}
2812
2813/// Lowers masks values (v*i1) to the local register values
2814/// \returns DAG node after lowering to register type
2815static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
2816 const SDLoc &Dl, SelectionDAG &DAG) {
2817 EVT ValVT = ValArg.getValueType();
2818
2819 if (ValVT == MVT::v1i1)
2820 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ValLoc, ValArg,
2821 DAG.getIntPtrConstant(0, Dl));
2822
2823 if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
2824 (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
2825 // Two stage lowering might be required
2826 // bitcast: v8i1 -> i8 / v16i1 -> i16
2827 // anyextend: i8 -> i32 / i16 -> i32
2828 EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
2829 SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
2830 if (ValLoc == MVT::i32)
2831 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
2832 return ValToCopy;
2833 }
2834
2835 if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
2836 (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
2837 // One stage lowering is required
2838 // bitcast: v32i1 -> i32 / v64i1 -> i64
2839 return DAG.getBitcast(ValLoc, ValArg);
2840 }
2841
2842 return DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValArg);
2843}
2844
2845/// Breaks v64i1 value into two registers and adds the new node to the DAG
2846static void Passv64i1ArgInRegs(
2847 const SDLoc &Dl, SelectionDAG &DAG, SDValue &Arg,
2848 SmallVectorImpl<std::pair<Register, SDValue>> &RegsToPass, CCValAssign &VA,
2849 CCValAssign &NextVA, const X86Subtarget &Subtarget) {
2850 assert(Subtarget.hasBWI() && "Expected AVX512BW target!")(static_cast<void> (0));
2851 assert(Subtarget.is32Bit() && "Expecting 32 bit target")(static_cast<void> (0));
2852 assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value")(static_cast<void> (0));
2853 assert(VA.isRegLoc() && NextVA.isRegLoc() &&(static_cast<void> (0))
2854 "The value should reside in two registers")(static_cast<void> (0));
2855
2856 // Before splitting the value we cast it to i64
2857 Arg = DAG.getBitcast(MVT::i64, Arg);
2858
2859 // Splitting the value into two i32 types
2860 SDValue Lo, Hi;
2861 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2862 DAG.getConstant(0, Dl, MVT::i32));
2863 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2864 DAG.getConstant(1, Dl, MVT::i32));
2865
2866 // Attach the two i32 types into corresponding registers
2867 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
2868 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
2869}
2870
2871SDValue
2872X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2873 bool isVarArg,
2874 const SmallVectorImpl<ISD::OutputArg> &Outs,
2875 const SmallVectorImpl<SDValue> &OutVals,
2876 const SDLoc &dl, SelectionDAG &DAG) const {
2877 MachineFunction &MF = DAG.getMachineFunction();
2878 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2879
2880 // In some cases we need to disable registers from the default CSR list.
2881 // For example, when they are used for argument passing.
2882 bool ShouldDisableCalleeSavedRegister =
2883 CallConv == CallingConv::X86_RegCall ||
2884 MF.getFunction().hasFnAttribute("no_caller_saved_registers");
2885
2886 if (CallConv == CallingConv::X86_INTR && !Outs.empty())
2887 report_fatal_error("X86 interrupts may not return any value");
2888
2889 SmallVector<CCValAssign, 16> RVLocs;
2890 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2891 CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2892
2893 SmallVector<std::pair<Register, SDValue>, 4> RetVals;
2894 for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
2895 ++I, ++OutsIndex) {
2896 CCValAssign &VA = RVLocs[I];
2897 assert(VA.isRegLoc() && "Can only return in registers!")(static_cast<void> (0));
2898
2899 // Add the register to the CalleeSaveDisableRegs list.
2900 if (ShouldDisableCalleeSavedRegister)
2901 MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());
2902
2903 SDValue ValToCopy = OutVals[OutsIndex];
2904 EVT ValVT = ValToCopy.getValueType();
2905
2906 // Promote values to the appropriate types.
2907 if (VA.getLocInfo() == CCValAssign::SExt)
2908 ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2909 else if (VA.getLocInfo() == CCValAssign::ZExt)
2910 ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2911 else if (VA.getLocInfo() == CCValAssign::AExt) {
2912 if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
2913 ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
2914 else
2915 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2916 }
2917 else if (VA.getLocInfo() == CCValAssign::BCvt)
2918 ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
2919
2920 assert(VA.getLocInfo() != CCValAssign::FPExt &&(static_cast<void> (0))
2921 "Unexpected FP-extend for return value.")(static_cast<void> (0));
2922
2923 // Report an error if we have attempted to return a value via an XMM
2924 // register and SSE was disabled.
2925 if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
2926 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2927 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2928 } else if (!Subtarget.hasSSE2() &&
2929 X86::FR64XRegClass.contains(VA.getLocReg()) &&
2930 ValVT == MVT::f64) {
2931 // When returning a double via an XMM register, report an error if SSE2 is
2932 // not enabled.
2933 errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
2934 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2935 }
2936
2937 // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2938 // the RET instruction and handled by the FP Stackifier.
2939 if (VA.getLocReg() == X86::FP0 ||
2940 VA.getLocReg() == X86::FP1) {
2941 // If this is a copy from an xmm register to ST(0), use an FPExtend to
2942 // change the value to the FP stack register class.
2943 if (isScalarFPTypeInSSEReg(VA.getValVT()))
2944 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2945 RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
2946 // Don't emit a copytoreg.
2947 continue;
2948 }
2949
2950 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2951 // which is returned in RAX / RDX.
2952 if (Subtarget.is64Bit()) {
2953 if (ValVT == MVT::x86mmx) {
2954 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2955 ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
2956 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2957 ValToCopy);
2958 // If we don't have SSE2 available, convert to v4f32 so the generated
2959 // register is legal.
2960 if (!Subtarget.hasSSE2())
2961 ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
2962 }
2963 }
2964 }
2965
2966 if (VA.needsCustom()) {
2967 assert(VA.getValVT() == MVT::v64i1 &&(static_cast<void> (0))
2968 "Currently the only custom case is when we split v64i1 to 2 regs")(static_cast<void> (0));
2969
2970 Passv64i1ArgInRegs(dl, DAG, ValToCopy, RetVals, VA, RVLocs[++I],
2971 Subtarget);
2972
2973 // Add the second register to the CalleeSaveDisableRegs list.
2974 if (ShouldDisableCalleeSavedRegister)
2975 MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
2976 } else {
2977 RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
2978 }
2979 }
2980
2981 SDValue Flag;
2982 SmallVector<SDValue, 6> RetOps;
2983 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2984 // Operand #1 = Bytes To Pop
2985 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
2986 MVT::i32));
2987
2988 // Copy the result values into the output registers.
2989 for (auto &RetVal : RetVals) {
2990 if (RetVal.first == X86::FP0 || RetVal.first == X86::FP1) {
2991 RetOps.push_back(RetVal.second);
2992 continue; // Don't emit a copytoreg.
2993 }
2994
2995 Chain = DAG.getCopyToReg(Chain, dl, RetVal.first, RetVal.second, Flag);
2996 Flag = Chain.getValue(1);
2997 RetOps.push_back(
2998 DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
2999 }
3000
3001 // Swift calling convention does not require we copy the sret argument
3002 // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
3003
3004 // All x86 ABIs require that for returning structs by value we copy
3005 // the sret argument into %rax/%eax (depending on ABI) for the return.
3006 // We saved the argument into a virtual register in the entry block,
3007 // so now we copy the value out and into %rax/%eax.
3008 //
3009 // Checking Function.hasStructRetAttr() here is insufficient because the IR
3010 // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
3011 // false, then an sret argument may be implicitly inserted in the SelDAG. In
3012 // either case FuncInfo->setSRetReturnReg() will have been called.
3013 if (Register SRetReg = FuncInfo->getSRetReturnReg()) {
3014 // When we have both sret and another return value, we should use the
3015 // original Chain stored in RetOps[0], instead of the current Chain updated
3016 // in the above loop. If we only have sret, RetOps[0] equals to Chain.
3017
3018 // For the case of sret and another return value, we have
3019 // Chain_0 at the function entry
3020 // Chain_1 = getCopyToReg(Chain_0) in the above loop
3021 // If we use Chain_1 in getCopyFromReg, we will have
3022 // Val = getCopyFromReg(Chain_1)
3023 // Chain_2 = getCopyToReg(Chain_1, Val) from below
3024
3025 // getCopyToReg(Chain_0) will be glued together with
3026 // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
3027 // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
3028 // Data dependency from Unit B to Unit A due to usage of Val in
3029 // getCopyToReg(Chain_1, Val)
3030 // Chain dependency from Unit A to Unit B
3031
3032 // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
3033 SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
3034 getPointerTy(MF.getDataLayout()));
3035
3036 Register RetValReg
3037 = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
3038 X86::RAX : X86::EAX;
3039 Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
3040 Flag = Chain.getValue(1);
3041
3042 // RAX/EAX now acts like a return value.
3043 RetOps.push_back(
3044 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
3045
3046 // Add the returned register to the CalleeSaveDisableRegs list.
3047 if (ShouldDisableCalleeSavedRegister)
3048 MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
3049 }
3050
3051 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
3052 const MCPhysReg *I =
3053 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3054 if (I) {
3055 for (; *I; ++I) {
3056 if (X86::GR64RegClass.contains(*I))
3057 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3058 else
3059 llvm_unreachable("Unexpected register class in CSRsViaCopy!")__builtin_unreachable();
3060 }
3061 }
3062
3063 RetOps[0] = Chain; // Update chain.
3064
3065 // Add the flag if we have it.
3066 if (Flag.getNode())
3067 RetOps.push_back(Flag);
3068
3069 X86ISD::NodeType opcode = X86ISD::RET_FLAG;
3070 if (CallConv == CallingConv::X86_INTR)
3071 opcode = X86ISD::IRET;
3072 return DAG.getNode(opcode, dl, MVT::Other, RetOps);
3073}
3074
3075bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
3076 if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
3077 return false;
3078
3079 SDValue TCChain = Chain;
3080 SDNode *Copy = *N->use_begin();
3081 if (Copy->getOpcode() == ISD::CopyToReg) {
3082 // If the copy has a glue operand, we conservatively assume it isn't safe to
3083 // perform a tail call.
3084 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
3085 return false;
3086 TCChain = Copy->getOperand(0);
3087 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
3088 return false;
3089
3090 bool HasRet = false;
3091 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
3092 UI != UE; ++UI) {
3093 if (UI->getOpcode() != X86ISD::RET_FLAG)
3094 return false;
3095 // If we are returning more than one value, we can definitely
3096 // not make a tail call see PR19530
3097 if (UI->getNumOperands() > 4)
3098 return false;
3099 if (UI->getNumOperands() == 4 &&
3100 UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
3101 return false;
3102 HasRet = true;
3103 }
3104
3105 if (!HasRet)
3106 return false;
3107
3108 Chain = TCChain;
3109 return true;
3110}
3111
3112EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
3113 ISD::NodeType ExtendKind) const {
3114 MVT ReturnMVT = MVT::i32;
3115
3116 bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
3117 if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
3118 // The ABI does not require i1, i8 or i16 to be extended.
3119 //
3120 // On Darwin, there is code in the wild relying on Clang's old behaviour of
3121 // always extending i8/i16 return values, so keep doing that for now.
3122 // (PR26665).
3123 ReturnMVT = MVT::i8;
3124 }
3125
3126 EVT MinVT = getRegisterType(Context, ReturnMVT);
3127 return VT.bitsLT(MinVT) ? MinVT : VT;
3128}
3129
3130/// Reads two 32 bit registers and creates a 64 bit mask value.
3131/// \param VA The current 32 bit value that need to be assigned.
3132/// \param NextVA The next 32 bit value that need to be assigned.
3133/// \param Root The parent DAG node.
3134/// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
3135/// glue purposes. In the case the DAG is already using
3136/// physical register instead of virtual, we should glue
3137/// our new SDValue to InFlag SDvalue.
3138/// \return a new SDvalue of size 64bit.
3139static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
3140 SDValue &Root, SelectionDAG &DAG,
3141 const SDLoc &Dl, const X86Subtarget &Subtarget,
3142 SDValue *InFlag = nullptr) {
3143 assert((Subtarget.hasBWI()) && "Expected AVX512BW target!")(static_cast<void> (0));
3144 assert(Subtarget.is32Bit() && "Expecting 32 bit target")(static_cast<void> (0));
3145 assert(VA.getValVT() == MVT::v64i1 &&(static_cast<void> (0))
3146 "Expecting first location of 64 bit width type")(static_cast<void> (0));
3147 assert(NextVA.getValVT() == VA.getValVT() &&(static_cast<void> (0))
3148 "The locations should have the same type")(static_cast<void> (0));
3149 assert(VA.isRegLoc() && NextVA.isRegLoc() &&(static_cast<void> (0))
3150 "The values should reside in two registers")(static_cast<void> (0));
3151
3152 SDValue Lo, Hi;
3153 SDValue ArgValueLo, ArgValueHi;
3154
3155 MachineFunction &MF = DAG.getMachineFunction();
3156 const TargetRegisterClass *RC = &X86::GR32RegClass;
3157
3158 // Read a 32 bit value from the registers.
3159 if (nullptr == InFlag) {
3160 // When no physical register is present,
3161 // create an intermediate virtual register.
3162 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
3163 ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
3164 Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
3165 ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
3166 } else {
3167 // When a physical register is available read the value from it and glue
3168 // the reads together.
3169 ArgValueLo =
3170 DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
3171 *InFlag = ArgValueLo.getValue(2);
3172 ArgValueHi =
3173 DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
3174 *InFlag = ArgValueHi.getValue(2);
3175 }
3176
3177 // Convert the i32 type into v32i1 type.
3178 Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
3179
3180 // Convert the i32 type into v32i1 type.
3181 Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
3182
3183 // Concatenate the two values together.
3184 return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
3185}
3186
3187/// The function will lower a register of various sizes (8/16/32/64)
3188/// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
3189/// \returns a DAG node contains the operand after lowering to mask type.
3190static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
3191 const EVT &ValLoc, const SDLoc &Dl,
3192 SelectionDAG &DAG) {
3193 SDValue ValReturned = ValArg;
3194
3195 if (ValVT == MVT::v1i1)
3196 return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);
3197
3198 if (ValVT == MVT::v64i1) {
3199 // In 32 bit machine, this case is handled by getv64i1Argument
3200 assert(ValLoc == MVT::i64 && "Expecting only i64 locations")(static_cast<void> (0));
3201 // In 64 bit machine, There is no need to truncate the value only bitcast
3202 } else {
3203 MVT maskLen;
3204 switch (ValVT.getSimpleVT().SimpleTy) {
3205 case MVT::v8i1:
3206 maskLen = MVT::i8;
3207 break;
3208 case MVT::v16i1:
3209 maskLen = MVT::i16;
3210 break;
3211 case MVT::v32i1:
3212 maskLen = MVT::i32;
3213 break;
3214 default:
3215 llvm_unreachable("Expecting a vector of i1 types")__builtin_unreachable();
3216 }
3217
3218 ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
3219 }
3220 return DAG.getBitcast(ValVT, ValReturned);
3221}
3222
3223/// Lower the result values of a call into the
3224/// appropriate copies out of appropriate physical registers.
3225///
3226SDValue X86TargetLowering::LowerCallResult(
3227 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
3228 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3229 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
3230 uint32_t *RegMask) const {
3231
3232 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
3233 // Assign locations to each value returned by this call.
3234 SmallVector<CCValAssign, 16> RVLocs;
3235 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3236 *DAG.getContext());
3237 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
3238
3239 // Copy all of the result registers out of their specified physreg.
3240 for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
3241 ++I, ++InsIndex) {
3242 CCValAssign &VA = RVLocs[I];
3243 EVT CopyVT = VA.getLocVT();
3244
3245 // In some calling conventions we need to remove the used registers
3246 // from the register mask.
3247 if (RegMask) {
3248 for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);
3249 SubRegs.isValid(); ++SubRegs)
3250 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
3251 }
3252
3253 // Report an error if there was an attempt to return FP values via XMM
3254 // registers.
3255 if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
3256 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
3257 if (VA.getLocReg() == X86::XMM1)
3258 VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
3259 else
3260 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
3261 } else if (!Subtarget.hasSSE2() &&
3262 X86::FR64XRegClass.contains(VA.getLocReg()) &&
3263 CopyVT == MVT::f64) {
3264 errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
3265 if (VA.getLocReg() == X86::XMM1)
3266 VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
3267 else
3268 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
3269 }
3270
3271 // If we prefer to use the value in xmm registers, copy it out as f80 and
3272 // use a truncate to move it from fp stack reg to xmm reg.
3273 bool RoundAfterCopy = false;
3274 if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
3275 isScalarFPTypeInSSEReg(VA.getValVT())) {
3276 if (!Subtarget.hasX87())
3277 report_fatal_error("X87 register return with X87 disabled");
3278 CopyVT = MVT::f80;
3279 RoundAfterCopy = (CopyVT != VA.getLocVT());
3280 }
3281
3282 SDValue Val;
3283 if (VA.needsCustom()) {
3284 assert(VA.getValVT() == MVT::v64i1 &&(static_cast<void> (0))
3285 "Currently the only custom case is when we split v64i1 to 2 regs")(static_cast<void> (0));
3286 Val =
3287 getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
3288 } else {
3289 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
3290 .getValue(1);
3291 Val = Chain.getValue(0);
3292 InFlag = Chain.getValue(2);
3293 }
3294
3295 if (RoundAfterCopy)
3296 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
3297 // This truncation won't change the value.
3298 DAG.getIntPtrConstant(1, dl));
3299
3300 if (VA.isExtInLoc()) {
3301 if (VA.getValVT().isVector() &&
3302 VA.getValVT().getScalarType() == MVT::i1 &&
3303 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
3304 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
3305 // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
3306 Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
3307 } else
3308 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
3309 }
3310
3311 if (VA.getLocInfo() == CCValAssign::BCvt)
3312 Val = DAG.getBitcast(VA.getValVT(), Val);
3313
3314 InVals.push_back(Val);
3315 }
3316
3317 return Chain;
3318}
3319
3320//===----------------------------------------------------------------------===//
3321// C & StdCall & Fast Calling Convention implementation
3322//===----------------------------------------------------------------------===//
3323// StdCall calling convention seems to be standard for many Windows' API
3324// routines and around. It differs from C calling convention just a little:
3325// callee should clean up the stack, not caller. Symbols should be also
3326// decorated in some fancy way :) It doesn't support any vector arguments.
3327// For info on fast calling convention see Fast Calling Convention (tail call)
3328// implementation LowerX86_32FastCCCallTo.
3329
3330/// CallIsStructReturn - Determines whether a call uses struct return
3331/// semantics.
3332enum StructReturnType {
3333 NotStructReturn,
3334 RegStructReturn,
3335 StackStructReturn
3336};
3337static StructReturnType
3338callIsStructReturn(ArrayRef<ISD::OutputArg> Outs, bool IsMCU) {
3339 if (Outs.empty())
3340 return NotStructReturn;
3341
3342 const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
3343 if (!Flags.isSRet())
3344 return NotStructReturn;
3345 if (Flags.isInReg() || IsMCU)
3346 return RegStructReturn;
3347 return StackStructReturn;
3348}
3349
3350/// Determines whether a function uses struct return semantics.
3351static StructReturnType
3352argsAreStructReturn(ArrayRef<ISD::InputArg> Ins, bool IsMCU) {
3353 if (Ins.empty())
3354 return NotStructReturn;
3355
3356 const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
3357 if (!Flags.isSRet())
3358 return NotStructReturn;
3359 if (Flags.isInReg() || IsMCU)
3360 return RegStructReturn;
3361 return StackStructReturn;
3362}
3363
3364/// Make a copy of an aggregate at address specified by "Src" to address
3365/// "Dst" with size and alignment information specified by the specific
3366/// parameter attribute. The copy will be passed as a byval function parameter.
3367static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
3368 SDValue Chain, ISD::ArgFlagsTy Flags,
3369 SelectionDAG &DAG, const SDLoc &dl) {
3370 SDValue SizeNode = DAG.getIntPtrConstant(Flags.getByValSize(), dl);
3371
3372 return DAG.getMemcpy(
3373 Chain, dl, Dst, Src, SizeNode, Flags.getNonZeroByValAlign(),
3374 /*isVolatile*/ false, /*AlwaysInline=*/true,
3375 /*isTailCall*/ false, MachinePointerInfo(), MachinePointerInfo());
3376}
3377
3378/// Return true if the calling convention is one that we can guarantee TCO for.
3379static bool canGuaranteeTCO(CallingConv::ID CC) {
3380 return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
3381 CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
3382 CC == CallingConv::HHVM || CC == CallingConv::Tail ||
3383 CC == CallingConv::SwiftTail);
3384}
3385
3386/// Return true if we might ever do TCO for calls with this calling convention.
3387static bool mayTailCallThisCC(CallingConv::ID CC) {
3388 switch (CC) {
3389 // C calling conventions:
3390 case CallingConv::C:
3391 case CallingConv::Win64:
3392 case CallingConv::X86_64_SysV:
3393 // Callee pop conventions:
3394 case CallingConv::X86_ThisCall:
3395 case CallingConv::X86_StdCall:
3396 case CallingConv::X86_VectorCall:
3397 case CallingConv::X86_FastCall:
3398 // Swift:
3399 case CallingConv::Swift:
3400 return true;
3401 default:
3402 return canGuaranteeTCO(CC);
3403 }
3404}
3405
3406/// Return true if the function is being made into a tailcall target by
3407/// changing its ABI.
3408static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
3409 return (GuaranteedTailCallOpt && canGuaranteeTCO(CC)) ||
3410 CC == CallingConv::Tail || CC == CallingConv::SwiftTail;
3411}
3412
3413bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
3414 if (!CI->isTailCall())
3415 return false;
3416
3417 CallingConv::ID CalleeCC = CI->getCallingConv();
3418 if (!mayTailCallThisCC(CalleeCC))
3419 return false;
3420
3421 return true;
3422}
3423
3424SDValue
3425X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
3426 const SmallVectorImpl<ISD::InputArg> &Ins,
3427 const SDLoc &dl, SelectionDAG &DAG,
3428 const CCValAssign &VA,
3429 MachineFrameInfo &MFI, unsigned i) const {
3430 // Create the nodes corresponding to a load from this parameter slot.
3431 ISD::ArgFlagsTy Flags = Ins[i].Flags;
3432 bool AlwaysUseMutable = shouldGuaranteeTCO(
3433 CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
3434 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
3435 EVT ValVT;
3436 MVT PtrVT = getPointerTy(DAG.getDataLayout());
3437
3438 // If value is passed by pointer we have address passed instead of the value
3439 // itself. No need to extend if the mask value and location share the same
3440 // absolute size.
3441 bool ExtendedInMem =
3442 VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
3443 VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
3444
3445 if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
3446 ValVT = VA.getLocVT();
3447 else
3448 ValVT = VA.getValVT();
3449
3450 // FIXME: For now, all byval parameter objects are marked mutable. This can be
3451 // changed with more analysis.
3452 // In case of tail call optimization mark all arguments mutable. Since they
3453 // could be overwritten by lowering of arguments in case of a tail call.
3454 if (Flags.isByVal()) {
3455 unsigned Bytes = Flags.getByValSize();
3456 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
3457
3458 // FIXME: For now, all byval parameter objects are marked as aliasing. This
3459 // can be improved with deeper analysis.
3460 int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable,
3461 /*isAliased=*/true);
3462 return DAG.getFrameIndex(FI, PtrVT);
3463 }
3464
3465 EVT ArgVT = Ins[i].ArgVT;
3466
3467 // If this is a vector that has been split into multiple parts, and the
3468 // scalar size of the parts don't match the vector element size, then we can't
3469 // elide the copy. The parts will have padding between them instead of being
3470 // packed like a vector.
3471 bool ScalarizedAndExtendedVector =
3472 ArgVT.isVector() && !VA.getLocVT().isVector() &&
3473 VA.getLocVT().getSizeInBits() != ArgVT.getScalarSizeInBits();
3474
3475 // This is an argument in memory. We might be able to perform copy elision.
3476 // If the argument is passed directly in memory without any extension, then we
3477 // can perform copy elision. Large vector types, for example, may be passed
3478 // indirectly by pointer.
3479 if (Flags.isCopyElisionCandidate() &&
3480 VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem &&
3481 !ScalarizedAndExtendedVector) {
3482 SDValue PartAddr;
3483 if (Ins[i].PartOffset == 0) {
3484 // If this is a one-part value or the first part of a multi-part value,
3485 // create a stack object for the entire argument value type and return a
3486 // load from our portion of it. This assumes that if the first part of an
3487 // argument is in memory, the rest will also be in memory.
3488 int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
3489 /*IsImmutable=*/false);
3490 PartAddr = DAG.getFrameIndex(FI, PtrVT);
3491 return DAG.getLoad(
3492 ValVT, dl, Chain, PartAddr,
3493 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3494 } else {
3495 // This is not the first piece of an argument in memory. See if there is
3496 // already a fixed stack object including this offset. If so, assume it
3497 // was created by the PartOffset == 0 branch above and create a load from
3498 // the appropriate offset into it.
3499 int64_t PartBegin = VA.getLocMemOffset();
3500 int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
3501 int FI = MFI.getObjectIndexBegin();
3502 for (; MFI.isFixedObjectIndex(FI); ++FI) {
3503 int64_t ObjBegin = MFI.getObjectOffset(FI);
3504 int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
3505 if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
3506 break;
3507 }
3508 if (MFI.isFixedObjectIndex(FI)) {
3509 SDValue Addr =
3510 DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
3511 DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
3512 return DAG.getLoad(
3513 ValVT, dl, Chain, Addr,
3514 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
3515 Ins[i].PartOffset));
3516 }
3517 }
3518 }
3519
3520 int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
3521 VA.getLocMemOffset(), isImmutable);
3522
3523 // Set SExt or ZExt flag.
3524 if (VA.getLocInfo() == CCValAssign::ZExt) {
3525 MFI.setObjectZExt(FI, true);
3526 } else if (VA.getLocInfo() == CCValAssign::SExt) {
3527 MFI.setObjectSExt(FI, true);
3528 }
3529
3530 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
3531 SDValue Val = DAG.getLoad(
3532 ValVT, dl, Chain, FIN,
3533 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3534 return ExtendedInMem
3535 ? (VA.getValVT().isVector()
3536 ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
3537 : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
3538 : Val;
3539}
3540
3541// FIXME: Get this from tablegen.
3542static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
3543 const X86Subtarget &Subtarget) {
3544 assert(Subtarget.is64Bit())(static_cast<void> (0));
3545
3546 if (Subtarget.isCallingConvWin64(CallConv)) {
3547 static const MCPhysReg GPR64ArgRegsWin64[] = {
3548 X86::RCX, X86::RDX, X86::R8, X86::R9
3549 };
3550 return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
3551 }
3552
3553 static const MCPhysReg GPR64ArgRegs64Bit[] = {
3554 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
3555 };
3556 return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
3557}
3558
3559// FIXME: Get this from tablegen.
3560static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
3561 CallingConv::ID CallConv,
3562 const X86Subtarget &Subtarget) {
3563 assert(Subtarget.is64Bit())(static_cast<void> (0));
3564 if (Subtarget.isCallingConvWin64(CallConv)) {
3565 // The XMM registers which might contain var arg parameters are shadowed
3566 // in their paired GPR. So we only need to save the GPR to their home
3567 // slots.
3568 // TODO: __vectorcall will change this.
3569 return None;
3570 }
3571
3572 bool isSoftFloat = Subtarget.useSoftFloat();
3573 if (isSoftFloat || !Subtarget.hasSSE1())
3574 // Kernel mode asks for SSE to be disabled, so there are no XMM argument
3575 // registers.
3576 return None;
3577
3578 static const MCPhysReg XMMArgRegs64Bit[] = {
3579 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3580 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3581 };
3582 return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
3583}
3584
3585#ifndef NDEBUG1
3586static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) {
3587 return llvm::is_sorted(
3588 ArgLocs, [](const CCValAssign &A, const CCValAssign &B) -> bool {
3589 return A.getValNo() < B.getValNo();
3590 });
3591}
3592#endif
3593
3594namespace {
3595/// This is a helper class for lowering variable arguments parameters.
3596class VarArgsLoweringHelper {
3597public:
3598 VarArgsLoweringHelper(X86MachineFunctionInfo *FuncInfo, const SDLoc &Loc,
3599 SelectionDAG &DAG, const X86Subtarget &Subtarget,
3600 CallingConv::ID CallConv, CCState &CCInfo)
3601 : FuncInfo(FuncInfo), DL(Loc), DAG(DAG), Subtarget(Subtarget),
3602 TheMachineFunction(DAG.getMachineFunction()),
3603 TheFunction(TheMachineFunction.getFunction()),
3604 FrameInfo(TheMachineFunction.getFrameInfo()),
3605 FrameLowering(*Subtarget.getFrameLowering()),
3606 TargLowering(DAG.getTargetLoweringInfo()), CallConv(CallConv),
3607 CCInfo(CCInfo) {}
3608
3609 // Lower variable arguments parameters.
3610 void lowerVarArgsParameters(SDValue &Chain, unsigned StackSize);
3611
3612private:
3613 void createVarArgAreaAndStoreRegisters(SDValue &Chain, unsigned StackSize);
3614
3615 void forwardMustTailParameters(SDValue &Chain);
3616
3617 bool is64Bit() const { return Subtarget.is64Bit(); }
3618 bool isWin64() const { return Subtarget.isCallingConvWin64(CallConv); }
3619
3620 X86MachineFunctionInfo *FuncInfo;
3621 const SDLoc &DL;
3622 SelectionDAG &DAG;
3623 const X86Subtarget &Subtarget;
3624 MachineFunction &TheMachineFunction;
3625 const Function &TheFunction;
3626 MachineFrameInfo &FrameInfo;
3627 const TargetFrameLowering &FrameLowering;
3628 const TargetLowering &TargLowering;
3629 CallingConv::ID CallConv;
3630 CCState &CCInfo;
3631};
3632} // namespace
3633
3634void VarArgsLoweringHelper::createVarArgAreaAndStoreRegisters(
3635 SDValue &Chain, unsigned StackSize) {
3636 // If the function takes variable number of arguments, make a frame index for
3637 // the start of the first vararg value... for expansion of llvm.va_start. We
3638 // can skip this if there are no va_start calls.
3639 if (is64Bit() || (CallConv != CallingConv::X86_FastCall &&
3640 CallConv != CallingConv::X86_ThisCall)) {
3641 FuncInfo->setVarArgsFrameIndex(
3642 FrameInfo.CreateFixedObject(1, StackSize, true));
3643 }
3644
3645 // 64-bit calling conventions support varargs and register parameters, so we
3646 // have to do extra work to spill them in the prologue.
3647 if (is64Bit()) {
3648 // Find the first unallocated argument registers.
3649 ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
3650 ArrayRef<MCPhysReg> ArgXMMs =
3651 get64BitArgumentXMMs(TheMachineFunction, CallConv, Subtarget);
3652 unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
3653 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
3654
3655 assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&(static_cast<void> (0))
3656 "SSE register cannot be used when SSE is disabled!")(static_cast<void> (0));
3657
3658 if (isWin64()) {
3659 // Get to the caller-allocated home save location. Add 8 to account
3660 // for the return address.
3661 int HomeOffset = FrameLowering.getOffsetOfLocalArea() + 8;
3662 FuncInfo->setRegSaveFrameIndex(
3663 FrameInfo.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
3664 // Fixup to set vararg frame on shadow area (4 x i64).
3665 if (NumIntRegs < 4)
3666 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
3667 } else {
3668 // For X86-64, if there are vararg parameters that are passed via
3669 // registers, then we must store them to their spots on the stack so
3670 // they may be loaded by dereferencing the result of va_next.
3671 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
3672 FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
3673 FuncInfo->setRegSaveFrameIndex(FrameInfo.CreateStackObject(
3674 ArgGPRs.size() * 8 + ArgXMMs.size() * 16, Align(16), false));
3675 }
3676
3677 SmallVector<SDValue, 6>
3678 LiveGPRs; // list of SDValue for GPR registers keeping live input value
3679 SmallVector<SDValue, 8> LiveXMMRegs; // list of SDValue for XMM registers
3680 // keeping live input value
3681 SDValue ALVal; // if applicable keeps SDValue for %al register
3682
3683 // Gather all the live in physical registers.
3684 for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
3685 Register GPR = TheMachineFunction.addLiveIn(Reg, &X86::GR64RegClass);
3686 LiveGPRs.push_back(DAG.getCopyFromReg(Chain, DL, GPR, MVT::i64));
3687 }
3688 const auto &AvailableXmms = ArgXMMs.slice(NumXMMRegs);
3689 if (!AvailableXmms.empty()) {
3690 Register AL = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);
3691 ALVal = DAG.getCopyFromReg(Chain, DL, AL, MVT::i8);
3692 for (MCPhysReg Reg : AvailableXmms) {
3693 // FastRegisterAllocator spills virtual registers at basic
3694 // block boundary. That leads to usages of xmm registers
3695 // outside of check for %al. Pass physical registers to
3696 // VASTART_SAVE_XMM_REGS to avoid unneccessary spilling.
3697 TheMachineFunction.getRegInfo().addLiveIn(Reg);
3698 LiveXMMRegs.push_back(DAG.getRegister(Reg, MVT::v4f32));
3699 }
3700 }
3701
3702 // Store the integer parameter registers.
3703 SmallVector<SDValue, 8> MemOps;
3704 SDValue RSFIN =
3705 DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
3706 TargLowering.getPointerTy(DAG.getDataLayout()));
3707 unsigned Offset = FuncInfo->getVarArgsGPOffset();
3708 for (SDValue Val : LiveGPRs) {
3709 SDValue FIN = DAG.getNode(ISD::ADD, DL,
3710 TargLowering.getPointerTy(DAG.getDataLayout()),
3711 RSFIN, DAG.getIntPtrConstant(Offset, DL));
3712 SDValue Store =
3713 DAG.getStore(Val.getValue(1), DL, Val, FIN,
3714 MachinePointerInfo::getFixedStack(
3715 DAG.getMachineFunction(),
3716 FuncInfo->getRegSaveFrameIndex(), Offset));
3717 MemOps.push_back(Store);
3718 Offset += 8;
3719 }
3720
3721 // Now store the XMM (fp + vector) parameter registers.
3722 if (!LiveXMMRegs.empty()) {
3723 SmallVector<SDValue, 12> SaveXMMOps;
3724 SaveXMMOps.push_back(Chain);
3725 SaveXMMOps.push_back(ALVal);
3726 SaveXMMOps.push_back(
3727 DAG.getTargetConstant(FuncInfo->getRegSaveFrameIndex(), DL, MVT::i32));
3728 SaveXMMOps.push_back(
3729 DAG.getTargetConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32));
3730 llvm::append_range(SaveXMMOps, LiveXMMRegs);
3731 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, DL,
3732 MVT::Other, SaveXMMOps));
3733 }
3734
3735 if (!MemOps.empty())
3736 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
3737 }
3738}
3739
3740void VarArgsLoweringHelper::forwardMustTailParameters(SDValue &Chain) {
3741 // Find the largest legal vector type.
3742 MVT VecVT = MVT::Other;
3743 // FIXME: Only some x86_32 calling conventions support AVX512.
3744 if (Subtarget.useAVX512Regs() &&
3745 (is64Bit() || (CallConv == CallingConv::X86_VectorCall ||
3746 CallConv == CallingConv::Intel_OCL_BI)))
3747 VecVT = MVT::v16f32;
3748 else if (Subtarget.hasAVX())
3749 VecVT = MVT::v8f32;
3750 else if (Subtarget.hasSSE2())
3751 VecVT = MVT::v4f32;
3752
3753 // We forward some GPRs and some vector types.
3754 SmallVector<MVT, 2> RegParmTypes;
3755 MVT IntVT = is64Bit() ? MVT::i64 : MVT::i32;
3756 RegParmTypes.push_back(IntVT);
3757 if (VecVT != MVT::Other)
3758 RegParmTypes.push_back(VecVT);
3759
3760 // Compute the set of forwarded registers. The rest are scratch.
3761 SmallVectorImpl<ForwardedRegister> &Forwards =
3762 FuncInfo->getForwardedMustTailRegParms();
3763 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
3764
3765 // Forward AL for SysV x86_64 targets, since it is used for varargs.
3766 if (is64Bit() && !isWin64() && !CCInfo.isAllocated(X86::AL)) {
3767 Register ALVReg = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);
3768 Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
3769 }
3770
3771 // Copy all forwards from physical to virtual registers.
3772 for (ForwardedRegister &FR : Forwards) {
3773 // FIXME: Can we use a less constrained schedule?
3774 SDValue RegVal = DAG.getCopyFromReg(Chain, DL, FR.VReg, FR.VT);
3775 FR.VReg = TheMachineFunction.getRegInfo().createVirtualRegister(
3776 TargLowering.getRegClassFor(FR.VT));
3777 Chain = DAG.getCopyToReg(Chain, DL, FR.VReg, RegVal);
3778 }
3779}
3780
3781void VarArgsLoweringHelper::lowerVarArgsParameters(SDValue &Chain,
3782 unsigned StackSize) {
3783 // Set FrameIndex to the 0xAAAAAAA value to mark unset state.
3784 // If necessary, it would be set into the correct value later.
3785 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
3786 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
3787
3788 if (FrameInfo.hasVAStart())
3789 createVarArgAreaAndStoreRegisters(Chain, StackSize);
3790
3791 if (FrameInfo.hasMustTailInVarArgFunc())
3792 forwardMustTailParameters(Chain);
3793}
3794
3795SDValue X86TargetLowering::LowerFormalArguments(
3796 SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
3797 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3798 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3799 MachineFunction &MF = DAG.getMachineFunction();
3800 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
3801
3802 const Function &F = MF.getFunction();
3803 if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&
3804 F.getName() == "main")
3805 FuncInfo->setForceFramePointer(true);
3806
3807 MachineFrameInfo &MFI = MF.getFrameInfo();
3808 bool Is64Bit = Subtarget.is64Bit();
3809 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
3810
3811 assert((static_cast<void> (0))
3812 !(IsVarArg && canGuaranteeTCO(CallConv)) &&(static_cast<void> (0))
3813 "Var args not supported with calling conv' regcall, fastcc, ghc or hipe")(static_cast<void> (0));
3814
3815 // Assign locations to all of the incoming arguments.
3816 SmallVector<CCValAssign, 16> ArgLocs;
3817 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
3818
3819 // Allocate shadow area for Win64.
3820 if (IsWin64)
3821 CCInfo.AllocateStack(32, Align(8));
3822
3823 CCInfo.AnalyzeArguments(Ins, CC_X86);
3824
3825 // In vectorcall calling convention a second pass is required for the HVA
3826 // types.
3827 if (CallingConv::X86_VectorCall == CallConv) {
3828 CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
3829 }
3830
3831 // The next loop assumes that the locations are in the same order of the
3832 // input arguments.
3833 assert(isSortedByValueNo(ArgLocs) &&(static_cast<void> (0))
3834 "Argument Location list must be sorted before lowering")(static_cast<void> (0));
3835
3836 SDValue ArgValue;
3837 for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
3838 ++I, ++InsIndex) {
3839 assert(InsIndex < Ins.size() && "Invalid Ins index")(static_cast<void> (0));
3840 CCValAssign &VA = ArgLocs[I];
3841
3842 if (VA.isRegLoc()) {
3843 EVT RegVT = VA.getLocVT();
3844 if (VA.needsCustom()) {
3845 assert((static_cast<void> (0))
3846 VA.getValVT() == MVT::v64i1 &&(static_cast<void> (0))
3847 "Currently the only custom case is when we split v64i1 to 2 regs")(static_cast<void> (0));
3848
3849 // v64i1 values, in regcall calling convention, that are
3850 // compiled to 32 bit arch, are split up into two registers.
3851 ArgValue =
3852 getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
3853 } else {
3854 const TargetRegisterClass *RC;
3855 if (RegVT == MVT::i8)
3856 RC = &X86::GR8RegClass;
3857 else if (RegVT == MVT::i16)
3858 RC = &X86::GR16RegClass;
3859 else if (RegVT == MVT::i32)
3860 RC = &X86::GR32RegClass;
3861 else if (Is64Bit && RegVT == MVT::i64)
3862 RC = &X86::GR64RegClass;
3863 else if (RegVT == MVT::f16)
3864 RC = &X86::FR16XRegClass;
3865 else if (RegVT == MVT::f32)
3866 RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
3867 else if (RegVT == MVT::f64)
3868 RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
3869 else if (RegVT == MVT::f80)
3870 RC = &X86::RFP80RegClass;
3871 else if (RegVT == MVT::f128)
3872 RC = &X86::VR128RegClass;
3873 else if (RegVT.is512BitVector())
3874 RC = &X86::VR512RegClass;
3875 else if (RegVT.is256BitVector())
3876 RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
3877 else if (RegVT.is128BitVector())
3878 RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
3879 else if (RegVT == MVT::x86mmx)
3880 RC = &X86::VR64RegClass;
3881 else if (RegVT == MVT::v1i1)
3882 RC = &X86::VK1RegClass;
3883 else if (RegVT == MVT::v8i1)
3884 RC = &X86::VK8RegClass;
3885 else if (RegVT == MVT::v16i1)
3886 RC = &X86::VK16RegClass;
3887 else if (RegVT == MVT::v32i1)
3888 RC = &X86::VK32RegClass;
3889 else if (RegVT == MVT::v64i1)
3890 RC = &X86::VK64RegClass;
3891 else
3892 llvm_unreachable("Unknown argument type!")__builtin_unreachable();
3893
3894 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
3895 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
3896 }
3897
3898 // If this is an 8 or 16-bit value, it is really passed promoted to 32
3899 // bits. Insert an assert[sz]ext to capture this, then truncate to the
3900 // right size.
3901 if (VA.getLocInfo() == CCValAssign::SExt)
3902 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
3903 DAG.getValueType(VA.getValVT()));
3904 else if (VA.getLocInfo() == CCValAssign::ZExt)
3905 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
3906 DAG.getValueType(VA.getValVT()));
3907 else if (VA.getLocInfo() == CCValAssign::BCvt)
3908 ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
3909
3910 if (VA.isExtInLoc()) {
3911 // Handle MMX values passed in XMM regs.
3912 if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
3913 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
3914 else if (VA.getValVT().isVector() &&
3915 VA.getValVT().getScalarType() == MVT::i1 &&
3916 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
3917 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
3918 // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
3919 ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
3920 } else
3921 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
3922 }
3923 } else {
3924 assert(VA.isMemLoc())(static_cast<void> (0));
3925 ArgValue =
3926 LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
3927 }
3928
3929 // If value is passed via pointer - do a load.
3930 if (VA.getLocInfo() == CCValAssign::Indirect && !Ins[I].Flags.isByVal())
3931 ArgValue =
3932 DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
3933
3934 InVals.push_back(ArgValue);
3935 }
3936
3937 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
3938 if (Ins[I].Flags.isSwiftAsync()) {
3939 auto X86FI = MF.getInfo<X86MachineFunctionInfo>();
3940 if (Subtarget.is64Bit())
3941 X86FI->setHasSwiftAsyncContext(true);
3942 else {
3943 int FI = MF.getFrameInfo().CreateStackObject(4, Align(4), false);
3944 X86FI->setSwiftAsyncContextFrameIdx(FI);
3945 SDValue St = DAG.getStore(DAG.getEntryNode(), dl, InVals[I],
3946 DAG.getFrameIndex(FI, MVT::i32),
3947 MachinePointerInfo::getFixedStack(MF, FI));
3948 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, St, Chain);
3949 }
3950 }
3951
3952 // Swift calling convention does not require we copy the sret argument
3953 // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
3954 if (CallConv == CallingConv::Swift || CallConv == CallingConv::SwiftTail)
3955 continue;
3956
3957 // All x86 ABIs require that for returning structs by value we copy the
3958 // sret argument into %rax/%eax (depending on ABI) for the return. Save
3959 // the argument into a virtual register so that we can access it from the
3960 // return points.
3961 if (Ins[I].Flags.isSRet()) {
3962 assert(!FuncInfo->getSRetReturnReg() &&(static_cast<void> (0))
3963 "SRet return has already been set")(static_cast<void> (0));
3964 MVT PtrTy = getPointerTy(DAG.getDataLayout());
3965 Register Reg =
3966 MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
3967 FuncInfo->setSRetReturnReg(Reg);
3968 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
3969 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
3970 break;
3971 }
3972 }
3973
3974 unsigned StackSize = CCInfo.getNextStackOffset();
3975 // Align stack specially for tail calls.
3976 if (shouldGuaranteeTCO(CallConv,
3977 MF.getTarget().Options.GuaranteedTailCallOpt))
3978 StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
3979
3980 if (IsVarArg)
3981 VarArgsLoweringHelper(FuncInfo, dl, DAG, Subtarget, CallConv, CCInfo)
3982 .lowerVarArgsParameters(Chain, StackSize);
3983
3984 // Some CCs need callee pop.
3985 if (X86::isCalleePop(CallConv, Is64Bit, IsVarArg,
3986 MF.getTarget().Options.GuaranteedTailCallOpt)) {
3987 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
3988 } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
3989 // X86 interrupts must pop the error code (and the alignment padding) if
3990 // present.
3991 FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
3992 } else {
3993 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
3994 // If this is an sret function, the return should pop the hidden pointer.
3995 if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3996 !Subtarget.getTargetTriple().isOSMSVCRT() &&
3997 argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
3998 FuncInfo->setBytesToPopOnReturn(4);
3999 }
4000
4001 if (!Is64Bit) {
4002 // RegSaveFrameIndex is X86-64 only.
4003 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
4004 }
4005
4006 FuncInfo->setArgumentStackSize(StackSize);
4007
4008 if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
4009 EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());
4010 if (Personality == EHPersonality::CoreCLR) {
4011 assert(Is64Bit)(static_cast<void> (0));
4012 // TODO: Add a mechanism to frame lowering that will allow us to indicate
4013 // that we'd prefer this slot be allocated towards the bottom of the frame
4014 // (i.e. near the stack pointer after allocating the frame). Every
4015 // funclet needs a copy of this slot in its (mostly empty) frame, and the
4016 // offset from the bottom of this and each funclet's frame must be the
4017 // same, so the size of funclets' (mostly empty) frames is dictated by
4018 // how far this slot is from the bottom (since they allocate just enough
4019 // space to accommodate holding this slot at the correct offset).
4020 int PSPSymFI = MFI.CreateStackObject(8, Align(8), /*isSpillSlot=*/false);
4021 EHInfo->PSPSymFrameIdx = PSPSymFI;
4022 }
4023 }
4024
4025 if (CallConv == CallingConv::X86_RegCall ||
4026 F.hasFnAttribute("no_caller_saved_registers")) {
4027 MachineRegisterInfo &MRI = MF.getRegInfo();
4028 for (std::pair<Register, Register> Pair : MRI.liveins())
4029 MRI.disableCalleeSavedRegister(Pair.first);
4030 }
4031
4032 return Chain;
4033}
4034
4035SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
4036 SDValue Arg, const SDLoc &dl,
4037 SelectionDAG &DAG,
4038 const CCValAssign &VA,
4039 ISD::ArgFlagsTy Flags,
4040 bool isByVal) const {
4041 unsigned LocMemOffset = VA.getLocMemOffset();
4042 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
4043 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
4044 StackPtr, PtrOff);
4045 if (isByVal)
4046 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
4047
4048 return DAG.getStore(
4049 Chain, dl, Arg, PtrOff,
4050 MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
4051}
4052
4053/// Emit a load of return address if tail call
4054/// optimization is performed and it is required.
4055SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
4056 SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
4057 bool Is64Bit, int FPDiff, const SDLoc &dl) const {
4058 // Adjust the Return address stack slot.
4059 EVT VT = getPointerTy(DAG.getDataLayout());
4060 OutRetAddr = getReturnAddressFrameIndex(DAG);
4061
4062 // Load the "old" Return address.
4063 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
4064 return SDValue(OutRetAddr.getNode(), 1);
4065}
4066
4067/// Emit a store of the return address if tail call
4068/// optimization is performed and it is required (FPDiff!=0).
4069static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
4070 SDValue Chain, SDValue RetAddrFrIdx,
4071 EVT PtrVT, unsigned SlotSize,
4072 int FPDiff, const SDLoc &dl) {
4073 // Store the return address to the appropriate stack slot.
4074 if (!FPDiff) return Chain;
4075 // Calculate the new stack slot for the return address.
4076 int NewReturnAddrFI =
4077 MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
4078 false);
4079 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
4080 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
4081 MachinePointerInfo::getFixedStack(
4082 DAG.getMachineFunction(), NewReturnAddrFI));
4083 return Chain;
4084}
4085
4086/// Returns a vector_shuffle mask for an movs{s|d}, movd
4087/// operation of specified width.
4088static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
4089 SDValue V2) {
4090 unsigned NumElems = VT.getVectorNumElements();
4091 SmallVector<int, 8> Mask;
4092 Mask.push_back(NumElems);
4093 for (unsigned i = 1; i != NumElems; ++i)
4094 Mask.push_back(i);
4095 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
4096}
4097
4098SDValue
4099X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
4100 SmallVectorImpl<SDValue> &InVals) const {
4101 SelectionDAG &DAG = CLI.DAG;
4102 SDLoc &dl = CLI.DL;
4103 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
4104 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
4105 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
4106 SDValue Chain = CLI.Chain;
4107 SDValue Callee = CLI.Callee;
4108 CallingConv::ID CallConv = CLI.CallConv;
4109 bool &isTailCall = CLI.IsTailCall;
4110 bool isVarArg = CLI.IsVarArg;
4111 const auto *CB = CLI.CB;
4112
4113 MachineFunction &MF = DAG.getMachineFunction();
4114 bool Is64Bit = Subtarget.is64Bit();
4115 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
4116 StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
4117 bool IsSibcall = false;
4118 bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt ||
4119 CallConv == CallingConv::Tail || CallConv == CallingConv::SwiftTail;
4120 X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
4121 bool HasNCSR = (CB && isa<CallInst>(CB) &&
4122 CB->hasFnAttr("no_caller_saved_registers"));
4123 bool HasNoCfCheck = (CB && CB->doesNoCfCheck());
4124 bool IsIndirectCall = (CB && isa<CallInst>(CB) && CB->isIndirectCall());
4125 const Module *M = MF.getMMI().getModule();
4126 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
4127
4128 MachineFunction::CallSiteInfo CSInfo;
4129 if (CallConv == CallingConv::X86_INTR)
4130 report_fatal_error("X86 interrupts may not be called directly");
4131
4132 bool IsMustTail = CLI.CB && CLI.CB->isMustTailCall();
4133 if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO && !IsMustTail) {
4134 // If we are using a GOT, disable tail calls to external symbols with
4135 // default visibility. Tail calling such a symbol requires using a GOT
4136 // relocation, which forces early binding of the symbol. This breaks code
4137 // that require lazy function symbol resolution. Using musttail or
4138 // GuaranteedTailCallOpt will override this.
4139 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
4140 if (!G || (!G->getGlobal()->hasLocalLinkage() &&
4141 G->getGlobal()->hasDefaultVisibility()))
4142 isTailCall = false;
4143 }
4144
4145
4146 if (isTailCall && !IsMustTail) {
4147 // Check if it's really possible to do a tail call.
4148 isTailCall = IsEligibleForTailCallOptimization(
4149 Callee, CallConv, SR == StackStructReturn, isVarArg, CLI.RetTy, Outs,
4150 OutVals, Ins, DAG);
4151
4152 // Sibcalls are automatically detected tailcalls which do not require
4153 // ABI changes.
4154 if (!IsGuaranteeTCO && isTailCall)
4155 IsSibcall = true;
4156
4157 if (isTailCall)
4158 ++NumTailCalls;
4159 }
4160
4161 if (IsMustTail && !isTailCall)
4162 report_fatal_error("failed to perform tail call elimination on a call "
4163 "site marked musttail");
4164
4165 assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&(static_cast<void> (0))
4166 "Var args not supported with calling convention fastcc, ghc or hipe")(static_cast<void> (0));
4167
4168 // Analyze operands of the call, assigning locations to each operand.
4169 SmallVector<CCValAssign, 16> ArgLocs;
4170 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
4171
4172 // Allocate shadow area for Win64.
4173 if (IsWin64)
4174 CCInfo.AllocateStack(32, Align(8));
4175
4176 CCInfo.AnalyzeArguments(Outs, CC_X86);
4177
4178 // In vectorcall calling convention a second pass is required for the HVA
4179 // types.
4180 if (CallingConv::X86_VectorCall == CallConv) {
4181 CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
4182 }
4183
4184 // Get a count of how many bytes are to be pushed on the stack.
4185 unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
4186 if (IsSibcall)
4187 // This is a sibcall. The memory operands are available in caller's
4188 // own caller's stack.
4189 NumBytes = 0;
4190 else if (IsGuaranteeTCO && canGuaranteeTCO(CallConv))
4191 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
4192
4193 int FPDiff = 0;
4194 if (isTailCall &&
4195 shouldGuaranteeTCO(CallConv,
4196 MF.getTarget().Options.GuaranteedTailCallOpt)) {
4197 // Lower arguments at fp - stackoffset + fpdiff.
4198 unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
4199
4200 FPDiff = NumBytesCallerPushed - NumBytes;
4201
4202 // Set the delta of movement of the returnaddr stackslot.
4203 // But only set if delta is greater than previous delta.
4204 if (FPDiff < X86Info->getTCReturnAddrDelta())
4205 X86Info->setTCReturnAddrDelta(FPDiff);
4206 }
4207
4208 unsigned NumBytesToPush = NumBytes;
4209 unsigned NumBytesToPop = NumBytes;
4210
4211 // If we have an inalloca argument, all stack space has already been allocated
4212 // for us and be right at the top of the stack. We don't support multiple
4213 // arguments passed in memory when using inalloca.
4214 if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
4215 NumBytesToPush = 0;
4216 if (!ArgLocs.back().isMemLoc())
4217 report_fatal_error("cannot use inalloca attribute on a register "
4218 "parameter");
4219 if (ArgLocs.back().getLocMemOffset() != 0)
4220 report_fatal_error("any parameter with the inalloca attribute must be "
4221 "the only memory argument");
4222 } else if (CLI.IsPreallocated) {
4223 assert(ArgLocs.back().isMemLoc() &&(static_cast<void> (0))
4224 "cannot use preallocated attribute on a register "(static_cast<void> (0))
4225 "parameter")(static_cast<void> (0));
4226 SmallVector<size_t, 4> PreallocatedOffsets;
4227 for (size_t i = 0; i < CLI.OutVals.size(); ++i) {
4228 if (CLI.CB->paramHasAttr(i, Attribute::Preallocated)) {
4229 PreallocatedOffsets.push_back(ArgLocs[i].getLocMemOffset());
4230 }
4231 }
4232 auto *MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();
4233 size_t PreallocatedId = MFI->getPreallocatedIdForCallSite(CLI.CB);
4234 MFI->setPreallocatedStackSize(PreallocatedId, NumBytes);
4235 MFI->setPreallocatedArgOffsets(PreallocatedId, PreallocatedOffsets);
4236 NumBytesToPush = 0;
4237 }
4238
4239 if (!IsSibcall && !IsMustTail)
4240 Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
4241 NumBytes - NumBytesToPush, dl);
4242
4243 SDValue RetAddrFrIdx;
4244 // Load return address for tail calls.
4245 if (isTailCall && FPDiff)
4246 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
4247 Is64Bit, FPDiff, dl);
4248
4249 SmallVector<std::pair<Register, SDValue>, 8> RegsToPass;
4250 SmallVector<SDValue, 8> MemOpChains;
4251 SDValue StackPtr;
4252
4253 // The next loop assumes that the locations are in the same order of the
4254 // input arguments.
4255 assert(isSortedByValueNo(ArgLocs) &&(static_cast<void> (0))
4256 "Argument Location list must be sorted before lowering")(static_cast<void> (0));
4257
4258 // Walk the register/memloc assignments, inserting copies/loads. In the case
4259 // of tail call optimization arguments are handle later.
4260 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4261 for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
4262 ++I, ++OutIndex) {
4263 assert(OutIndex < Outs.size() && "Invalid Out index")(static_cast<void> (0));
4264 // Skip inalloca/preallocated arguments, they have already been written.
4265 ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
4266 if (Flags.isInAlloca() || Flags.isPreallocated())
4267 continue;
4268
4269 CCValAssign &VA = ArgLocs[I];
4270 EVT RegVT = VA.getLocVT();
4271 SDValue Arg = OutVals[OutIndex];
4272 bool isByVal = Flags.isByVal();
4273
4274 // Promote the value if needed.
4275 switch (VA.getLocInfo()) {
4276 default: llvm_unreachable("Unknown loc info!")__builtin_unreachable();
4277 case CCValAssign::Full: break;
4278 case CCValAssign::SExt:
4279 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
4280 break;
4281 case CCValAssign::ZExt:
4282 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
4283 break;
4284 case CCValAssign::AExt:
4285 if (Arg.getValueType().isVector() &&
4286 Arg.getValueType().getVectorElementType() == MVT::i1)
4287 Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
4288 else if (RegVT.is128BitVector()) {
4289 // Special case: passing MMX values in XMM registers.
4290 Arg = DAG.getBitcast(MVT::i64, Arg);
4291 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
4292 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
4293 } else
4294 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
4295 break;
4296 case CCValAssign::BCvt:
4297 Arg = DAG.getBitcast(RegVT, Arg);
4298 break;
4299 case CCValAssign::Indirect: {
4300 if (isByVal) {
4301 // Memcpy the argument to a temporary stack slot to prevent
4302 // the caller from seeing any modifications the callee may make
4303 // as guaranteed by the `byval` attribute.
4304 int FrameIdx = MF.getFrameInfo().CreateStackObject(
4305 Flags.getByValSize(),
4306 std::max(Align(16), Flags.getNonZeroByValAlign()), false);
4307 SDValue StackSlot =
4308 DAG.getFrameIndex(FrameIdx, getPointerTy(DAG.getDataLayout()));
4309 Chain =
4310 CreateCopyOfByValArgument(Arg, StackSlot, Chain, Flags, DAG, dl);
4311 // From now on treat this as a regular pointer
4312 Arg = StackSlot;
4313 isByVal = false;
4314 } else {
4315 // Store the argument.
4316 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
4317 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
4318 Chain = DAG.getStore(
4319 Chain, dl, Arg, SpillSlot,
4320 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
4321 Arg = SpillSlot;
4322 }
4323 break;
4324 }
4325 }
4326
4327 if (VA.needsCustom()) {
4328 assert(VA.getValVT() == MVT::v64i1 &&(static_cast<void> (0))
4329 "Currently the only custom case is when we split v64i1 to 2 regs")(static_cast<void> (0));
4330 // Split v64i1 value into two registers
4331 Passv64i1ArgInRegs(dl, DAG, Arg, RegsToPass, VA, ArgLocs[++I], Subtarget);
4332 } else if (VA.isRegLoc()) {
4333 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
4334 const TargetOptions &Options = DAG.getTarget().Options;
4335 if (Options.EmitCallSiteInfo)
4336 CSInfo.emplace_back(VA.getLocReg(), I);
4337 if (isVarArg && IsWin64) {
4338 // Win64 ABI requires argument XMM reg to be copied to the corresponding
4339 // shadow reg if callee is a varargs function.
4340 Register ShadowReg;
4341 switch (VA.getLocReg()) {
4342 case X86::XMM0: ShadowReg = X86::RCX; break;
4343 case X86::XMM1: ShadowReg = X86::RDX; break;
4344 case X86::XMM2: ShadowReg = X86::R8; break;
4345 case X86::XMM3: ShadowReg = X86::R9; break;
4346 }
4347 if (ShadowReg)
4348 RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
4349 }
4350 } else if (!IsSibcall && (!isTailCall || isByVal)) {
4351 assert(VA.isMemLoc())(static_cast<void> (0));
4352 if (!StackPtr.getNode())
4353 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
4354 getPointerTy(DAG.getDataLayout()));
4355 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
4356 dl, DAG, VA, Flags, isByVal));
4357 }
4358 }
4359
4360 if (!MemOpChains.empty())
4361 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
4362
4363 if (Subtarget.isPICStyleGOT()) {
4364 // ELF / PIC requires GOT in the EBX register before function calls via PLT
4365 // GOT pointer (except regcall).
4366 if (!isTailCall) {
4367 // Indirect call with RegCall calling convertion may use up all the
4368 // general registers, so it is not suitable to bind EBX reister for
4369 // GOT address, just let register allocator handle it.
4370 if (CallConv != CallingConv::X86_RegCall)
4371 RegsToPass.push_back(std::make_pair(
4372 Register(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
4373 getPointerTy(DAG.getDataLayout()))));
4374 } else {
4375 // If we are tail calling and generating PIC/GOT style code load the
4376 // address of the callee into ECX. The value in ecx is used as target of
4377 // the tail jump. This is done to circumvent the ebx/callee-saved problem
4378 // for tail calls on PIC/GOT architectures. Normally we would just put the
4379 // address of GOT into ebx and then call target@PLT. But for tail calls
4380 // ebx would be restored (since ebx is callee saved) before jumping to the
4381 // target@PLT.
4382
4383 // Note: The actual moving to ECX is done further down.
4384 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
4385 if (G && !G->getGlobal()->hasLocalLinkage() &&
4386 G->getGlobal()->hasDefaultVisibility())
4387 Callee = LowerGlobalAddress(Callee, DAG);
4388 else if (isa<ExternalSymbolSDNode>(Callee))
4389 Callee = LowerExternalSymbol(Callee, DAG);
4390 }
4391 }
4392
4393 if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
4394 // From AMD64 ABI document:
4395 // For calls that may call functions that use varargs or stdargs
4396 // (prototype-less calls or calls to functions containing ellipsis (...) in
4397 // the declaration) %al is used as hidden argument to specify the number
4398 // of SSE registers used. The contents of %al do not need to match exactly
4399 // the number of registers, but must be an ubound on the number of SSE
4400 // registers used and is in the range 0 - 8 inclusive.
4401
4402 // Count the number of XMM registers allocated.
4403 static const MCPhysReg XMMArgRegs[] = {
4404 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
4405 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
4406 };
4407 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
4408 assert((Subtarget.hasSSE1() || !NumXMMRegs)(static_cast<void> (0))
4409 && "SSE registers cannot be used when SSE is disabled")(static_cast<void> (0));
4410 RegsToPass.push_back(std::make_pair(Register(X86::AL),
4411 DAG.getConstant(NumXMMRegs, dl,
4412 MVT::i8)));
4413 }
4414
4415 if (isVarArg && IsMustTail) {
4416 const auto &Forwards = X86Info->getForwardedMustTailRegParms();
4417 for (const auto &F : Forwards) {
4418 SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
4419 RegsToPass.push_back(std::make_pair(F.PReg, Val));
4420 }
4421 }
4422
4423 // For tail calls lower the arguments to the 'real' stack slots. Sibcalls
4424 // don't need this because the eligibility check rejects calls that require
4425 // shuffling arguments passed in memory.
4426 if (!IsSibcall && isTailCall) {
4427 // Force all the incoming stack arguments to be loaded from the stack
4428 // before any new outgoing arguments are stored to the stack, because the
4429 // outgoing stack slots may alias the incoming argument stack slots, and
4430 // the alias isn't otherwise explicit. This is slightly more conservative
4431 // than necessary, because it means that each store effectively depends
4432 // on every argument instead of just those arguments it would clobber.
4433 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
4434
4435 SmallVector<SDValue, 8> MemOpChains2;
4436 SDValue FIN;
4437 int FI = 0;
4438 for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
4439 ++I, ++OutsIndex) {
4440 CCValAssign &VA = ArgLocs[I];
4441
4442 if (VA.isRegLoc()) {
4443 if (VA.needsCustom()) {
4444 assert((CallConv == CallingConv::X86_RegCall) &&(static_cast<void> (0))
4445 "Expecting custom case only in regcall calling convention")(static_cast<void> (0));
4446 // This means that we are in special case where one argument was
4447 // passed through two register locations - Skip the next location
4448 ++I;
4449 }
4450
4451 continue;
4452 }
4453
4454 assert(VA.isMemLoc())(static_cast<void> (0));
4455 SDValue Arg = OutVals[OutsIndex];
4456 ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
4457 // Skip inalloca/preallocated arguments. They don't require any work.
4458 if (Flags.isInAlloca() || Flags.isPreallocated())
4459 continue;
4460 // Create frame index.
4461 int32_t Offset = VA.getLocMemOffset()+FPDiff;
4462 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
4463 FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
4464 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
4465
4466 if (Flags.isByVal()) {
4467 // Copy relative to framepointer.
4468 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
4469 if (!StackPtr.getNode())
4470 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
4471 getPointerTy(DAG.getDataLayout()));
4472 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
4473 StackPtr, Source);
4474
4475 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
4476 ArgChain,
4477 Flags, DAG, dl));
4478 } else {
4479 // Store relative to framepointer.
4480 MemOpChains2.push_back(DAG.getStore(
4481 ArgChain, dl, Arg, FIN,
4482 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
4483 }
4484 }
4485
4486 if (!MemOpChains2.empty())
4487 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
4488
4489 // Store the return address to the appropriate stack slot.
4490 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
4491 getPointerTy(DAG.getDataLayout()),
4492 RegInfo->getSlotSize(), FPDiff, dl);
4493 }
4494
4495 // Build a sequence of copy-to-reg nodes chained together with token chain
4496 // and flag operands which copy the outgoing args into registers.
4497 SDValue InFlag;
4498 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
4499 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
4500 RegsToPass[i].second, InFlag);
4501 InFlag = Chain.getValue(1);
4502 }
4503
4504 if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
4505 assert(Is64Bit && "Large code model is only legal in 64-bit mode.")(static_cast<void> (0));
4506 // In the 64-bit large code model, we have to make all calls
4507 // through a register, since the call instruction's 32-bit
4508 // pc-relative offset may not be large enough to hold the whole
4509 // address.
4510 } else if (Callee->getOpcode() == ISD::GlobalAddress ||
4511 Callee->getOpcode() == ISD::ExternalSymbol) {
4512 // Lower direct calls to global addresses and external symbols. Setting
4513 // ForCall to true here has the effect of removing WrapperRIP when possible
4514 // to allow direct calls to be selected without first materializing the
4515 // address into a register.
4516 Callee = LowerGlobalOrExternal(Callee, DAG, /*ForCall=*/true);
4517 } else if (Subtarget.isTarget64BitILP32() &&
4518 Callee->getValueType(0) == MVT::i32) {
4519 // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
4520 Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
4521 }
4522
4523 // Returns a chain & a flag for retval copy to use.
4524 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
4525 SmallVector<SDValue, 8> Ops;
4526
4527 if (!IsSibcall && isTailCall && !IsMustTail) {
4528 Chain = DAG.getCALLSEQ_END(Chain,
4529 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
4530 DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
4531 InFlag = Chain.getValue(1);
4532 }
4533
4534 Ops.push_back(Chain);
4535 Ops.push_back(Callee);
4536
4537 if (isTailCall)
4538 Ops.push_back(DAG.getTargetConstant(FPDiff, dl, MVT::i32));
4539
4540 // Add argument registers to the end of the list so that they are known live
4541 // into the call.
4542 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
4543 Ops.push_back(DAG.getRegister(RegsToPass[i].first,
4544 RegsToPass[i].second.getValueType()));
4545
4546 // Add a register mask operand representing the call-preserved registers.
4547 const uint32_t *Mask = [&]() {
4548 auto AdaptedCC = CallConv;
4549 // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists),
4550 // use X86_INTR calling convention because it has the same CSR mask
4551 // (same preserved registers).
4552 if (HasNCSR)
4553 AdaptedCC = (CallingConv::ID)CallingConv::X86_INTR;
4554 // If NoCalleeSavedRegisters is requested, than use GHC since it happens
4555 // to use the CSR_NoRegs_RegMask.
4556 if (CB && CB->hasFnAttr("no_callee_saved_registers"))
4557 AdaptedCC = (CallingConv::ID)CallingConv::GHC;
4558 return RegInfo->getCallPreservedMask(MF, AdaptedCC);
4559 }();
4560 assert(Mask && "Missing call preserved mask for calling convention")(static_cast<void> (0));
4561
4562 // If this is an invoke in a 32-bit function using a funclet-based
4563 // personality, assume the function clobbers all registers. If an exception
4564 // is thrown, the runtime will not restore CSRs.
4565 // FIXME: Model this more precisely so that we can register allocate across
4566 // the normal edge and spill and fill across the exceptional edge.
4567 if (!Is64Bit && CLI.CB && isa<InvokeInst>(CLI.CB)) {
4568 const Function &CallerFn = MF.getFunction();
4569 EHPersonality Pers =
4570 CallerFn.hasPersonalityFn()
4571 ? classifyEHPersonality(CallerFn.getPersonalityFn())
4572 : EHPersonality::Unknown;
4573 if (isFuncletEHPersonality(Pers))
4574 Mask = RegInfo->getNoPreservedMask();
4575 }
4576
4577 // Define a new register mask from the existing mask.
4578 uint32_t *RegMask = nullptr;
4579
4580 // In some calling conventions we need to remove the used physical registers
4581 // from the reg mask.
4582 if (CallConv == CallingConv::X86_RegCall || HasNCSR) {
4583 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
4584
4585 // Allocate a new Reg Mask and copy Mask.
4586 RegMask = MF.allocateRegMask();
4587 unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs());
4588 memcpy(RegMask, Mask, sizeof(RegMask[0]) * RegMaskSize);
4589
4590 // Make sure all sub registers of the argument registers are reset
4591 // in the RegMask.
4592 for (auto const &RegPair : RegsToPass)
4593 for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true);
4594 SubRegs.isValid(); ++SubRegs)
4595 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
4596
4597 // Create the RegMask Operand according to our updated mask.
4598 Ops.push_back(DAG.getRegisterMask(RegMask));
4599 } else {
4600 // Create the RegMask Operand according to the static mask.
4601 Ops.push_back(DAG.getRegisterMask(Mask));
4602 }
4603
4604 if (InFlag.getNode())
4605 Ops.push_back(InFlag);
4606
4607 if (isTailCall) {
4608 // We used to do:
4609 //// If this is the first return lowered for this function, add the regs
4610 //// to the liveout set for the function.
4611 // This isn't right, although it's probably harmless on x86; liveouts
4612 // should be computed from returns not tail calls. Consider a void
4613 // function making a tail call to a function returning int.
4614 MF.getFrameInfo().setHasTailCall();
4615 SDValue Ret = DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
4616 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
4617 return Ret;
4618 }
4619
4620 if (HasNoCfCheck && IsCFProtectionSupported && IsIndirectCall) {
4621 Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops);
4622 } else if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
4623 // Calls with a "clang.arc.attachedcall" bundle are special. They should be
4624 // expanded to the call, directly followed by a special marker sequence and
4625 // a call to a ObjC library function. Use the CALL_RVMARKER to do that.
4626 assert(!isTailCall &&(static_cast<void> (0))
4627 "tail calls cannot be marked with clang.arc.attachedcall")(static_cast<void> (0));
4628 assert(Is64Bit && "clang.arc.attachedcall is only supported in 64bit mode")(static_cast<void> (0));
4629
4630 // Add target constant to select ObjC runtime call just before the call
4631 // target. RuntimeCallType == 0 selects objc_retainAutoreleasedReturnValue,
4632 // RuntimeCallType == 0 selects objc_unsafeClaimAutoreleasedReturnValue when
4633 // epxanding the pseudo.
4634 unsigned RuntimeCallType =
4635 objcarc::hasAttachedCallOpBundle(CLI.CB, true) ? 0 : 1;
4636 Ops.insert(Ops.begin() + 1,
4637 DAG.getTargetConstant(RuntimeCallType, dl, MVT::i32));
4638 Chain = DAG.getNode(X86ISD::CALL_RVMARKER, dl, NodeTys, Ops);
4639 } else {
4640 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
4641 }
4642
4643 InFlag = Chain.getValue(1);
4644 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
4645 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
4646
4647 // Save heapallocsite metadata.
4648 if (CLI.CB)
4649 if (MDNode *HeapAlloc = CLI.CB->getMetadata("heapallocsite"))
4650 DAG.addHeapAllocSite(Chain.getNode(), HeapAlloc);
4651
4652 // Create the CALLSEQ_END node.
4653 unsigned NumBytesForCalleeToPop;
4654 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
4655 DAG.getTarget().Options.GuaranteedTailCallOpt))
4656 NumBytesForCalleeToPop = NumBytes; // Callee pops everything
4657 else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
4658 !Subtarget.getTargetTriple().isOSMSVCRT() &&
4659 SR == StackStructReturn)
4660 // If this is a call to a struct-return function, the callee
4661 // pops the hidden struct pointer, so we have to push it back.
4662 // This is common for Darwin/X86, Linux & Mingw32 targets.
4663 // For MSVC Win32 targets, the caller pops the hidden struct pointer.
4664 NumBytesForCalleeToPop = 4;
4665 else
4666 NumBytesForCalleeToPop = 0; // Callee pops nothing.
4667
4668 // Returns a flag for retval copy to use.
4669 if (!IsSibcall) {
4670 Chain = DAG.getCALLSEQ_END(Chain,
4671 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
4672 DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
4673 true),
4674 InFlag, dl);
4675 InFlag = Chain.getValue(1);
4676 }
4677
4678 // Handle result values, copying them out of physregs into vregs that we
4679 // return.
4680 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
4681 InVals, RegMask);
4682}
4683
4684//===----------------------------------------------------------------------===//
4685// Fast Calling Convention (tail call) implementation
4686//===----------------------------------------------------------------------===//
4687
4688// Like std call, callee cleans arguments, convention except that ECX is
4689// reserved for storing the tail called function address. Only 2 registers are
4690// free for argument passing (inreg). Tail call optimization is performed
4691// provided:
4692// * tailcallopt is enabled
4693// * caller/callee are fastcc
4694// On X86_64 architecture with GOT-style position independent code only local
4695// (within module) calls are supported at the moment.
4696// To keep the stack aligned according to platform abi the function
4697// GetAlignedArgumentStackSize ensures that argument delta is always multiples
4698// of stack alignment. (Dynamic linkers need this - Darwin's dyld for example)
4699// If a tail called function callee has more arguments than the caller the
4700// caller needs to make sure that there is room to move the RETADDR to. This is
4701// achieved by reserving an area the size of the argument delta right after the
4702// original RETADDR, but before the saved framepointer or the spilled registers
4703// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
4704// stack layout:
4705// arg1
4706// arg2
4707// RETADDR
4708// [ new RETADDR
4709// move area ]
4710// (possible EBP)
4711// ESI
4712// EDI
4713// local1 ..
4714
4715/// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
4716/// requirement.
4717unsigned
4718X86TargetLowering::GetAlignedArgumentStackSize(const unsigned StackSize,
4719 SelectionDAG &DAG) const {
4720 const Align StackAlignment = Subtarget.getFrameLowering()->getStackAlign();
4721 const uint64_t SlotSize = Subtarget.getRegisterInfo()->getSlotSize();
4722 assert(StackSize % SlotSize == 0 &&(static_cast<void> (0))
4723 "StackSize must be a multiple of SlotSize")(static_cast<void> (0));
4724 return alignTo(StackSize + SlotSize, StackAlignment) - SlotSize;
4725}
4726
4727/// Return true if the given stack call argument is already available in the
4728/// same position (relatively) of the caller's incoming argument stack.
4729static
4730bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
4731 MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
4732 const X86InstrInfo *TII, const CCValAssign &VA) {
4733 unsigned Bytes = Arg.getValueSizeInBits() / 8;
4734
4735 for (;;) {
4736 // Look through nodes that don't alter the bits of the incoming value.
4737 unsigned Op = Arg.getOpcode();
4738 if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
4739 Arg = Arg.getOperand(0);
4740 continue;
4741 }
4742 if (Op == ISD::TRUNCATE) {
4743 const SDValue &TruncInput = Arg.getOperand(0);
4744 if (TruncInput.getOpcode() == ISD::AssertZext &&
4745 cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
4746 Arg.getValueType()) {
4747 Arg = TruncInput.getOperand(0);
4748 continue;
4749 }
4750 }
4751 break;
4752 }
4753
4754 int FI = INT_MAX2147483647;
4755 if (Arg.getOpcode() == ISD::CopyFromReg) {
4756 Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
4757 if (!VR.isVirtual())
4758 return false;
4759 MachineInstr *Def = MRI->getVRegDef(VR);
4760 if (!Def)
4761 return false;
4762 if (!Flags.isByVal()) {
4763 if (!TII->isLoadFromStackSlot(*Def, FI))
4764 return false;
4765 } else {
4766 unsigned Opcode = Def->getOpcode();
4767 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
4768 Opcode == X86::LEA64_32r) &&
4769 Def->getOperand(1).isFI()) {
4770 FI = Def->getOperand(1).getIndex();
4771 Bytes = Flags.getByValSize();
4772 } else
4773 return false;
4774 }
4775 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
4776 if (Flags.isByVal())
4777 // ByVal argument is passed in as a pointer but it's now being
4778 // dereferenced. e.g.
4779 // define @foo(%struct.X* %A) {
4780 // tail call @bar(%struct.X* byval %A)
4781 // }
4782 return false;
4783 SDValue Ptr = Ld->getBasePtr();
4784 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
4785 if (!FINode)
4786 return false;
4787 FI = FINode->getIndex();
4788 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
4789 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
4790 FI = FINode->getIndex();
4791 Bytes = Flags.getByValSize();
4792 } else
4793 return false;
4794
4795 assert(FI != INT_MAX)(static_cast<void> (0));
4796 if (!MFI.isFixedObjectIndex(FI))
4797 return false;
4798
4799 if (Offset != MFI.getObjectOffset(FI))
4800 return false;
4801
4802 // If this is not byval, check that the argument stack object is immutable.
4803 // inalloca and argument copy elision can create mutable argument stack
4804 // objects. Byval objects can be mutated, but a byval call intends to pass the
4805 // mutated memory.
4806 if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
4807 return false;
4808
4809 if (VA.getLocVT().getFixedSizeInBits() >
4810 Arg.getValueSizeInBits().getFixedSize()) {
4811 // If the argument location is wider than the argument type, check that any
4812 // extension flags match.
4813 if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
4814 Flags.isSExt() != MFI.isObjectSExt(FI)) {
4815 return false;
4816 }
4817 }
4818
4819 return Bytes == MFI.getObjectSize(FI);
4820}
4821
4822/// Check whether the call is eligible for tail call optimization. Targets
4823/// that want to do tail call optimization should implement this function.
4824bool X86TargetLowering::IsEligibleForTailCallOptimization(
4825 SDValue Callee, CallingConv::ID CalleeCC, bool IsCalleeStackStructRet,
4826 bool isVarArg, Type *RetTy, const SmallVectorImpl<ISD::OutputArg> &Outs,
4827 const SmallVectorImpl<SDValue> &OutVals,
4828 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
4829 if (!mayTailCallThisCC(CalleeCC))
4830 return false;
4831
4832 // If -tailcallopt is specified, make fastcc functions tail-callable.
4833 MachineFunction &MF = DAG.getMachineFunction();
4834 const Function &CallerF = MF.getFunction();
4835
4836 // If the function return type is x86_fp80 and the callee return type is not,
4837 // then the FP_EXTEND of the call result is not a nop. It's not safe to
4838 // perform a tailcall optimization here.
4839 if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
4840 return false;
4841
4842 CallingConv::ID CallerCC = CallerF.getCallingConv();
4843 bool CCMatch = CallerCC == CalleeCC;
4844 bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
4845 bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
4846 bool IsGuaranteeTCO = DAG.getTarget().Options.GuaranteedTailCallOpt ||
4847 CalleeCC == CallingConv::Tail || CalleeCC == CallingConv::SwiftTail;
4848
4849 // Win64 functions have extra shadow space for argument homing. Don't do the
4850 // sibcall if the caller and callee have mismatched expectations for this
4851 // space.
4852 if (IsCalleeWin64 != IsCallerWin64)
4853 return false;
4854
4855 if (IsGuaranteeTCO) {
4856 if (canGuaranteeTCO(CalleeCC) && CCMatch)
4857 return true;
4858 return false;
4859 }
4860
4861 // Look for obvious safe cases to perform tail call optimization that do not
4862 // require ABI changes. This is what gcc calls sibcall.
4863
4864 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
4865 // emit a special epilogue.
4866 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4867 if (RegInfo->hasStackRealignment(MF))
4868 return false;
4869
4870 // Also avoid sibcall optimization if we're an sret return fn and the callee
4871 // is incompatible. See comment in LowerReturn about why hasStructRetAttr is
4872 // insufficient.
4873 if (MF.getInfo<X86MachineFunctionInfo>()->getSRetReturnReg()) {
4874 // For a compatible tail call the callee must return our sret pointer. So it
4875 // needs to be (a) an sret function itself and (b) we pass our sret as its
4876 // sret. Condition #b is harder to determine.
4877 return false;
4878 } else if (Subtarget.is32Bit() && IsCalleeStackStructRet)
4879 // In the i686 ABI, the sret pointer is callee-pop, so we cannot tail-call,
4880 // as our caller doesn't expect that.
4881 return false;
4882
4883 // Do not sibcall optimize vararg calls unless all arguments are passed via
4884 // registers.
4885 LLVMContext &C = *DAG.getContext();
4886 if (isVarArg && !Outs.empty()) {
4887 // Optimizing for varargs on Win64 is unlikely to be safe without
4888 // additional testing.
4889 if (IsCalleeWin64 || IsCallerWin64)
4890 return false;
4891
4892 SmallVector<CCValAssign, 16> ArgLocs;
4893 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4894
4895 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4896 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
4897 if (!ArgLocs[i].isRegLoc())
4898 return false;
4899 }
4900
4901 // If the call result is in ST0 / ST1, it needs to be popped off the x87
4902 // stack. Therefore, if it's not used by the call it is not safe to optimize
4903 // this into a sibcall.
4904 bool Unused = false;
4905 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
4906 if (!Ins[i].Used) {
4907 Unused = true;
4908 break;
4909 }
4910 }
4911 if (Unused) {
4912 SmallVector<CCValAssign, 16> RVLocs;
4913 CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
4914 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
4915 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
4916 CCValAssign &VA = RVLocs[i];
4917 if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
4918 return false;
4919 }
4920 }
4921
4922 // Check that the call results are passed in the same way.
4923 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
4924 RetCC_X86, RetCC_X86))
4925 return false;
4926 // The callee has to preserve all registers the caller needs to preserve.
4927 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
4928 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
4929 if (!CCMatch) {
4930 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4931 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4932 return false;
4933 }
4934
4935 unsigned StackArgsSize = 0;
4936
4937 // If the callee takes no arguments then go on to check the results of the
4938 // call.
4939 if (!Outs.empty()) {
4940 // Check if stack adjustment is needed. For now, do not do this if any
4941 // argument is passed on the stack.
4942 SmallVector<CCValAssign, 16> ArgLocs;
4943 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4944
4945 // Allocate shadow area for Win64
4946 if (IsCalleeWin64)
4947 CCInfo.AllocateStack(32, Align(8));
4948
4949 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4950 StackArgsSize = CCInfo.getNextStackOffset();
4951
4952 if (CCInfo.getNextStackOffset()) {
4953 // Check if the arguments are already laid out in the right way as
4954 // the caller's fixed stack objects.
4955 MachineFrameInfo &MFI = MF.getFrameInfo();
4956 const MachineRegisterInfo *MRI = &MF.getRegInfo();
4957 const X86InstrInfo *TII = Subtarget.getInstrInfo();
4958 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4959 CCValAssign &VA = ArgLocs[i];
4960 SDValue Arg = OutVals[i];
4961 ISD::ArgFlagsTy Flags = Outs[i].Flags;
4962 if (VA.getLocInfo() == CCValAssign::Indirect)
4963 return false;
4964 if (!VA.isRegLoc()) {
4965 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
4966 MFI, MRI, TII, VA))
4967 return false;
4968 }
4969 }
4970 }
4971
4972 bool PositionIndependent = isPositionIndependent();
4973 // If the tailcall address may be in a register, then make sure it's
4974 // possible to register allocate for it. In 32-bit, the call address can
4975 // only target EAX, EDX, or ECX since the tail call must be scheduled after
4976 // callee-saved registers are restored. These happen to be the same
4977 // registers used to pass 'inreg' arguments so watch out for those.
4978 if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
4979 !isa<ExternalSymbolSDNode>(Callee)) ||
4980 PositionIndependent)) {
4981 unsigned NumInRegs = 0;
4982 // In PIC we need an extra register to formulate the address computation
4983 // for the callee.
4984 unsigned MaxInRegs = PositionIndependent ? 2 : 3;
4985
4986 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4987 CCValAssign &VA = ArgLocs[i];
4988 if (!VA.isRegLoc())
4989 continue;
4990 Register Reg = VA.getLocReg();
4991 switch (Reg) {
4992 default: break;
4993 case X86::EAX: case X86::EDX: case X86::ECX:
4994 if (++NumInRegs == MaxInRegs)
4995 return false;
4996 break;
4997 }
4998 }
4999 }
5000
5001 const MachineRegisterInfo &MRI = MF.getRegInfo();
5002 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
5003 return false;
5004 }
5005
5006 bool CalleeWillPop =
5007 X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
5008 MF.getTarget().Options.GuaranteedTailCallOpt);
5009
5010 if (unsigned BytesToPop =
5011 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
5012 // If we have bytes to pop, the callee must pop them.
5013 bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
5014 if (!CalleePopMatches)
5015 return false;
5016 } else if (CalleeWillPop && StackArgsSize > 0) {
5017 // If we don't have bytes to pop, make sure the callee doesn't pop any.
5018 return false;
5019 }
5020
5021 return true;
5022}
5023
5024FastISel *
5025X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
5026 const TargetLibraryInfo *libInfo) const {
5027 return X86::createFastISel(funcInfo, libInfo);
5028}
5029
5030//===----------------------------------------------------------------------===//
5031// Other Lowering Hooks
5032//===----------------------------------------------------------------------===//
5033
5034static bool MayFoldLoad(SDValue Op, bool AssumeSingleUse = false) {
5035 return (AssumeSingleUse || Op.hasOneUse()) && ISD::isNormalLoad(Op.getNode());
5036}
5037
5038static bool MayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT,
5039 bool AssumeSingleUse = false) {
5040 if (!MayFoldLoad(Op, AssumeSingleUse))
5041 return false;
5042
5043 // We can not replace a wide volatile load with a broadcast-from-memory,
5044 // because that would narrow the load, which isn't legal for volatiles.
5045 const LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op.getNode());
5046 return !Ld->isVolatile() ||
5047 Ld->getValueSizeInBits(0) == EltVT.getScalarSizeInBits();
5048}
5049
5050static bool MayFoldIntoStore(SDValue Op) {
5051 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
5052}
5053
5054static bool MayFoldIntoZeroExtend(SDValue Op) {
5055 if (Op.hasOneUse()) {
5056 unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
5057 return (ISD::ZERO_EXTEND == Opcode);
5058 }
5059 return false;
5060}
5061
5062static bool isTargetShuffle(unsigned Opcode) {
5063 switch(Opcode) {
5064 default: return false;
5065 case X86ISD::BLENDI:
5066 case X86ISD::PSHUFB:
5067 case X86ISD::PSHUFD:
5068 case X86ISD::PSHUFHW:
5069 case X86ISD::PSHUFLW:
5070 case X86ISD::SHUFP:
5071 case X86ISD::INSERTPS:
5072 case X86ISD::EXTRQI:
5073 case X86ISD::INSERTQI:
5074 case X86ISD::VALIGN:
5075 case X86ISD::PALIGNR:
5076 case X86ISD::VSHLDQ:
5077 case X86ISD::VSRLDQ:
5078 case X86ISD::MOVLHPS:
5079 case X86ISD::MOVHLPS:
5080 case X86ISD::MOVSHDUP:
5081 case X86ISD::MOVSLDUP:
5082 case X86ISD::MOVDDUP:
5083 case X86ISD::MOVSS:
5084 case X86ISD::MOVSD:
5085 case X86ISD::MOVSH:
5086 case X86ISD::UNPCKL:
5087 case X86ISD::UNPCKH:
5088 case X86ISD::VBROADCAST:
5089 case X86ISD::VPERMILPI:
5090 case X86ISD::VPERMILPV:
5091 case X86ISD::VPERM2X128:
5092 case X86ISD::SHUF128:
5093 case X86ISD::VPERMIL2:
5094 case X86ISD::VPERMI:
5095 case X86ISD::VPPERM:
5096 case X86ISD::VPERMV:
5097 case X86ISD::VPERMV3:
5098 case X86ISD::VZEXT_MOVL:
5099 return true;
5100 }
5101}
5102
5103static bool isTargetShuffleVariableMask(unsigned Opcode) {
5104 switch (Opcode) {
5105 default: return false;
5106 // Target Shuffles.
5107 case X86ISD::PSHUFB:
5108 case X86ISD::VPERMILPV:
5109 case X86ISD::VPERMIL2:
5110 case X86ISD::VPPERM:
5111 case X86ISD::VPERMV:
5112 case X86ISD::VPERMV3:
5113 return true;
5114 // 'Faux' Target Shuffles.
5115 case ISD::OR:
5116 case ISD::AND:
5117 case X86ISD::ANDNP:
5118 return true;
5119 }
5120}
5121
5122static bool isTargetShuffleSplat(SDValue Op) {
5123 unsigned Opcode = Op.getOpcode();
5124 if (Opcode == ISD::EXTRACT_SUBVECTOR)
5125 return isTargetShuffleSplat(Op.getOperand(0));
5126 return Opcode == X86ISD::VBROADCAST || Opcode == X86ISD::VBROADCAST_LOAD;
5127}
5128
5129SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
5130 MachineFunction &MF = DAG.getMachineFunction();
5131 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
5132 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
5133 int ReturnAddrIndex = FuncInfo->getRAIndex();
5134
5135 if (ReturnAddrIndex == 0) {
5136 // Set up a frame object for the return address.
5137 unsigned SlotSize = RegInfo->getSlotSize();
5138 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
5139 -(int64_t)SlotSize,
5140 false);
5141 FuncInfo->setRAIndex(ReturnAddrIndex);
5142 }
5143
5144 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
5145}
5146
5147bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
5148 bool hasSymbolicDisplacement) {
5149 // Offset should fit into 32 bit immediate field.
5150 if (!isInt<32>(Offset))
5151 return false;
5152
5153 // If we don't have a symbolic displacement - we don't have any extra
5154 // restrictions.
5155 if (!hasSymbolicDisplacement)
5156 return true;
5157
5158 // FIXME: Some tweaks might be needed for medium code model.
5159 if (M != CodeModel::Small && M != CodeModel::Kernel)
5160 return false;
5161
5162 // For small code model we assume that latest object is 16MB before end of 31
5163 // bits boundary. We may also accept pretty large negative constants knowing
5164 // that all objects are in the positive half of address space.
5165 if (M == CodeModel::Small && Offset < 16*1024*1024)
5166 return true;
5167
5168 // For kernel code model we know that all object resist in the negative half
5169 // of 32bits address space. We may not accept negative offsets, since they may
5170 // be just off and we may accept pretty large positive ones.
5171 if (M == CodeModel::Kernel && Offset >= 0)
5172 return true;
5173
5174 return false;
5175}
5176
5177/// Determines whether the callee is required to pop its own arguments.
5178/// Callee pop is necessary to support tail calls.
5179bool X86::isCalleePop(CallingConv::ID CallingConv,
5180 bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
5181 // If GuaranteeTCO is true, we force some calls to be callee pop so that we
5182 // can guarantee TCO.
5183 if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
5184 return true;
5185
5186 switch (CallingConv) {
5187 default:
5188 return false;
5189 case CallingConv::X86_StdCall:
5190 case CallingConv::X86_FastCall:
5191 case CallingConv::X86_ThisCall:
5192 case CallingConv::X86_VectorCall:
5193 return !is64Bit;
5194 }
5195}
5196
5197/// Return true if the condition is an signed comparison operation.
5198static bool isX86CCSigned(unsigned X86CC) {
5199 switch (X86CC) {
5200 default:
5201 llvm_unreachable("Invalid integer condition!")__builtin_unreachable();
5202 case X86::COND_E:
5203 case X86::COND_NE:
5204 case X86::COND_B:
5205 case X86::COND_A:
5206 case X86::COND_BE:
5207 case X86::COND_AE:
5208 return false;
5209 case X86::COND_G:
5210 case X86::COND_GE:
5211 case X86::COND_L:
5212 case X86::COND_LE:
5213 return true;
5214 }
5215}
5216
5217static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
5218 switch (SetCCOpcode) {
5219 default: llvm_unreachable("Invalid integer condition!")__builtin_unreachable();
5220 case ISD::SETEQ: return X86::COND_E;
5221 case ISD::SETGT: return X86::COND_G;
5222 case ISD::SETGE: return X86::COND_GE;
5223 case ISD::SETLT: return X86::COND_L;
5224 case ISD::SETLE: return X86::COND_LE;
5225 case ISD::SETNE: return X86::COND_NE;
5226 case ISD::SETULT: return X86::COND_B;
5227 case ISD::SETUGT: return X86::COND_A;
5228 case ISD::SETULE: return X86::COND_BE;
5229 case ISD::SETUGE: return X86::COND_AE;
5230 }
5231}
5232
5233/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
5234/// condition code, returning the condition code and the LHS/RHS of the
5235/// comparison to make.
5236static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
5237 bool isFP, SDValue &LHS, SDValue &RHS,
5238 SelectionDAG &DAG) {
5239 if (!isFP) {
5240 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
5241 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
5242 // X > -1 -> X == 0, jump !sign.
5243 RHS = DAG.getConstant(0, DL, RHS.getValueType());
5244 return X86::COND_NS;
5245 }
5246 if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
5247 // X < 0 -> X == 0, jump on sign.
5248 return X86::COND_S;
5249 }
5250 if (SetCCOpcode == ISD::SETGE && RHSC->isNullValue()) {
5251 // X >= 0 -> X == 0, jump on !sign.
5252 return X86::COND_NS;
5253 }
5254 if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {
5255 // X < 1 -> X <= 0
5256 RHS = DAG.getConstant(0, DL, RHS.getValueType());
5257 return X86::COND_LE;
5258 }
5259 }
5260
5261 return TranslateIntegerX86CC(SetCCOpcode);
5262 }
5263
5264 // First determine if it is required or is profitable to flip the operands.
5265
5266 // If LHS is a foldable load, but RHS is not, flip the condition.
5267 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
5268 !ISD::isNON_EXTLoad(RHS.getNode())) {
5269 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
5270 std::swap(LHS, RHS);
5271 }
5272
5273 switch (SetCCOpcode) {
5274 default: break;
5275 case ISD::SETOLT:
5276 case ISD::SETOLE:
5277 case ISD::SETUGT:
5278 case ISD::SETUGE:
5279 std::swap(LHS, RHS);
5280 break;
5281 }
5282
5283 // On a floating point condition, the flags are set as follows:
5284 // ZF PF CF op
5285 // 0 | 0 | 0 | X > Y
5286 // 0 | 0 | 1 | X < Y
5287 // 1 | 0 | 0 | X == Y
5288 // 1 | 1 | 1 | unordered
5289 switch (SetCCOpcode) {
5290 default: llvm_unreachable("Condcode should be pre-legalized away")__builtin_unreachable();
5291 case ISD::SETUEQ:
5292 case ISD::SETEQ: return X86::COND_E;
5293 case ISD::SETOLT: // flipped
5294 case ISD::SETOGT:
5295 case ISD::SETGT: return X86::COND_A;
5296 case ISD::SETOLE: // flipped
5297 case ISD::SETOGE:
5298 case ISD::SETGE: return X86::COND_AE;
5299 case ISD::SETUGT: // flipped
5300 case ISD::SETULT:
5301 case ISD::SETLT: return X86::COND_B;
5302 case ISD::SETUGE: // flipped
5303 case ISD::SETULE:
5304 case ISD::SETLE: return X86::COND_BE;
5305 case ISD::SETONE:
5306 case ISD::SETNE: return X86::COND_NE;
5307 case ISD::SETUO: return X86::COND_P;
5308 case ISD::SETO: return X86::COND_NP;
5309 case ISD::SETOEQ:
5310 case ISD::SETUNE: return X86::COND_INVALID;
5311 }
5312}
5313
5314/// Is there a floating point cmov for the specific X86 condition code?
5315/// Current x86 isa includes the following FP cmov instructions:
5316/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
5317static bool hasFPCMov(unsigned X86CC) {
5318 switch (X86CC) {
5319 default:
5320 return false;
5321 case X86::COND_B:
5322 case X86::COND_BE:
5323 case X86::COND_E:
5324 case X86::COND_P:
5325 case X86::COND_A:
5326 case X86::COND_AE:
5327 case X86::COND_NE:
5328 case X86::COND_NP:
5329 return true;
5330 }
5331}
5332
5333
5334bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
5335 const CallInst &I,
5336 MachineFunction &MF,
5337 unsigned Intrinsic) const {
5338 Info.flags = MachineMemOperand::MONone;
5339 Info.offset = 0;
5340
5341 const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
5342 if (!IntrData) {
5343 switch (Intrinsic) {
5344 case Intrinsic::x86_aesenc128kl:
5345 case Intrinsic::x86_aesdec128kl:
5346 Info.opc = ISD::INTRINSIC_W_CHAIN;
5347 Info.ptrVal = I.getArgOperand(1);
5348 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
5349 Info.align = Align(1);
5350 Info.flags |= MachineMemOperand::MOLoad;
5351 return true;
5352 case Intrinsic::x86_aesenc256kl:
5353 case Intrinsic::x86_aesdec256kl:
5354 Info.opc = ISD::INTRINSIC_W_CHAIN;
5355 Info.ptrVal = I.getArgOperand(1);
5356 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
5357 Info.align = Align(1);
5358 Info.flags |= MachineMemOperand::MOLoad;
5359 return true;
5360 case Intrinsic::x86_aesencwide128kl:
5361 case Intrinsic::x86_aesdecwide128kl:
5362 Info.opc = ISD::INTRINSIC_W_CHAIN;
5363 Info.ptrVal = I.getArgOperand(0);
5364 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
5365 Info.align = Align(1);
5366 Info.flags |= MachineMemOperand::MOLoad;
5367 return true;
5368 case Intrinsic::x86_aesencwide256kl:
5369 case Intrinsic::x86_aesdecwide256kl:
5370 Info.opc = ISD::INTRINSIC_W_CHAIN;
5371 Info.ptrVal = I.getArgOperand(0);
5372 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
5373 Info.align = Align(1);
5374 Info.flags |= MachineMemOperand::MOLoad;
5375 return true;
5376 }
5377 return false;
5378 }
5379
5380 switch (IntrData->Type) {
5381 case TRUNCATE_TO_MEM_VI8:
5382 case TRUNCATE_TO_MEM_VI16:
5383 case TRUNCATE_TO_MEM_VI32: {
5384 Info.opc = ISD::INTRINSIC_VOID;
5385 Info.ptrVal = I.getArgOperand(0);
5386 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
5387 MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
5388 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
5389 ScalarVT = MVT::i8;
5390 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
5391 ScalarVT = MVT::i16;
5392 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
5393 ScalarVT = MVT::i32;
5394
5395 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
5396 Info.align = Align(1);
5397 Info.flags |= MachineMemOperand::MOStore;
5398 break;
5399 }
5400 case GATHER:
5401 case GATHER_AVX2: {
5402 Info.opc = ISD::INTRINSIC_W_CHAIN;
5403 Info.ptrVal = nullptr;
5404 MVT DataVT = MVT::getVT(I.getType());
5405 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
5406 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
5407 IndexVT.getVectorNumElements());
5408 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
5409 Info.align = Align(1);
5410 Info.flags |= MachineMemOperand::MOLoad;
5411 break;
5412 }
5413 case SCATTER: {
5414 Info.opc = ISD::INTRINSIC_VOID;
5415 Info.ptrVal = nullptr;
5416 MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
5417 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
5418 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
5419 IndexVT.getVectorNumElements());
5420 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
5421 Info.align = Align(1);
5422 Info.flags |= MachineMemOperand::MOStore;
5423 break;
5424 }
5425 default:
5426 return false;
5427 }
5428
5429 return true;
5430}
5431
5432/// Returns true if the target can instruction select the
5433/// specified FP immediate natively. If false, the legalizer will
5434/// materialize the FP immediate as a load from a constant pool.
5435bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
5436 bool ForCodeSize) const {
5437 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
5438 if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
5439 return true;
5440 }
5441 return false;
5442}
5443
5444bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
5445 ISD::LoadExtType ExtTy,
5446 EVT NewVT) const {
5447 assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow")(static_cast<void> (0));
5448
5449 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
5450 // relocation target a movq or addq instruction: don't let the load shrink.
5451 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
5452 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
5453 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
5454 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
5455
5456 // If this is an (1) AVX vector load with (2) multiple uses and (3) all of
5457 // those uses are extracted directly into a store, then the extract + store
5458 // can be store-folded. Therefore, it's probably not worth splitting the load.
5459 EVT VT = Load->getValueType(0);
5460 if ((VT.is256BitVector() || VT.is512BitVector()) && !Load->hasOneUse()) {
5461 for (auto UI = Load->use_begin(), UE = Load->use_end(); UI != UE; ++UI) {
5462 // Skip uses of the chain value. Result 0 of the node is the load value.
5463 if (UI.getUse().getResNo() != 0)
5464 continue;
5465
5466 // If this use is not an extract + store, it's probably worth splitting.
5467 if (UI->getOpcode() != ISD::EXTRACT_SUBVECTOR || !UI->hasOneUse() ||
5468 UI->use_begin()->getOpcode() != ISD::STORE)
5469 return true;
5470 }
5471 // All non-chain uses are extract + store.
5472 return false;
5473 }
5474
5475 return true;
5476}
5477
5478/// Returns true if it is beneficial to convert a load of a constant
5479/// to just the constant itself.
5480bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
5481 Type *Ty) const {
5482 assert(Ty->isIntegerTy())(static_cast<void> (0));
5483
5484 unsigned BitSize = Ty->getPrimitiveSizeInBits();
5485 if (BitSize == 0 || BitSize > 64)
5486 return false;
5487 return true;
5488}
5489
5490bool X86TargetLowering::reduceSelectOfFPConstantLoads(EVT CmpOpVT) const {
5491 // If we are using XMM registers in the ABI and the condition of the select is
5492 // a floating-point compare and we have blendv or conditional move, then it is
5493 // cheaper to select instead of doing a cross-register move and creating a
5494 // load that depends on the compare result.
5495 bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;
5496 return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();
5497}
5498
5499bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {
5500 // TODO: It might be a win to ease or lift this restriction, but the generic
5501 // folds in DAGCombiner conflict with vector folds for an AVX512 target.
5502 if (VT.isVector() && Subtarget.hasAVX512())
5503 return false;
5504
5505 return true;
5506}
5507
5508bool X86TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
5509 SDValue C) const {
5510 // TODO: We handle scalars using custom code, but generic combining could make
5511 // that unnecessary.
5512 APInt MulC;
5513 if (!ISD::isConstantSplatVector(C.getNode(), MulC))
5514 return false;
5515
5516 // Find the type this will be legalized too. Otherwise we might prematurely
5517 // convert this to shl+add/sub and then still have to type legalize those ops.
5518 // Another choice would be to defer the decision for illegal types until
5519 // after type legalization. But constant splat vectors of i64 can't make it
5520 // through type legalization on 32-bit targets so we would need to special
5521 // case vXi64.
5522 while (getTypeAction(Context, VT) != TypeLegal)
5523 VT = getTypeToTransformTo(Context, VT);
5524
5525 // If vector multiply is legal, assume that's faster than shl + add/sub.
5526 // TODO: Multiply is a complex op with higher latency and lower throughput in
5527 // most implementations, so this check could be loosened based on type
5528 // and/or a CPU attribute.
5529 if (isOperationLegal(ISD::MUL, VT))
5530 return false;
5531
5532 // shl+add, shl+sub, shl+add+neg
5533 return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
5534 (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
5535}
5536
5537bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
5538 unsigned Index) const {
5539 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
5540 return false;
5541
5542 // Mask vectors support all subregister combinations and operations that
5543 // extract half of vector.
5544 if (ResVT.getVectorElementType() == MVT::i1)
5545 return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
5546 (Index == ResVT.getVectorNumElements()));
5547
5548 return (Index % ResVT.getVectorNumElements()) == 0;
5549}
5550
5551bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const {
5552 unsigned Opc = VecOp.getOpcode();
5553
5554 // Assume target opcodes can't be scalarized.
5555 // TODO - do we have any exceptions?
5556 if (Opc >= ISD::BUILTIN_OP_END)
5557 return false;
5558
5559 // If the vector op is not supported, try to convert to scalar.
5560 EVT VecVT = VecOp.getValueType();
5561 if (!isOperationLegalOrCustomOrPromote(Opc, VecVT))
5562 return true;
5563
5564 // If the vector op is supported, but the scalar op is not, the transform may
5565 // not be worthwhile.
5566 EVT ScalarVT = VecVT.getScalarType();
5567 return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
5568}
5569
5570bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT,
5571 bool) const {
5572 // TODO: Allow vectors?
5573 if (VT.isVector())
5574 return false;
5575 return VT.isSimple() || !isOperationExpand(Opcode, VT);
5576}
5577
5578bool X86TargetLowering::isCheapToSpeculateCttz() const {
5579 // Speculate cttz only if we can directly use TZCNT.
5580 return Subtarget.hasBMI();
5581}
5582
5583bool X86TargetLowering::isCheapToSpeculateCtlz() const {
5584 // Speculate ctlz only if we can directly use LZCNT.
5585 return Subtarget.hasLZCNT();
5586}
5587
5588bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
5589 const SelectionDAG &DAG,
5590 const MachineMemOperand &MMO) const {
5591 if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
5592 BitcastVT.getVectorElementType() == MVT::i1)
5593 return false;
5594
5595 if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
5596 return false;
5597
5598 // If both types are legal vectors, it's always ok to convert them.
5599 if (LoadVT.isVector() && BitcastVT.isVector() &&
5600 isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
5601 return true;
5602
5603 return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
5604}
5605
5606bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
5607 const MachineFunction &MF) const {
5608 // Do not merge to float value size (128 bytes) if no implicit
5609 // float attribute is set.
5610 bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);
5611
5612 if (NoFloat) {
5613 unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
5614 return (MemVT.getSizeInBits() <= MaxIntSize);
5615 }
5616 // Make sure we don't merge greater than our preferred vector
5617 // width.
5618 if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
5619 return false;
5620
5621 return true;
5622}
5623
5624bool X86TargetLowering::isCtlzFast() const {
5625 return Subtarget.hasFastLZCNT();
5626}
5627
5628bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
5629 const Instruction &AndI) const {
5630 return true;
5631}
5632
5633bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
5634 EVT VT = Y.getValueType();
5635
5636 if (VT.isVector())
5637 return false;
5638
5639 if (!Subtarget.hasBMI())
5640 return false;
5641
5642 // There are only 32-bit and 64-bit forms for 'andn'.
5643 if (VT != MVT::i32 && VT != MVT::i64)
5644 return false;
5645
5646 return !isa<ConstantSDNode>(Y);
5647}
5648
5649bool X86TargetLowering::hasAndNot(SDValue Y) const {
5650 EVT VT = Y.getValueType();
5651
5652 if (!VT.isVector())
5653 return hasAndNotCompare(Y);
5654
5655 // Vector.
5656
5657 if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
5658 return false;
5659
5660 if (VT == MVT::v4i32)
5661 return true;
5662
5663 return Subtarget.hasSSE2();
5664}
5665
5666bool X86TargetLowering::hasBitTest(SDValue X, SDValue Y) const {
5667 return X.getValueType().isScalarInteger(); // 'bt'
5668}
5669
5670bool X86TargetLowering::
5671 shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
5672 SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
5673 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
5674 SelectionDAG &DAG) const {
5675 // Does baseline recommend not to perform the fold by default?
5676 if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
5677 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
5678 return false;
5679 // For scalars this transform is always beneficial.
5680 if (X.getValueType().isScalarInteger())
5681 return true;
5682 // If all the shift amounts are identical, then transform is beneficial even
5683 // with rudimentary SSE2 shifts.
5684 if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))
5685 return true;
5686 // If we have AVX2 with it's powerful shift operations, then it's also good.
5687 if (Subtarget.hasAVX2())
5688 return true;
5689 // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.
5690 return NewShiftOpcode == ISD::SHL;
5691}
5692
5693bool X86TargetLowering::shouldFoldConstantShiftPairToMask(
5694 const SDNode *N, CombineLevel Level) const {
5695 assert(((N->getOpcode() == ISD::SHL &&(static_cast<void> (0))
5696 N->getOperand(0).getOpcode() == ISD::SRL) ||(static_cast<void> (0))
5697 (N->getOpcode() == ISD::SRL &&(static_cast<void> (0))
5698 N->getOperand(0).getOpcode() == ISD::SHL)) &&(static_cast<void> (0))
5699 "Expected shift-shift mask")(static_cast<void> (0));
5700 EVT VT = N->getValueType(0);
5701 if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||
5702 (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
5703 // Only fold if the shift values are equal - so it folds to AND.
5704 // TODO - we should fold if either is a non-uniform vector but we don't do
5705 // the fold for non-splats yet.
5706 return N->getOperand(1) == N->getOperand(0).getOperand(1);
5707 }
5708 return TargetLoweringBase::shouldFoldConstantShiftPairToMask(N, Level);
5709}
5710
5711bool X86TargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const {
5712 EVT VT = Y.getValueType();
5713
5714 // For vectors, we don't have a preference, but we probably want a mask.
5715 if (VT.isVector())
5716 return false;
5717
5718 // 64-bit shifts on 32-bit targets produce really bad bloated code.
5719 if (VT == MVT::i64 && !Subtarget.is64Bit())
5720 return false;
5721
5722 return true;
5723}
5724
5725bool X86TargetLowering::shouldExpandShift(SelectionDAG &DAG,
5726 SDNode *N) const {
5727 if (DAG.getMachineFunction().getFunction().hasMinSize() &&
5728 !Subtarget.isOSWindows())
5729 return false;
5730 return true;
5731}
5732
5733bool X86TargetLowering::shouldSplatInsEltVarIndex(EVT VT) const {
5734 // Any legal vector type can be splatted more efficiently than
5735 // loading/spilling from memory.
5736 return isTypeLegal(VT);
5737}
5738
5739MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
5740 MVT VT = MVT::getIntegerVT(NumBits);
5741 if (isTypeLegal(VT))
5742 return VT;
5743
5744 // PMOVMSKB can handle this.
5745 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
5746 return MVT::v16i8;
5747
5748 // VPMOVMSKB can handle this.
5749 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
5750 return MVT::v32i8;
5751
5752 // TODO: Allow 64-bit type for 32-bit target.
5753 // TODO: 512-bit types should be allowed, but make sure that those
5754 // cases are handled in combineVectorSizedSetCCEquality().
5755
5756 return MVT::INVALID_SIMPLE_VALUE_TYPE;
5757}
5758
5759/// Val is the undef sentinel value or equal to the specified value.
5760static bool isUndefOrEqual(int Val, int CmpVal) {
5761 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
5762}
5763
5764/// Return true if every element in Mask is the undef sentinel value or equal to
5765/// the specified value..
5766static bool isUndefOrEqual(ArrayRef<int> Mask, int CmpVal) {
5767 return llvm::all_of(Mask, [CmpVal](int M) {
5768 return (M == SM_SentinelUndef) || (M == CmpVal);
5769 });
5770}
5771
5772/// Val is either the undef or zero sentinel value.
5773static bool isUndefOrZero(int Val) {
5774 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
5775}
5776
5777/// Return true if every element in Mask, beginning from position Pos and ending
5778/// in Pos+Size is the undef sentinel value.
5779static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
5780 return llvm::all_of(Mask.slice(Pos, Size),
5781 [](int M) { return M == SM_SentinelUndef; });
5782}
5783
5784/// Return true if the mask creates a vector whose lower half is undefined.
5785static bool isUndefLowerHalf(ArrayRef<int> Mask) {
5786 unsigned NumElts = Mask.size();
5787 return isUndefInRange(Mask, 0, NumElts / 2);
5788}
5789
5790/// Return true if the mask creates a vector whose upper half is undefined.
5791static bool isUndefUpperHalf(ArrayRef<int> Mask) {
5792 unsigned NumElts = Mask.size();
5793 return isUndefInRange(Mask, NumElts / 2, NumElts / 2);
5794}
5795
5796/// Return true if Val falls within the specified range (L, H].
5797static bool isInRange(int Val, int Low, int Hi) {
5798 return (Val >= Low && Val < Hi);
5799}
5800
5801/// Return true if the value of any element in Mask falls within the specified
5802/// range (L, H].
5803static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
5804 return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });
5805}
5806
5807/// Return true if the value of any element in Mask is the zero sentinel value.
5808static bool isAnyZero(ArrayRef<int> Mask) {
5809 return llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
5810}
5811
5812/// Return true if the value of any element in Mask is the zero or undef
5813/// sentinel values.
5814static bool isAnyZeroOrUndef(ArrayRef<int> Mask) {
5815 return llvm::any_of(Mask, [](int M) {
5816 return M == SM_SentinelZero || M == SM_SentinelUndef;
5817 });
5818}
5819
5820/// Return true if Val is undef or if its value falls within the
5821/// specified range (L, H].
5822static bool isUndefOrInRange(int Val, int Low, int Hi) {
5823 return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
5824}
5825
5826/// Return true if every element in Mask is undef or if its value
5827/// falls within the specified range (L, H].
5828static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
5829 return llvm::all_of(
5830 Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });
5831}
5832
5833/// Return true if Val is undef, zero or if its value falls within the
5834/// specified range (L, H].
5835static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
5836 return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
5837}
5838
5839/// Return true if every element in Mask is undef, zero or if its value
5840/// falls within the specified range (L, H].
5841static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
5842 return llvm::all_of(
5843 Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });
5844}
5845
5846/// Return true if every element in Mask, beginning
5847/// from position Pos and ending in Pos + Size, falls within the specified
5848/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
5849static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
5850 unsigned Size, int Low, int Step = 1) {
5851 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
5852 if (!isUndefOrEqual(Mask[i], Low))
5853 return false;
5854 return true;
5855}
5856
5857/// Return true if every element in Mask, beginning
5858/// from position Pos and ending in Pos+Size, falls within the specified
5859/// sequential range (Low, Low+Size], or is undef or is zero.
5860static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
5861 unsigned Size, int Low,
5862 int Step = 1) {
5863 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
5864 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
5865 return false;
5866 return true;
5867}
5868
5869/// Return true if every element in Mask, beginning
5870/// from position Pos and ending in Pos+Size is undef or is zero.
5871static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
5872 unsigned Size) {
5873 return llvm::all_of(Mask.slice(Pos, Size),
5874 [](int M) { return isUndefOrZero(M); });
5875}
5876
5877/// Helper function to test whether a shuffle mask could be
5878/// simplified by widening the elements being shuffled.
5879///
5880/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
5881/// leaves it in an unspecified state.
5882///
5883/// NOTE: This must handle normal vector shuffle masks and *target* vector
5884/// shuffle masks. The latter have the special property of a '-2' representing
5885/// a zero-ed lane of a vector.
5886static bool canWidenShuffleElements(ArrayRef<int> Mask,
5887 SmallVectorImpl<int> &WidenedMask) {
5888 WidenedMask.assign(Mask.size() / 2, 0);
5889 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
5890 int M0 = Mask[i];
5891 int M1 = Mask[i + 1];
5892
5893 // If both elements are undef, its trivial.
5894 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
5895 WidenedMask[i / 2] = SM_SentinelUndef;
5896 continue;
5897 }
5898
5899 // Check for an undef mask and a mask value properly aligned to fit with
5900 // a pair of values. If we find such a case, use the non-undef mask's value.
5901 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
5902 WidenedMask[i / 2] = M1 / 2;
5903 continue;
5904 }
5905 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
5906 WidenedMask[i / 2] = M0 / 2;
5907 continue;
5908 }
5909
5910 // When zeroing, we need to spread the zeroing across both lanes to widen.
5911 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
5912 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
5913 (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {
5914 WidenedMask[i / 2] = SM_SentinelZero;
5915 continue;
5916 }
5917 return false;
5918 }
5919
5920 // Finally check if the two mask values are adjacent and aligned with
5921 // a pair.
5922 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
5923 WidenedMask[i / 2] = M0 / 2;
5924 continue;
5925 }
5926
5927 // Otherwise we can't safely widen the elements used in this shuffle.
5928 return false;
5929 }
5930 assert(WidenedMask.size() == Mask.size() / 2 &&(static_cast<void> (0))
5931 "Incorrect size of mask after widening the elements!")(static_cast<void> (0));
5932
5933 return true;
5934}
5935
5936static bool canWidenShuffleElements(ArrayRef<int> Mask,
5937 const APInt &Zeroable,
5938 bool V2IsZero,
5939 SmallVectorImpl<int> &WidenedMask) {
5940 // Create an alternative mask with info about zeroable elements.
5941 // Here we do not set undef elements as zeroable.
5942 SmallVector<int, 64> ZeroableMask(Mask.begin(), Mask.end());
5943 if (V2IsZero) {
5944 assert(!Zeroable.isNullValue() && "V2's non-undef elements are used?!")(static_cast<void> (0));
5945 for (int i = 0, Size = Mask.size(); i != Size; ++i)
5946 if (Mask[i] != SM_SentinelUndef && Zeroable[i])
5947 ZeroableMask[i] = SM_SentinelZero;
5948 }
5949 return canWidenShuffleElements(ZeroableMask, WidenedMask);
5950}
5951
5952static bool canWidenShuffleElements(ArrayRef<int> Mask) {
5953 SmallVector<int, 32> WidenedMask;
5954 return canWidenShuffleElements(Mask, WidenedMask);
5955}
5956
5957// Attempt to narrow/widen shuffle mask until it matches the target number of
5958// elements.
5959static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,
5960 SmallVectorImpl<int> &ScaledMask) {
5961 unsigned NumSrcElts = Mask.size();
5962 assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&(static_cast<void> (0))
5963 "Illegal shuffle scale factor")(static_cast<void> (0));
5964
5965 // Narrowing is guaranteed to work.
5966 if (NumDstElts >= NumSrcElts) {
5967 int Scale = NumDstElts / NumSrcElts;
5968 llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);
5969 return true;
5970 }
5971
5972 // We have to repeat the widening until we reach the target size, but we can
5973 // split out the first widening as it sets up ScaledMask for us.
5974 if (canWidenShuffleElements(Mask, ScaledMask)) {
5975 while (ScaledMask.size() > NumDstElts) {
5976 SmallVector<int, 16> WidenedMask;
5977 if (!canWidenShuffleElements(ScaledMask, WidenedMask))
5978 return false;
5979 ScaledMask = std::move(WidenedMask);
5980 }
5981 return true;
5982 }
5983
5984 return false;
5985}
5986
5987/// Returns true if Elt is a constant zero or a floating point constant +0.0.
5988bool X86::isZeroNode(SDValue Elt) {
5989 return isNullConstant(Elt) || isNullFPConstant(Elt);
5990}
5991
5992// Build a vector of constants.
5993// Use an UNDEF node if MaskElt == -1.
5994// Split 64-bit constants in the 32-bit mode.
5995static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
5996 const SDLoc &dl, bool IsMask = false) {
5997
5998 SmallVector<SDValue, 32> Ops;
5999 bool Split = false;
6000
6001 MVT ConstVecVT = VT;
6002 unsigned NumElts = VT.getVectorNumElements();
6003 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
6004 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
6005 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
6006 Split = true;
6007 }
6008
6009 MVT EltVT = ConstVecVT.getVectorElementType();
6010 for (unsigned i = 0; i < NumElts; ++i) {
6011 bool IsUndef = Values[i] < 0 && IsMask;
6012 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
6013 DAG.getConstant(Values[i], dl, EltVT);
6014 Ops.push_back(OpNode);
6015 if (Split)
6016 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
6017 DAG.getConstant(0, dl, EltVT));
6018 }
6019 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
6020 if (Split)
6021 ConstsNode = DAG.getBitcast(VT, ConstsNode);
6022 return ConstsNode;
6023}
6024
6025static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
6026 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
6027 assert(Bits.size() == Undefs.getBitWidth() &&(static_cast<void> (0))
6028 "Unequal constant and undef arrays")(static_cast<void> (0));
6029 SmallVector<SDValue, 32> Ops;
6030 bool Split = false;
6031
6032 MVT ConstVecVT = VT;
6033 unsigned NumElts = VT.getVectorNumElements();
6034 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
6035 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
6036 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
6037 Split = true;
6038 }
6039
6040 MVT EltVT = ConstVecVT.getVectorElementType();
6041 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
6042 if (Undefs[i]) {
6043 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
6044 continue;
6045 }
6046 const APInt &V = Bits[i];
6047 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes")(static_cast<void> (0));
6048 if (Split) {
6049 Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
6050 Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
6051 } else if (EltVT == MVT::f32) {
6052 APFloat FV(APFloat::IEEEsingle(), V);
6053 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
6054 } else if (EltVT == MVT::f64) {
6055 APFloat FV(APFloat::IEEEdouble(), V);
6056 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
6057 } else {
6058 Ops.push_back(DAG.getConstant(V, dl, EltVT));
6059 }
6060 }
6061
6062 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
6063 return DAG.getBitcast(VT, ConstsNode);
6064}
6065
6066/// Returns a vector of specified type with all zero elements.
6067static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
6068 SelectionDAG &DAG, const SDLoc &dl) {
6069 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||(static_cast<void> (0))
6070 VT.getVectorElementType() == MVT::i1) &&(static_cast<void> (0))
6071 "Unexpected vector type")(static_cast<void> (0));
6072
6073 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
6074 // type. This ensures they get CSE'd. But if the integer type is not
6075 // available, use a floating-point +0.0 instead.
6076 SDValue Vec;
6077 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
6078 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
6079 } else if (VT.isFloatingPoint()) {
6080 Vec = DAG.getConstantFP(+0.0, dl, VT);
6081 } else if (VT.getVectorElementType() == MVT::i1) {
6082 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&(static_cast<void> (0))
6083 "Unexpected vector type")(static_cast<void> (0));
6084 Vec = DAG.getConstant(0, dl, VT);
6085 } else {
6086 unsigned Num32BitElts = VT.getSizeInBits() / 32;
6087 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
6088 }
6089 return DAG.getBitcast(VT, Vec);
6090}
6091
6092static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
6093 const SDLoc &dl, unsigned vectorWidth) {
6094 EVT VT = Vec.getValueType();
6095 EVT ElVT = VT.getVectorElementType();
6096 unsigned Factor = VT.getSizeInBits() / vectorWidth;
6097 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
6098 VT.getVectorNumElements() / Factor);
6099
6100 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
6101 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
6102 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")(static_cast<void> (0));
6103
6104 // This is the index of the first element of the vectorWidth-bit chunk
6105 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
6106 IdxVal &= ~(ElemsPerChunk - 1);
6107
6108 // If the input is a buildvector just emit a smaller one.
6109 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
6110 return DAG.getBuildVector(ResultVT, dl,
6111 Vec->ops().slice(IdxVal, ElemsPerChunk));
6112
6113 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
6114 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
6115}
6116
6117/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
6118/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
6119/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
6120/// instructions or a simple subregister reference. Idx is an index in the
6121/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
6122/// lowering EXTRACT_VECTOR_ELT operations easier.
6123static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
6124 SelectionDAG &DAG, const SDLoc &dl) {
6125 assert((Vec.getValueType().is256BitVector() ||(static_cast<void> (0))
6126 Vec.getValueType().is512BitVector()) && "Unexpected vector size!")(static_cast<void> (0));
6127 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
6128}
6129
6130/// Generate a DAG to grab 256-bits from a 512-bit vector.
6131static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
6132 SelectionDAG &DAG, const SDLoc &dl) {
6133 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!")(static_cast<void> (0));
6134 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
6135}
6136
6137static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
6138 SelectionDAG &DAG, const SDLoc &dl,
6139 unsigned vectorWidth) {
6140 assert((vectorWidth == 128 || vectorWidth == 256) &&(static_cast<void> (0))
6141 "Unsupported vector width")(static_cast<void> (0));
6142 // Inserting UNDEF is Result
6143 if (Vec.isUndef())
6144 return Result;
6145 EVT VT = Vec.getValueType();
6146 EVT ElVT = VT.getVectorElementType();
6147 EVT ResultVT = Result.getValueType();
6148
6149 // Insert the relevant vectorWidth bits.
6150 unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
6151 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")(static_cast<void> (0));
6152
6153 // This is the index of the first element of the vectorWidth-bit chunk
6154 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
6155 IdxVal &= ~(ElemsPerChunk - 1);
6156
6157 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
6158 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
6159}
6160
6161/// Generate a DAG to put 128-bits into a vector > 128 bits. This
6162/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
6163/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
6164/// simple superregister reference. Idx is an index in the 128 bits
6165/// we want. It need not be aligned to a 128-bit boundary. That makes
6166/// lowering INSERT_VECTOR_ELT operations easier.
6167static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
6168 SelectionDAG &DAG, const SDLoc &dl) {
6169 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!")(static_cast<void> (0));
6170 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
6171}
6172
6173/// Widen a vector to a larger size with the same scalar type, with the new
6174/// elements either zero or undef.
6175static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
6176 const X86Subtarget &Subtarget, SelectionDAG &DAG,
6177 const SDLoc &dl) {
6178 assert(Vec.getValueSizeInBits().getFixedSize() < VT.getFixedSizeInBits() &&(static_cast<void> (0))
6179 Vec.getValueType().getScalarType() == VT.getScalarType() &&(static_cast<void> (0))
6180 "Unsupported vector widening type")(static_cast<void> (0));
6181 SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
6182 : DAG.getUNDEF(VT);
6183 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec,
6184 DAG.getIntPtrConstant(0, dl));
6185}
6186
6187/// Widen a vector to a larger size with the same scalar type, with the new
6188/// elements either zero or undef.
6189static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
6190 const X86Subtarget &Subtarget, SelectionDAG &DAG,
6191 const SDLoc &dl, unsigned WideSizeInBits) {
6192 assert(Vec.getValueSizeInBits() < WideSizeInBits &&(static_cast<void> (0))
6193 (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&(static_cast<void> (0))
6194 "Unsupported vector widening type")(static_cast<void> (0));
6195 unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();
6196 MVT SVT = Vec.getSimpleValueType().getScalarType();
6197 MVT VT = MVT::getVectorVT(SVT, WideNumElts);
6198 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
6199}
6200
6201// Helper function to collect subvector ops that are concatenated together,
6202// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
6203// The subvectors in Ops are guaranteed to be the same type.
6204static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops) {
6205 assert(Ops.empty() && "Expected an empty ops vector")(static_cast<void> (0));
6206
6207 if (N->getOpcode() == ISD::CONCAT_VECTORS) {
6208 Ops.append(N->op_begin(), N->op_end());
6209 return true;
6210 }
6211
6212 if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {
6213 SDValue Src = N->getOperand(0);
6214 SDValue Sub = N->getOperand(1);
6215 const APInt &Idx = N->getConstantOperandAPInt(2);
6216 EVT VT = Src.getValueType();
6217 EVT SubVT = Sub.getValueType();
6218
6219 // TODO - Handle more general insert_subvector chains.
6220 if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2) &&
6221 Idx == (VT.getVectorNumElements() / 2)) {
6222 // insert_subvector(insert_subvector(undef, x, lo), y, hi)
6223 if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
6224 Src.getOperand(1).getValueType() == SubVT &&
6225 isNullConstant(Src.getOperand(2))) {
6226 Ops.push_back(Src.getOperand(1));
6227 Ops.push_back(Sub);
6228 return true;
6229 }
6230 // insert_subvector(x, extract_subvector(x, lo), hi)
6231 if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
6232 Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {
6233 Ops.append(2, Sub);
6234 return true;
6235 }
6236 }
6237 }
6238
6239 return false;
6240}
6241
6242static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,
6243 const SDLoc &dl) {
6244 EVT VT = Op.getValueType();
6245 unsigned NumElems = VT.getVectorNumElements();
6246 unsigned SizeInBits = VT.getSizeInBits();
6247 assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&(static_cast<void> (0))
6248 "Can't split odd sized vector")(static_cast<void> (0));
6249
6250 SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);
6251 SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);
6252 return std::make_pair(Lo, Hi);
6253}
6254
6255// Split an unary integer op into 2 half sized ops.
6256static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
6257 EVT VT = Op.getValueType();
6258
6259 // Make sure we only try to split 256/512-bit types to avoid creating
6260 // narrow vectors.
6261 assert((Op.getOperand(0).getValueType().is256BitVector() ||(static_cast<void> (0))
6262 Op.getOperand(0).getValueType().is512BitVector()) &&(static_cast<void> (0))
6263 (VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!")(static_cast<void> (0));
6264 assert(Op.getOperand(0).getValueType().getVectorNumElements() ==(static_cast<void> (0))
6265 VT.getVectorNumElements() &&(static_cast<void> (0))
6266 "Unexpected VTs!")(static_cast<void> (0));
6267
6268 SDLoc dl(Op);
6269
6270 // Extract the Lo/Hi vectors
6271 SDValue Lo, Hi;
6272 std::tie(Lo, Hi) = splitVector(Op.getOperand(0), DAG, dl);
6273
6274 EVT LoVT, HiVT;
6275 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
6276 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
6277 DAG.getNode(Op.getOpcode(), dl, LoVT, Lo),
6278 DAG.getNode(Op.getOpcode(), dl, HiVT, Hi));
6279}
6280
6281/// Break a binary integer operation into 2 half sized ops and then
6282/// concatenate the result back.
6283static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG) {
6284 EVT VT = Op.getValueType();
6285
6286 // Sanity check that all the types match.
6287 assert(Op.getOperand(0).getValueType() == VT &&(static_cast<void> (0))
6288 Op.getOperand(1).getValueType() == VT && "Unexpected VTs!")(static_cast<void> (0));
6289 assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!")(static_cast<void> (0));
6290
6291 SDLoc dl(Op);
6292
6293 // Extract the LHS Lo/Hi vectors
6294 SDValue LHS1, LHS2;
6295 std::tie(LHS1, LHS2) = splitVector(Op.getOperand(0), DAG, dl);
6296
6297 // Extract the RHS Lo/Hi vectors
6298 SDValue RHS1, RHS2;
6299 std::tie(RHS1, RHS2) = splitVector(Op.getOperand(1), DAG, dl);
6300
6301 EVT LoVT, HiVT;
6302 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
6303 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
6304 DAG.getNode(Op.getOpcode(), dl, LoVT, LHS1, RHS1),
6305 DAG.getNode(Op.getOpcode(), dl, HiVT, LHS2, RHS2));
6306}
6307
6308// Helper for splitting operands of an operation to legal target size and
6309// apply a function on each part.
6310// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
6311// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
6312// deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
6313// The argument Builder is a function that will be applied on each split part:
6314// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
6315template <typename F>
6316SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,
6317 const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
6318 F Builder, bool CheckBWI = true) {
6319 assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2")(static_cast<void> (0));
6320 unsigned NumSubs = 1;
6321 if ((CheckBWI && Subtarget.useBWIRegs()) ||
6322 (!CheckBWI && Subtarget.useAVX512Regs())) {
6323 if (VT.getSizeInBits() > 512) {
6324 NumSubs = VT.getSizeInBits() / 512;
6325 assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size")(static_cast<void> (0));
6326 }
6327 } else if (Subtarget.hasAVX2()) {
6328 if (VT.getSizeInBits() > 256) {
6329 NumSubs = VT.getSizeInBits() / 256;
6330 assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size")(static_cast<void> (0));
6331 }
6332 } else {
6333 if (VT.getSizeInBits() > 128) {
6334 NumSubs = VT.getSizeInBits() / 128;
6335 assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size")(static_cast<void> (0));
6336 }
6337 }
6338
6339 if (NumSubs == 1)
6340 return Builder(DAG, DL, Ops);
6341
6342 SmallVector<SDValue, 4> Subs;
6343 for (unsigned i = 0; i != NumSubs; ++i) {
6344 SmallVector<SDValue, 2> SubOps;
6345 for (SDValue Op : Ops) {
6346 EVT OpVT = Op.getValueType();
6347 unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
6348 unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
6349 SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
6350 }
6351 Subs.push_back(Builder(DAG, DL, SubOps));
6352 }
6353 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
6354}
6355
6356/// Insert i1-subvector to i1-vector.
6357static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
6358 const X86Subtarget &Subtarget) {
6359
6360 SDLoc dl(Op);
6361 SDValue Vec = Op.getOperand(0);
6362 SDValue SubVec = Op.getOperand(1);
6363 SDValue Idx = Op.getOperand(2);
6364 unsigned IdxVal = Op.getConstantOperandVal(2);
6365
6366 // Inserting undef is a nop. We can just return the original vector.
6367 if (SubVec.isUndef())
6368 return Vec;
6369
6370 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
6371 return Op;
6372
6373 MVT OpVT = Op.getSimpleValueType();
6374 unsigned NumElems = OpVT.getVectorNumElements();
6375 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
6376
6377 // Extend to natively supported kshift.
6378 MVT WideOpVT = OpVT;
6379 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
6380 WideOpVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
6381
6382 // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
6383 // if necessary.
6384 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
6385 // May need to promote to a legal type.
6386 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6387 DAG.getConstant(0, dl, WideOpVT),
6388 SubVec, Idx);
6389 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
6390 }
6391
6392 MVT SubVecVT = SubVec.getSimpleValueType();
6393 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
6394 assert(IdxVal + SubVecNumElems <= NumElems &&(static_cast<void> (0))
6395 IdxVal % SubVecVT.getSizeInBits() == 0 &&(static_cast<void> (0))
6396 "Unexpected index value in INSERT_SUBVECTOR")(static_cast<void> (0));
6397
6398 SDValue Undef = DAG.getUNDEF(WideOpVT);
6399
6400 if (IdxVal == 0) {
6401 // Zero lower bits of the Vec
6402 SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);
6403 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
6404 ZeroIdx);
6405 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
6406 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
6407 // Merge them together, SubVec should be zero extended.
6408 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6409 DAG.getConstant(0, dl, WideOpVT),
6410 SubVec, ZeroIdx);
6411 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
6412 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
6413 }
6414
6415 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6416 Undef, SubVec, ZeroIdx);
6417
6418 if (Vec.isUndef()) {
6419 assert(IdxVal != 0 && "Unexpected index")(static_cast<void> (0));
6420 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6421 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
6422 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
6423 }
6424
6425 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
6426 assert(IdxVal != 0 && "Unexpected index")(static_cast<void> (0));
6427 // If upper elements of Vec are known undef, then just shift into place.
6428 if (llvm::all_of(Vec->ops().slice(IdxVal + SubVecNumElems),
6429 [](SDValue V) { return V.isUndef(); })) {
6430 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6431 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
6432 } else {
6433 NumElems = WideOpVT.getVectorNumElements();
6434 unsigned ShiftLeft = NumElems - SubVecNumElems;
6435 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
6436 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6437 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
6438 if (ShiftRight != 0)
6439 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
6440 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
6441 }
6442 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
6443 }
6444
6445 // Simple case when we put subvector in the upper part
6446 if (IdxVal + SubVecNumElems == NumElems) {
6447 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6448 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
6449 if (SubVecNumElems * 2 == NumElems) {
6450 // Special case, use legal zero extending insert_subvector. This allows
6451 // isel to optimize when bits are known zero.
6452 Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
6453 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6454 DAG.getConstant(0, dl, WideOpVT),
6455 Vec, ZeroIdx);
6456 } else {
6457 // Otherwise use explicit shifts to zero the bits.
6458 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6459 Undef, Vec, ZeroIdx);
6460 NumElems = WideOpVT.getVectorNumElements();
6461 SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
6462 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
6463 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
6464 }
6465 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
6466 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
6467 }
6468
6469 // Inserting into the middle is more complicated.
6470
6471 NumElems = WideOpVT.getVectorNumElements();
6472
6473 // Widen the vector if needed.
6474 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
6475
6476 unsigned ShiftLeft = NumElems - SubVecNumElems;
6477 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
6478
6479 // Do an optimization for the the most frequently used types.
6480 if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {
6481 APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);
6482 Mask0.flipAllBits();
6483 SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));
6484 SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);
6485 Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);
6486 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6487 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
6488 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
6489 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
6490 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
6491
6492 // Reduce to original width if needed.
6493 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
6494 }
6495
6496 // Clear the upper bits of the subvector and move it to its insert position.
6497 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6498 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
6499 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
6500 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
6501
6502 // Isolate the bits below the insertion point.
6503 unsigned LowShift = NumElems - IdxVal;
6504 SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,
6505 DAG.getTargetConstant(LowShift, dl, MVT::i8));
6506 Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,
6507 DAG.getTargetConstant(LowShift, dl, MVT::i8));
6508
6509 // Isolate the bits after the last inserted bit.
6510 unsigned HighShift = IdxVal + SubVecNumElems;
6511 SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
6512 DAG.getTargetConstant(HighShift, dl, MVT::i8));
6513 High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
6514 DAG.getTargetConstant(HighShift, dl, MVT::i8));
6515
6516 // Now OR all 3 pieces together.
6517 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
6518 SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);
6519
6520 // Reduce to original width if needed.
6521 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
6522}
6523
6524static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG,
6525 const SDLoc &dl) {
6526 assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch")(static_cast<void> (0));
6527 EVT SubVT = V1.getValueType();
6528 EVT SubSVT = SubVT.getScalarType();
6529 unsigned SubNumElts = SubVT.getVectorNumElements();
6530 unsigned SubVectorWidth = SubVT.getSizeInBits();
6531 EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);
6532 SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);
6533 return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);
6534}
6535
6536/// Returns a vector of specified type with all bits set.
6537/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
6538/// Then bitcast to their original type, ensuring they get CSE'd.
6539static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
6540 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&(static_cast<void> (0))
6541 "Expected a 128/256/512-bit vector type")(static_cast<void> (0));
6542
6543 APInt Ones = APInt::getAllOnesValue(32);
6544 unsigned NumElts = VT.getSizeInBits() / 32;
6545 SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
6546 return DAG.getBitcast(VT, Vec);
6547}
6548
6549// Convert *_EXTEND_VECTOR_INREG to *_EXTEND opcode.
6550static unsigned getOpcode_EXTEND(unsigned Opcode) {
6551 switch (Opcode) {
6552 case ISD::ANY_EXTEND:
6553 case ISD::ANY_EXTEND_VECTOR_INREG:
6554 return ISD::ANY_EXTEND;
6555 case ISD::ZERO_EXTEND:
6556 case ISD::ZERO_EXTEND_VECTOR_INREG:
6557 return ISD::ZERO_EXTEND;
6558 case ISD::SIGN_EXTEND:
6559 case ISD::SIGN_EXTEND_VECTOR_INREG:
6560 return ISD::SIGN_EXTEND;
6561 }
6562 llvm_unreachable("Unknown opcode")__builtin_unreachable();
6563}
6564
6565// Convert *_EXTEND to *_EXTEND_VECTOR_INREG opcode.
6566static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode) {
6567 switch (Opcode) {
6568 case ISD::ANY_EXTEND:
6569 case ISD::ANY_EXTEND_VECTOR_INREG:
6570 return ISD::ANY_EXTEND_VECTOR_INREG;
6571 case ISD::ZERO_EXTEND:
6572 case ISD::ZERO_EXTEND_VECTOR_INREG:
6573 return ISD::ZERO_EXTEND_VECTOR_INREG;
6574 case ISD::SIGN_EXTEND:
6575 case ISD::SIGN_EXTEND_VECTOR_INREG:
6576 return ISD::SIGN_EXTEND_VECTOR_INREG;
6577 }
6578 llvm_unreachable("Unknown opcode")__builtin_unreachable();
6579}
6580
6581static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT,
6582 SDValue In, SelectionDAG &DAG) {
6583 EVT InVT = In.getValueType();
6584 assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.")(static_cast<void> (0));
6585 assert((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||(static_cast<void> (0))
6586 ISD::ZERO_EXTEND == Opcode) &&(static_cast<void> (0))
6587 "Unknown extension opcode")(static_cast<void> (0));
6588
6589 // For 256-bit vectors, we only need the lower (128-bit) input half.
6590 // For 512-bit vectors, we only need the lower input half or quarter.
6591 if (InVT.getSizeInBits() > 128) {
6592 assert(VT.getSizeInBits() == InVT.getSizeInBits() &&(static_cast<void> (0))
6593 "Expected VTs to be the same size!")(static_cast<void> (0));
6594 unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
6595 In = extractSubVector(In, 0, DAG, DL,
6596 std::max(128U, (unsigned)VT.getSizeInBits() / Scale));
6597 InVT = In.getValueType();
6598 }
6599
6600 if (VT.getVectorNumElements() != InVT.getVectorNumElements())
6601 Opcode = getOpcode_EXTEND_VECTOR_INREG(Opcode);
6602
6603 return DAG.getNode(Opcode, DL, VT, In);
6604}
6605
6606// Match (xor X, -1) -> X.
6607// Match extract_subvector(xor X, -1) -> extract_subvector(X).
6608// Match concat_vectors(xor X, -1, xor Y, -1) -> concat_vectors(X, Y).
6609static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {
6610 V = peekThroughBitcasts(V);
6611 if (V.getOpcode() == ISD::XOR &&
6612 ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()))
6613 return V.getOperand(0);
6614 if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
6615 (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
6616 if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
6617 Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
6618 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), V.getValueType(),
6619 Not, V.getOperand(1));
6620 }
6621 }
6622 SmallVector<SDValue, 2> CatOps;
6623 if (collectConcatOps(V.getNode(), CatOps)) {
6624 for (SDValue &CatOp : CatOps) {
6625 SDValue NotCat = IsNOT(CatOp, DAG);
6626 if (!NotCat) return SDValue();
6627 CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
6628 }
6629 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), V.getValueType(), CatOps);
6630 }
6631 return SDValue();
6632}
6633
6634void llvm::createUnpackShuffleMask(EVT VT, SmallVectorImpl<int> &Mask,
6635 bool Lo, bool Unary) {
6636 assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 &&(static_cast<void> (0))
6637 "Illegal vector type to unpack")(static_cast<void> (0));
6638 assert(Mask.empty() && "Expected an empty shuffle mask vector")(static_cast<void> (0));
6639 int NumElts = VT.getVectorNumElements();
6640 int NumEltsInLane = 128 / VT.getScalarSizeInBits();
6641 for (int i = 0; i < NumElts; ++i) {
6642 unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
6643 int Pos = (i % NumEltsInLane) / 2 + LaneStart;
6644 Pos += (Unary ? 0 : NumElts * (i % 2));
6645 Pos += (Lo ? 0 : NumEltsInLane / 2);
6646 Mask.push_back(Pos);
6647 }
6648}
6649
6650/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
6651/// imposed by AVX and specific to the unary pattern. Example:
6652/// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
6653/// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
6654void llvm::createSplat2ShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
6655 bool Lo) {
6656 assert(Mask.empty() && "Expected an empty shuffle mask vector")(static_cast<void> (0));
6657 int NumElts = VT.getVectorNumElements();
6658 for (int i = 0; i < NumElts; ++i) {
6659 int Pos = i / 2;
6660 Pos += (Lo ? 0 : NumElts / 2);
6661 Mask.push_back(Pos);
6662 }
6663}
6664
6665/// Returns a vector_shuffle node for an unpackl operation.
6666static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
6667 SDValue V1, SDValue V2) {
6668 SmallVector<int, 8> Mask;
6669 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
6670 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
6671}
6672
6673/// Returns a vector_shuffle node for an unpackh operation.
6674static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
6675 SDValue V1, SDValue V2) {
6676 SmallVector<int, 8> Mask;
6677 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
6678 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
6679}
6680
6681/// Return a vector_shuffle of the specified vector of zero or undef vector.
6682/// This produces a shuffle where the low element of V2 is swizzled into the
6683/// zero/undef vector, landing at element Idx.
6684/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
6685static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
6686 bool IsZero,
6687 const X86Subtarget &Subtarget,
6688 SelectionDAG &DAG) {
6689 MVT VT = V2.getSimpleValueType();
6690 SDValue V1 = IsZero
6691 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
6692 int NumElems = VT.getVectorNumElements();
6693 SmallVector<int, 16> MaskVec(NumElems);
6694 for (int i = 0; i != NumElems; ++i)
6695 // If this is the insertion idx, put the low elt of V2 here.
6696 MaskVec[i] = (i == Idx) ? NumElems : i;
6697 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
6698}
6699
6700static const Constant *getTargetConstantFromBasePtr(SDValue Ptr) {
6701 if (Ptr.getOpcode() == X86ISD::Wrapper ||
6702 Ptr.getOpcode() == X86ISD::WrapperRIP)
6703 Ptr = Ptr.getOperand(0);
6704
6705 auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
6706 if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
6707 return nullptr;
6708
6709 return CNode->getConstVal();
6710}
6711
6712static const Constant *getTargetConstantFromNode(LoadSDNode *Load) {
6713 if (!Load || !ISD::isNormalLoad(Load))
6714 return nullptr;
6715 return getTargetConstantFromBasePtr(Load->getBasePtr());
6716}
6717
6718static const Constant *getTargetConstantFromNode(SDValue Op) {
6719 Op = peekThroughBitcasts(Op);
6720 return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op));
6721}
6722
6723const Constant *
6724X86TargetLowering::getTargetConstantFromLoad(LoadSDNode *LD) const {
6725 assert(LD && "Unexpected null LoadSDNode")(static_cast<void> (0));
6726 return getTargetConstantFromNode(LD);
6727}
6728
6729// Extract raw constant bits from constant pools.
6730static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
6731 APInt &UndefElts,
6732 SmallVectorImpl<APInt> &EltBits,
6733 bool AllowWholeUndefs = true,
6734 bool AllowPartialUndefs = true) {
6735 assert(EltBits.empty() && "Expected an empty EltBits vector")(static_cast<void> (0));
6736
6737 Op = peekThroughBitcasts(Op);
6738
6739 EVT VT = Op.getValueType();
6740 unsigned SizeInBits = VT.getSizeInBits();
6741 assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!")(static_cast<void> (0));
6742 unsigned NumElts = SizeInBits / EltSizeInBits;
6743
6744 // Bitcast a source array of element bits to the target size.
6745 auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
6746 unsigned NumSrcElts = UndefSrcElts.getBitWidth();
6747 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
6748 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&(static_cast<void> (0))
6749 "Constant bit sizes don't match")(static_cast<void> (0));
6750
6751 // Don't split if we don't allow undef bits.
6752 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
6753 if (UndefSrcElts.getBoolValue() && !AllowUndefs)
6754 return false;
6755
6756 // If we're already the right size, don't bother bitcasting.
6757 if (NumSrcElts == NumElts) {
6758 UndefElts = UndefSrcElts;
6759 EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
6760 return true;
6761 }
6762
6763 // Extract all the undef/constant element data and pack into single bitsets.
6764 APInt UndefBits(SizeInBits, 0);
6765 APInt MaskBits(SizeInBits, 0);
6766
6767 for (unsigned i = 0; i != NumSrcElts; ++i) {
6768 unsigned BitOffset = i * SrcEltSizeInBits;
6769 if (UndefSrcElts[i])
6770 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
6771 MaskBits.insertBits(SrcEltBits[i], BitOffset);
6772 }
6773
6774 // Split the undef/constant single bitset data into the target elements.
6775 UndefElts = APInt(NumElts, 0);
6776 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
6777
6778 for (unsigned i = 0; i != NumElts; ++i) {
6779 unsigned BitOffset = i * EltSizeInBits;
6780 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
6781
6782 // Only treat an element as UNDEF if all bits are UNDEF.
6783 if (UndefEltBits.isAllOnesValue()) {
6784 if (!AllowWholeUndefs)
6785 return false;
6786 UndefElts.setBit(i);
6787 continue;
6788 }
6789
6790 // If only some bits are UNDEF then treat them as zero (or bail if not
6791 // supported).
6792 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
6793 return false;
6794
6795 EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);
6796 }
6797 return true;
6798 };
6799
6800 // Collect constant bits and insert into mask/undef bit masks.
6801 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
6802 unsigned UndefBitIndex) {
6803 if (!Cst)
6804 return false;
6805 if (isa<UndefValue>(Cst)) {
6806 Undefs.setBit(UndefBitIndex);
6807 return true;
6808 }
6809 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
6810 Mask = CInt->getValue();
6811 return true;
6812 }
6813 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
6814 Mask = CFP->getValueAPF().bitcastToAPInt();
6815 return true;
6816 }
6817 return false;
6818 };
6819
6820 // Handle UNDEFs.
6821 if (Op.isUndef()) {
6822 APInt UndefSrcElts = APInt::getAllOnesValue(NumElts);
6823 SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
6824 return CastBitData(UndefSrcElts, SrcEltBits);
6825 }
6826
6827 // Extract scalar constant bits.
6828 if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
6829 APInt UndefSrcElts = APInt::getNullValue(1);
6830 SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
6831 return CastBitData(UndefSrcElts, SrcEltBits);
6832 }
6833 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
6834 APInt UndefSrcElts = APInt::getNullValue(1);
6835 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
6836 SmallVector<APInt, 64> SrcEltBits(1, RawBits);
6837 return CastBitData(UndefSrcElts, SrcEltBits);
6838 }
6839
6840 // Extract constant bits from build vector.
6841 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
6842 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
6843 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6844
6845 APInt UndefSrcElts(NumSrcElts, 0);
6846 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
6847 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
6848 const SDValue &Src = Op.getOperand(i);
6849 if (Src.isUndef()) {
6850 UndefSrcElts.setBit(i);
6851 continue;
6852 }
6853 auto *Cst = cast<ConstantSDNode>(Src);
6854 SrcEltBits[i] = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
6855 }
6856 return CastBitData(UndefSrcElts, SrcEltBits);
6857 }
6858 if (ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode())) {
6859 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
6860 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6861
6862 APInt UndefSrcElts(NumSrcElts, 0);
6863 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
6864 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
6865 const SDValue &Src = Op.getOperand(i);
6866 if (Src.isUndef()) {
6867 UndefSrcElts.setBit(i);
6868 continue;
6869 }
6870 auto *Cst = cast<ConstantFPSDNode>(Src);
6871 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
6872 SrcEltBits[i] = RawBits.zextOrTrunc(SrcEltSizeInBits);
6873 }
6874 return CastBitData(UndefSrcElts, SrcEltBits);
6875 }
6876
6877 // Extract constant bits from constant pool vector.
6878 if (auto *Cst = getTargetConstantFromNode(Op)) {
6879 Type *CstTy = Cst->getType();
6880 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
6881 if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
6882 return false;
6883
6884 unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
6885 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6886
6887 APInt UndefSrcElts(NumSrcElts, 0);
6888 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
6889 for (unsigned i = 0; i != NumSrcElts; ++i)
6890 if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
6891 UndefSrcElts, i))
6892 return false;
6893
6894 return CastBitData(UndefSrcElts, SrcEltBits);
6895 }
6896
6897 // Extract constant bits from a broadcasted constant pool scalar.
6898 if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
6899 EltSizeInBits <= VT.getScalarSizeInBits()) {
6900 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
6901 if (MemIntr->getMemoryVT().getScalarSizeInBits() != VT.getScalarSizeInBits())
6902 return false;
6903
6904 SDValue Ptr = MemIntr->getBasePtr();
6905 if (const Constant *C = getTargetConstantFromBasePtr(Ptr)) {
6906 unsigned SrcEltSizeInBits = C->getType()->getScalarSizeInBits();
6907 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6908
6909 APInt UndefSrcElts(NumSrcElts, 0);
6910 SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
6911 if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {
6912 if (UndefSrcElts[0])
6913 UndefSrcElts.setBits(0, NumSrcElts);
6914 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
6915 return CastBitData(UndefSrcElts, SrcEltBits);
6916 }
6917 }
6918 }
6919
6920 // Extract constant bits from a subvector broadcast.
6921 if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
6922 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
6923 SDValue Ptr = MemIntr->getBasePtr();
6924 // The source constant may be larger than the subvector broadcast,
6925 // ensure we extract the correct subvector constants.
6926 if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) {
6927 Type *CstTy = Cst->getType();
6928 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
6929 unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits();
6930 if (!CstTy->isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 ||
6931 (SizeInBits % SubVecSizeInBits) != 0)
6932 return false;
6933 unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
6934 unsigned NumSubElts = SubVecSizeInBits / CstEltSizeInBits;
6935 unsigned NumSubVecs = SizeInBits / SubVecSizeInBits;
6936 APInt UndefSubElts(NumSubElts, 0);
6937 SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs,
6938 APInt(CstEltSizeInBits, 0));
6939 for (unsigned i = 0; i != NumSubElts; ++i) {
6940 if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],
6941 UndefSubElts, i))
6942 return false;
6943 for (unsigned j = 1; j != NumSubVecs; ++j)
6944 SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];
6945 }
6946 UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(),
6947 UndefSubElts);
6948 return CastBitData(UndefSubElts, SubEltBits);
6949 }
6950 }
6951
6952 // Extract a rematerialized scalar constant insertion.
6953 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
6954 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
6955 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
6956 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
6957 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6958
6959 APInt UndefSrcElts(NumSrcElts, 0);
6960 SmallVector<APInt, 64> SrcEltBits;
6961 auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
6962 SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));
6963 SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
6964 return CastBitData(UndefSrcElts, SrcEltBits);
6965 }
6966
6967 // Insert constant bits from a base and sub vector sources.
6968 if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {
6969 // If bitcasts to larger elements we might lose track of undefs - don't
6970 // allow any to be safe.
6971 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
6972 bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;
6973
6974 APInt UndefSrcElts, UndefSubElts;
6975 SmallVector<APInt, 32> EltSrcBits, EltSubBits;
6976 if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits,
6977 UndefSubElts, EltSubBits,
6978 AllowWholeUndefs && AllowUndefs,
6979 AllowPartialUndefs && AllowUndefs) &&
6980 getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits,
6981 UndefSrcElts, EltSrcBits,
6982 AllowWholeUndefs && AllowUndefs,
6983 AllowPartialUndefs && AllowUndefs)) {
6984 unsigned BaseIdx = Op.getConstantOperandVal(2);
6985 UndefSrcElts.insertBits(UndefSubElts, BaseIdx);
6986 for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
6987 EltSrcBits[BaseIdx + i] = EltSubBits[i];
6988 return CastBitData(UndefSrcElts, EltSrcBits);
6989 }
6990 }
6991
6992 // Extract constant bits from a subvector's source.
6993 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
6994 // TODO - support extract_subvector through bitcasts.
6995 if (EltSizeInBits != VT.getScalarSizeInBits())
6996 return false;
6997
6998 if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
6999 UndefElts, EltBits, AllowWholeUndefs,
7000 AllowPartialUndefs)) {
7001 EVT SrcVT = Op.getOperand(0).getValueType();
7002 unsigned NumSrcElts = SrcVT.getVectorNumElements();
7003 unsigned NumSubElts = VT.getVectorNumElements();
7004 unsigned BaseIdx = Op.getConstantOperandVal(1);
7005 UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
7006 if ((BaseIdx + NumSubElts) != NumSrcElts)
7007 EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
7008 if (BaseIdx != 0)
7009 EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
7010 return true;
7011 }
7012 }
7013
7014 // Extract constant bits from shuffle node sources.
7015 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {
7016 // TODO - support shuffle through bitcasts.
7017 if (EltSizeInBits != VT.getScalarSizeInBits())
7018 return false;
7019
7020 ArrayRef<int> Mask = SVN->getMask();
7021 if ((!AllowWholeUndefs || !AllowPartialUndefs) &&
7022 llvm::any_of(Mask, [](int M) { return M < 0; }))
7023 return false;
7024
7025 APInt UndefElts0, UndefElts1;
7026 SmallVector<APInt, 32> EltBits0, EltBits1;
7027 if (isAnyInRange(Mask, 0, NumElts) &&
7028 !getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
7029 UndefElts0, EltBits0, AllowWholeUndefs,
7030 AllowPartialUndefs))
7031 return false;
7032 if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&
7033 !getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
7034 UndefElts1, EltBits1, AllowWholeUndefs,
7035 AllowPartialUndefs))
7036 return false;
7037
7038 UndefElts = APInt::getNullValue(NumElts);
7039 for (int i = 0; i != (int)NumElts; ++i) {
7040 int M = Mask[i];
7041 if (M < 0) {
7042 UndefElts.setBit(i);
7043 EltBits.push_back(APInt::getNullValue(EltSizeInBits));
7044 } else if (M < (int)NumElts) {
7045 if (UndefElts0[M])
7046 UndefElts.setBit(i);
7047 EltBits.push_back(EltBits0[M]);
7048 } else {
7049 if (UndefElts1[M - NumElts])
7050 UndefElts.setBit(i);
7051 EltBits.push_back(EltBits1[M - NumElts]);
7052 }
7053 }
7054 return true;
7055 }
7056
7057 return false;
7058}
7059
7060namespace llvm {
7061namespace X86 {
7062bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {
7063 APInt UndefElts;
7064 SmallVector<APInt, 16> EltBits;
7065 if (getTargetConstantBitsFromNode(Op, Op.getScalarValueSizeInBits(),
7066 UndefElts, EltBits, true,
7067 AllowPartialUndefs)) {
7068 int SplatIndex = -1;
7069 for (int i = 0, e = EltBits.size(); i != e; ++i) {
7070 if (UndefElts[i])
7071 continue;
7072 if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
7073 SplatIndex = -1;
7074 break;
7075 }
7076 SplatIndex = i;
7077 }
7078 if (0 <= SplatIndex) {
7079 SplatVal = EltBits[SplatIndex];
7080 return true;
7081 }
7082 }
7083
7084 return false;
7085}
7086} // namespace X86
7087} // namespace llvm
7088
7089static bool getTargetShuffleMaskIndices(SDValue MaskNode,
7090 unsigned MaskEltSizeInBits,
7091 SmallVectorImpl<uint64_t> &RawMask,
7092 APInt &UndefElts) {
7093 // Extract the raw target constant bits.
7094 SmallVector<APInt, 64> EltBits;
7095 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
7096 EltBits, /* AllowWholeUndefs */ true,
7097 /* AllowPartialUndefs */ false))
7098 return false;
7099
7100 // Insert the extracted elements into the mask.
7101 for (const APInt &Elt : EltBits)
7102 RawMask.push_back(Elt.getZExtValue());
7103
7104 return true;
7105}
7106
7107/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
7108/// A multi-stage pack shuffle mask is created by specifying NumStages > 1.
7109/// Note: This ignores saturation, so inputs must be checked first.
7110static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
7111 bool Unary, unsigned NumStages = 1) {
7112 assert(Mask.empty() && "Expected an empty shuffle mask vector")(static_cast<void> (0));
7113 unsigned NumElts = VT.getVectorNumElements();
7114 unsigned NumLanes = VT.getSizeInBits() / 128;
7115 unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
7116 unsigned Offset = Unary ? 0 : NumElts;
7117 unsigned Repetitions = 1u << (NumStages - 1);
7118 unsigned Increment = 1u << NumStages;
7119 assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction")(static_cast<void> (0));
7120
7121 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
7122 for (unsigned Stage = 0; Stage != Repetitions; ++Stage) {
7123 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
7124 Mask.push_back(Elt + (Lane * NumEltsPerLane));
7125 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
7126 Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
7127 }
7128 }
7129}
7130
7131// Split the demanded elts of a PACKSS/PACKUS node between its operands.
7132static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
7133 APInt &DemandedLHS, APInt &DemandedRHS) {
7134 int NumLanes = VT.getSizeInBits() / 128;
7135 int NumElts = DemandedElts.getBitWidth();
7136 int NumInnerElts = NumElts / 2;
7137 int NumEltsPerLane = NumElts / NumLanes;
7138 int NumInnerEltsPerLane = NumInnerElts / NumLanes;
7139
7140 DemandedLHS = APInt::getNullValue(NumInnerElts);
7141 DemandedRHS = APInt::getNullValue(NumInnerElts);
7142
7143 // Map DemandedElts to the packed operands.
7144 for (int Lane = 0; Lane != NumLanes; ++Lane) {
7145 for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
7146 int OuterIdx = (Lane * NumEltsPerLane) + Elt;
7147 int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
7148 if (DemandedElts[OuterIdx])
7149 DemandedLHS.setBit(InnerIdx);
7150 if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
7151 DemandedRHS.setBit(InnerIdx);
7152 }
7153 }
7154}
7155
7156// Split the demanded elts of a HADD/HSUB node between its operands.
7157static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
7158 APInt &DemandedLHS, APInt &DemandedRHS) {
7159 int NumLanes = VT.getSizeInBits() / 128;
7160 int NumElts = DemandedElts.getBitWidth();
7161 int NumEltsPerLane = NumElts / NumLanes;
7162 int HalfEltsPerLane = NumEltsPerLane / 2;
7163
7164 DemandedLHS = APInt::getNullValue(NumElts);
7165 DemandedRHS = APInt::getNullValue(NumElts);
7166
7167 // Map DemandedElts to the horizontal operands.
7168 for (int Idx = 0; Idx != NumElts; ++Idx) {
7169 if (!DemandedElts[Idx])
7170 continue;
7171 int LaneIdx = (Idx / NumEltsPerLane) * NumEltsPerLane;
7172 int LocalIdx = Idx % NumEltsPerLane;
7173 if (LocalIdx < HalfEltsPerLane) {
7174 DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 0);
7175 DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 1);
7176 } else {
7177 LocalIdx -= HalfEltsPerLane;
7178 DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 0);
7179 DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 1);
7180 }
7181 }
7182}
7183
7184/// Calculates the shuffle mask corresponding to the target-specific opcode.
7185/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
7186/// operands in \p Ops, and returns true.
7187/// Sets \p IsUnary to true if only one source is used. Note that this will set
7188/// IsUnary for shuffles which use a single input multiple times, and in those
7189/// cases it will adjust the mask to only have indices within that single input.
7190/// It is an error to call this with non-empty Mask/Ops vectors.
7191static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
7192 SmallVectorImpl<SDValue> &Ops,
7193 SmallVectorImpl<int> &Mask, bool &IsUnary) {
7194 unsigned NumElems = VT.getVectorNumElements();
7195 unsigned MaskEltSize = VT.getScalarSizeInBits();
7196 SmallVector<uint64_t, 32> RawMask;
7197 APInt RawUndefs;
7198 uint64_t ImmN;
7199
7200 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector")(static_cast<void> (0));
7201 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector")(static_cast<void> (0));
7202
7203 IsUnary = false;
7204 bool IsFakeUnary = false;
7205 switch (N->getOpcode()) {
7206 case X86ISD::BLENDI:
7207 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast<void> (0));
7208 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast<void> (0));
7209 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7210 DecodeBLENDMask(NumElems, ImmN, Mask);
7211 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7212 break;
7213 case X86ISD::SHUFP:
7214 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast<void> (0));
7215 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast<void> (0));
7216 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7217 DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);
7218 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7219 break;
7220 case X86ISD::INSERTPS:
7221 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast<void> (0));
7222 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast<void> (0));
7223 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7224 DecodeINSERTPSMask(ImmN, Mask);
7225 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7226 break;
7227 case X86ISD::EXTRQI:
7228 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast<void> (0));
7229 if (isa<ConstantSDNode>(N->getOperand(1)) &&
7230 isa<ConstantSDNode>(N->getOperand(2))) {
7231 int BitLen = N->getConstantOperandVal(1);
7232 int BitIdx = N->getConstantOperandVal(2);
7233 DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
7234 IsUnary = true;
7235 }
7236 break;
7237 case X86ISD::INSERTQI:
7238 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast<void> (0));
7239 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast<void> (0));
7240 if (isa<ConstantSDNode>(N->getOperand(2)) &&
7241 isa<ConstantSDNode>(N->getOperand(3))) {
7242 int BitLen = N->getConstantOperandVal(2);
7243 int BitIdx = N->getConstantOperandVal(3);
7244 DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
7245 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7246 }
7247 break;
7248 case X86ISD::UNPCKH:
7249 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast<void> (0));
7250 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast<void> (0));
7251 DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);
7252 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7253 break;
7254 case X86ISD::UNPCKL:
7255 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast<void> (0));
7256 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast<void> (0));
7257 DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);
7258 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7259 break;
7260 case X86ISD::MOVHLPS:
7261 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast<void> (0));
7262 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast<void> (0));
7263 DecodeMOVHLPSMask(NumElems, Mask);
7264 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7265 break;
7266 case X86ISD::MOVLHPS:
7267 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast<void> (0));
7268 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast<void> (0));
7269 DecodeMOVLHPSMask(NumElems, Mask);
7270 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7271 break;
7272 case X86ISD::VALIGN:
7273 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&(static_cast<void> (0))
7274 "Only 32-bit and 64-bit elements are supported!")(static_cast<void> (0));
7275 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast<void> (0));
7276 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast<void> (0));
7277 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7278 DecodeVALIGNMask(NumElems, ImmN, Mask);
7279 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7280 Ops.push_back(N->getOperand(1));
7281 Ops.push_back(N->getOperand(0));
7282 break;
7283 case X86ISD::PALIGNR:
7284 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast<void> (0));
7285 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast<void> (0));
7286 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast<void> (0));
7287 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7288 DecodePALIGNRMask(NumElems, ImmN, Mask);
7289 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7290 Ops.push_back(N->getOperand(1));
7291 Ops.push_back(N->getOperand(0));
7292 break;
7293 case X86ISD::VSHLDQ:
7294 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast<void> (0));
7295 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast<void> (0));
7296 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7297 DecodePSLLDQMask(NumElems, ImmN, Mask);
7298 IsUnary = true;
7299 break;
7300 case X86ISD::VSRLDQ:
7301 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast<void> (0));
7302 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast<void> (0));
7303 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7304 DecodePSRLDQMask(NumElems, ImmN, Mask);
7305 IsUnary = true;
7306 break;
7307 case X86ISD::PSHUFD:
7308 case X86ISD::VPERMILPI:
7309 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast<void> (0));
7310 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7311 DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);
7312 IsUnary = true;
7313 break;
7314 case X86ISD::PSHUFHW:
7315 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast<void> (0));
7316 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7317 DecodePSHUFHWMask(NumElems, ImmN, Mask);
7318 IsUnary = true;
7319 break;
7320 case X86ISD::PSHUFLW:
7321 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast<void> (0));
7322 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7323 DecodePSHUFLWMask(NumElems, ImmN, Mask);
7324 IsUnary = true;
7325 break;
7326 case X86ISD::VZEXT_MOVL:
7327 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast<void> (0));
7328 DecodeZeroMoveLowMask(NumElems, Mask);
7329 IsUnary = true;
7330 break;
7331 case X86ISD::VBROADCAST:
7332 // We only decode broadcasts of same-sized vectors, peeking through to
7333 // extracted subvectors is likely to cause hasOneUse issues with
7334 // SimplifyDemandedBits etc.
7335 if (N->getOperand(0).getValueType() == VT) {
7336 DecodeVectorBroadcast(NumElems, Mask);
7337 IsUnary = true;
7338 break;
7339 }
7340 return false;
7341 case X86ISD::VPERMILPV: {
7342 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast<void> (0));
7343 IsUnary = true;
7344 SDValue MaskNode = N->getOperand(1);
7345 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
7346 RawUndefs)) {
7347 DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);
7348 break;
7349 }
7350 return false;
7351 }
7352 case X86ISD::PSHUFB: {
7353 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast<void> (0));
7354 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast<void> (0));
7355 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast<void> (0));
7356 IsUnary = true;
7357 SDValue MaskNode = N->getOperand(1);
7358 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
7359 DecodePSHUFBMask(RawMask, RawUndefs, Mask);
7360 break;
7361 }
7362 return false;
7363 }
7364 case X86ISD::VPERMI:
7365 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast<void> (0));
7366 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7367 DecodeVPERMMask(NumElems, ImmN, Mask);
7368 IsUnary = true;
7369 break;
7370 case X86ISD::MOVSS:
7371 case X86ISD::MOVSD:
7372 case X86ISD::MOVSH:
7373 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast<void> (0));
7374 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast<void> (0));
7375 DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
7376 break;
7377 case X86ISD::VPERM2X128:
7378 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast<void> (0));
7379 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast<void> (0));
7380 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7381 DecodeVPERM2X128Mask(NumElems, ImmN, Mask);
7382 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7383 break;
7384 case X86ISD::SHUF128:
7385 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast<void> (0));
7386 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast<void> (0));
7387 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7388 decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);
7389 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7390 break;
7391 case X86ISD::MOVSLDUP:
7392 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast<void> (0));
7393 DecodeMOVSLDUPMask(NumElems, Mask);
7394 IsUnary = true;
7395 break;
7396 case X86ISD::MOVSHDUP:
7397 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast<void> (0));
7398 DecodeMOVSHDUPMask(NumElems, Mask);
7399 IsUnary = true;
7400 break;
7401 case X86ISD::MOVDDUP:
7402 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast<void> (0));
7403 DecodeMOVDDUPMask(NumElems, Mask);
7404 IsUnary = true;
7405 break;
7406 case X86ISD::VPERMIL2: {
7407 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast<void> (0));
7408 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast<void> (0));
7409 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7410 SDValue MaskNode = N->getOperand(2);
7411 SDValue CtrlNode = N->getOperand(3);
7412 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
7413 unsigned CtrlImm = CtrlOp->getZExtValue();
7414 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
7415 RawUndefs)) {
7416 DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,
7417 Mask);
7418 break;
7419 }
7420 }
7421 return false;
7422 }
7423 case X86ISD::VPPERM: {
7424 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast<void> (0));
7425 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast<void> (0));
7426 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7427 SDValue MaskNode = N->getOperand(2);
7428 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
7429 DecodeVPPERMMask(RawMask, RawUndefs, Mask);
7430 break;
7431 }
7432 return false;
7433 }
7434 case X86ISD::VPERMV: {
7435 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast<void> (0));
7436 IsUnary = true;
7437 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
7438 Ops.push_back(N->getOperand(1));
7439 SDValue MaskNode = N->getOperand(0);
7440 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
7441 RawUndefs)) {
7442 DecodeVPERMVMask(RawMask, RawUndefs, Mask);
7443 break;
7444 }
7445 return false;
7446 }
7447 case X86ISD::VPERMV3: {
7448 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast<void> (0));
7449 assert(N->getOperand(2).getValueType() == VT && "Unexpected value type")(static_cast<void> (0));
7450 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
7451 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
7452 Ops.push_back(N->getOperand(0));
7453 Ops.push_back(N->getOperand(2));
7454 SDValue MaskNode = N->getOperand(1);
7455 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
7456 RawUndefs)) {
7457 DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);
7458 break;
7459 }
7460 return false;
7461 }
7462 default: llvm_unreachable("unknown target shuffle node")__builtin_unreachable();
7463 }
7464
7465 // Empty mask indicates the decode failed.
7466 if (Mask.empty())
7467 return false;
7468
7469 // Check if we're getting a shuffle mask with zero'd elements.
7470 if (!AllowSentinelZero && isAnyZero(Mask))
7471 return false;
7472
7473 // If we have a fake unary shuffle, the shuffle mask is spread across two
7474 // inputs that are actually the same node. Re-map the mask to always point
7475 // into the first input.
7476 if (IsFakeUnary)
7477 for (int &M : Mask)
7478 if (M >= (int)Mask.size())
7479 M -= Mask.size();
7480
7481 // If we didn't already add operands in the opcode-specific code, default to
7482 // adding 1 or 2 operands starting at 0.
7483 if (Ops.empty()) {
7484 Ops.push_back(N->getOperand(0));
7485 if (!IsUnary || IsFakeUnary)
7486 Ops.push_back(N->getOperand(1));
7487 }
7488
7489 return true;
7490}
7491
7492// Wrapper for getTargetShuffleMask with InUnary;
7493static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
7494 SmallVectorImpl<SDValue> &Ops,
7495 SmallVectorImpl<int> &Mask) {
7496 bool IsUnary;
7497 return getTargetShuffleMask(N, VT, AllowSentinelZero, Ops, Mask, IsUnary);
7498}
7499
7500/// Compute whether each element of a shuffle is zeroable.
7501///
7502/// A "zeroable" vector shuffle element is one which can be lowered to zero.
7503/// Either it is an undef element in the shuffle mask, the element of the input
7504/// referenced is undef, or the element of the input referenced is known to be
7505/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
7506/// as many lanes with this technique as possible to simplify the remaining
7507/// shuffle.
7508static void computeZeroableShuffleElements(ArrayRef<int> Mask,
7509 SDValue V1, SDValue V2,
7510 APInt &KnownUndef, APInt &KnownZero) {
7511 int Size = Mask.size();
7512 KnownUndef = KnownZero = APInt::getNullValue(Size);
7513
7514 V1 = peekThroughBitcasts(V1);
7515 V2 = peekThroughBitcasts(V2);
7516
7517 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
7518 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
7519
7520 int VectorSizeInBits = V1.getValueSizeInBits();
7521 int ScalarSizeInBits = VectorSizeInBits / Size;
7522 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size")(static_cast<void> (0));
7523
7524 for (int i = 0; i < Size; ++i) {
7525 int M = Mask[i];
7526 // Handle the easy cases.
7527 if (M < 0) {
7528 KnownUndef.setBit(i);
7529 continue;
7530 }
7531 if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
7532 KnownZero.setBit(i);
7533 continue;
7534 }
7535
7536 // Determine shuffle input and normalize the mask.
7537 SDValue V = M < Size ? V1 : V2;
7538 M %= Size;
7539
7540 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
7541 if (V.getOpcode() != ISD::BUILD_VECTOR)
7542 continue;
7543
7544 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
7545 // the (larger) source element must be UNDEF/ZERO.
7546 if ((Size % V.getNumOperands()) == 0) {
7547 int Scale = Size / V->getNumOperands();
7548 SDValue Op = V.getOperand(M / Scale);
7549 if (Op.isUndef())
7550 KnownUndef.setBit(i);
7551 if (X86::isZeroNode(Op))
7552 KnownZero.setBit(i);
7553 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
7554 APInt Val = Cst->getAPIntValue();
7555 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
7556 if (Val == 0)
7557 KnownZero.setBit(i);
7558 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
7559 APInt Val = Cst->getValueAPF().bitcastToAPInt();
7560 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
7561 if (Val == 0)
7562 KnownZero.setBit(i);
7563 }
7564 continue;
7565 }
7566
7567 // If the BUILD_VECTOR has more elements then all the (smaller) source
7568 // elements must be UNDEF or ZERO.
7569 if ((V.getNumOperands() % Size) == 0) {
7570 int Scale = V->getNumOperands() / Size;
7571 bool AllUndef = true;
7572 bool AllZero = true;
7573 for (int j = 0; j < Scale; ++j) {
7574 SDValue Op = V.getOperand((M * Scale) + j);
7575 AllUndef &= Op.isUndef();
7576 AllZero &= X86::isZeroNode(Op);
7577 }
7578 if (AllUndef)
7579 KnownUndef.setBit(i);
7580 if (AllZero)
7581 KnownZero.setBit(i);
7582 continue;
7583 }
7584 }
7585}
7586
7587/// Decode a target shuffle mask and inputs and see if any values are
7588/// known to be undef or zero from their inputs.
7589/// Returns true if the target shuffle mask was decoded.
7590/// FIXME: Merge this with computeZeroableShuffleElements?
7591static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask,
7592 SmallVectorImpl<SDValue> &Ops,
7593 APInt &KnownUndef, APInt &KnownZero) {
7594 bool IsUnary;
7595 if (!isTargetShuffle(N.getOpcode()))
7596 return false;
7597
7598 MVT VT = N.getSimpleValueType();
7599 if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
7600 return false;
7601
7602 int Size = Mask.size();
7603 SDValue V1 = Ops[0];
7604 SDValue V2 = IsUnary ? V1 : Ops[1];
7605 KnownUndef = KnownZero = APInt::getNullValue(Size);
7606
7607 V1 = peekThroughBitcasts(V1);
7608 V2 = peekThroughBitcasts(V2);
7609
7610 assert((VT.getSizeInBits() % Size) == 0 &&(static_cast<void> (0))
7611 "Illegal split of shuffle value type")(static_cast<void> (0));
7612 unsigned EltSizeInBits = VT.getSizeInBits() / Size;
7613
7614 // Extract known constant input data.
7615 APInt UndefSrcElts[2];
7616 SmallVector<APInt, 32> SrcEltBits[2];
7617 bool IsSrcConstant[2] = {
7618 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
7619 SrcEltBits[0], true, false),
7620 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
7621 SrcEltBits[1], true, false)};
7622
7623 for (int i = 0; i < Size; ++i) {
7624 int M = Mask[i];
7625
7626 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
7627 if (M < 0) {
7628 assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!")(static_cast<void> (0));
7629 if (SM_SentinelUndef == M)
7630 KnownUndef.setBit(i);
7631 if (SM_SentinelZero == M)
7632 KnownZero.setBit(i);
7633 continue;
7634 }
7635
7636 // Determine shuffle input and normalize the mask.
7637 unsigned SrcIdx = M / Size;
7638 SDValue V = M < Size ? V1 : V2;
7639 M %= Size;
7640
7641 // We are referencing an UNDEF input.
7642 if (V.isUndef()) {
7643 KnownUndef.setBit(i);
7644 continue;
7645 }
7646
7647 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
7648 // TODO: We currently only set UNDEF for integer types - floats use the same
7649 // registers as vectors and many of the scalar folded loads rely on the
7650 // SCALAR_TO_VECTOR pattern.
7651 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
7652 (Size % V.getValueType().getVectorNumElements()) == 0) {
7653 int Scale = Size / V.getValueType().getVectorNumElements();
7654 int Idx = M / Scale;
7655 if (Idx != 0 && !VT.isFloatingPoint())
7656 KnownUndef.setBit(i);
7657 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
7658 KnownZero.setBit(i);
7659 continue;
7660 }
7661
7662 // INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF
7663 // base vectors.
7664 if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {
7665 SDValue Vec = V.getOperand(0);
7666 int NumVecElts = Vec.getValueType().getVectorNumElements();
7667 if (Vec.isUndef() && Size == NumVecElts) {
7668 int Idx = V.getConstantOperandVal(2);
7669 int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();
7670 if (M < Idx || (Idx + NumSubElts) <= M)
7671 KnownUndef.setBit(i);
7672 }
7673 continue;
7674 }
7675
7676 // Attempt to extract from the source's constant bits.
7677 if (IsSrcConstant[SrcIdx]) {
7678 if (UndefSrcElts[SrcIdx][M])
7679 KnownUndef.setBit(i);
7680 else if (SrcEltBits[SrcIdx][M] == 0)
7681 KnownZero.setBit(i);
7682 }
7683 }
7684
7685 assert(VT.getVectorNumElements() == (unsigned)Size &&(static_cast<void> (0))
7686 "Different mask size from vector size!")(static_cast<void> (0));
7687 return true;
7688}
7689
7690// Replace target shuffle mask elements with known undef/zero sentinels.
7691static void resolveTargetShuffleFromZeroables(SmallVectorImpl<int> &Mask,
7692 const APInt &KnownUndef,
7693 const APInt &KnownZero,
7694 bool ResolveKnownZeros= true) {
7695 unsigned NumElts = Mask.size();
7696 assert(KnownUndef.getBitWidth() == NumElts &&(static_cast<void> (0))
7697 KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch")(static_cast<void> (0));
7698
7699 for (unsigned i = 0; i != NumElts; ++i) {
7700 if (KnownUndef[i])
7701 Mask[i] = SM_SentinelUndef;
7702 else if (ResolveKnownZeros && KnownZero[i])
7703 Mask[i] = SM_SentinelZero;
7704 }
7705}
7706
7707// Extract target shuffle mask sentinel elements to known undef/zero bitmasks.
7708static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl<int> &Mask,
7709 APInt &KnownUndef,
7710 APInt &KnownZero) {
7711 unsigned NumElts = Mask.size();
7712 KnownUndef = KnownZero = APInt::getNullValue(NumElts);
7713
7714 for (unsigned i = 0; i != NumElts; ++i) {
7715 int M = Mask[i];
7716 if (SM_SentinelUndef == M)
7717 KnownUndef.setBit(i);
7718 if (SM_SentinelZero == M)
7719 KnownZero.setBit(i);
7720 }
7721}
7722
7723// Forward declaration (for getFauxShuffleMask recursive check).
7724// TODO: Use DemandedElts variant.
7725static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
7726 SmallVectorImpl<int> &Mask,
7727 const SelectionDAG &DAG, unsigned Depth,
7728 bool ResolveKnownElts);
7729
7730// Attempt to decode ops that could be represented as a shuffle mask.
7731// The decoded shuffle mask may contain a different number of elements to the
7732// destination value type.
7733static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
7734 SmallVectorImpl<int> &Mask,
7735 SmallVectorImpl<SDValue> &Ops,
7736 const SelectionDAG &DAG, unsigned Depth,
7737 bool ResolveKnownElts) {
7738 Mask.clear();
7739 Ops.clear();
7740
7741 MVT VT = N.getSimpleValueType();
7742 unsigned NumElts = VT.getVectorNumElements();
7743 unsigned NumSizeInBits = VT.getSizeInBits();
7744 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
7745 if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
7746 return false;
7747 assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size")(static_cast<void> (0));
7748 unsigned NumSizeInBytes = NumSizeInBits / 8;
7749 unsigned NumBytesPerElt = NumBitsPerElt / 8;
7750
7751 unsigned Opcode = N.getOpcode();
7752 switch (Opcode) {
7753 case ISD::VECTOR_SHUFFLE: {
7754 // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
7755 ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
7756 if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
7757 Mask.append(ShuffleMask.begin(), ShuffleMask.end());
7758 Ops.push_back(N.getOperand(0));
7759 Ops.push_back(N.getOperand(1));
7760 return true;
7761 }
7762 return false;
7763 }
7764 case ISD::AND:
7765 case X86ISD::ANDNP: {
7766 // Attempt to decode as a per-byte mask.
7767 APInt UndefElts;
7768 SmallVector<APInt, 32> EltBits;
7769 SDValue N0 = N.getOperand(0);
7770 SDValue N1 = N.getOperand(1);
7771 bool IsAndN = (X86ISD::ANDNP == Opcode);
7772 uint64_t ZeroMask = IsAndN ? 255 : 0;
7773 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
7774 return false;
7775 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
7776 if (UndefElts[i]) {
7777 Mask.push_back(SM_SentinelUndef);
7778 continue;
7779 }
7780 const APInt &ByteBits = EltBits[i];
7781 if (ByteBits != 0 && ByteBits != 255)
7782 return false;
7783 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
7784 }
7785 Ops.push_back(IsAndN ? N1 : N0);
7786 return true;
7787 }
7788 case ISD::OR: {
7789 // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
7790 // is a valid shuffle index.
7791 SDValue N0 = peekThroughBitcasts(N.getOperand(0));
7792 SDValue N1 = peekThroughBitcasts(N.getOperand(1));
7793 if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
7794 return false;
7795 SmallVector<int, 64> SrcMask0, SrcMask1;
7796 SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
7797 if (!getTargetShuffleInputs(N0, SrcInputs0, SrcMask0, DAG, Depth + 1,
7798 true) ||
7799 !getTargetShuffleInputs(N1, SrcInputs1, SrcMask1, DAG, Depth + 1,
7800 true))
7801 return false;
7802
7803 size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
7804 SmallVector<int, 64> Mask0, Mask1;
7805 narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
7806 narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
7807 for (int i = 0; i != (int)MaskSize; ++i) {
7808 // NOTE: Don't handle SM_SentinelUndef, as we can end up in infinite
7809 // loops converting between OR and BLEND shuffles due to
7810 // canWidenShuffleElements merging away undef elements, meaning we
7811 // fail to recognise the OR as the undef element isn't known zero.
7812 if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
7813 Mask.push_back(SM_SentinelZero);
7814 else if (Mask1[i] == SM_SentinelZero)
7815 Mask.push_back(i);
7816 else if (Mask0[i] == SM_SentinelZero)
7817 Mask.push_back(i + MaskSize);
7818 else
7819 return false;
7820 }
7821 Ops.push_back(N0);
7822 Ops.push_back(N1);
7823 return true;
7824 }
7825 case ISD::INSERT_SUBVECTOR: {
7826 SDValue Src = N.getOperand(0);
7827 SDValue Sub = N.getOperand(1);
7828 EVT SubVT = Sub.getValueType();
7829 unsigned NumSubElts = SubVT.getVectorNumElements();
7830 if (!N->isOnlyUserOf(Sub.getNode()))
7831 return false;
7832 uint64_t InsertIdx = N.getConstantOperandVal(2);
7833 // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
7834 if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
7835 Sub.getOperand(0).getValueType() == VT) {
7836 uint64_t ExtractIdx = Sub.getConstantOperandVal(1);
7837 for (int i = 0; i != (int)NumElts; ++i)
7838 Mask.push_back(i);
7839 for (int i = 0; i != (int)NumSubElts; ++i)
7840 Mask[InsertIdx + i] = NumElts + ExtractIdx + i;
7841 Ops.push_back(Src);
7842 Ops.push_back(Sub.getOperand(0));
7843 return true;
7844 }
7845 // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
7846 SmallVector<int, 64> SubMask;
7847 SmallVector<SDValue, 2> SubInputs;
7848 if (!getTargetShuffleInputs(peekThroughOneUseBitcasts(Sub), SubInputs,
7849 SubMask, DAG, Depth + 1, ResolveKnownElts))
7850 return false;
7851
7852 // Subvector shuffle inputs must not be larger than the subvector.
7853 if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {
7854 return SubVT.getFixedSizeInBits() <
7855 SubInput.getValueSizeInBits().getFixedSize();
7856 }))
7857 return false;
7858
7859 if (SubMask.size() != NumSubElts) {
7860 assert(((SubMask.size() % NumSubElts) == 0 ||(static_cast<void> (0))
7861 (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale")(static_cast<void> (0));
7862 if ((NumSubElts % SubMask.size()) == 0) {
7863 int Scale = NumSubElts / SubMask.size();
7864 SmallVector<int,64> ScaledSubMask;
7865 narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask);
7866 SubMask = ScaledSubMask;
7867 } else {
7868 int Scale = SubMask.size() / NumSubElts;
7869 NumSubElts = SubMask.size();
7870 NumElts *= Scale;
7871 InsertIdx *= Scale;
7872 }
7873 }
7874 Ops.push_back(Src);
7875 Ops.append(SubInputs.begin(), SubInputs.end());
7876 if (ISD::isBuildVectorAllZeros(Src.getNode()))
7877 Mask.append(NumElts, SM_SentinelZero);
7878 else
7879 for (int i = 0; i != (int)NumElts; ++i)
7880 Mask.push_back(i);
7881 for (int i = 0; i != (int)NumSubElts; ++i) {
7882 int M = SubMask[i];
7883 if (0 <= M) {
7884 int InputIdx = M / NumSubElts;
7885 M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
7886 }
7887 Mask[i + InsertIdx] = M;
7888 }
7889 return true;
7890 }
7891 case X86ISD::PINSRB:
7892 case X86ISD::PINSRW:
7893 case ISD::SCALAR_TO_VECTOR:
7894 case ISD::INSERT_VECTOR_ELT: {
7895 // Match against a insert_vector_elt/scalar_to_vector of an extract from a
7896 // vector, for matching src/dst vector types.
7897 SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1);
7898
7899 unsigned DstIdx = 0;
7900 if (Opcode != ISD::SCALAR_TO_VECTOR) {
7901 // Check we have an in-range constant insertion index.
7902 if (!isa<ConstantSDNode>(N.getOperand(2)) ||
7903 N.getConstantOperandAPInt(2).uge(NumElts))
7904 return false;
7905 DstIdx = N.getConstantOperandVal(2);
7906
7907 // Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern.
7908 if (X86::isZeroNode(Scl)) {
7909 Ops.push_back(N.getOperand(0));
7910 for (unsigned i = 0; i != NumElts; ++i)
7911 Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i);
7912 return true;
7913 }
7914 }
7915
7916 // Peek through trunc/aext/zext.
7917 // TODO: aext shouldn't require SM_SentinelZero padding.
7918 // TODO: handle shift of scalars.
7919 unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();
7920 while (Scl.getOpcode() == ISD::TRUNCATE ||
7921 Scl.getOpcode() == ISD::ANY_EXTEND ||
7922 Scl.getOpcode() == ISD::ZERO_EXTEND) {
7923 Scl = Scl.getOperand(0);
7924 MinBitsPerElt =
7925 std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());
7926 }
7927 if ((MinBitsPerElt % 8) != 0)
7928 return false;
7929
7930 // Attempt to find the source vector the scalar was extracted from.
7931 SDValue SrcExtract;
7932 if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
7933 Scl.getOpcode() == X86ISD::PEXTRW ||
7934 Scl.getOpcode() == X86ISD::PEXTRB) &&
7935 Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
7936 SrcExtract = Scl;
7937 }
7938 if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
7939 return false;
7940
7941 SDValue SrcVec = SrcExtract.getOperand(0);
7942 EVT SrcVT = SrcVec.getValueType();
7943 if (!SrcVT.getScalarType().isByteSized())
7944 return false;
7945 unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
7946 unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);
7947 unsigned DstByte = DstIdx * NumBytesPerElt;
7948 MinBitsPerElt =
7949 std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());
7950
7951 // Create 'identity' byte level shuffle mask and then add inserted bytes.
7952 if (Opcode == ISD::SCALAR_TO_VECTOR) {
7953 Ops.push_back(SrcVec);
7954 Mask.append(NumSizeInBytes, SM_SentinelUndef);
7955 } else {
7956 Ops.push_back(SrcVec);
7957 Ops.push_back(N.getOperand(0));
7958 for (int i = 0; i != (int)NumSizeInBytes; ++i)
7959 Mask.push_back(NumSizeInBytes + i);
7960 }
7961
7962 unsigned MinBytesPerElts = MinBitsPerElt / 8;
7963 MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
7964 for (unsigned i = 0; i != MinBytesPerElts; ++i)
7965 Mask[DstByte + i] = SrcByte + i;
7966 for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
7967 Mask[DstByte + i] = SM_SentinelZero;
7968 return true;
7969 }
7970 case X86ISD::PACKSS:
7971 case X86ISD::PACKUS: {
7972 SDValue N0 = N.getOperand(0);
7973 SDValue N1 = N.getOperand(1);
7974 assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&(static_cast<void> (0))
7975 N1.getValueType().getVectorNumElements() == (NumElts / 2) &&(static_cast<void> (0))
7976 "Unexpected input value type")(static_cast<void> (0));
7977
7978 APInt EltsLHS, EltsRHS;
7979 getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);
7980
7981 // If we know input saturation won't happen (or we don't care for particular
7982 // lanes), we can treat this as a truncation shuffle.
7983 bool Offset0 = false, Offset1 = false;
7984 if (Opcode == X86ISD::PACKSS) {
7985 if ((!(N0.isUndef() || EltsLHS.isNullValue()) &&
7986 DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
7987 (!(N1.isUndef() || EltsRHS.isNullValue()) &&
7988 DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
7989 return false;
7990 // We can't easily fold ASHR into a shuffle, but if it was feeding a
7991 // PACKSS then it was likely being used for sign-extension for a
7992 // truncation, so just peek through and adjust the mask accordingly.
7993 if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&
7994 N0.getConstantOperandAPInt(1) == NumBitsPerElt) {
7995 Offset0 = true;
7996 N0 = N0.getOperand(0);
7997 }
7998 if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&
7999 N1.getConstantOperandAPInt(1) == NumBitsPerElt) {
8000 Offset1 = true;
8001 N1 = N1.getOperand(0);
8002 }
8003 } else {
8004 APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
8005 if ((!(N0.isUndef() || EltsLHS.isNullValue()) &&
8006 !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||
8007 (!(N1.isUndef() || EltsRHS.isNullValue()) &&
8008 !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
8009 return false;
8010 }
8011
8012 bool IsUnary = (N0 == N1);
8013
8014 Ops.push_back(N0);
8015 if (!IsUnary)
8016 Ops.push_back(N1);
8017
8018 createPackShuffleMask(VT, Mask, IsUnary);
8019
8020 if (Offset0 || Offset1) {
8021 for (int &M : Mask)
8022 if ((Offset0 && isInRange(M, 0, NumElts)) ||
8023 (Offset1 && isInRange(M, NumElts, 2 * NumElts)))
8024 ++M;
8025 }
8026 return true;
8027 }
8028 case X86ISD::VTRUNC: {
8029 SDValue Src = N.getOperand(0);
8030 EVT SrcVT = Src.getValueType();
8031 // Truncated source must be a simple vector.
8032 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
8033 (SrcVT.getScalarSizeInBits() % 8) != 0)
8034 return false;
8035 unsigned NumSrcElts = SrcVT.getVectorNumElements();
8036 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
8037 unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
8038 assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation")(static_cast<void> (0));
8039 for (unsigned i = 0; i != NumSrcElts; ++i)
8040 Mask.push_back(i * Scale);
8041 Mask.append(NumElts - NumSrcElts, SM_SentinelZero);
8042 Ops.push_back(Src);
8043 return true;
8044 }
8045 case X86ISD::VSHLI:
8046 case X86ISD::VSRLI: {
8047 uint64_t ShiftVal = N.getConstantOperandVal(1);
8048 // Out of range bit shifts are guaranteed to be zero.
8049 if (NumBitsPerElt <= ShiftVal) {
8050 Mask.append(NumElts, SM_SentinelZero);
8051 return true;
8052 }
8053
8054 // We can only decode 'whole byte' bit shifts as shuffles.
8055 if ((ShiftVal % 8) != 0)
8056 break;
8057
8058 uint64_t ByteShift = ShiftVal / 8;
8059 Ops.push_back(N.getOperand(0));
8060
8061 // Clear mask to all zeros and insert the shifted byte indices.
8062 Mask.append(NumSizeInBytes, SM_SentinelZero);
8063
8064 if (X86ISD::VSHLI == Opcode) {
8065 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
8066 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
8067 Mask[i + j] = i + j - ByteShift;
8068 } else {
8069 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
8070 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
8071 Mask[i + j - ByteShift] = i + j;
8072 }
8073 return true;
8074 }
8075 case X86ISD::VROTLI:
8076 case X86ISD::VROTRI: {
8077 // We can only decode 'whole byte' bit rotates as shuffles.
8078 uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);
8079 if ((RotateVal % 8) != 0)
8080 return false;
8081 Ops.push_back(N.getOperand(0));
8082 int Offset = RotateVal / 8;
8083 Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);
8084 for (int i = 0; i != (int)NumElts; ++i) {
8085 int BaseIdx = i * NumBytesPerElt;
8086 for (int j = 0; j != (int)NumBytesPerElt; ++j) {
8087 Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));
8088 }
8089 }
8090 return true;
8091 }
8092 case X86ISD::VBROADCAST: {
8093 SDValue Src = N.getOperand(0);
8094 if (!Src.getSimpleValueType().isVector()) {
8095 if (Src.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8096 !isNullConstant(Src.getOperand(1)) ||
8097 Src.getOperand(0).getValueType().getScalarType() !=
8098 VT.getScalarType())
8099 return false;
8100 Src = Src.getOperand(0);
8101 }
8102 Ops.push_back(Src);
8103 Mask.append(NumElts, 0);
8104 return true;
8105 }
8106 case ISD::ZERO_EXTEND:
8107 case ISD::ANY_EXTEND:
8108 case ISD::ZERO_EXTEND_VECTOR_INREG:
8109 case ISD::ANY_EXTEND_VECTOR_INREG: {
8110 SDValue Src = N.getOperand(0);
8111 EVT SrcVT = Src.getValueType();
8112
8113 // Extended source must be a simple vector.
8114 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
8115 (SrcVT.getScalarSizeInBits() % 8) != 0)
8116 return false;
8117
8118 bool IsAnyExtend =
8119 (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
8120 DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,
8121 IsAnyExtend, Mask);
8122 Ops.push_back(Src);
8123 return true;
8124 }
8125 }
8126
8127 return false;
8128}
8129
8130/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
8131static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
8132 SmallVectorImpl<int> &Mask) {
8133 int MaskWidth = Mask.size();
8134 SmallVector<SDValue, 16> UsedInputs;
8135 for (int i = 0, e = Inputs.size(); i < e; ++i) {
8136 int lo = UsedInputs.size() * MaskWidth;
8137 int hi = lo + MaskWidth;
8138
8139 // Strip UNDEF input usage.
8140 if (Inputs[i].isUndef())
8141 for (int &M : Mask)
8142 if ((lo <= M) && (M < hi))
8143 M = SM_SentinelUndef;
8144
8145 // Check for unused inputs.
8146 if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
8147 for (int &M : Mask)
8148 if (lo <= M)
8149 M -= MaskWidth;
8150 continue;
8151 }
8152
8153 // Check for repeated inputs.
8154 bool IsRepeat = false;
8155 for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {
8156 if (UsedInputs[j] != Inputs[i])
8157 continue;
8158 for (int &M : Mask)
8159 if (lo <= M)
8160 M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
8161 IsRepeat = true;
8162 break;
8163 }
8164 if (IsRepeat)
8165 continue;
8166
8167 UsedInputs.push_back(Inputs[i]);
8168 }
8169 Inputs = UsedInputs;
8170}
8171
8172/// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs
8173/// and then sets the SM_SentinelUndef and SM_SentinelZero values.
8174/// Returns true if the target shuffle mask was decoded.
8175static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
8176 SmallVectorImpl<SDValue> &Inputs,
8177 SmallVectorImpl<int> &Mask,
8178 APInt &KnownUndef, APInt &KnownZero,
8179 const SelectionDAG &DAG, unsigned Depth,
8180 bool ResolveKnownElts) {
8181 EVT VT = Op.getValueType();
8182 if (!VT.isSimple() || !VT.isVector())
8183 return false;
8184
8185 if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {
8186 if (ResolveKnownElts)
8187 resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);
8188 return true;
8189 }
8190 if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,
8191 ResolveKnownElts)) {
8192 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
8193 return true;
8194 }
8195 return false;
8196}
8197
8198static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
8199 SmallVectorImpl<int> &Mask,
8200 const SelectionDAG &DAG, unsigned Depth = 0,
8201 bool ResolveKnownElts = true) {
8202 EVT VT = Op.getValueType();
8203 if (!VT.isSimple() || !VT.isVector())
8204 return false;
8205
8206 APInt KnownUndef, KnownZero;
8207 unsigned NumElts = Op.getValueType().getVectorNumElements();
8208 APInt DemandedElts = APInt::getAllOnesValue(NumElts);
8209 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,
8210 KnownZero, DAG, Depth, ResolveKnownElts);
8211}
8212
8213// Attempt to create a scalar/subvector broadcast from the base MemSDNode.
8214static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT,
8215 EVT MemVT, MemSDNode *Mem, unsigned Offset,
8216 SelectionDAG &DAG) {
8217 assert((Opcode == X86ISD::VBROADCAST_LOAD ||(static_cast<void> (0))
8218 Opcode == X86ISD::SUBV_BROADCAST_LOAD) &&(static_cast<void> (0))
8219 "Unknown broadcast load type")(static_cast<void> (0));
8220
8221 // Ensure this is a simple (non-atomic, non-voltile), temporal read memop.
8222 if (!Mem || !Mem->readMem() || !Mem->isSimple() || Mem->isNonTemporal())
8223 return SDValue();
8224
8225 SDValue Ptr =
8226 DAG.getMemBasePlusOffset(Mem->getBasePtr(), TypeSize::Fixed(Offset), DL);
8227 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
8228 SDValue Ops[] = {Mem->getChain(), Ptr};
8229 SDValue BcstLd = DAG.getMemIntrinsicNode(
8230 Opcode, DL, Tys, Ops, MemVT,
8231 DAG.getMachineFunction().getMachineMemOperand(
8232 Mem->getMemOperand(), Offset, MemVT.getStoreSize()));
8233 DAG.makeEquivalentMemoryOrdering(SDValue(Mem, 1), BcstLd.getValue(1));
8234 return BcstLd;
8235}
8236
8237/// Returns the scalar element that will make up the i'th
8238/// element of the result of the vector shuffle.
8239static SDValue getShuffleScalarElt(SDValue Op, unsigned Index,
8240 SelectionDAG &DAG, unsigned Depth) {
8241 if (Depth >= SelectionDAG::MaxRecursionDepth)
8242 return SDValue(); // Limit search depth.
8243
8244 EVT VT = Op.getValueType();
8245 unsigned Opcode = Op.getOpcode();
8246 unsigned NumElems = VT.getVectorNumElements();
8247
8248 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
8249 if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) {
8250 int Elt = SV->getMaskElt(Index);
8251
8252 if (Elt < 0)
8253 return DAG.getUNDEF(VT.getVectorElementType());
8254
8255 SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);
8256 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
8257 }
8258
8259 // Recurse into target specific vector shuffles to find scalars.
8260 if (isTargetShuffle(Opcode)) {
8261 MVT ShufVT = VT.getSimpleVT();
8262 MVT ShufSVT = ShufVT.getVectorElementType();
8263 int NumElems = (int)ShufVT.getVectorNumElements();
8264 SmallVector<int, 16> ShuffleMask;
8265 SmallVector<SDValue, 16> ShuffleOps;
8266 if (!getTargetShuffleMask(Op.getNode(), ShufVT, true, ShuffleOps,
8267 ShuffleMask))
8268 return SDValue();
8269
8270 int Elt = ShuffleMask[Index];
8271 if (Elt == SM_SentinelZero)
8272 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT)
8273 : DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT);
8274 if (Elt == SM_SentinelUndef)
8275 return DAG.getUNDEF(ShufSVT);
8276
8277 assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range")(static_cast<void> (0));
8278 SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
8279 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
8280 }
8281
8282 // Recurse into insert_subvector base/sub vector to find scalars.
8283 if (Opcode == ISD::INSERT_SUBVECTOR) {
8284 SDValue Vec = Op.getOperand(0);
8285 SDValue Sub = Op.getOperand(1);
8286 uint64_t SubIdx = Op.getConstantOperandVal(2);
8287 unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
8288
8289 if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
8290 return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);
8291 return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);
8292 }
8293
8294 // Recurse into concat_vectors sub vector to find scalars.
8295 if (Opcode == ISD::CONCAT_VECTORS) {
8296 EVT SubVT = Op.getOperand(0).getValueType();
8297 unsigned NumSubElts = SubVT.getVectorNumElements();
8298 uint64_t SubIdx = Index / NumSubElts;
8299 uint64_t SubElt = Index % NumSubElts;
8300 return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);
8301 }
8302
8303 // Recurse into extract_subvector src vector to find scalars.
8304 if (Opcode == ISD::EXTRACT_SUBVECTOR) {
8305 SDValue Src = Op.getOperand(0);
8306 uint64_t SrcIdx = Op.getConstantOperandVal(1);
8307 return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);
8308 }
8309
8310 // We only peek through bitcasts of the same vector width.
8311 if (Opcode == ISD::BITCAST) {
8312 SDValue Src = Op.getOperand(0);
8313 EVT SrcVT = Src.getValueType();
8314 if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems)
8315 return getShuffleScalarElt(Src, Index, DAG, Depth + 1);
8316 return SDValue();
8317 }
8318
8319 // Actual nodes that may contain scalar elements
8320
8321 // For insert_vector_elt - either return the index matching scalar or recurse
8322 // into the base vector.
8323 if (Opcode == ISD::INSERT_VECTOR_ELT &&
8324 isa<ConstantSDNode>(Op.getOperand(2))) {
8325 if (Op.getConstantOperandAPInt(2) == Index)
8326 return Op.getOperand(1);
8327 return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);
8328 }
8329
8330 if (Opcode == ISD::SCALAR_TO_VECTOR)
8331 return (Index == 0) ? Op.getOperand(0)
8332 : DAG.getUNDEF(VT.getVectorElementType());
8333
8334 if (Opcode == ISD::BUILD_VECTOR)
8335 return Op.getOperand(Index);
8336
8337 return SDValue();
8338}
8339
8340// Use PINSRB/PINSRW/PINSRD to create a build vector.
8341static SDValue LowerBuildVectorAsInsert(SDValue Op, const APInt &NonZeroMask,
8342 unsigned NumNonZero, unsigned NumZero,
8343 SelectionDAG &DAG,
8344 const X86Subtarget &Subtarget) {
8345 MVT VT = Op.getSimpleValueType();
8346 unsigned NumElts = VT.getVectorNumElements();
8347 assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||(static_cast<void> (0))
8348 ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&(static_cast<void> (0))
8349 "Illegal vector insertion")(static_cast<void> (0));
8350
8351 SDLoc dl(Op);
8352 SDValue V;
8353 bool First = true;
8354
8355 for (unsigned i = 0; i < NumElts; ++i) {
8356 bool IsNonZero = NonZeroMask[i];
8357 if (!IsNonZero)
8358 continue;
8359
8360 // If the build vector contains zeros or our first insertion is not the
8361 // first index then insert into zero vector to break any register
8362 // dependency else use SCALAR_TO_VECTOR.
8363 if (First) {
8364 First = false;
8365 if (NumZero || 0 != i)
8366 V = getZeroVector(VT, Subtarget, DAG, dl);
8367 else {
8368 assert(0 == i && "Expected insertion into zero-index")(static_cast<void> (0));
8369 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
8370 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
8371 V = DAG.getBitcast(VT, V);
8372 continue;
8373 }
8374 }
8375 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V, Op.getOperand(i),
8376 DAG.getIntPtrConstant(i, dl));
8377 }
8378
8379 return V;
8380}
8381
8382/// Custom lower build_vector of v16i8.
8383static SDValue LowerBuildVectorv16i8(SDValue Op, const APInt &NonZeroMask,
8384 unsigned NumNonZero, unsigned NumZero,
8385 SelectionDAG &DAG,
8386 const X86Subtarget &Subtarget) {
8387 if (NumNonZero > 8 && !Subtarget.hasSSE41())
8388 return SDValue();
8389
8390 // SSE4.1 - use PINSRB to insert each byte directly.
8391 if (Subtarget.hasSSE41())
8392 return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG,
8393 Subtarget);
8394
8395 SDLoc dl(Op);
8396 SDValue V;
8397
8398 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
8399 for (unsigned i = 0; i < 16; i += 2) {
8400 bool ThisIsNonZero = NonZeroMask[i];
8401 bool NextIsNonZero = NonZeroMask[i + 1];
8402 if (!ThisIsNonZero && !NextIsNonZero)
8403 continue;
8404
8405 // FIXME: Investigate combining the first 4 bytes as a i32 instead.
8406 SDValue Elt;
8407 if (ThisIsNonZero) {
8408 if (NumZero || NextIsNonZero)
8409 Elt = DAG.getZExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
8410 else
8411 Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
8412 }
8413
8414 if (NextIsNonZero) {
8415 SDValue NextElt = Op.getOperand(i + 1);
8416 if (i == 0 && NumZero)
8417 NextElt = DAG.getZExtOrTrunc(NextElt, dl, MVT::i32);
8418 else
8419 NextElt = DAG.getAnyExtOrTrunc(NextElt, dl, MVT::i32);
8420 NextElt = DAG.getNode(ISD::SHL, dl, MVT::i32, NextElt,
8421 DAG.getConstant(8, dl, MVT::i8));
8422 if (ThisIsNonZero)
8423 Elt = DAG.getNode(ISD::OR, dl, MVT::i32, NextElt, Elt);
8424 else
8425 Elt = NextElt;
8426 }
8427
8428 // If our first insertion is not the first index or zeros are needed, then
8429 // insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high
8430 // elements undefined).
8431 if (!V) {
8432 if (i != 0 || NumZero)
8433 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
8434 else {
8435 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Elt);
8436 V = DAG.getBitcast(MVT::v8i16, V);
8437 continue;
8438 }
8439 }
8440 Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Elt);
8441 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, Elt,
8442 DAG.getIntPtrConstant(i / 2, dl));
8443 }
8444
8445 return DAG.getBitcast(MVT::v16i8, V);
8446}
8447
8448/// Custom lower build_vector of v8i16.
8449static SDValue LowerBuildVectorv8i16(SDValue Op, const APInt &NonZeroMask,
8450 unsigned NumNonZero, unsigned NumZero,
8451 SelectionDAG &DAG,
8452 const X86Subtarget &Subtarget) {
8453 if (NumNonZero > 4 && !Subtarget.hasSSE41())
8454 return SDValue();
8455
8456 // Use PINSRW to insert each byte directly.
8457 return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG,
8458 Subtarget);
8459}
8460
8461/// Custom lower build_vector of v4i32 or v4f32.
8462static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
8463 const X86Subtarget &Subtarget) {
8464 // If this is a splat of a pair of elements, use MOVDDUP (unless the target
8465 // has XOP; in that case defer lowering to potentially use VPERMIL2PS).
8466 // Because we're creating a less complicated build vector here, we may enable
8467 // further folding of the MOVDDUP via shuffle transforms.
8468 if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&
8469 Op.getOperand(0) == Op.getOperand(2) &&
8470 Op.getOperand(1) == Op.getOperand(3) &&
8471 Op.getOperand(0) != Op.getOperand(1)) {
8472 SDLoc DL(Op);
8473 MVT VT = Op.getSimpleValueType();
8474 MVT EltVT = VT.getVectorElementType();
8475 // Create a new build vector with the first 2 elements followed by undef
8476 // padding, bitcast to v2f64, duplicate, and bitcast back.
8477 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
8478 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
8479 SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));
8480 SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);
8481 return DAG.getBitcast(VT, Dup);
8482 }
8483
8484 // Find all zeroable elements.
8485 std::bitset<4> Zeroable, Undefs;
8486 for (int i = 0; i < 4; ++i) {
8487 SDValue Elt = Op.getOperand(i);
8488 Undefs[i] = Elt.isUndef();
8489 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
8490 }
8491 assert(Zeroable.size() - Zeroable.count() > 1 &&(static_cast<void> (0))
8492 "We expect at least two non-zero elements!")(static_cast<void> (0));
8493
8494 // We only know how to deal with build_vector nodes where elements are either
8495 // zeroable or extract_vector_elt with constant index.
8496 SDValue FirstNonZero;
8497 unsigned FirstNonZeroIdx;
8498 for (unsigned i = 0; i < 4; ++i) {
8499 if (Zeroable[i])
8500 continue;
8501 SDValue Elt = Op.getOperand(i);
8502 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8503 !isa<ConstantSDNode>(Elt.getOperand(1)))
8504 return SDValue();
8505 // Make sure that this node is extracting from a 128-bit vector.
8506 MVT VT = Elt.getOperand(0).getSimpleValueType();
8507 if (!VT.is128BitVector())
8508 return SDValue();
8509 if (!FirstNonZero.getNode()) {
8510 FirstNonZero = Elt;
8511 FirstNonZeroIdx = i;
8512 }
8513 }
8514
8515 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!")(static_cast<void> (0));
8516 SDValue V1 = FirstNonZero.getOperand(0);
8517 MVT VT = V1.getSimpleValueType();
8518
8519 // See if this build_vector can be lowered as a blend with zero.
8520 SDValue Elt;
8521 unsigned EltMaskIdx, EltIdx;
8522 int Mask[4];
8523 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
8524 if (Zeroable[EltIdx]) {
8525 // The zero vector will be on the right hand side.
8526 Mask[EltIdx] = EltIdx+4;
8527 continue;
8528 }
8529
8530 Elt = Op->getOperand(EltIdx);
8531 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
8532 EltMaskIdx = Elt.getConstantOperandVal(1);
8533 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
8534 break;
8535 Mask[EltIdx] = EltIdx;
8536 }
8537
8538 if (EltIdx == 4) {
8539 // Let the shuffle legalizer deal with blend operations.
8540 SDValue VZeroOrUndef = (Zeroable == Undefs)
8541 ? DAG.getUNDEF(VT)
8542 : getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
8543 if (V1.getSimpleValueType() != VT)
8544 V1 = DAG.getBitcast(VT, V1);
8545 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);
8546 }
8547
8548 // See if we can lower this build_vector to a INSERTPS.
8549 if (!Subtarget.hasSSE41())
8550 return SDValue();
8551
8552 SDValue V2 = Elt.getOperand(0);
8553 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
8554 V1 = SDValue();
8555
8556 bool CanFold = true;
8557 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
8558 if (Zeroable[i])
8559 continue;
8560
8561 SDValue Current = Op->getOperand(i);
8562 SDValue SrcVector = Current->getOperand(0);
8563 if (!V1.getNode())
8564 V1 = SrcVector;
8565 CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);
8566 }
8567
8568 if (!CanFold)
8569 return SDValue();
8570
8571 assert(V1.getNode() && "Expected at least two non-zero elements!")(static_cast<void> (0));
8572 if (V1.getSimpleValueType() != MVT::v4f32)
8573 V1 = DAG.getBitcast(MVT::v4f32, V1);
8574 if (V2.getSimpleValueType() != MVT::v4f32)
8575 V2 = DAG.getBitcast(MVT::v4f32, V2);
8576
8577 // Ok, we can emit an INSERTPS instruction.
8578 unsigned ZMask = Zeroable.to_ulong();
8579
8580 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
8581 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!")(static_cast<void> (0));
8582 SDLoc DL(Op);
8583 SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
8584 DAG.getIntPtrConstant(InsertPSMask, DL, true));
8585 return DAG.getBitcast(VT, Result);
8586}
8587
8588/// Return a vector logical shift node.
8589static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
8590 SelectionDAG &DAG, const TargetLowering &TLI,
8591 const SDLoc &dl) {
8592 assert(VT.is128BitVector() && "Unknown type for VShift")(static_cast<void> (0));
8593 MVT ShVT = MVT::v16i8;
8594 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
8595 SrcOp = DAG.getBitcast(ShVT, SrcOp);
8596 assert(NumBits % 8 == 0 && "Only support byte sized shifts")(static_cast<void> (0));
8597 SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);
8598 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
8599}
8600
8601static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
8602 SelectionDAG &DAG) {
8603
8604 // Check if the scalar load can be widened into a vector load. And if
8605 // the address is "base + cst" see if the cst can be "absorbed" into
8606 // the shuffle mask.
8607 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
8608 SDValue Ptr = LD->getBasePtr();
8609 if (!ISD::isNormalLoad(LD) || !LD->isSimple())
8610 return SDValue();
8611 EVT PVT = LD->getValueType(0);
8612 if (PVT != MVT::i32 && PVT != MVT::f32)
8613 return SDValue();
8614
8615 int FI = -1;
8616 int64_t Offset = 0;
8617 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
8618 FI = FINode->getIndex();
8619 Offset = 0;
8620 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
8621 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
8622 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
8623 Offset = Ptr.getConstantOperandVal(1);
8624 Ptr = Ptr.getOperand(0);
8625 } else {
8626 return SDValue();
8627 }
8628
8629 // FIXME: 256-bit vector instructions don't require a strict alignment,
8630 // improve this code to support it better.
8631 Align RequiredAlign(VT.getSizeInBits() / 8);
8632 SDValue Chain = LD->getChain();
8633 // Make sure the stack object alignment is at least 16 or 32.
8634 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
8635 MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr);
8636 if (!InferredAlign || *InferredAlign < RequiredAlign) {
8637 if (MFI.isFixedObjectIndex(FI)) {
8638 // Can't change the alignment. FIXME: It's possible to compute
8639 // the exact stack offset and reference FI + adjust offset instead.
8640 // If someone *really* cares about this. That's the way to implement it.
8641 return SDValue();
8642 } else {
8643 MFI.setObjectAlignment(FI, RequiredAlign);
8644 }
8645 }
8646
8647 // (Offset % 16 or 32) must be multiple of 4. Then address is then
8648 // Ptr + (Offset & ~15).
8649 if (Offset < 0)
8650 return SDValue();
8651 if ((Offset % RequiredAlign.value()) & 3)
8652 return SDValue();
8653 int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);
8654 if (StartOffset) {
8655 SDLoc DL(Ptr);
8656 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
8657 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
8658 }
8659
8660 int EltNo = (Offset - StartOffset) >> 2;
8661 unsigned NumElems = VT.getVectorNumElements();
8662
8663 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
8664 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
8665 LD->getPointerInfo().getWithOffset(StartOffset));
8666
8667 SmallVector<int, 8> Mask(NumElems, EltNo);
8668
8669 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
8670 }
8671
8672 return SDValue();
8673}
8674
8675// Recurse to find a LoadSDNode source and the accumulated ByteOffest.
8676static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
8677 if (ISD::isNON_EXTLoad(Elt.getNode())) {
8678 auto *BaseLd = cast<LoadSDNode>(Elt);
8679 if (!BaseLd->isSimple())
8680 return false;
8681 Ld = BaseLd;
8682 ByteOffset = 0;
8683 return true;
8684 }
8685
8686 switch (Elt.getOpcode()) {
8687 case ISD::BITCAST:
8688 case ISD::TRUNCATE:
8689 case ISD::SCALAR_TO_VECTOR:
8690 return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
8691 case ISD::SRL:
8692 if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
8693 uint64_t Idx = IdxC->getZExtValue();
8694 if ((Idx % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
8695 ByteOffset += Idx / 8;
8696 return true;
8697 }
8698 }
8699 break;
8700 case ISD::EXTRACT_VECTOR_ELT:
8701 if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
8702 SDValue Src = Elt.getOperand(0);
8703 unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
8704 unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
8705 if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
8706 findEltLoadSrc(Src, Ld, ByteOffset)) {
8707 uint64_t Idx = IdxC->getZExtValue();
8708 ByteOffset += Idx * (SrcSizeInBits / 8);
8709 return true;
8710 }
8711 }
8712 break;
8713 }
8714
8715 return false;
8716}
8717
8718/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
8719/// elements can be replaced by a single large load which has the same value as
8720/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
8721///
8722/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
8723static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
8724 const SDLoc &DL, SelectionDAG &DAG,
8725 const X86Subtarget &Subtarget,
8726 bool IsAfterLegalize) {
8727 if ((VT.getScalarSizeInBits() % 8) != 0)
8728 return SDValue();
8729
8730 unsigned NumElems = Elts.size();
8731
8732 int LastLoadedElt = -1;
8733 APInt LoadMask = APInt::getNullValue(NumElems);
8734 APInt ZeroMask = APInt::getNullValue(NumElems);
8735 APInt UndefMask = APInt::getNullValue(NumElems);
8736
8737 SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
8738 SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
8739
8740 // For each element in the initializer, see if we've found a load, zero or an
8741 // undef.
8742 for (unsigned i = 0; i < NumElems; ++i) {
8743 SDValue Elt = peekThroughBitcasts(Elts[i]);
8744 if (!Elt.getNode())
8745 return SDValue();
8746 if (Elt.isUndef()) {
8747 UndefMask.setBit(i);
8748 continue;
8749 }
8750 if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode())) {
8751 ZeroMask.setBit(i);
8752 continue;
8753 }
8754
8755 // Each loaded element must be the correct fractional portion of the
8756 // requested vector load.
8757 unsigned EltSizeInBits = Elt.getValueSizeInBits();
8758 if ((NumElems * EltSizeInBits) != VT.getSizeInBits())
8759 return SDValue();
8760
8761 if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)
8762 return SDValue();
8763 unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
8764 if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
8765 return SDValue();
8766
8767 LoadMask.setBit(i);
8768 LastLoadedElt = i;
8769 }
8770 assert((ZeroMask.countPopulation() + UndefMask.countPopulation() +(static_cast<void> (0))
8771 LoadMask.countPopulation()) == NumElems &&(static_cast<void> (0))
8772 "Incomplete element masks")(static_cast<void> (0));
8773
8774 // Handle Special Cases - all undef or undef/zero.
8775 if (UndefMask.countPopulation() == NumElems)
8776 return DAG.getUNDEF(VT);
8777 if ((ZeroMask.countPopulation() + UndefMask.countPopulation()) == NumElems)
8778 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
8779 : DAG.getConstantFP(0.0, DL, VT);
8780
8781 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8782 int FirstLoadedElt = LoadMask.countTrailingZeros();
8783 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
8784 EVT EltBaseVT = EltBase.getValueType();
8785 assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&(static_cast<void> (0))
8786 "Register/Memory size mismatch")(static_cast<void> (0));
8787 LoadSDNode *LDBase = Loads[FirstLoadedElt];
8788 assert(LDBase && "Did not find base load for merging consecutive loads")(static_cast<void> (0));
8789 unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
8790 unsigned BaseSizeInBytes = BaseSizeInBits / 8;
8791 int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);
8792 int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;
8793 assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected")(static_cast<void> (0));
8794
8795 // TODO: Support offsetting the base load.
8796 if (ByteOffsets[FirstLoadedElt] != 0)
8797 return SDValue();
8798
8799 // Check to see if the element's load is consecutive to the base load
8800 // or offset from a previous (already checked) load.
8801 auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
8802 LoadSDNode *Ld = Loads[EltIdx];
8803 int64_t ByteOffset = ByteOffsets[EltIdx];
8804 if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
8805 int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
8806 return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
8807 Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
8808 }
8809 return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,
8810 EltIdx - FirstLoadedElt);
8811 };
8812
8813 // Consecutive loads can contain UNDEFS but not ZERO elements.
8814 // Consecutive loads with UNDEFs and ZEROs elements require a
8815 // an additional shuffle stage to clear the ZERO elements.
8816 bool IsConsecutiveLoad = true;
8817 bool IsConsecutiveLoadWithZeros = true;
8818 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
8819 if (LoadMask[i]) {
8820 if (!CheckConsecutiveLoad(LDBase, i)) {
8821 IsConsecutiveLoad = false;
8822 IsConsecutiveLoadWithZeros = false;
8823 break;
8824 }
8825 } else if (ZeroMask[i]) {
8826 IsConsecutiveLoad = false;
8827 }
8828 }
8829
8830 auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
8831 auto MMOFlags = LDBase->getMemOperand()->getFlags();
8832 assert(LDBase->isSimple() &&(static_cast<void> (0))
8833 "Cannot merge volatile or atomic loads.")(static_cast<void> (0));
8834 SDValue NewLd =
8835 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
8836 LDBase->getPointerInfo(), LDBase->getOriginalAlign(),
8837 MMOFlags);
8838 for (auto *LD : Loads)
8839 if (LD)
8840 DAG.makeEquivalentMemoryOrdering(LD, NewLd);
8841 return NewLd;
8842 };
8843
8844 // Check if the base load is entirely dereferenceable.
8845 bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
8846 VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());
8847
8848 // LOAD - all consecutive load/undefs (must start/end with a load or be
8849 // entirely dereferenceable). If we have found an entire vector of loads and
8850 // undefs, then return a large load of the entire vector width starting at the
8851 // base pointer. If the vector contains zeros, then attempt to shuffle those
8852 // elements.
8853 if (FirstLoadedElt == 0 &&
8854 (NumLoadedElts == (int)NumElems || IsDereferenceable) &&
8855 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
8856 if (IsAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
8857 return SDValue();
8858
8859 // Don't create 256-bit non-temporal aligned loads without AVX2 as these
8860 // will lower to regular temporal loads and use the cache.
8861 if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 &&
8862 VT.is256BitVector() && !Subtarget.hasInt256())
8863 return SDValue();
8864
8865 if (NumElems == 1)
8866 return DAG.getBitcast(VT, Elts[FirstLoadedElt]);
8867
8868 if (!ZeroMask)
8869 return CreateLoad(VT, LDBase);
8870
8871 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
8872 // vector and a zero vector to clear out the zero elements.
8873 if (!IsAfterLegalize && VT.isVector()) {
8874 unsigned NumMaskElts = VT.getVectorNumElements();
8875 if ((NumMaskElts % NumElems) == 0) {
8876 unsigned Scale = NumMaskElts / NumElems;
8877 SmallVector<int, 4> ClearMask(NumMaskElts, -1);
8878 for (unsigned i = 0; i < NumElems; ++i) {
8879 if (UndefMask[i])
8880 continue;
8881 int Offset = ZeroMask[i] ? NumMaskElts : 0;
8882 for (unsigned j = 0; j != Scale; ++j)
8883 ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;
8884 }
8885 SDValue V = CreateLoad(VT, LDBase);
8886 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
8887 : DAG.getConstantFP(0.0, DL, VT);
8888 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
8889 }
8890 }
8891 }
8892
8893 // If the upper half of a ymm/zmm load is undef then just load the lower half.
8894 if (VT.is256BitVector() || VT.is512BitVector()) {
8895 unsigned HalfNumElems = NumElems / 2;
8896 if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnesValue()) {
8897 EVT HalfVT =
8898 EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
8899 SDValue HalfLD =
8900 EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
8901 DAG, Subtarget, IsAfterLegalize);
8902 if (HalfLD)
8903 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
8904 HalfLD, DAG.getIntPtrConstant(0, DL));
8905 }
8906 }
8907
8908 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
8909 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
8910 ((LoadSizeInBits == 16 && Subtarget.hasFP16()) || LoadSizeInBits == 32 ||
8911 LoadSizeInBits == 64) &&
8912 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
8913 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
8914 : MVT::getIntegerVT(LoadSizeInBits);
8915 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
8916 // Allow v4f32 on SSE1 only targets.
8917 // FIXME: Add more isel patterns so we can just use VT directly.
8918 if (!Subtarget.hasSSE2() && VT == MVT::v4f32)
8919 VecVT = MVT::v4f32;
8920 if (TLI.isTypeLegal(VecVT)) {
8921 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
8922 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
8923 SDValue ResNode = DAG.getMemIntrinsicNode(
8924 X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),
8925 LDBase->getOriginalAlign(), MachineMemOperand::MOLoad);
8926 for (auto *LD : Loads)
8927 if (LD)
8928 DAG.makeEquivalentMemoryOrdering(LD, ResNode);
8929 return DAG.getBitcast(VT, ResNode);
8930 }
8931 }
8932
8933 // BROADCAST - match the smallest possible repetition pattern, load that
8934 // scalar/subvector element and then broadcast to the entire vector.
8935 if (ZeroMask.isNullValue() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
8936 (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {
8937 for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
8938 unsigned RepeatSize = SubElems * BaseSizeInBits;
8939 unsigned ScalarSize = std::min(RepeatSize, 64u);
8940 if (!Subtarget.hasAVX2() && ScalarSize < 32)
8941 continue;
8942
8943 // Don't attempt a 1:N subvector broadcast - it should be caught by
8944 // combineConcatVectorOps, else will cause infinite loops.
8945 if (RepeatSize > ScalarSize && SubElems == 1)
8946 continue;
8947
8948 bool Match = true;
8949 SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
8950 for (unsigned i = 0; i != NumElems && Match; ++i) {
8951 if (!LoadMask[i])
8952 continue;
8953 SDValue Elt = peekThroughBitcasts(Elts[i]);
8954 if (RepeatedLoads[i % SubElems].isUndef())
8955 RepeatedLoads[i % SubElems] = Elt;
8956 else
8957 Match &= (RepeatedLoads[i % SubElems] == Elt);
8958 }
8959
8960 // We must have loads at both ends of the repetition.
8961 Match &= !RepeatedLoads.front().isUndef();
8962 Match &= !RepeatedLoads.back().isUndef();
8963 if (!Match)
8964 continue;
8965
8966 EVT RepeatVT =
8967 VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))
8968 ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)
8969 : EVT::getFloatingPointVT(ScalarSize);
8970 if (RepeatSize > ScalarSize)
8971 RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,
8972 RepeatSize / ScalarSize);
8973 EVT BroadcastVT =
8974 EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),
8975 VT.getSizeInBits() / ScalarSize);
8976 if (TLI.isTypeLegal(BroadcastVT)) {
8977 if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
8978 RepeatVT, RepeatedLoads, DL, DAG, Subtarget, IsAfterLegalize)) {
8979 SDValue Broadcast = RepeatLoad;
8980 if (RepeatSize > ScalarSize) {
8981 while (Broadcast.getValueSizeInBits() < VT.getSizeInBits())
8982 Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL);
8983 } else {
8984 if (!Subtarget.hasAVX2() &&
8985 !MayFoldLoadIntoBroadcastFromMem(
8986 RepeatLoad, RepeatVT.getScalarType().getSimpleVT(),
8987 /*AssumeSingleUse=*/true))
8988 return SDValue();
8989 Broadcast =
8990 DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad);
8991 }
8992 return DAG.getBitcast(VT, Broadcast);
8993 }
8994 }
8995 }
8996 }
8997
8998 return SDValue();
8999}
9000
9001// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
9002// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
9003// are consecutive, non-overlapping, and in the right order.
9004static SDValue combineToConsecutiveLoads(EVT VT, SDValue Op, const SDLoc &DL,
9005 SelectionDAG &DAG,
9006 const X86Subtarget &Subtarget,
9007 bool IsAfterLegalize) {
9008 SmallVector<SDValue, 64> Elts;
9009 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
9010 if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {
9011 Elts.push_back(Elt);
9012 continue;
9013 }
9014 return SDValue();
9015 }
9016 assert(Elts.size() == VT.getVectorNumElements())(static_cast<void> (0));
9017 return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,
9018 IsAfterLegalize);
9019}
9020
9021static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
9022 unsigned SplatBitSize, LLVMContext &C) {
9023 unsigned ScalarSize = VT.getScalarSizeInBits();
9024 unsigned NumElm = SplatBitSize / ScalarSize;
9025
9026 SmallVector<Constant *, 32> ConstantVec;
9027 for (unsigned i = 0; i < NumElm; i++) {
9028 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
9029 Constant *Const;
9030 if (VT.isFloatingPoint()) {
9031 if (ScalarSize == 16) {
9032 Const = ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
9033 } else if (ScalarSize == 32) {
9034 Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
9035 } else {
9036 assert(ScalarSize == 64 && "Unsupported floating point scalar size")(static_cast<void> (0));
9037 Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
9038 }
9039 } else
9040 Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
9041 ConstantVec.push_back(Const);
9042 }
9043 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
9044}
9045
9046static bool isFoldableUseOfShuffle(SDNode *N) {
9047 for (auto *U : N->uses()) {
9048 unsigned Opc = U->getOpcode();
9049 // VPERMV/VPERMV3 shuffles can never fold their index operands.
9050 if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
9051 return false;
9052 if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
9053 return false;
9054 if (isTargetShuffle(Opc))
9055 return true;
9056 if (Opc == ISD::BITCAST) // Ignore bitcasts
9057 return isFoldableUseOfShuffle(U);
9058 if (N->hasOneUse())
9059 return true;
9060 }
9061 return false;
9062}
9063
9064/// Attempt to use the vbroadcast instruction to generate a splat value
9065/// from a splat BUILD_VECTOR which uses:
9066/// a. A single scalar load, or a constant.
9067/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
9068///
9069/// The VBROADCAST node is returned when a pattern is found,
9070/// or SDValue() otherwise.
9071static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
9072 const X86Subtarget &Subtarget,
9073 SelectionDAG &DAG) {
9074 // VBROADCAST requires AVX.
9075 // TODO: Splats could be generated for non-AVX CPUs using SSE
9076 // instructions, but there's less potential gain for only 128-bit vectors.
9077 if (!Subtarget.hasAVX())
9078 return SDValue();
9079
9080 MVT VT = BVOp->getSimpleValueType(0);
9081 unsigned NumElts = VT.getVectorNumElements();
9082 SDLoc dl(BVOp);
9083
9084 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&(static_cast<void> (0))
9085 "Unsupported vector type for broadcast.")(static_cast<void> (0));
9086
9087 // See if the build vector is a repeating sequence of scalars (inc. splat).
9088 SDValue Ld;
9089 BitVector UndefElements;
9090 SmallVector<SDValue, 16> Sequence;
9091 if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) {
9092 assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit.")(static_cast<void> (0));
9093 if (Sequence.size() == 1)
9094 Ld = Sequence[0];
9095 }
9096
9097 // Attempt to use VBROADCASTM
9098 // From this pattern:
9099 // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
9100 // b. t1 = (build_vector t0 t0)
9101 //
9102 // Create (VBROADCASTM v2i1 X)
9103 if (!Sequence.empty() && Subtarget.hasCDI()) {
9104 // If not a splat, are the upper sequence values zeroable?
9105 unsigned SeqLen = Sequence.size();
9106 bool UpperZeroOrUndef =
9107 SeqLen == 1 ||
9108 llvm::all_of(makeArrayRef(Sequence).drop_front(), [](SDValue V) {
9109 return !V || V.isUndef() || isNullConstant(V);
9110 });
9111 SDValue Op0 = Sequence[0];
9112 if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) ||
9113 (Op0.getOpcode() == ISD::ZERO_EXTEND &&
9114 Op0.getOperand(0).getOpcode() == ISD::BITCAST))) {
9115 SDValue BOperand = Op0.getOpcode() == ISD::BITCAST
9116 ? Op0.getOperand(0)
9117 : Op0.getOperand(0).getOperand(0);
9118 MVT MaskVT = BOperand.getSimpleValueType();
9119 MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen);
9120 if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q
9121 (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
9122 MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen);
9123 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
9124 unsigned Scale = 512 / VT.getSizeInBits();
9125 BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen));
9126 }
9127 SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand);
9128 if (BcstVT.getSizeInBits() != VT.getSizeInBits())
9129 Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits());
9130 return DAG.getBitcast(VT, Bcst);
9131 }
9132 }
9133 }
9134
9135 unsigned NumUndefElts = UndefElements.count();
9136 if (!Ld || (NumElts - NumUndefElts) <= 1) {
9137 APInt SplatValue, Undef;
9138 unsigned SplatBitSize;
9139 bool HasUndef;
9140 // Check if this is a repeated constant pattern suitable for broadcasting.
9141 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
9142 SplatBitSize > VT.getScalarSizeInBits() &&
9143 SplatBitSize < VT.getSizeInBits()) {
9144 // Avoid replacing with broadcast when it's a use of a shuffle
9145 // instruction to preserve the present custom lowering of shuffles.
9146 if (isFoldableUseOfShuffle(BVOp))
9147 return SDValue();
9148 // replace BUILD_VECTOR with broadcast of the repeated constants.
9149 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9150 LLVMContext *Ctx = DAG.getContext();
9151 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
9152 if (Subtarget.hasAVX()) {
9153 if (SplatBitSize == 32 || SplatBitSize == 64 ||
9154 (SplatBitSize < 32 && Subtarget.hasAVX2())) {
9155 // Splatted value can fit in one INTEGER constant in constant pool.
9156 // Load the constant and broadcast it.
9157 MVT CVT = MVT::getIntegerVT(SplatBitSize);
9158 Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);
9159 Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
9160 SDValue CP = DAG.getConstantPool(C, PVT);
9161 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
9162
9163 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
9164 SDVTList Tys =
9165 DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);
9166 SDValue Ops[] = {DAG.getEntryNode(), CP};
9167 MachinePointerInfo MPI =
9168 MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
9169 SDValue Brdcst = DAG.getMemIntrinsicNode(
9170 X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT, MPI, Alignment,
9171 MachineMemOperand::MOLoad);
9172 return DAG.getBitcast(VT, Brdcst);
9173 }
9174 if (SplatBitSize > 64) {
9175 // Load the vector of constants and broadcast it.
9176 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
9177 *Ctx);
9178 SDValue VCP = DAG.getConstantPool(VecC, PVT);
9179 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
9180 MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm);
9181 Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();
9182 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
9183 SDValue Ops[] = {DAG.getEntryNode(), VCP};
9184 MachinePointerInfo MPI =
9185 MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
9186 return DAG.getMemIntrinsicNode(
9187 X86ISD::SUBV_BROADCAST_LOAD, dl, Tys, Ops, VVT, MPI, Alignment,
9188 MachineMemOperand::MOLoad);
9189 }
9190 }
9191 }
9192
9193 // If we are moving a scalar into a vector (Ld must be set and all elements
9194 // but 1 are undef) and that operation is not obviously supported by
9195 // vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.
9196 // That's better than general shuffling and may eliminate a load to GPR and
9197 // move from scalar to vector register.
9198 if (!Ld || NumElts - NumUndefElts != 1)
9199 return SDValue();
9200 unsigned ScalarSize = Ld.getValueSizeInBits();
9201 if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))
9202 return SDValue();
9203 }
9204
9205 bool ConstSplatVal =
9206 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
9207 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
9208
9209 // TODO: Handle broadcasts of non-constant sequences.
9210
9211 // Make sure that all of the users of a non-constant load are from the
9212 // BUILD_VECTOR node.
9213 // FIXME: Is the use count needed for non-constant, non-load case?
9214 if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))
9215 return SDValue();
9216
9217 unsigned ScalarSize = Ld.getValueSizeInBits();
9218 bool IsGE256 = (VT.getSizeInBits() >= 256);
9219
9220 // When optimizing for size, generate up to 5 extra bytes for a broadcast
9221 // instruction to save 8 or more bytes of constant pool data.
9222 // TODO: If multiple splats are generated to load the same constant,
9223 // it may be detrimental to overall size. There needs to be a way to detect
9224 // that condition to know if this is truly a size win.
9225 bool OptForSize = DAG.shouldOptForSize();
9226
9227 // Handle broadcasting a single constant scalar from the constant pool
9228 // into a vector.
9229 // On Sandybridge (no AVX2), it is still better to load a constant vector
9230 // from the constant pool and not to broadcast it from a scalar.
9231 // But override that restriction when optimizing for size.
9232 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
9233 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
9234 EVT CVT = Ld.getValueType();
9235 assert(!CVT.isVector() && "Must not broadcast a vector type")(static_cast<void> (0));
9236
9237 // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
9238 // For size optimization, also splat v2f64 and v2i64, and for size opt
9239 // with AVX2, also splat i8 and i16.
9240 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
9241 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
9242 (ScalarSize == 16 && Subtarget.hasFP16() && CVT.isFloatingPoint()) ||
9243 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
9244 const Constant *C = nullptr;
9245 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
9246 C = CI->getConstantIntValue();
9247 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
9248 C = CF->getConstantFPValue();
9249
9250 assert(C && "Invalid constant type")(static_cast<void> (0));
9251
9252 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9253 SDValue CP =
9254 DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
9255 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
9256
9257 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
9258 SDValue Ops[] = {DAG.getEntryNode(), CP};
9259 MachinePointerInfo MPI =
9260 MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
9261 return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
9262 MPI, Alignment, MachineMemOperand::MOLoad);
9263 }
9264 }
9265
9266 // Handle AVX2 in-register broadcasts.
9267 if (!IsLoad && Subtarget.hasInt256() &&
9268 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
9269 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
9270
9271 // The scalar source must be a normal load.
9272 if (!IsLoad)
9273 return SDValue();
9274
9275 // Make sure the non-chain result is only used by this build vector.
9276 if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))
9277 return SDValue();
9278
9279 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
9280 (Subtarget.hasVLX() && ScalarSize == 64)) {
9281 auto *LN = cast<LoadSDNode>(Ld);
9282 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
9283 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
9284 SDValue BCast =
9285 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
9286 LN->getMemoryVT(), LN->getMemOperand());
9287 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
9288 return BCast;
9289 }
9290
9291 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
9292 // double since there is no vbroadcastsd xmm
9293 if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&
9294 (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {
9295 auto *LN = cast<LoadSDNode>(Ld);
9296 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
9297 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
9298 SDValue BCast =
9299 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
9300 LN->getMemoryVT(), LN->getMemOperand());
9301 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
9302 return BCast;
9303 }
9304
9305 if (ScalarSize == 16 && Subtarget.hasFP16() && IsGE256)
9306 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
9307
9308 // Unsupported broadcast.
9309 return SDValue();
9310}
9311
9312/// For an EXTRACT_VECTOR_ELT with a constant index return the real
9313/// underlying vector and index.
9314///
9315/// Modifies \p ExtractedFromVec to the real vector and returns the real
9316/// index.
9317static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
9318 SDValue ExtIdx) {
9319 int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
9320 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
9321 return Idx;
9322
9323 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
9324 // lowered this:
9325 // (extract_vector_elt (v8f32 %1), Constant<6>)
9326 // to:
9327 // (extract_vector_elt (vector_shuffle<2,u,u,u>
9328 // (extract_subvector (v8f32 %0), Constant<4>),
9329 // undef)
9330 // Constant<0>)
9331 // In this case the vector is the extract_subvector expression and the index
9332 // is 2, as specified by the shuffle.
9333 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
9334 SDValue ShuffleVec = SVOp->getOperand(0);
9335 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
9336 assert(ShuffleVecVT.getVectorElementType() ==(static_cast<void> (0))
9337 ExtractedFromVec.getSimpleValueType().getVectorElementType())(static_cast<void> (0));
9338
9339 int ShuffleIdx = SVOp->getMaskElt(Idx);
9340 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
9341 ExtractedFromVec = ShuffleVec;
9342 return ShuffleIdx;
9343 }
9344 return Idx;
9345}
9346
9347static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
9348 MVT VT = Op.getSimpleValueType();
9349
9350 // Skip if insert_vec_elt is not supported.
9351 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9352 if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
9353 return SDValue();
9354
9355 SDLoc DL(Op);
9356 unsigned NumElems = Op.getNumOperands();
9357
9358 SDValue VecIn1;
9359 SDValue VecIn2;
9360 SmallVector<unsigned, 4> InsertIndices;
9361 SmallVector<int, 8> Mask(NumElems, -1);
9362
9363 for (unsigned i = 0; i != NumElems; ++i) {
9364 unsigned Opc = Op.getOperand(i).getOpcode();
9365
9366 if (Opc == ISD::UNDEF)
9367 continue;
9368
9369 if (Opc != ISD::EXTRACT_VECTOR_ELT) {
9370 // Quit if more than 1 elements need inserting.
9371 if (InsertIndices.size() > 1)
9372 return SDValue();
9373
9374 InsertIndices.push_back(i);
9375 continue;
9376 }
9377
9378 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
9379 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
9380
9381 // Quit if non-constant index.
9382 if (!isa<ConstantSDNode>(ExtIdx))
9383 return SDValue();
9384 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
9385
9386 // Quit if extracted from vector of different type.
9387 if (ExtractedFromVec.getValueType() != VT)
9388 return SDValue();
9389
9390 if (!VecIn1.getNode())
9391 VecIn1 = ExtractedFromVec;
9392 else if (VecIn1 != ExtractedFromVec) {
9393 if (!VecIn2.getNode())
9394 VecIn2 = ExtractedFromVec;
9395 else if (VecIn2 != ExtractedFromVec)
9396 // Quit if more than 2 vectors to shuffle
9397 return SDValue();
9398 }
9399
9400 if (ExtractedFromVec == VecIn1)
9401 Mask[i] = Idx;
9402 else if (ExtractedFromVec == VecIn2)
9403 Mask[i] = Idx + NumElems;
9404 }
9405
9406 if (!VecIn1.getNode())
9407 return SDValue();
9408
9409 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
9410 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
9411
9412 for (unsigned Idx : InsertIndices)
9413 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
9414 DAG.getIntPtrConstant(Idx, DL));
9415
9416 return NV;
9417}
9418
9419// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
9420static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
9421 const X86Subtarget &Subtarget) {
9422
9423 MVT VT = Op.getSimpleValueType();
9424 assert((VT.getVectorElementType() == MVT::i1) &&(static_cast<void> (0))
9425 "Unexpected type in LowerBUILD_VECTORvXi1!")(static_cast<void> (0));
9426
9427 SDLoc dl(Op);
9428 if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
9429 ISD::isBuildVectorAllOnes(Op.getNode()))
9430 return Op;
9431
9432 uint64_t Immediate = 0;
9433 SmallVector<unsigned, 16> NonConstIdx;
9434 bool IsSplat = true;
9435 bool HasConstElts = false;
9436 int SplatIdx = -1;
9437 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
9438 SDValue In = Op.getOperand(idx);
9439 if (In.isUndef())
9440 continue;
9441 if (auto *InC = dyn_cast<ConstantSDNode>(In)) {
9442 Immediate |= (InC->getZExtValue() & 0x1) << idx;
9443 HasConstElts = true;
9444 } else {
9445 NonConstIdx.push_back(idx);
9446 }
9447 if (SplatIdx < 0)
9448 SplatIdx = idx;
9449 else if (In != Op.getOperand(SplatIdx))
9450 IsSplat = false;
9451 }
9452
9453 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
9454 if (IsSplat) {
9455 // The build_vector allows the scalar element to be larger than the vector
9456 // element type. We need to mask it to use as a condition unless we know
9457 // the upper bits are zero.
9458 // FIXME: Use computeKnownBits instead of checking specific opcode?
9459 SDValue Cond = Op.getOperand(SplatIdx);
9460 assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!")(static_cast<void> (0));
9461 if (Cond.getOpcode() != ISD::SETCC)
9462 Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,
9463 DAG.getConstant(1, dl, MVT::i8));
9464
9465 // Perform the select in the scalar domain so we can use cmov.
9466 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
9467 SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,
9468 DAG.getAllOnesConstant(dl, MVT::i32),
9469 DAG.getConstant(0, dl, MVT::i32));
9470 Select = DAG.getBitcast(MVT::v32i1, Select);
9471 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);
9472 } else {
9473 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
9474 SDValue Select = DAG.getSelect(dl, ImmVT, Cond,
9475 DAG.getAllOnesConstant(dl, ImmVT),
9476 DAG.getConstant(0, dl, ImmVT));
9477 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
9478 Select = DAG.getBitcast(VecVT, Select);
9479 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,
9480 DAG.getIntPtrConstant(0, dl));
9481 }
9482 }
9483
9484 // insert elements one by one
9485 SDValue DstVec;
9486 if (HasConstElts) {
9487 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
9488 SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);
9489 SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);
9490 ImmL = DAG.getBitcast(MVT::v32i1, ImmL);
9491 ImmH = DAG.getBitcast(MVT::v32i1, ImmH);
9492 DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);
9493 } else {
9494 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
9495 SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);
9496 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
9497 DstVec = DAG.getBitcast(VecVT, Imm);
9498 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,
9499 DAG.getIntPtrConstant(0, dl));
9500 }
9501 } else
9502 DstVec = DAG.getUNDEF(VT);
9503
9504 for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
9505 unsigned InsertIdx = NonConstIdx[i];
9506 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
9507 Op.getOperand(InsertIdx),
9508 DAG.getIntPtrConstant(InsertIdx, dl));
9509 }
9510 return DstVec;
9511}
9512
9513LLVM_ATTRIBUTE_UNUSED__attribute__((__unused__)) static bool isHorizOp(unsigned Opcode) {
9514 switch (Opcode) {
9515 case X86ISD::PACKSS:
9516 case X86ISD::PACKUS:
9517 case X86ISD::FHADD:
9518 case X86ISD::FHSUB:
9519 case X86ISD::HADD:
9520 case X86ISD::HSUB:
9521 return true;
9522 }
9523 return false;
9524}
9525
9526/// This is a helper function of LowerToHorizontalOp().
9527/// This function checks that the build_vector \p N in input implements a
9528/// 128-bit partial horizontal operation on a 256-bit vector, but that operation
9529/// may not match the layout of an x86 256-bit horizontal instruction.
9530/// In other words, if this returns true, then some extraction/insertion will
9531/// be required to produce a valid horizontal instruction.
9532///
9533/// Parameter \p Opcode defines the kind of horizontal operation to match.
9534/// For example, if \p Opcode is equal to ISD::ADD, then this function
9535/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
9536/// is equal to ISD::SUB, then this function checks if this is a horizontal
9537/// arithmetic sub.
9538///
9539/// This function only analyzes elements of \p N whose indices are
9540/// in range [BaseIdx, LastIdx).
9541///
9542/// TODO: This function was originally used to match both real and fake partial
9543/// horizontal operations, but the index-matching logic is incorrect for that.
9544/// See the corrected implementation in isHopBuildVector(). Can we reduce this
9545/// code because it is only used for partial h-op matching now?
9546static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
9547 SelectionDAG &DAG,
9548 unsigned BaseIdx, unsigned LastIdx,
9549 SDValue &V0, SDValue &V1) {
9550 EVT VT = N->getValueType(0);
9551 assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops")(static_cast<void> (0));
9552 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!")(static_cast<void> (0));
9553 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&(static_cast<void> (0))
9554 "Invalid Vector in input!")(static_cast<void> (0));
9555
9556 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
9557 bool CanFold = true;
9558 unsigned ExpectedVExtractIdx = BaseIdx;
9559 unsigned NumElts = LastIdx - BaseIdx;
9560 V0 = DAG.getUNDEF(VT);
9561 V1 = DAG.getUNDEF(VT);
9562
9563 // Check if N implements a horizontal binop.
9564 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
9565 SDValue Op = N->getOperand(i + BaseIdx);
9566
9567 // Skip UNDEFs.
9568 if (Op->isUndef()) {
9569 // Update the expected vector extract index.
9570 if (i * 2 == NumElts)
9571 ExpectedVExtractIdx = BaseIdx;
9572 ExpectedVExtractIdx += 2;
9573 continue;
9574 }
9575
9576 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
9577
9578 if (!CanFold)
9579 break;
9580
9581 SDValue Op0 = Op.getOperand(0);
9582 SDValue Op1 = Op.getOperand(1);
9583
9584 // Try to match the following pattern:
9585 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
9586 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
9587 Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
9588 Op0.getOperand(0) == Op1.getOperand(0) &&
9589 isa<ConstantSDNode>(Op0.getOperand(1)) &&
9590 isa<ConstantSDNode>(Op1.getOperand(1)));
9591 if (!CanFold)
9592 break;
9593
9594 unsigned I0 = Op0.getConstantOperandVal(1);
9595 unsigned I1 = Op1.getConstantOperandVal(1);
9596
9597 if (i * 2 < NumElts) {
9598 if (V0.isUndef()) {
9599 V0 = Op0.getOperand(0);
9600 if (V0.getValueType() != VT)
9601 return false;
9602 }
9603 } else {
9604 if (V1.isUndef()) {
9605 V1 = Op0.getOperand(0);
9606 if (V1.getValueType() != VT)
9607 return false;
9608 }
9609 if (i * 2 == NumElts)
9610 ExpectedVExtractIdx = BaseIdx;
9611 }
9612
9613 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
9614 if (I0 == ExpectedVExtractIdx)
9615 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
9616 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
9617 // Try to match the following dag sequence:
9618 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
9619 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
9620 } else
9621 CanFold = false;
9622
9623 ExpectedVExtractIdx += 2;
9624 }
9625
9626 return CanFold;
9627}
9628
9629/// Emit a sequence of two 128-bit horizontal add/sub followed by
9630/// a concat_vector.
9631///
9632/// This is a helper function of LowerToHorizontalOp().
9633/// This function expects two 256-bit vectors called V0 and V1.
9634/// At first, each vector is split into two separate 128-bit vectors.
9635/// Then, the resulting 128-bit vectors are used to implement two
9636/// horizontal binary operations.
9637///
9638/// The kind of horizontal binary operation is defined by \p X86Opcode.
9639///
9640/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
9641/// the two new horizontal binop.
9642/// When Mode is set, the first horizontal binop dag node would take as input
9643/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
9644/// horizontal binop dag node would take as input the lower 128-bit of V1
9645/// and the upper 128-bit of V1.
9646/// Example:
9647/// HADD V0_LO, V0_HI
9648/// HADD V1_LO, V1_HI
9649///
9650/// Otherwise, the first horizontal binop dag node takes as input the lower
9651/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
9652/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
9653/// Example:
9654/// HADD V0_LO, V1_LO
9655/// HADD V0_HI, V1_HI
9656///
9657/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
9658/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
9659/// the upper 128-bits of the result.
9660static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
9661 const SDLoc &DL, SelectionDAG &DAG,
9662 unsigned X86Opcode, bool Mode,
9663 bool isUndefLO, bool isUndefHI) {
9664 MVT VT = V0.getSimpleValueType();
9665 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&(static_cast<void> (0))
9666 "Invalid nodes in input!")(static_cast<void> (0));
9667
9668 unsigned NumElts = VT.getVectorNumElements();
9669 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
9670 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
9671 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
9672 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
9673 MVT NewVT = V0_LO.getSimpleValueType();
9674
9675 SDValue LO = DAG.getUNDEF(NewVT);
9676 SDValue HI = DAG.getUNDEF(NewVT);
9677
9678 if (Mode) {
9679 // Don't emit a horizontal binop if the result is expected to be UNDEF.
9680 if (!isUndefLO && !V0->isUndef())
9681 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
9682 if (!isUndefHI && !V1->isUndef())
9683 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
9684 } else {
9685 // Don't emit a horizontal binop if the result is expected to be UNDEF.
9686 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
9687 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
9688
9689 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
9690 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
9691 }
9692
9693 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
9694}
9695
9696/// Returns true iff \p BV builds a vector with the result equivalent to
9697/// the result of ADDSUB/SUBADD operation.
9698/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
9699/// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
9700/// \p Opnd0 and \p Opnd1.
9701static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,
9702 const X86Subtarget &Subtarget, SelectionDAG &DAG,
9703 SDValue &Opnd0, SDValue &Opnd1,
9704 unsigned &NumExtracts,
9705 bool &IsSubAdd) {
9706
9707 MVT VT = BV->getSimpleValueType(0);
9708 if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
9709 return false;
9710
9711 unsigned NumElts = VT.getVectorNumElements();
9712 SDValue InVec0 = DAG.getUNDEF(VT);
9713 SDValue InVec1 = DAG.getUNDEF(VT);
9714
9715 NumExtracts = 0;
9716
9717 // Odd-numbered elements in the input build vector are obtained from
9718 // adding/subtracting two integer/float elements.
9719 // Even-numbered elements in the input build vector are obtained from
9720 // subtracting/adding two integer/float elements.
9721 unsigned Opc[2] = {0, 0};
9722 for (unsigned i = 0, e = NumElts; i != e; ++i) {
9723 SDValue Op = BV->getOperand(i);
9724
9725 // Skip 'undef' values.
9726 unsigned Opcode = Op.getOpcode();
9727 if (Opcode == ISD::UNDEF)
9728 continue;
9729
9730 // Early exit if we found an unexpected opcode.
9731 if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
9732 return false;
9733
9734 SDValue Op0 = Op.getOperand(0);
9735 SDValue Op1 = Op.getOperand(1);
9736
9737 // Try to match the following pattern:
9738 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
9739 // Early exit if we cannot match that sequence.
9740 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9741 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9742 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
9743 Op0.getOperand(1) != Op1.getOperand(1))
9744 return false;
9745
9746 unsigned I0 = Op0.getConstantOperandVal(1);
9747 if (I0 != i)
9748 return false;
9749
9750 // We found a valid add/sub node, make sure its the same opcode as previous
9751 // elements for this parity.
9752 if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
9753 return false;
9754 Opc[i % 2] = Opcode;
9755
9756 // Update InVec0 and InVec1.
9757 if (InVec0.isUndef()) {
9758 InVec0 = Op0.getOperand(0);
9759 if (InVec0.getSimpleValueType() != VT)
9760 return false;
9761 }
9762 if (InVec1.isUndef()) {
9763 InVec1 = Op1.getOperand(0);
9764 if (InVec1.getSimpleValueType() != VT)
9765 return false;
9766 }
9767
9768 // Make sure that operands in input to each add/sub node always
9769 // come from a same pair of vectors.
9770 if (InVec0 != Op0.getOperand(0)) {
9771 if (Opcode == ISD::FSUB)
9772 return false;
9773
9774 // FADD is commutable. Try to commute the operands
9775 // and then test again.
9776 std::swap(Op0, Op1);
9777 if (InVec0 != Op0.getOperand(0))
9778 return false;
9779 }
9780
9781 if (InVec1 != Op1.getOperand(0))
9782 return false;
9783
9784 // Increment the number of extractions done.
9785 ++NumExtracts;
9786 }
9787
9788 // Ensure we have found an opcode for both parities and that they are
9789 // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
9790 // inputs are undef.
9791 if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
9792 InVec0.isUndef() || InVec1.isUndef())
9793 return false;
9794
9795 IsSubAdd = Opc[0] == ISD::FADD;
9796
9797 Opnd0 = InVec0;
9798 Opnd1 = InVec1;
9799 return true;
9800}
9801
9802/// Returns true if is possible to fold MUL and an idiom that has already been
9803/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
9804/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
9805/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
9806///
9807/// Prior to calling this function it should be known that there is some
9808/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
9809/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
9810/// before replacement of such SDNode with ADDSUB operation. Thus the number
9811/// of \p Opnd0 uses is expected to be equal to 2.
9812/// For example, this function may be called for the following IR:
9813/// %AB = fmul fast <2 x double> %A, %B
9814/// %Sub = fsub fast <2 x double> %AB, %C
9815/// %Add = fadd fast <2 x double> %AB, %C
9816/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
9817/// <2 x i32> <i32 0, i32 3>
9818/// There is a def for %Addsub here, which potentially can be replaced by
9819/// X86ISD::ADDSUB operation:
9820/// %Addsub = X86ISD::ADDSUB %AB, %C
9821/// and such ADDSUB can further be replaced with FMADDSUB:
9822/// %Addsub = FMADDSUB %A, %B, %C.
9823///
9824/// The main reason why this method is called before the replacement of the
9825/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
9826/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
9827/// FMADDSUB is.
9828static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
9829 SelectionDAG &DAG,
9830 SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,
9831 unsigned ExpectedUses) {
9832 if (Opnd0.getOpcode() != ISD::FMUL ||
9833 !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
9834 return false;
9835
9836 // FIXME: These checks must match the similar ones in
9837 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
9838 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
9839 // or MUL + ADDSUB to FMADDSUB.
9840 const TargetOptions &Options = DAG.getTarget().Options;
9841 bool AllowFusion =
9842 (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
9843 if (!AllowFusion)
9844 return false;
9845
9846 Opnd2 = Opnd1;
9847 Opnd1 = Opnd0.getOperand(1);
9848 Opnd0 = Opnd0.getOperand(0);
9849
9850 return true;
9851}
9852
9853/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
9854/// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
9855/// X86ISD::FMSUBADD node.
9856static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
9857 const X86Subtarget &Subtarget,
9858 SelectionDAG &DAG) {
9859 SDValue Opnd0, Opnd1;
9860 unsigned NumExtracts;
9861 bool IsSubAdd;
9862 if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts,
9863 IsSubAdd))
9864 return SDValue();
9865
9866 MVT VT = BV->getSimpleValueType(0);
9867 SDLoc DL(BV);
9868
9869 // Try to generate X86ISD::FMADDSUB node here.
9870 SDValue Opnd2;
9871 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) {
9872 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
9873 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
9874 }
9875
9876 // We only support ADDSUB.
9877 if (IsSubAdd)
9878 return SDValue();
9879
9880 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
9881 // the ADDSUB idiom has been successfully recognized. There are no known
9882 // X86 targets with 512-bit ADDSUB instructions!
9883 // 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom
9884 // recognition.
9885 if (VT.is512BitVector())
9886 return SDValue();
9887
9888 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
9889}
9890
9891static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG,
9892 unsigned &HOpcode, SDValue &V0, SDValue &V1) {
9893 // Initialize outputs to known values.
9894 MVT VT = BV->getSimpleValueType(0);
9895 HOpcode = ISD::DELETED_NODE;
9896 V0 = DAG.getUNDEF(VT);
9897 V1 = DAG.getUNDEF(VT);
9898
9899 // x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
9900 // half of the result is calculated independently from the 128-bit halves of
9901 // the inputs, so that makes the index-checking logic below more complicated.
9902 unsigned NumElts = VT.getVectorNumElements();
9903 unsigned GenericOpcode = ISD::DELETED_NODE;
9904 unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;
9905 unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
9906 unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
9907 for (unsigned i = 0; i != Num128BitChunks; ++i) {
9908 for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {
9909 // Ignore undef elements.
9910 SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
9911 if (Op.isUndef())
9912 continue;
9913
9914 // If there's an opcode mismatch, we're done.
9915 if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)
9916 return false;
9917
9918 // Initialize horizontal opcode.
9919 if (HOpcode == ISD::DELETED_NODE) {
9920 GenericOpcode = Op.getOpcode();
9921 switch (GenericOpcode) {
9922 case ISD::ADD: HOpcode = X86ISD::HADD; break;
9923 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
9924 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
9925 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
9926 default: return false;
9927 }
9928 }
9929
9930 SDValue Op0 = Op.getOperand(0);
9931 SDValue Op1 = Op.getOperand(1);
9932 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9933 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9934 Op0.getOperand(0) != Op1.getOperand(0) ||
9935 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
9936 !isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())
9937 return false;
9938
9939 // The source vector is chosen based on which 64-bit half of the
9940 // destination vector is being calculated.
9941 if (j < NumEltsIn64Bits) {
9942 if (V0.isUndef())
9943 V0 = Op0.getOperand(0);
9944 } else {
9945 if (V1.isUndef())
9946 V1 = Op0.getOperand(0);
9947 }
9948
9949 SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
9950 if (SourceVec != Op0.getOperand(0))
9951 return false;
9952
9953 // op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
9954 unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
9955 unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
9956 unsigned ExpectedIndex = i * NumEltsIn128Bits +
9957 (j % NumEltsIn64Bits) * 2;
9958 if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
9959 continue;
9960
9961 // If this is not a commutative op, this does not match.
9962 if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)
9963 return false;
9964
9965 // Addition is commutative, so try swapping the extract indexes.
9966 // op (extract_vector_elt A, I+1), (extract_vector_elt A, I)
9967 if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
9968 continue;
9969
9970 // Extract indexes do not match horizontal requirement.
9971 return false;
9972 }
9973 }
9974 // We matched. Opcode and operands are returned by reference as arguments.
9975 return true;
9976}
9977
9978static SDValue getHopForBuildVector(const BuildVectorSDNode *BV,
9979 SelectionDAG &DAG, unsigned HOpcode,
9980 SDValue V0, SDValue V1) {
9981 // If either input vector is not the same size as the build vector,
9982 // extract/insert the low bits to the correct size.
9983 // This is free (examples: zmm --> xmm, xmm --> ymm).
9984 MVT VT = BV->getSimpleValueType(0);
9985 unsigned Width = VT.getSizeInBits();
9986 if (V0.getValueSizeInBits() > Width)
9987 V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), Width);
9988 else if (V0.getValueSizeInBits() < Width)
9989 V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, SDLoc(BV), Width);
9990
9991 if (V1.getValueSizeInBits() > Width)
9992 V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), Width);
9993 else if (V1.getValueSizeInBits() < Width)
9994 V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, SDLoc(BV), Width);
9995
9996 unsigned NumElts = VT.getVectorNumElements();
9997 APInt DemandedElts = APInt::getAllOnesValue(NumElts);
9998 for (unsigned i = 0; i != NumElts; ++i)
9999 if (BV->getOperand(i).isUndef())
10000 DemandedElts.clearBit(i);
10001
10002 // If we don't need the upper xmm, then perform as a xmm hop.
10003 unsigned HalfNumElts = NumElts / 2;
10004 if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
10005 MVT HalfVT = VT.getHalfNumVectorElementsVT();
10006 V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), 128);
10007 V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), 128);
10008 SDValue Half = DAG.getNode(HOpcode, SDLoc(BV), HalfVT, V0, V1);
10009 return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, SDLoc(BV), 256);
10010 }
10011
10012 return DAG.getNode(HOpcode, SDLoc(BV), VT, V0, V1);
10013}
10014
10015/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
10016static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
10017 const X86Subtarget &Subtarget,
10018 SelectionDAG &DAG) {
10019 // We need at least 2 non-undef elements to make this worthwhile by default.
10020 unsigned NumNonUndefs =
10021 count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
10022 if (NumNonUndefs < 2)
10023 return SDValue();
10024
10025 // There are 4 sets of horizontal math operations distinguished by type:
10026 // int/FP at 128-bit/256-bit. Each type was introduced with a different
10027 // subtarget feature. Try to match those "native" patterns first.
10028 MVT VT = BV->getSimpleValueType(0);
10029 if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||
10030 ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||
10031 ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||
10032 ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {
10033 unsigned HOpcode;
10034 SDValue V0, V1;
10035 if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
10036 return getHopForBuildVector(BV, DAG, HOpcode, V0, V1);
10037 }
10038
10039 // Try harder to match 256-bit ops by using extract/concat.
10040 if (!Subtarget.hasAVX() || !VT.is256BitVector())
10041 return SDValue();
10042
10043 // Count the number of UNDEF operands in the build_vector in input.
10044 unsigned NumElts = VT.getVectorNumElements();
10045 unsigned Half = NumElts / 2;
10046 unsigned NumUndefsLO = 0;
10047 unsigned NumUndefsHI = 0;
10048 for (unsigned i = 0, e = Half; i != e; ++i)
10049 if (BV->getOperand(i)->isUndef())
10050 NumUndefsLO++;
10051
10052 for (unsigned i = Half, e = NumElts; i != e; ++i)
10053 if (BV->getOperand(i)->isUndef())
10054 NumUndefsHI++;
10055
10056 SDLoc DL(BV);
10057 SDValue InVec0, InVec1;
10058 if (VT == MVT::v8i32 || VT == MVT::v16i16) {
10059 SDValue InVec2, InVec3;
10060 unsigned X86Opcode;
10061 bool CanFold = true;
10062
10063 if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
10064 isHorizontalBinOpPart(BV, ISD::ADD, DAG, Half, NumElts, InVec2,
10065 InVec3) &&
10066 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
10067 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
10068 X86Opcode = X86ISD::HADD;
10069 else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, Half, InVec0,
10070 InVec1) &&
10071 isHorizontalBinOpPart(BV, ISD::SUB, DAG, Half, NumElts, InVec2,
10072 InVec3) &&
10073 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
10074 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
10075 X86Opcode = X86ISD::HSUB;
10076 else
10077 CanFold = false;
10078
10079 if (CanFold) {
10080 // Do not try to expand this build_vector into a pair of horizontal
10081 // add/sub if we can emit a pair of scalar add/sub.
10082 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
10083 return SDValue();
10084
10085 // Convert this build_vector into a pair of horizontal binops followed by
10086 // a concat vector. We must adjust the outputs from the partial horizontal
10087 // matching calls above to account for undefined vector halves.
10088 SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
10089 SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
10090 assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?")(static_cast<void> (0));
10091 bool isUndefLO = NumUndefsLO == Half;
10092 bool isUndefHI = NumUndefsHI == Half;
10093 return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,
10094 isUndefHI);
10095 }
10096 }
10097
10098 if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
10099 VT == MVT::v16i16) {
10100 unsigned X86Opcode;
10101 if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
10102 X86Opcode = X86ISD::HADD;
10103 else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, NumElts, InVec0,
10104 InVec1))
10105 X86Opcode = X86ISD::HSUB;
10106 else if (isHorizontalBinOpPart(BV, ISD::FADD, DAG, 0, NumElts, InVec0,
10107 InVec1))
10108 X86Opcode = X86ISD::FHADD;
10109 else if (isHorizontalBinOpPart(BV, ISD::FSUB, DAG, 0, NumElts, InVec0,
10110 InVec1))
10111 X86Opcode = X86ISD::FHSUB;
10112 else
10113 return SDValue();
10114
10115 // Don't try to expand this build_vector into a pair of horizontal add/sub
10116 // if we can simply emit a pair of scalar add/sub.
10117 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
10118 return SDValue();
10119
10120 // Convert this build_vector into two horizontal add/sub followed by
10121 // a concat vector.
10122 bool isUndefLO = NumUndefsLO == Half;
10123 bool isUndefHI = NumUndefsHI == Half;
10124 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
10125 isUndefLO, isUndefHI);
10126 }
10127
10128 return SDValue();
10129}
10130
10131static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
10132 SelectionDAG &DAG);
10133
10134/// If a BUILD_VECTOR's source elements all apply the same bit operation and
10135/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
10136/// just apply the bit to the vectors.
10137/// NOTE: Its not in our interest to start make a general purpose vectorizer
10138/// from this, but enough scalar bit operations are created from the later
10139/// legalization + scalarization stages to need basic support.
10140static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
10141 const X86Subtarget &Subtarget,
10142 SelectionDAG &DAG) {
10143 SDLoc DL(Op);
10144 MVT VT = Op->getSimpleValueType(0);
10145 unsigned NumElems = VT.getVectorNumElements();
10146 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
10147
10148 // Check that all elements have the same opcode.
10149 // TODO: Should we allow UNDEFS and if so how many?
10150 unsigned Opcode = Op->getOperand(0).getOpcode();
10151 for (unsigned i = 1; i < NumElems; ++i)
10152 if (Opcode != Op->getOperand(i).getOpcode())
10153 return SDValue();
10154
10155 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
10156 bool IsShift = false;
10157 switch (Opcode) {
10158 default:
10159 return SDValue();
10160 case ISD::SHL:
10161 case ISD::SRL:
10162 case ISD::SRA:
10163 IsShift = true;
10164 break;
10165 case ISD::AND:
10166 case ISD::XOR:
10167 case ISD::OR:
10168 // Don't do this if the buildvector is a splat - we'd replace one
10169 // constant with an entire vector.
10170 if (Op->getSplatValue())
10171 return SDValue();
10172 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
10173 return SDValue();
10174 break;
10175 }
10176
10177 SmallVector<SDValue, 4> LHSElts, RHSElts;
10178 for (SDValue Elt : Op->ops()) {
10179 SDValue LHS = Elt.getOperand(0);
10180 SDValue RHS = Elt.getOperand(1);
10181
10182 // We expect the canonicalized RHS operand to be the constant.
10183 if (!isa<ConstantSDNode>(RHS))
10184 return SDValue();
10185
10186 // Extend shift amounts.
10187 if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {
10188 if (!IsShift)
10189 return SDValue();
10190 RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());
10191 }
10192
10193 LHSElts.push_back(LHS);
10194 RHSElts.push_back(RHS);
10195 }
10196
10197 // Limit to shifts by uniform immediates.
10198 // TODO: Only accept vXi8/vXi64 special cases?
10199 // TODO: Permit non-uniform XOP/AVX2/MULLO cases?
10200 if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
10201 return SDValue();
10202
10203 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
10204 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
10205 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
10206
10207 if (!IsShift)
10208 return Res;
10209
10210 // Immediately lower the shift to ensure the constant build vector doesn't
10211 // get converted to a constant pool before the shift is lowered.
10212 return LowerShift(Res, Subtarget, DAG);
10213}
10214
10215/// Create a vector constant without a load. SSE/AVX provide the bare minimum
10216/// functionality to do this, so it's all zeros, all ones, or some derivation
10217/// that is cheap to calculate.
10218static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
10219 const X86Subtarget &Subtarget) {
10220 SDLoc DL(Op);
10221 MVT VT = Op.getSimpleValueType();
10222
10223 // Vectors containing all zeros can be matched by pxor and xorps.
10224 if (ISD::isBuildVectorAllZeros(Op.getNode()))
10225 return Op;
10226
10227 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
10228 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
10229 // vpcmpeqd on 256-bit vectors.
10230 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
10231 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
10232 return Op;
10233
10234 return getOnesVector(VT, DAG, DL);
10235 }
10236
10237 return SDValue();
10238}
10239
10240/// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
10241/// from a vector of source values and a vector of extraction indices.
10242/// The vectors might be manipulated to match the type of the permute op.
10243static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
10244 SDLoc &DL, SelectionDAG &DAG,
10245 const X86Subtarget &Subtarget) {
10246 MVT ShuffleVT = VT;
10247 EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
10248 unsigned NumElts = VT.getVectorNumElements();
10249 unsigned SizeInBits = VT.getSizeInBits();
10250
10251 // Adjust IndicesVec to match VT size.
10252 assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&(static_cast<void> (0))
10253 "Illegal variable permute mask size")(static_cast<void> (0));
10254 if (IndicesVec.getValueType().getVectorNumElements() > NumElts) {
10255 // Narrow/widen the indices vector to the correct size.
10256 if (IndicesVec.getValueSizeInBits() > SizeInBits)
10257 IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
10258 NumElts * VT.getScalarSizeInBits());
10259 else if (IndicesVec.getValueSizeInBits() < SizeInBits)
10260 IndicesVec = widenSubVector(IndicesVec, false, Subtarget, DAG,
10261 SDLoc(IndicesVec), SizeInBits);
10262 // Zero-extend the index elements within the vector.
10263 if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
10264 IndicesVec = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(IndicesVec),
10265 IndicesVT, IndicesVec);
10266 }
10267 IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
10268
10269 // Handle SrcVec that don't match VT type.
10270 if (SrcVec.getValueSizeInBits() != SizeInBits) {
10271 if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
10272 // Handle larger SrcVec by treating it as a larger permute.
10273 unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
10274 VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
10275 IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
10276 IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
10277 Subtarget, DAG, SDLoc(IndicesVec));
10278 SDValue NewSrcVec =
10279 createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
10280 if (NewSrcVec)
10281 return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);
10282 return SDValue();
10283 } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
10284 // Widen smaller SrcVec to match VT.
10285 SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
10286 } else
10287 return SDValue();
10288 }
10289
10290 auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
10291 assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale")(static_cast<void> (0));
10292 EVT SrcVT = Idx.getValueType();
10293 unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
10294 uint64_t IndexScale = 0;
10295 uint64_t IndexOffset = 0;
10296
10297 // If we're scaling a smaller permute op, then we need to repeat the
10298 // indices, scaling and offsetting them as well.
10299 // e.g. v4i32 -> v16i8 (Scale = 4)
10300 // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
10301 // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
10302 for (uint64_t i = 0; i != Scale; ++i) {
10303 IndexScale |= Scale << (i * NumDstBits);
10304 IndexOffset |= i << (i * NumDstBits);
10305 }
10306
10307 Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
10308 DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
10309 Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
10310 DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
10311 return Idx;
10312 };
10313
10314 unsigned Opcode = 0;
10315 switch (VT.SimpleTy) {
10316 default:
10317 break;
10318 case MVT::v16i8:
10319 if (Subtarget.hasSSSE3())
10320 Opcode = X86ISD::PSHUFB;
10321 break;
10322 case MVT::v8i16:
10323 if (Subtarget.hasVLX() && Subtarget.hasBWI())
10324 Opcode = X86ISD::VPERMV;
10325 else if (Subtarget.hasSSSE3()) {
10326 Opcode = X86ISD::PSHUFB;
10327 ShuffleVT = MVT::v16i8;
10328 }
10329 break;
10330 case MVT::v4f32:
10331 case MVT::v4i32:
10332 if (Subtarget.hasAVX()) {
10333 Opcode = X86ISD::VPERMILPV;
10334 ShuffleVT = MVT::v4f32;
10335 } else if (Subtarget.hasSSSE3()) {
10336 Opcode = X86ISD::PSHUFB;
10337 ShuffleVT = MVT::v16i8;
10338 }
10339 break;
10340 case MVT::v2f64:
10341 case MVT::v2i64:
10342 if (Subtarget.hasAVX()) {
10343 // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
10344 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
10345 Opcode = X86ISD::VPERMILPV;
10346 ShuffleVT = MVT::v2f64;
10347 } else if (Subtarget.hasSSE41()) {
10348 // SSE41 can compare v2i64 - select between indices 0 and 1.
10349 return DAG.getSelectCC(
10350 DL, IndicesVec,
10351 getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
10352 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
10353 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
10354 ISD::CondCode::SETEQ);
10355 }
10356 break;
10357 case MVT::v32i8:
10358 if (Subtarget.hasVLX() && Subtarget.hasVBMI())
10359 Opcode = X86ISD::VPERMV;
10360 else if (Subtarget.hasXOP()) {
10361 SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
10362 SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
10363 SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
10364 SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
10365 return DAG.getNode(
10366 ISD::CONCAT_VECTORS, DL, VT,
10367 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
10368 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
10369 } else if (Subtarget.hasAVX()) {
10370 SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
10371 SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
10372 SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
10373 SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
10374 auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
10375 ArrayRef<SDValue> Ops) {
10376 // Permute Lo and Hi and then select based on index range.
10377 // This works as SHUFB uses bits[3:0] to permute elements and we don't
10378 // care about the bit[7] as its just an index vector.
10379 SDValue Idx = Ops[2];
10380 EVT VT = Idx.getValueType();
10381 return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
10382 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
10383 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
10384 ISD::CondCode::SETGT);
10385 };
10386 SDValue Ops[] = {LoLo, HiHi, IndicesVec};
10387 return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
10388 PSHUFBBuilder);
10389 }
10390 break;
10391 case MVT::v16i16:
10392 if (Subtarget.hasVLX() && Subtarget.hasBWI())
10393 Opcode = X86ISD::VPERMV;
10394 else if (Subtarget.hasAVX()) {
10395 // Scale to v32i8 and perform as v32i8.
10396 IndicesVec = ScaleIndices(IndicesVec, 2);
10397 return DAG.getBitcast(
10398 VT, createVariablePermute(
10399 MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
10400 DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
10401 }
10402 break;
10403 case MVT::v8f32:
10404 case MVT::v8i32:
10405 if (Subtarget.hasAVX2())
10406 Opcode = X86ISD::VPERMV;
10407 else if (Subtarget.hasAVX()) {
10408 SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
10409 SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
10410 {0, 1, 2, 3, 0, 1, 2, 3});
10411 SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
10412 {4, 5, 6, 7, 4, 5, 6, 7});
10413 if (Subtarget.hasXOP())
10414 return DAG.getBitcast(
10415 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,
10416 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
10417 // Permute Lo and Hi and then select based on index range.
10418 // This works as VPERMILPS only uses index bits[0:1] to permute elements.
10419 SDValue Res = DAG.getSelectCC(
10420 DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
10421 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
10422 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
10423 ISD::CondCode::SETGT);
10424 return DAG.getBitcast(VT, Res);
10425 }
10426 break;
10427 case MVT::v4i64:
10428 case MVT::v4f64:
10429 if (Subtarget.hasAVX512()) {
10430 if (!Subtarget.hasVLX()) {
10431 MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
10432 SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
10433 SDLoc(SrcVec));
10434 IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
10435 DAG, SDLoc(IndicesVec));
10436 SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
10437 DAG, Subtarget);
10438 return extract256BitVector(Res, 0, DAG, DL);
10439 }
10440 Opcode = X86ISD::VPERMV;
10441 } else if (Subtarget.hasAVX()) {
10442 SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
10443 SDValue LoLo =
10444 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
10445 SDValue HiHi =
10446 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
10447 // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
10448 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
10449 if (Subtarget.hasXOP())
10450 return DAG.getBitcast(
10451 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,
10452 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
10453 // Permute Lo and Hi and then select based on index range.
10454 // This works as VPERMILPD only uses index bit[1] to permute elements.
10455 SDValue Res = DAG.getSelectCC(
10456 DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
10457 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
10458 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
10459 ISD::CondCode::SETGT);
10460 return DAG.getBitcast(VT, Res);
10461 }
10462 break;
10463 case MVT::v64i8:
10464 if (Subtarget.hasVBMI())
10465 Opcode = X86ISD::VPERMV;
10466 break;
10467 case MVT::v32i16:
10468 if (Subtarget.hasBWI())
10469 Opcode = X86ISD::VPERMV;
10470 break;
10471 case MVT::v16f32:
10472 case MVT::v16i32:
10473 case MVT::v8f64:
10474 case MVT::v8i64:
10475 if (Subtarget.hasAVX512())
10476 Opcode = X86ISD::VPERMV;
10477 break;
10478 }
10479 if (!Opcode)
10480 return SDValue();
10481
10482 assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&(static_cast<void> (0))
10483 (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&(static_cast<void> (0))
10484 "Illegal variable permute shuffle type")(static_cast<void> (0));
10485
10486 uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
10487 if (Scale > 1)
10488 IndicesVec = ScaleIndices(IndicesVec, Scale);
10489
10490 EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
10491 IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
10492
10493 SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
10494 SDValue Res = Opcode == X86ISD::VPERMV
10495 ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
10496 : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
10497 return DAG.getBitcast(VT, Res);
10498}
10499
10500// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
10501// reasoned to be a permutation of a vector by indices in a non-constant vector.
10502// (build_vector (extract_elt V, (extract_elt I, 0)),
10503// (extract_elt V, (extract_elt I, 1)),
10504// ...
10505// ->
10506// (vpermv I, V)
10507//
10508// TODO: Handle undefs
10509// TODO: Utilize pshufb and zero mask blending to support more efficient
10510// construction of vectors with constant-0 elements.
10511static SDValue
10512LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
10513 const X86Subtarget &Subtarget) {
10514 SDValue SrcVec, IndicesVec;
10515 // Check for a match of the permute source vector and permute index elements.
10516 // This is done by checking that the i-th build_vector operand is of the form:
10517 // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
10518 for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
10519 SDValue Op = V.getOperand(Idx);
10520 if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
10521 return SDValue();
10522
10523 // If this is the first extract encountered in V, set the source vector,
10524 // otherwise verify the extract is from the previously defined source
10525 // vector.
10526 if (!SrcVec)
10527 SrcVec = Op.getOperand(0);
10528 else if (SrcVec != Op.getOperand(0))
10529 return SDValue();
10530 SDValue ExtractedIndex = Op->getOperand(1);
10531 // Peek through extends.
10532 if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
10533 ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
10534 ExtractedIndex = ExtractedIndex.getOperand(0);
10535 if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
10536 return SDValue();
10537
10538 // If this is the first extract from the index vector candidate, set the
10539 // indices vector, otherwise verify the extract is from the previously
10540 // defined indices vector.
10541 if (!IndicesVec)
10542 IndicesVec = ExtractedIndex.getOperand(0);
10543 else if (IndicesVec != ExtractedIndex.getOperand(0))
10544 return SDValue();
10545
10546 auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
10547 if (!PermIdx || PermIdx->getAPIntValue() != Idx)
10548 return SDValue();
10549 }
10550
10551 SDLoc DL(V);
10552 MVT VT = V.getSimpleValueType();
10553 return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
10554}
10555
10556SDValue
10557X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
10558 SDLoc dl(Op);
10559
10560 MVT VT = Op.getSimpleValueType();
10561 MVT EltVT = VT.getVectorElementType();
10562 unsigned NumElems = Op.getNumOperands();
10563
10564 // Generate vectors for predicate vectors.
10565 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
10566 return LowerBUILD_VECTORvXi1(Op, DAG, Subtarget);
10567
10568 if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
10569 return VectorConstant;
10570
10571 unsigned EVTBits = EltVT.getSizeInBits();
10572 APInt UndefMask = APInt::getNullValue(NumElems);
10573 APInt ZeroMask = APInt::getNullValue(NumElems);
10574 APInt NonZeroMask = APInt::getNullValue(NumElems);
10575 bool IsAllConstants = true;
10576 SmallSet<SDValue, 8> Values;
10577 unsigned NumConstants = NumElems;
10578 for (unsigned i = 0; i < NumElems; ++i) {
10579 SDValue Elt = Op.getOperand(i);
10580 if (Elt.isUndef()) {
10581 UndefMask.setBit(i);
10582 continue;
10583 }
10584 Values.insert(Elt);
10585 if (!isa<ConstantSDNode>(Elt) && !isa<ConstantFPSDNode>(Elt)) {
10586 IsAllConstants = false;
10587 NumConstants--;
10588 }
10589 if (X86::isZeroNode(Elt)) {
10590 ZeroMask.setBit(i);
10591 } else {
10592 NonZeroMask.setBit(i);
10593 }
10594 }
10595
10596 // All undef vector. Return an UNDEF. All zero vectors were handled above.
10597 if (NonZeroMask == 0) {
10598 assert(UndefMask.isAllOnesValue() && "Fully undef mask expected")(static_cast<void> (0));
10599 return DAG.getUNDEF(VT);
10600 }
10601
10602 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
10603
10604 // If the upper elts of a ymm/zmm are undef/zero then we might be better off
10605 // lowering to a smaller build vector and padding with undef/zero.
10606 if ((VT.is256BitVector() || VT.is512BitVector()) &&
10607 !isFoldableUseOfShuffle(BV)) {
10608 unsigned UpperElems = NumElems / 2;
10609 APInt UndefOrZeroMask = UndefMask | ZeroMask;
10610 unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countLeadingOnes();
10611 if (NumUpperUndefsOrZeros >= UpperElems) {
10612 if (VT.is512BitVector() &&
10613 NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))
10614 UpperElems = NumElems - (NumElems / 4);
10615 bool UndefUpper = UndefMask.countLeadingOnes() >= UpperElems;
10616 MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems);
10617 SDValue NewBV =
10618 DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems));
10619 return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);
10620 }
10621 }
10622
10623 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
10624 return AddSub;
10625 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
10626 return HorizontalOp;
10627 if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
10628 return Broadcast;
10629 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, Subtarget, DAG))
10630 return BitOp;
10631
10632 unsigned NumZero = ZeroMask.countPopulation();
10633 unsigned NumNonZero = NonZeroMask.countPopulation();
10634
10635 // If we are inserting one variable into a vector of non-zero constants, try
10636 // to avoid loading each constant element as a scalar. Load the constants as a
10637 // vector and then insert the variable scalar element. If insertion is not
10638 // supported, fall back to a shuffle to get the scalar blended with the
10639 // constants. Insertion into a zero vector is handled as a special-case
10640 // somewhere below here.
10641 if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
10642 (isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) ||
10643 isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) {
10644 // Create an all-constant vector. The variable element in the old
10645 // build vector is replaced by undef in the constant vector. Save the
10646 // variable scalar element and its index for use in the insertelement.
10647 LLVMContext &Context = *DAG.getContext();
10648 Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
10649 SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
10650 SDValue VarElt;
10651 SDValue InsIndex;
10652 for (unsigned i = 0; i != NumElems; ++i) {
10653 SDValue Elt = Op.getOperand(i);
10654 if (auto *C = dyn_cast<ConstantSDNode>(Elt))
10655 ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
10656 else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
10657 ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
10658 else if (!Elt.isUndef()) {
10659 assert(!VarElt.getNode() && !InsIndex.getNode() &&(static_cast<void> (0))
10660 "Expected one variable element in this vector")(static_cast<void> (0));
10661 VarElt = Elt;
10662 InsIndex = DAG.getVectorIdxConstant(i, dl);
10663 }
10664 }
10665 Constant *CV = ConstantVector::get(ConstVecOps);
10666 SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
10667
10668 // The constants we just created may not be legal (eg, floating point). We
10669 // must lower the vector right here because we can not guarantee that we'll
10670 // legalize it before loading it. This is also why we could not just create
10671 // a new build vector here. If the build vector contains illegal constants,
10672 // it could get split back up into a series of insert elements.
10673 // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
10674 SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
10675 MachineFunction &MF = DAG.getMachineFunction();
10676 MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
10677 SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
10678 unsigned InsertC = cast<ConstantSDNode>(InsIndex)->getZExtValue();
10679 unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();
10680 if (InsertC < NumEltsInLow128Bits)
10681 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
10682
10683 // There's no good way to insert into the high elements of a >128-bit
10684 // vector, so use shuffles to avoid an extract/insert sequence.
10685 assert(VT.getSizeInBits() > 128 && "Invalid insertion index?")(static_cast<void> (0));
10686 assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector")(static_cast<void> (0));
10687 SmallVector<int, 8> ShuffleMask;
10688 unsigned NumElts = VT.getVectorNumElements();
10689 for (unsigned i = 0; i != NumElts; ++i)
10690 ShuffleMask.push_back(i == InsertC ? NumElts : i);
10691 SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);
10692 return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);
10693 }
10694
10695 // Special case for single non-zero, non-undef, element.
10696 if (NumNonZero == 1) {
10697 unsigned Idx = NonZeroMask.countTrailingZeros();
10698 SDValue Item = Op.getOperand(Idx);
10699
10700 // If we have a constant or non-constant insertion into the low element of
10701 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
10702 // the rest of the elements. This will be matched as movd/movq/movss/movsd
10703 // depending on what the source datatype is.
10704 if (Idx == 0) {
10705 if (NumZero == 0)
10706 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
10707
10708 if (EltVT == MVT::i32 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
10709 EltVT == MVT::f64 || (EltVT == MVT::i64 && Subtarget.is64Bit()) ||
10710 (EltVT == MVT::i16 && Subtarget.hasFP16())) {
10711 assert((VT.is128BitVector() || VT.is256BitVector() ||(static_cast<void> (0))
10712 VT.is512BitVector()) &&(static_cast<void> (0))
10713 "Expected an SSE value type!")(static_cast<void> (0));
10714 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
10715 // Turn it into a MOVL (i.e. movsh, movss, movsd, movw or movd) to a
10716 // zero vector.
10717 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
10718 }
10719
10720 // We can't directly insert an i8 or i16 into a vector, so zero extend
10721 // it to i32 first.
10722 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
10723 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
10724 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
10725 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
10726 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
10727 return DAG.getBitcast(VT, Item);
10728 }
10729 }
10730
10731 // Is it a vector logical left shift?
10732 if (NumElems == 2 && Idx == 1 &&
10733 X86::isZeroNode(Op.getOperand(0)) &&
10734 !X86::isZeroNode(Op.getOperand(1))) {
10735 unsigned NumBits = VT.getSizeInBits();
10736 return getVShift(true, VT,
10737 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
10738 VT, Op.getOperand(1)),
10739 NumBits/2, DAG, *this, dl);
10740 }
10741
10742 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
10743 return SDValue();
10744
10745 // Otherwise, if this is a vector with i32 or f32 elements, and the element
10746 // is a non-constant being inserted into an element other than the low one,
10747 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
10748 // movd/movss) to move this into the low element, then shuffle it into
10749 // place.
10750 if (EVTBits == 32) {
10751 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
10752 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
10753 }
10754 }
10755
10756 // Splat is obviously ok. Let legalizer expand it to a shuffle.
10757 if (Values.size() == 1) {
10758 if (EVTBits == 32) {
10759 // Instead of a shuffle like this:
10760 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
10761 // Check if it's possible to issue this instead.
10762 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
10763 unsigned Idx = NonZeroMask.countTrailingZeros();
10764 SDValue Item = Op.getOperand(Idx);
10765 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
10766 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
10767 }
10768 return SDValue();
10769 }
10770
10771 // A vector full of immediates; various special cases are already
10772 // handled, so this is best done with a single constant-pool load.
10773 if (IsAllConstants)
10774 return SDValue();
10775
10776 if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget))
10777 return V;
10778
10779 // See if we can use a vector load to get all of the elements.
10780 {
10781 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
10782 if (SDValue LD =
10783 EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
10784 return LD;
10785 }
10786
10787 // If this is a splat of pairs of 32-bit elements, we can use a narrower
10788 // build_vector and broadcast it.
10789 // TODO: We could probably generalize this more.
10790 if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
10791 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
10792 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
10793 auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
10794 // Make sure all the even/odd operands match.
10795 for (unsigned i = 2; i != NumElems; ++i)
10796 if (Ops[i % 2] != Op.getOperand(i))
10797 return false;
10798 return true;
10799 };
10800 if (CanSplat(Op, NumElems, Ops)) {
10801 MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
10802 MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
10803 // Create a new build vector and cast to v2i64/v2f64.
10804 SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
10805 DAG.getBuildVector(NarrowVT, dl, Ops));
10806 // Broadcast from v2i64/v2f64 and cast to final VT.
10807 MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems / 2);
10808 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
10809 NewBV));
10810 }
10811 }
10812
10813 // For AVX-length vectors, build the individual 128-bit pieces and use
10814 // shuffles to put them in place.
10815 if (VT.getSizeInBits() > 128) {
10816 MVT HVT = MVT::getVectorVT(EltVT, NumElems / 2);
10817
10818 // Build both the lower and upper subvector.
10819 SDValue Lower =
10820 DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
10821 SDValue Upper = DAG.getBuildVector(
10822 HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
10823
10824 // Recreate the wider vector with the lower and upper part.
10825 return concatSubVectors(Lower, Upper, DAG, dl);
10826 }
10827
10828 // Let legalizer expand 2-wide build_vectors.
10829 if (EVTBits == 64) {
10830 if (NumNonZero == 1) {
10831 // One half is zero or undef.
10832 unsigned Idx = NonZeroMask.countTrailingZeros();
10833 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
10834 Op.getOperand(Idx));
10835 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
10836 }
10837 return SDValue();
10838 }
10839
10840 // If element VT is < 32 bits, convert it to inserts into a zero vector.
10841 if (EVTBits == 8 && NumElems == 16)
10842 if (SDValue V = LowerBuildVectorv16i8(Op, NonZeroMask, NumNonZero, NumZero,
10843 DAG, Subtarget))
10844 return V;
10845
10846 if (EltVT == MVT::i16 && NumElems == 8)
10847 if (SDValue V = LowerBuildVectorv8i16(Op, NonZeroMask, NumNonZero, NumZero,
10848 DAG, Subtarget))
10849 return V;
10850
10851 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
10852 if (EVTBits == 32 && NumElems == 4)
10853 if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
10854 return V;
10855
10856 // If element VT is == 32 bits, turn it into a number of shuffles.
10857 if (NumElems == 4 && NumZero > 0) {
10858 SmallVector<SDValue, 8> Ops(NumElems);
10859 for (unsigned i = 0; i < 4; ++i) {
10860 bool isZero = !NonZeroMask[i];
10861 if (isZero)
10862 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
10863 else
10864 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
10865 }
10866
10867 for (unsigned i = 0; i < 2; ++i) {
10868 switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) {
10869 default: llvm_unreachable("Unexpected NonZero count")__builtin_unreachable();
10870 case 0:
10871 Ops[i] = Ops[i*2]; // Must be a zero vector.
10872 break;
10873 case 1:
10874 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
10875 break;
10876 case 2:
10877 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
10878 break;
10879 case 3:
10880 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
10881 break;
10882 }
10883 }
10884
10885 bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2;
10886 bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2;
10887 int MaskVec[] = {
10888 Reverse1 ? 1 : 0,
10889 Reverse1 ? 0 : 1,
10890 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
10891 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
10892 };
10893 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
10894 }
10895
10896 assert(Values.size() > 1 && "Expected non-undef and non-splat vector")(static_cast<void> (0));
10897
10898 // Check for a build vector from mostly shuffle plus few inserting.
10899 if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
10900 return Sh;
10901
10902 // For SSE 4.1, use insertps to put the high elements into the low element.
10903 if (Subtarget.hasSSE41() && EltVT != MVT::f16) {
10904 SDValue Result;
10905 if (!Op.getOperand(0).isUndef())
10906 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
10907 else
10908 Result = DAG.getUNDEF(VT);
10909
10910 for (unsigned i = 1; i < NumElems; ++i) {
10911 if (Op.getOperand(i).isUndef()) continue;
10912 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
10913 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
10914 }
10915 return Result;
10916 }
10917
10918 // Otherwise, expand into a number of unpckl*, start by extending each of
10919 // our (non-undef) elements to the full vector width with the element in the
10920 // bottom slot of the vector (which generates no code for SSE).
10921 SmallVector<SDValue, 8> Ops(NumElems);
10922 for (unsigned i = 0; i < NumElems; ++i) {
10923 if (!Op.getOperand(i).isUndef())
10924 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
10925 else
10926 Ops[i] = DAG.getUNDEF(VT);
10927 }
10928
10929 // Next, we iteratively mix elements, e.g. for v4f32:
10930 // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
10931 // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
10932 // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
10933 for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
10934 // Generate scaled UNPCKL shuffle mask.
10935 SmallVector<int, 16> Mask;
10936 for(unsigned i = 0; i != Scale; ++i)
10937 Mask.push_back(i);
10938 for (unsigned i = 0; i != Scale; ++i)
10939 Mask.push_back(NumElems+i);
10940 Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
10941
10942 for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
10943 Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
10944 }
10945 return Ops[0];
10946}
10947
10948// 256-bit AVX can use the vinsertf128 instruction
10949// to create 256-bit vectors from two other 128-bit ones.
10950// TODO: Detect subvector broadcast here instead of DAG combine?
10951static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
10952 const X86Subtarget &Subtarget) {
10953 SDLoc dl(Op);
10954 MVT ResVT = Op.getSimpleValueType();
10955
10956 assert((ResVT.is256BitVector() ||(static_cast<void> (0))
10957 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide")(static_cast<void> (0));
10958
10959 unsigned NumOperands = Op.getNumOperands();
10960 unsigned NumZero = 0;
10961 unsigned NumNonZero = 0;
10962 unsigned NonZeros = 0;
10963 for (unsigned i = 0; i != NumOperands; ++i) {
10964 SDValue SubVec = Op.getOperand(i);
10965 if (SubVec.isUndef())
10966 continue;
10967 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
10968 ++NumZero;
10969 else {
10970 assert(i < sizeof(NonZeros) * CHAR_BIT)(static_cast<void> (0)); // Ensure the shift is in range.
10971 NonZeros |= 1 << i;
10972 ++NumNonZero;
10973 }
10974 }
10975
10976 // If we have more than 2 non-zeros, build each half separately.
10977 if (NumNonZero > 2) {
10978 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
10979 ArrayRef<SDUse> Ops = Op->ops();
10980 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
10981 Ops.slice(0, NumOperands/2));
10982 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
10983 Ops.slice(NumOperands/2));
10984 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
10985 }
10986
10987 // Otherwise, build it up through insert_subvectors.
10988 SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
10989 : DAG.getUNDEF(ResVT);
10990
10991 MVT SubVT = Op.getOperand(0).getSimpleValueType();
10992 unsigned NumSubElems = SubVT.getVectorNumElements();
10993 for (unsigned i = 0; i != NumOperands; ++i) {
10994 if ((NonZeros & (1 << i)) == 0)
10995 continue;
10996
10997 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec,
10998 Op.getOperand(i),
10999 DAG.getIntPtrConstant(i * NumSubElems, dl));
11000 }
11001
11002 return Vec;
11003}
11004
11005// Returns true if the given node is a type promotion (by concatenating i1
11006// zeros) of the result of a node that already zeros all upper bits of
11007// k-register.
11008// TODO: Merge this with LowerAVXCONCAT_VECTORS?
11009static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
11010 const X86Subtarget &Subtarget,
11011 SelectionDAG & DAG) {
11012 SDLoc dl(Op);
11013 MVT ResVT = Op.getSimpleValueType();
11014 unsigned NumOperands = Op.getNumOperands();
11015
11016 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&(static_cast<void> (0))
11017 "Unexpected number of operands in CONCAT_VECTORS")(static_cast<void> (0));
11018
11019 uint64_t Zeros = 0;
11020 uint64_t NonZeros = 0;
11021 for (unsigned i = 0; i != NumOperands; ++i) {
11022 SDValue SubVec = Op.getOperand(i);
11023 if (SubVec.isUndef())
11024 continue;
11025 assert(i < sizeof(NonZeros) * CHAR_BIT)(static_cast<void> (0)); // Ensure the shift is in range.
11026 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
11027 Zeros |= (uint64_t)1 << i;
11028 else
11029 NonZeros |= (uint64_t)1 << i;
11030 }
11031
11032 unsigned NumElems = ResVT.getVectorNumElements();
11033
11034 // If we are inserting non-zero vector and there are zeros in LSBs and undef
11035 // in the MSBs we need to emit a KSHIFTL. The generic lowering to
11036 // insert_subvector will give us two kshifts.
11037 if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
11038 Log2_64(NonZeros) != NumOperands - 1) {
11039 MVT ShiftVT = ResVT;
11040 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
11041 ShiftVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
11042 unsigned Idx = Log2_64(NonZeros);
11043 SDValue SubVec = Op.getOperand(Idx);
11044 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
11045 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ShiftVT,
11046 DAG.getUNDEF(ShiftVT), SubVec,
11047 DAG.getIntPtrConstant(0, dl));
11048 Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, SubVec,
11049 DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));
11050 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,
11051 DAG.getIntPtrConstant(0, dl));
11052 }
11053
11054 // If there are zero or one non-zeros we can handle this very simply.
11055 if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {
11056 SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);
11057 if (!NonZeros)
11058 return Vec;
11059 unsigned Idx = Log2_64(NonZeros);
11060 SDValue SubVec = Op.getOperand(Idx);
11061 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
11062 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
11063 DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));
11064 }
11065
11066 if (NumOperands > 2) {
11067 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
11068 ArrayRef<SDUse> Ops = Op->ops();
11069 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
11070 Ops.slice(0, NumOperands/2));
11071 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
11072 Ops.slice(NumOperands/2));
11073 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
11074 }
11075
11076 assert(countPopulation(NonZeros) == 2 && "Simple cases not handled?")(static_cast<void> (0));
11077
11078 if (ResVT.getVectorNumElements() >= 16)
11079 return Op; // The operation is legal with KUNPCK
11080
11081 SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,
11082 DAG.getUNDEF(ResVT), Op.getOperand(0),
11083 DAG.getIntPtrConstant(0, dl));
11084 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
11085 DAG.getIntPtrConstant(NumElems/2, dl));
11086}
11087
11088static SDValue LowerCONCAT_VECTORS(SDValue Op,
11089 const X86Subtarget &Subtarget,
11090 SelectionDAG &DAG) {
11091 MVT VT = Op.getSimpleValueType();
11092 if (VT.getVectorElementType() == MVT::i1)
11093 return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
11094
11095 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||(static_cast<void> (0))
11096 (VT.is512BitVector() && (Op.getNumOperands() == 2 ||(static_cast<void> (0))
11097 Op.getNumOperands() == 4)))(static_cast<void> (0));
11098
11099 // AVX can use the vinsertf128 instruction to create 256-bit vectors
11100 // from two other 128-bit ones.
11101
11102 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
11103 return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget);
11104}
11105
11106//===----------------------------------------------------------------------===//
11107// Vector shuffle lowering
11108//
11109// This is an experimental code path for lowering vector shuffles on x86. It is
11110// designed to handle arbitrary vector shuffles and blends, gracefully
11111// degrading performance as necessary. It works hard to recognize idiomatic
11112// shuffles and lower them to optimal instruction patterns without leaving
11113// a framework that allows reasonably efficient handling of all vector shuffle
11114// patterns.
11115//===----------------------------------------------------------------------===//
11116
11117/// Tiny helper function to identify a no-op mask.
11118///
11119/// This is a somewhat boring predicate function. It checks whether the mask
11120/// array input, which is assumed to be a single-input shuffle mask of the kind
11121/// used by the X86 shuffle instructions (not a fully general
11122/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
11123/// in-place shuffle are 'no-op's.
11124static bool isNoopShuffleMask(ArrayRef<int> Mask) {
11125 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11126 assert(Mask[i] >= -1 && "Out of bound mask element!")(static_cast<void> (0));
11127 if (Mask[i] >= 0 && Mask[i] != i)
11128 return false;
11129 }
11130 return true;
11131}
11132
11133/// Test whether there are elements crossing LaneSizeInBits lanes in this
11134/// shuffle mask.
11135///
11136/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
11137/// and we routinely test for these.
11138static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,
11139 unsigned ScalarSizeInBits,
11140 ArrayRef<int> Mask) {
11141 assert(LaneSizeInBits && ScalarSizeInBits &&(static_cast<void> (0))
11142 (LaneSizeInBits % ScalarSizeInBits) == 0 &&(static_cast<void> (0))
11143 "Illegal shuffle lane size")(static_cast<void> (0));
11144 int LaneSize = LaneSizeInBits / ScalarSizeInBits;
11145 int Size = Mask.size();
11146 for (int i = 0; i < Size; ++i)
11147 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
11148 return true;
11149 return false;
11150}
11151
11152/// Test whether there are elements crossing 128-bit lanes in this
11153/// shuffle mask.
11154static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
11155 return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);
11156}
11157
11158/// Test whether elements in each LaneSizeInBits lane in this shuffle mask come
11159/// from multiple lanes - this is different to isLaneCrossingShuffleMask to
11160/// better support 'repeated mask + lane permute' style shuffles.
11161static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits,
11162 unsigned ScalarSizeInBits,
11163 ArrayRef<int> Mask) {
11164 assert(LaneSizeInBits && ScalarSizeInBits &&(static_cast<void> (0))
11165 (LaneSizeInBits % ScalarSizeInBits) == 0 &&(static_cast<void> (0))
11166 "Illegal shuffle lane size")(static_cast<void> (0));
11167 int NumElts = Mask.size();
11168 int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;
11169 int NumLanes = NumElts / NumEltsPerLane;
11170 if (NumLanes > 1) {
11171 for (int i = 0; i != NumLanes; ++i) {
11172 int SrcLane = -1;
11173 for (int j = 0; j != NumEltsPerLane; ++j) {
11174 int M = Mask[(i * NumEltsPerLane) + j];
11175 if (M < 0)
11176 continue;
11177 int Lane = (M % NumElts) / NumEltsPerLane;
11178 if (SrcLane >= 0 && SrcLane != Lane)
11179 return true;
11180 SrcLane = Lane;
11181 }
11182 }
11183 }
11184 return false;
11185}
11186
11187/// Test whether a shuffle mask is equivalent within each sub-lane.
11188///
11189/// This checks a shuffle mask to see if it is performing the same
11190/// lane-relative shuffle in each sub-lane. This trivially implies
11191/// that it is also not lane-crossing. It may however involve a blend from the
11192/// same lane of a second vector.
11193///
11194/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
11195/// non-trivial to compute in the face of undef lanes. The representation is
11196/// suitable for use with existing 128-bit shuffles as entries from the second
11197/// vector have been remapped to [LaneSize, 2*LaneSize).
11198static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
11199 ArrayRef<int> Mask,
11200 SmallVectorImpl<int> &RepeatedMask) {
11201 auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
11202 RepeatedMask.assign(LaneSize, -1);
11203 int Size = Mask.size();
11204 for (int i = 0; i < Size; ++i) {
11205 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0)(static_cast<void> (0));
11206 if (Mask[i] < 0)
11207 continue;
11208 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
11209 // This entry crosses lanes, so there is no way to model this shuffle.
11210 return false;
11211
11212 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
11213 // Adjust second vector indices to start at LaneSize instead of Size.
11214 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
11215 : Mask[i] % LaneSize + LaneSize;
11216 if (RepeatedMask[i % LaneSize] < 0)
11217 // This is the first non-undef entry in this slot of a 128-bit lane.
11218 RepeatedMask[i % LaneSize] = LocalM;
11219 else if (RepeatedMask[i % LaneSize] != LocalM)
11220 // Found a mismatch with the repeated mask.
11221 return false;
11222 }
11223 return true;
11224}
11225
11226/// Test whether a shuffle mask is equivalent within each 128-bit lane.
11227static bool
11228is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
11229 SmallVectorImpl<int> &RepeatedMask) {
11230 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
11231}
11232
11233static bool
11234is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask) {
11235 SmallVector<int, 32> RepeatedMask;
11236 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
11237}
11238
11239/// Test whether a shuffle mask is equivalent within each 256-bit lane.
11240static bool
11241is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
11242 SmallVectorImpl<int> &RepeatedMask) {
11243 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
11244}
11245
11246/// Test whether a target shuffle mask is equivalent within each sub-lane.
11247/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
11248static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,
11249 unsigned EltSizeInBits,
11250 ArrayRef<int> Mask,
11251 SmallVectorImpl<int> &RepeatedMask) {
11252 int LaneSize = LaneSizeInBits / EltSizeInBits;
11253 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
11254 int Size = Mask.size();
11255 for (int i = 0; i < Size; ++i) {
11256 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0))(static_cast<void> (0));
11257 if (Mask[i] == SM_SentinelUndef)
11258 continue;
11259 if (Mask[i] == SM_SentinelZero) {
11260 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
11261 return false;
11262 RepeatedMask[i % LaneSize] = SM_SentinelZero;
11263 continue;
11264 }
11265 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
11266 // This entry crosses lanes, so there is no way to model this shuffle.
11267 return false;
11268
11269 // Handle the in-lane shuffles by detecting if and when they repeat. Adjust
11270 // later vector indices to start at multiples of LaneSize instead of Size.
11271 int LaneM = Mask[i] / Size;
11272 int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize);
11273 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
11274 // This is the first non-undef entry in this slot of a 128-bit lane.
11275 RepeatedMask[i % LaneSize] = LocalM;
11276 else if (RepeatedMask[i % LaneSize] != LocalM)
11277 // Found a mismatch with the repeated mask.
11278 return false;
11279 }
11280 return true;
11281}
11282
11283/// Test whether a target shuffle mask is equivalent within each sub-lane.
11284/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
11285static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
11286 ArrayRef<int> Mask,
11287 SmallVectorImpl<int> &RepeatedMask) {
11288 return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(),
11289 Mask, RepeatedMask);
11290}
11291
11292/// Checks whether the vector elements referenced by two shuffle masks are
11293/// equivalent.
11294static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,
11295 int Idx, int ExpectedIdx) {
11296 assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&(static_cast<void> (0))
11297 ExpectedIdx < MaskSize && "Out of range element index")(static_cast<void> (0));
11298 if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode())
11299 return false;
11300
11301 switch (Op.getOpcode()) {
11302 case ISD::BUILD_VECTOR:
11303 // If the values are build vectors, we can look through them to find
11304 // equivalent inputs that make the shuffles equivalent.
11305 // TODO: Handle MaskSize != Op.getNumOperands()?
11306 if (MaskSize == (int)Op.getNumOperands() &&
11307 MaskSize == (int)ExpectedOp.getNumOperands())
11308 return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);
11309 break;
11310 case X86ISD::VBROADCAST:
11311 case X86ISD::VBROADCAST_LOAD:
11312 // TODO: Handle MaskSize != Op.getValueType().getVectorNumElements()?
11313 return (Op == ExpectedOp &&
11314 (int)Op.getValueType().getVectorNumElements() == MaskSize);
11315 case X86ISD::HADD:
11316 case X86ISD::HSUB:
11317 case X86ISD::FHADD:
11318 case X86ISD::FHSUB:
11319 case X86ISD::PACKSS:
11320 case X86ISD::PACKUS:
11321 // HOP(X,X) can refer to the elt from the lower/upper half of a lane.
11322 // TODO: Handle MaskSize != NumElts?
11323 // TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases.
11324 if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) {
11325 MVT VT = Op.getSimpleValueType();
11326 int NumElts = VT.getVectorNumElements();
11327 if (MaskSize == NumElts) {
11328 int NumLanes = VT.getSizeInBits() / 128;
11329 int NumEltsPerLane = NumElts / NumLanes;
11330 int NumHalfEltsPerLane = NumEltsPerLane / 2;
11331 bool SameLane =
11332 (Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);
11333 bool SameElt =
11334 (Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);
11335 return SameLane && SameElt;
11336 }
11337 }
11338 break;
11339 }
11340
11341 return false;
11342}
11343
11344/// Checks whether a shuffle mask is equivalent to an explicit list of
11345/// arguments.
11346///
11347/// This is a fast way to test a shuffle mask against a fixed pattern:
11348///
11349/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
11350///
11351/// It returns true if the mask is exactly as wide as the argument list, and
11352/// each element of the mask is either -1 (signifying undef) or the value given
11353/// in the argument.
11354static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask,
11355 SDValue V1 = SDValue(),
11356 SDValue V2 = SDValue()) {
11357 int Size = Mask.size();
11358 if (Size != (int)ExpectedMask.size())
11359 return false;
11360
11361 for (int i = 0; i < Size; ++i) {
11362 assert(Mask[i] >= -1 && "Out of bound mask element!")(static_cast<void> (0));
11363 int MaskIdx = Mask[i];
11364 int ExpectedIdx = ExpectedMask[i];
11365 if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {
11366 SDValue MaskV = MaskIdx < Size ? V1 : V2;
11367 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
11368 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
11369 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
11370 if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
11371 return false;
11372 }
11373 }
11374 return true;
11375}
11376
11377/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
11378///
11379/// The masks must be exactly the same width.
11380///
11381/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
11382/// value in ExpectedMask is always accepted. Otherwise the indices must match.
11383///
11384/// SM_SentinelZero is accepted as a valid negative index but must match in
11385/// both.
11386static bool isTargetShuffleEquivalent(MVT VT, ArrayRef<int> Mask,
11387 ArrayRef<int> ExpectedMask,
11388 SDValue V1 = SDValue(),
11389 SDValue V2 = SDValue()) {
11390 int Size = Mask.size();
11391 if (Size != (int)ExpectedMask.size())
11392 return false;
11393 assert(isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) &&(static_cast<void> (0))
11394 "Illegal target shuffle mask")(static_cast<void> (0));
11395
11396 // Check for out-of-range target shuffle mask indices.
11397 if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))
11398 return false;
11399
11400 // Don't use V1/V2 if they're not the same size as the shuffle mask type.
11401 if (V1 && V1.getValueSizeInBits() != VT.getSizeInBits())
11402 V1 = SDValue();
11403 if (V2 && V2.getValueSizeInBits() != VT.getSizeInBits())
11404 V2 = SDValue();
11405
11406 for (int i = 0; i < Size; ++i) {
11407 int MaskIdx = Mask[i];
11408 int ExpectedIdx = ExpectedMask[i];
11409 if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx)
11410 continue;
11411 if (0 <= MaskIdx && 0 <= ExpectedIdx) {
11412 SDValue MaskV = MaskIdx < Size ? V1 : V2;
11413 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
11414 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
11415 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
11416 if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
11417 continue;
11418 }
11419 // TODO - handle SM_Sentinel equivalences.
11420 return false;
11421 }
11422 return true;
11423}
11424
11425// Attempt to create a shuffle mask from a VSELECT condition mask.
11426static bool createShuffleMaskFromVSELECT(SmallVectorImpl<int> &Mask,
11427 SDValue Cond) {
11428 EVT CondVT = Cond.getValueType();
11429 unsigned EltSizeInBits = CondVT.getScalarSizeInBits();
11430 unsigned NumElts = CondVT.getVectorNumElements();
11431
11432 APInt UndefElts;
11433 SmallVector<APInt, 32> EltBits;
11434 if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,
11435 true, false))
11436 return false;
11437
11438 Mask.resize(NumElts, SM_SentinelUndef);
11439
11440 for (int i = 0; i != (int)NumElts; ++i) {
11441 Mask[i] = i;
11442 // Arbitrarily choose from the 2nd operand if the select condition element
11443 // is undef.
11444 // TODO: Can we do better by matching patterns such as even/odd?
11445 if (UndefElts[i] || EltBits[i].isNullValue())
11446 Mask[i] += NumElts;
11447 }
11448
11449 return true;
11450}
11451
11452// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
11453// instructions.
11454static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
11455 if (VT != MVT::v8i32 && VT != MVT::v8f32)
11456 return false;
11457
11458 SmallVector<int, 8> Unpcklwd;
11459 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
11460 /* Unary = */ false);
11461 SmallVector<int, 8> Unpckhwd;
11462 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
11463 /* Unary = */ false);
11464 bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd) ||
11465 isTargetShuffleEquivalent(VT, Mask, Unpckhwd));
11466 return IsUnpackwdMask;
11467}
11468
11469static bool is128BitUnpackShuffleMask(ArrayRef<int> Mask) {
11470 // Create 128-bit vector type based on mask size.
11471 MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
11472 MVT VT = MVT::getVectorVT(EltVT, Mask.size());
11473
11474 // We can't assume a canonical shuffle mask, so try the commuted version too.
11475 SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
11476 ShuffleVectorSDNode::commuteMask(CommutedMask);
11477
11478 // Match any of unary/binary or low/high.
11479 for (unsigned i = 0; i != 4; ++i) {
11480 SmallVector<int, 16> UnpackMask;
11481 createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
11482 if (isTargetShuffleEquivalent(VT, Mask, UnpackMask) ||
11483 isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask))
11484 return true;
11485 }
11486 return false;
11487}
11488
11489/// Return true if a shuffle mask chooses elements identically in its top and
11490/// bottom halves. For example, any splat mask has the same top and bottom
11491/// halves. If an element is undefined in only one half of the mask, the halves
11492/// are not considered identical.
11493static bool hasIdenticalHalvesShuffleMask(ArrayRef<int> Mask) {
11494 assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask")(static_cast<void> (0));
11495 unsigned HalfSize = Mask.size() / 2;
11496 for (unsigned i = 0; i != HalfSize; ++i) {
11497 if (Mask[i] != Mask[i + HalfSize])
11498 return false;
11499 }
11500 return true;
11501}
11502
11503/// Get a 4-lane 8-bit shuffle immediate for a mask.
11504///
11505/// This helper function produces an 8-bit shuffle immediate corresponding to
11506/// the ubiquitous shuffle encoding scheme used in x86 instructions for
11507/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
11508/// example.
11509///
11510/// NB: We rely heavily on "undef" masks preserving the input lane.
11511static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
11512 assert(Mask.size() == 4 && "Only 4-lane shuffle masks")(static_cast<void> (0));
11513 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!")(static_cast<void> (0));
11514 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!")(static_cast<void> (0));
11515 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!")(static_cast<void> (0));
11516 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!")(static_cast<void> (0));
11517
11518 // If the mask only uses one non-undef element, then fully 'splat' it to
11519 // improve later broadcast matching.
11520 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
11521 assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask")(static_cast<void> (0));
11522
11523 int FirstElt = Mask[FirstIndex];
11524 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }))
11525 return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;
11526
11527 unsigned Imm = 0;
11528 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
11529 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
11530 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
11531 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
11532 return Imm;
11533}
11534
11535static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
11536 SelectionDAG &DAG) {
11537 return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
11538}
11539
11540// The Shuffle result is as follow:
11541// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
11542// Each Zeroable's element correspond to a particular Mask's element.
11543// As described in computeZeroableShuffleElements function.
11544//
11545// The function looks for a sub-mask that the nonzero elements are in
11546// increasing order. If such sub-mask exist. The function returns true.
11547static bool isNonZeroElementsInOrder(const APInt &Zeroable,
11548 ArrayRef<int> Mask, const EVT &VectorType,
11549 bool &IsZeroSideLeft) {
11550 int NextElement = -1;
11551 // Check if the Mask's nonzero elements are in increasing order.
11552 for (int i = 0, e = Mask.size(); i < e; i++) {
11553 // Checks if the mask's zeros elements are built from only zeros.
11554 assert(Mask[i] >= -1 && "Out of bound mask element!")(static_cast<void> (0));
11555 if (Mask[i] < 0)
11556 return false;
11557 if (Zeroable[i])
11558 continue;
11559 // Find the lowest non zero element
11560 if (NextElement < 0) {
11561 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
11562 IsZeroSideLeft = NextElement != 0;
11563 }
11564 // Exit if the mask's non zero elements are not in increasing order.
11565 if (NextElement != Mask[i])
11566 return false;
11567 NextElement++;
11568 }
11569 return true;
11570}
11571
11572/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
11573static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
11574 ArrayRef<int> Mask, SDValue V1,
11575 SDValue V2, const APInt &Zeroable,
11576 const X86Subtarget &Subtarget,
11577 SelectionDAG &DAG) {
11578 int Size = Mask.size();
11579 int LaneSize = 128 / VT.getScalarSizeInBits();
11580 const int NumBytes = VT.getSizeInBits() / 8;
11581 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
11582
11583 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||(static_cast<void> (0))
11584 (Subtarget.hasAVX2() && VT.is256BitVector()) ||(static_cast<void> (0))
11585 (Subtarget.hasBWI() && VT.is512BitVector()))(static_cast<void> (0));
11586
11587 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
11588 // Sign bit set in i8 mask means zero element.
11589 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
11590
11591 SDValue V;
11592 for (int i = 0; i < NumBytes; ++i) {
11593 int M = Mask[i / NumEltBytes];
11594 if (M < 0) {
11595 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
11596 continue;
11597 }
11598 if (Zeroable[i / NumEltBytes]) {
11599 PSHUFBMask[i] = ZeroMask;
11600 continue;
11601 }
11602
11603 // We can only use a single input of V1 or V2.
11604 SDValue SrcV = (M >= Size ? V2 : V1);
11605 if (V && V != SrcV)
11606 return SDValue();
11607 V = SrcV;
11608 M %= Size;
11609
11610 // PSHUFB can't cross lanes, ensure this doesn't happen.
11611 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
11612 return SDValue();
11613
11614 M = M % LaneSize;
11615 M = M * NumEltBytes + (i % NumEltBytes);
11616 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
11617 }
11618 assert(V && "Failed to find a source input")(static_cast<void> (0));
11619
11620 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
11621 return DAG.getBitcast(
11622 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
11623 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
11624}
11625
11626static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
11627 const X86Subtarget &Subtarget, SelectionDAG &DAG,
11628 const SDLoc &dl);
11629
11630// X86 has dedicated shuffle that can be lowered to VEXPAND
11631static SDValue lowerShuffleToEXPAND(const SDLoc &DL, MVT VT,
11632 const APInt &Zeroable,
11633 ArrayRef<int> Mask, SDValue &V1,
11634 SDValue &V2, SelectionDAG &DAG,
11635 const X86Subtarget &Subtarget) {
11636 bool IsLeftZeroSide = true;
11637 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
11638 IsLeftZeroSide))
11639 return SDValue();
11640 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
11641 MVT IntegerType =
11642 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
11643 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
11644 unsigned NumElts = VT.getVectorNumElements();
11645 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&(static_cast<void> (0))
11646 "Unexpected number of vector elements")(static_cast<void> (0));
11647 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
11648 Subtarget, DAG, DL);
11649 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
11650 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
11651 return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
11652}
11653
11654static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
11655 unsigned &UnpackOpcode, bool IsUnary,
11656 ArrayRef<int> TargetMask, const SDLoc &DL,
11657 SelectionDAG &DAG,
11658 const X86Subtarget &Subtarget) {
11659 int NumElts = VT.getVectorNumElements();
11660
11661 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
11662 for (int i = 0; i != NumElts; i += 2) {
11663 int M1 = TargetMask[i + 0];
11664 int M2 = TargetMask[i + 1];
11665 Undef1 &= (SM_SentinelUndef == M1);
11666 Undef2 &= (SM_SentinelUndef == M2);
11667 Zero1 &= isUndefOrZero(M1);
11668 Zero2 &= isUndefOrZero(M2);
11669 }
11670 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&(static_cast<void> (0))
11671 "Zeroable shuffle detected")(static_cast<void> (0));
11672
11673 // Attempt to match the target mask against the unpack lo/hi mask patterns.
11674 SmallVector<int, 64> Unpckl, Unpckh;
11675 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
11676 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, V1,
11677 (IsUnary ? V1 : V2))) {
11678 UnpackOpcode = X86ISD::UNPCKL;
11679 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
11680 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
11681 return true;
11682 }
11683
11684 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
11685 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, V1,
11686 (IsUnary ? V1 : V2))) {
11687 UnpackOpcode = X86ISD::UNPCKH;
11688 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
11689 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
11690 return true;
11691 }
11692
11693 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
11694 if (IsUnary && (Zero1 || Zero2)) {
11695 // Don't bother if we can blend instead.
11696 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
11697 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
11698 return false;
11699
11700 bool MatchLo = true, MatchHi = true;
11701 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
11702 int M = TargetMask[i];
11703
11704 // Ignore if the input is known to be zero or the index is undef.
11705 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
11706 (M == SM_SentinelUndef))
11707 continue;
11708
11709 MatchLo &= (M == Unpckl[i]);
11710 MatchHi &= (M == Unpckh[i]);
11711 }
11712
11713 if (MatchLo || MatchHi) {
11714 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
11715 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
11716 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
11717 return true;
11718 }
11719 }
11720
11721 // If a binary shuffle, commute and try again.
11722 if (!IsUnary) {
11723 ShuffleVectorSDNode::commuteMask(Unpckl);
11724 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl)) {
11725 UnpackOpcode = X86ISD::UNPCKL;
11726 std::swap(V1, V2);
11727 return true;
11728 }
11729
11730 ShuffleVectorSDNode::commuteMask(Unpckh);
11731 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh)) {
11732 UnpackOpcode = X86ISD::UNPCKH;
11733 std::swap(V1, V2);
11734 return true;
11735 }
11736 }
11737
11738 return false;
11739}
11740
11741// X86 has dedicated unpack instructions that can handle specific blend
11742// operations: UNPCKH and UNPCKL.
11743static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT,
11744 ArrayRef<int> Mask, SDValue V1, SDValue V2,
11745 SelectionDAG &DAG) {
11746 SmallVector<int, 8> Unpckl;
11747 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
11748 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
11749 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
11750
11751 SmallVector<int, 8> Unpckh;
11752 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
11753 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
11754 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
11755
11756 // Commute and try again.
11757 ShuffleVectorSDNode::commuteMask(Unpckl);
11758 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
11759 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
11760
11761 ShuffleVectorSDNode::commuteMask(Unpckh);
11762 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
11763 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
11764
11765 return SDValue();
11766}
11767
11768/// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)
11769/// followed by unpack 256-bit.
11770static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT,
11771 ArrayRef<int> Mask, SDValue V1,
11772 SDValue V2, SelectionDAG &DAG) {
11773 SmallVector<int, 32> Unpckl, Unpckh;
11774 createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);
11775 createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);
11776
11777 unsigned UnpackOpcode;
11778 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
11779 UnpackOpcode = X86ISD::UNPCKL;
11780 else if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
11781 UnpackOpcode = X86ISD::UNPCKH;
11782 else
11783 return SDValue();
11784
11785 // This is a "natural" unpack operation (rather than the 128-bit sectored
11786 // operation implemented by AVX). We need to rearrange 64-bit chunks of the
11787 // input in order to use the x86 instruction.
11788 V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),
11789 DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});
11790 V1 = DAG.getBitcast(VT, V1);
11791 return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);
11792}
11793
11794// Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the
11795// source into the lower elements and zeroing the upper elements.
11796static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,
11797 ArrayRef<int> Mask, const APInt &Zeroable,
11798 const X86Subtarget &Subtarget) {
11799 if (!VT.is512BitVector() && !Subtarget.hasVLX())
11800 return false;
11801
11802 unsigned NumElts = Mask.size();
11803 unsigned EltSizeInBits = VT.getScalarSizeInBits();
11804 unsigned MaxScale = 64 / EltSizeInBits;
11805
11806 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
11807 unsigned SrcEltBits = EltSizeInBits * Scale;
11808 if (SrcEltBits < 32 && !Subtarget.hasBWI())
11809 continue;
11810 unsigned NumSrcElts = NumElts / Scale;
11811 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))
11812 continue;
11813 unsigned UpperElts = NumElts - NumSrcElts;
11814 if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnesValue())
11815 continue;
11816 SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);
11817 SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);
11818 DstVT = MVT::getIntegerVT(EltSizeInBits);
11819 if ((NumSrcElts * EltSizeInBits) >= 128) {
11820 // ISD::TRUNCATE
11821 DstVT = MVT::getVectorVT(DstVT, NumSrcElts);
11822 } else {
11823 // X86ISD::VTRUNC
11824 DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);
11825 }
11826 return true;
11827 }
11828
11829 return false;
11830}
11831
11832// Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper
11833// element padding to the final DstVT.
11834static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,
11835 const X86Subtarget &Subtarget,
11836 SelectionDAG &DAG, bool ZeroUppers) {
11837 MVT SrcVT = Src.getSimpleValueType();
11838 MVT DstSVT = DstVT.getScalarType();
11839 unsigned NumDstElts = DstVT.getVectorNumElements();
11840 unsigned NumSrcElts = SrcVT.getVectorNumElements();
11841 unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();
11842
11843 if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
11844 return SDValue();
11845
11846 // Perform a direct ISD::TRUNCATE if possible.
11847 if (NumSrcElts == NumDstElts)
11848 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);
11849
11850 if (NumSrcElts > NumDstElts) {
11851 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
11852 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
11853 return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());
11854 }
11855
11856 if ((NumSrcElts * DstEltSizeInBits) >= 128) {
11857 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
11858 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
11859 return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
11860 DstVT.getSizeInBits());
11861 }
11862
11863 // Non-VLX targets must truncate from a 512-bit type, so we need to
11864 // widen, truncate and then possibly extract the original subvector.
11865 if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {
11866 SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512);
11867 return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);
11868 }
11869
11870 // Fallback to a X86ISD::VTRUNC, padding if necessary.
11871 MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);
11872 SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);
11873 if (DstVT != TruncVT)
11874 Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
11875 DstVT.getSizeInBits());
11876 return Trunc;
11877}
11878
11879// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
11880//
11881// An example is the following:
11882//
11883// t0: ch = EntryToken
11884// t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
11885// t25: v4i32 = truncate t2
11886// t41: v8i16 = bitcast t25
11887// t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
11888// Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
11889// t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
11890// t18: v2i64 = bitcast t51
11891//
11892// One can just use a single vpmovdw instruction, without avx512vl we need to
11893// use the zmm variant and extract the lower subvector, padding with zeroes.
11894// TODO: Merge with lowerShuffleAsVTRUNC.
11895static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1,
11896 SDValue V2, ArrayRef<int> Mask,
11897 const APInt &Zeroable,
11898 const X86Subtarget &Subtarget,
11899 SelectionDAG &DAG) {
11900 assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type")(static_cast<void> (0));
11901 if (!Subtarget.hasAVX512())
11902 return SDValue();
11903
11904 unsigned NumElts = VT.getVectorNumElements();
11905 unsigned EltSizeInBits = VT.getScalarSizeInBits();
11906 unsigned MaxScale = 64 / EltSizeInBits;
11907 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
11908 unsigned NumSrcElts = NumElts / Scale;
11909 unsigned UpperElts = NumElts - NumSrcElts;
11910 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
11911 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnesValue())
11912 continue;
11913
11914 SDValue Src = V1;
11915 if (!Src.hasOneUse())
11916 return SDValue();
11917
11918 Src = peekThroughOneUseBitcasts(Src);
11919 if (Src.getOpcode() != ISD::TRUNCATE ||
11920 Src.getScalarValueSizeInBits() != (EltSizeInBits * Scale))
11921 return SDValue();
11922 Src = Src.getOperand(0);
11923
11924 // VPMOVWB is only available with avx512bw.
11925 MVT SrcVT = Src.getSimpleValueType();
11926 if (SrcVT.getVectorElementType() == MVT::i16 && VT == MVT::v16i8 &&
11927 !Subtarget.hasBWI())
11928 return SDValue();
11929
11930 bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);
11931 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
11932 }
11933
11934 return SDValue();
11935}
11936
11937// Attempt to match binary shuffle patterns as a truncate.
11938static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1,
11939 SDValue V2, ArrayRef<int> Mask,
11940 const APInt &Zeroable,
11941 const X86Subtarget &Subtarget,
11942 SelectionDAG &DAG) {
11943 assert((VT.is128BitVector() || VT.is256BitVector()) &&(static_cast<void> (0))
11944 "Unexpected VTRUNC type")(static_cast<void> (0));
11945 if (!Subtarget.hasAVX512())
11946 return SDValue();
11947
11948 unsigned NumElts = VT.getVectorNumElements();
11949 unsigned EltSizeInBits = VT.getScalarSizeInBits();
11950 unsigned MaxScale = 64 / EltSizeInBits;
11951 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
11952 // TODO: Support non-BWI VPMOVWB truncations?
11953 unsigned SrcEltBits = EltSizeInBits * Scale;
11954 if (SrcEltBits < 32 && !Subtarget.hasBWI())
11955 continue;
11956
11957 // Match shuffle <0,Scale,2*Scale,..,undef_or_zero,undef_or_zero,...>
11958 // Bail if the V2 elements are undef.
11959 unsigned NumHalfSrcElts = NumElts / Scale;
11960 unsigned NumSrcElts = 2 * NumHalfSrcElts;
11961 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
11962 isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))
11963 continue;
11964
11965 // The elements beyond the truncation must be undef/zero.
11966 unsigned UpperElts = NumElts - NumSrcElts;
11967 if (UpperElts > 0 &&
11968 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnesValue())
11969 continue;
11970 bool UndefUppers =
11971 UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);
11972
11973 // As we're using both sources then we need to concat them together
11974 // and truncate from the double-sized src.
11975 MVT ConcatVT = MVT::getVectorVT(VT.getScalarType(), NumElts * 2);
11976 SDValue Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);
11977
11978 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
11979 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
11980 Src = DAG.getBitcast(SrcVT, Src);
11981 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
11982 }
11983
11984 return SDValue();
11985}
11986
11987/// Check whether a compaction lowering can be done by dropping even
11988/// elements and compute how many times even elements must be dropped.
11989///
11990/// This handles shuffles which take every Nth element where N is a power of
11991/// two. Example shuffle masks:
11992///
11993/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
11994/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
11995/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
11996/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
11997/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
11998/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
11999///
12000/// Any of these lanes can of course be undef.
12001///
12002/// This routine only supports N <= 3.
12003/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
12004/// for larger N.
12005///
12006/// \returns N above, or the number of times even elements must be dropped if
12007/// there is such a number. Otherwise returns zero.
12008static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
12009 bool IsSingleInput) {
12010 // The modulus for the shuffle vector entries is based on whether this is
12011 // a single input or not.
12012 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
12013 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&(static_cast<void> (0))
12014 "We should only be called with masks with a power-of-2 size!")(static_cast<void> (0));
12015
12016 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
12017
12018 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
12019 // and 2^3 simultaneously. This is because we may have ambiguity with
12020 // partially undef inputs.
12021 bool ViableForN[3] = {true, true, true};
12022
12023 for (int i = 0, e = Mask.size(); i < e; ++i) {
12024 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
12025 // want.
12026 if (Mask[i] < 0)
12027 continue;
12028
12029 bool IsAnyViable = false;
12030 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
12031 if (ViableForN[j]) {
12032 uint64_t N = j + 1;
12033
12034 // The shuffle mask must be equal to (i * 2^N) % M.
12035 if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
12036 IsAnyViable = true;
12037 else
12038 ViableForN[j] = false;
12039 }
12040 // Early exit if we exhaust the possible powers of two.
12041 if (!IsAnyViable)
12042 break;
12043 }
12044
12045 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
12046 if (ViableForN[j])
12047 return j + 1;
12048
12049 // Return 0 as there is no viable power of two.
12050 return 0;
12051}
12052
12053// X86 has dedicated pack instructions that can handle specific truncation
12054// operations: PACKSS and PACKUS.
12055// Checks for compaction shuffle masks if MaxStages > 1.
12056// TODO: Add support for matching multiple PACKSS/PACKUS stages.
12057static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
12058 unsigned &PackOpcode, ArrayRef<int> TargetMask,
12059 const SelectionDAG &DAG,
12060 const X86Subtarget &Subtarget,
12061 unsigned MaxStages = 1) {
12062 unsigned NumElts = VT.getVectorNumElements();
12063 unsigned BitSize = VT.getScalarSizeInBits();
12064 assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&(static_cast<void> (0))
12065 "Illegal maximum compaction")(static_cast<void> (0));
12066
12067 auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {
12068 unsigned NumSrcBits = PackVT.getScalarSizeInBits();
12069 unsigned NumPackedBits = NumSrcBits - BitSize;
12070 N1 = peekThroughBitcasts(N1);
12071 N2 = peekThroughBitcasts(N2);
12072 unsigned NumBits1 = N1.getScalarValueSizeInBits();
12073 unsigned NumBits2 = N2.getScalarValueSizeInBits();
12074 bool IsZero1 = llvm::isNullOrNullSplat(N1, /*AllowUndefs*/ false);
12075 bool IsZero2 = llvm::isNullOrNullSplat(N2, /*AllowUndefs*/ false);
12076 if ((!N1.isUndef() && !IsZero1 && NumBits1 != NumSrcBits) ||
12077 (!N2.isUndef() && !IsZero2 && NumBits2 != NumSrcBits))
12078 return false;
12079 if (Subtarget.hasSSE41() || BitSize == 8) {
12080 APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);
12081 if ((N1.isUndef() || IsZero1 || DAG.MaskedValueIsZero(N1, ZeroMask)) &&
12082 (N2.isUndef() || IsZero2 || DAG.MaskedValueIsZero(N2, ZeroMask))) {
12083 V1 = N1;
12084 V2 = N2;
12085 SrcVT = PackVT;
12086 PackOpcode = X86ISD::PACKUS;
12087 return true;
12088 }
12089 }
12090 bool IsAllOnes1 = llvm::isAllOnesOrAllOnesSplat(N1, /*AllowUndefs*/ false);
12091 bool IsAllOnes2 = llvm::isAllOnesOrAllOnesSplat(N2, /*AllowUndefs*/ false);
12092 if ((N1.isUndef() || IsZero1 || IsAllOnes1 ||
12093 DAG.ComputeNumSignBits(N1) > NumPackedBits) &&
12094 (N2.isUndef() || IsZero2 || IsAllOnes2 ||
12095 DAG.ComputeNumSignBits(N2) > NumPackedBits)) {
12096 V1 = N1;
12097 V2 = N2;
12098 SrcVT = PackVT;
12099 PackOpcode = X86ISD::PACKSS;
12100 return true;
12101 }
12102 return false;
12103 };
12104
12105 // Attempt to match against wider and wider compaction patterns.
12106 for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {
12107 MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages);
12108 MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages);
12109
12110 // Try binary shuffle.
12111 SmallVector<int, 32> BinaryMask;
12112 createPackShuffleMask(VT, BinaryMask, false, NumStages);
12113 if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, V1, V2))
12114 if (MatchPACK(V1, V2, PackVT))
12115 return true;
12116
12117 // Try unary shuffle.
12118 SmallVector<int, 32> UnaryMask;
12119 createPackShuffleMask(VT, UnaryMask, true, NumStages);
12120 if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, V1))
12121 if (MatchPACK(V1, V1, PackVT))
12122 return true;
12123 }
12124
12125 return false;
12126}
12127
12128static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
12129 SDValue V1, SDValue V2, SelectionDAG &DAG,
12130 const X86Subtarget &Subtarget) {
12131 MVT PackVT;
12132 unsigned PackOpcode;
12133 unsigned SizeBits = VT.getSizeInBits();
12134 unsigned EltBits = VT.getScalarSizeInBits();
12135 unsigned MaxStages = Log2_32(64 / EltBits);
12136 if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
12137 Subtarget, MaxStages))
12138 return SDValue();
12139
12140 unsigned CurrentEltBits = PackVT.getScalarSizeInBits();
12141 unsigned NumStages = Log2_32(CurrentEltBits / EltBits);
12142
12143 // Don't lower multi-stage packs on AVX512, truncation is better.
12144 if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())
12145 return SDValue();
12146
12147 // Pack to the largest type possible:
12148 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
12149 unsigned MaxPackBits = 16;
12150 if (CurrentEltBits > 16 &&
12151 (PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41()))
12152 MaxPackBits = 32;
12153
12154 // Repeatedly pack down to the target size.
12155 SDValue Res;
12156 for (unsigned i = 0; i != NumStages; ++i) {
12157 unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);
12158 unsigned NumSrcElts = SizeBits / SrcEltBits;
12159 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
12160 MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2);
12161 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
12162 MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2);
12163 Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1),
12164 DAG.getBitcast(SrcVT, V2));
12165 V1 = V2 = Res;
12166 CurrentEltBits /= 2;
12167 }
12168 assert(Res && Res.getValueType() == VT &&(static_cast<void> (0))
12169 "Failed to lower compaction shuffle")(static_cast<void> (0));
12170 return Res;
12171}
12172
12173/// Try to emit a bitmask instruction for a shuffle.
12174///
12175/// This handles cases where we can model a blend exactly as a bitmask due to
12176/// one of the inputs being zeroable.
12177static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
12178 SDValue V2, ArrayRef<int> Mask,
12179 const APInt &Zeroable,
12180 const X86Subtarget &Subtarget,
12181 SelectionDAG &DAG) {
12182 MVT MaskVT = VT;
12183 MVT EltVT = VT.getVectorElementType();
12184 SDValue Zero, AllOnes;
12185 // Use f64 if i64 isn't legal.
12186 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
12187 EltVT = MVT::f64;
12188 MaskVT = MVT::getVectorVT(EltVT, Mask.size());
12189 }
12190
12191 MVT LogicVT = VT;
12192 if (EltVT == MVT::f32 || EltVT == MVT::f64) {
12193 Zero = DAG.getConstantFP(0.0, DL, EltVT);
12194 APFloat AllOnesValue = APFloat::getAllOnesValue(
12195 SelectionDAG::EVTToAPFloatSemantics(EltVT), EltVT.getSizeInBits());
12196 AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);
12197 LogicVT =
12198 MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size());
12199 } else {
12200 Zero = DAG.getConstant(0, DL, EltVT);
12201 AllOnes = DAG.getAllOnesConstant(DL, EltVT);
12202 }
12203
12204 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
12205 SDValue V;
12206 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
12207 if (Zeroable[i])
12208 continue;
12209 if (Mask[i] % Size != i)
12210 return SDValue(); // Not a blend.
12211 if (!V)
12212 V = Mask[i] < Size ? V1 : V2;
12213 else if (V != (Mask[i] < Size ? V1 : V2))
12214 return SDValue(); // Can only let one input through the mask.
12215
12216 VMaskOps[i] = AllOnes;
12217 }
12218 if (!V)
12219 return SDValue(); // No non-zeroable elements!
12220
12221 SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
12222 VMask = DAG.getBitcast(LogicVT, VMask);
12223 V = DAG.getBitcast(LogicVT, V);
12224 SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
12225 return DAG.getBitcast(VT, And);
12226}
12227
12228/// Try to emit a blend instruction for a shuffle using bit math.
12229///
12230/// This is used as a fallback approach when first class blend instructions are
12231/// unavailable. Currently it is only suitable for integer vectors, but could
12232/// be generalized for floating point vectors if desirable.
12233static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
12234 SDValue V2, ArrayRef<int> Mask,
12235 SelectionDAG &DAG) {
12236 assert(VT.isInteger() && "Only supports integer vector types!")(static_cast<void> (0));
12237 MVT EltVT = VT.getVectorElementType();
12238 SDValue Zero = DAG.getConstant(0, DL, EltVT);
12239 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
12240 SmallVector<SDValue, 16> MaskOps;
12241 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
12242 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
12243 return SDValue(); // Shuffled input!
12244 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
12245 }
12246
12247 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
12248 V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
12249 V2 = DAG.getNode(X86ISD::ANDNP, DL, VT, V1Mask, V2);
12250 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
12251}
12252
12253static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
12254 SDValue PreservedSrc,
12255 const X86Subtarget &Subtarget,
12256 SelectionDAG &DAG);
12257
12258static bool matchShuffleAsBlend(SDValue V1, SDValue V2,
12259 MutableArrayRef<int> Mask,
12260 const APInt &Zeroable, bool &ForceV1Zero,
12261 bool &ForceV2Zero, uint64_t &BlendMask) {
12262 bool V1IsZeroOrUndef =
12263 V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());
12264 bool V2IsZeroOrUndef =
12265 V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
12266
12267 BlendMask = 0;
12268 ForceV1Zero = false, ForceV2Zero = false;
12269 assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask")(static_cast<void> (0));
12270
12271 // Attempt to generate the binary blend mask. If an input is zero then
12272 // we can use any lane.
12273 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
12274 int M = Mask[i];
12275 if (M == SM_SentinelUndef)
12276 continue;
12277 if (M == i ||
12278 (0 <= M && M < Size && IsElementEquivalent(Size, V1, V1, M, i))) {
12279 Mask[i] = i;
12280 continue;
12281 }
12282 if (M == (i + Size) ||
12283 (Size <= M && IsElementEquivalent(Size, V2, V2, M - Size, i))) {
12284 BlendMask |= 1ull << i;
12285 Mask[i] = i + Size;
12286 continue;
12287 }
12288 if (Zeroable[i]) {
12289 if (V1IsZeroOrUndef) {
12290 ForceV1Zero = true;
12291 Mask[i] = i;
12292 continue;
12293 }
12294 if (V2IsZeroOrUndef) {
12295 ForceV2Zero = true;
12296 BlendMask |= 1ull << i;
12297 Mask[i] = i + Size;
12298 continue;
12299 }
12300 }
12301 return false;
12302 }
12303 return true;
12304}
12305
12306static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size,
12307 int Scale) {
12308 uint64_t ScaledMask = 0;
12309 for (int i = 0; i != Size; ++i)
12310 if (BlendMask & (1ull << i))
12311 ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);
12312 return ScaledMask;
12313}
12314
12315/// Try to emit a blend instruction for a shuffle.
12316///
12317/// This doesn't do any checks for the availability of instructions for blending
12318/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
12319/// be matched in the backend with the type given. What it does check for is
12320/// that the shuffle mask is a blend, or convertible into a blend with zero.
12321static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
12322 SDValue V2, ArrayRef<int> Original,
12323 const APInt &Zeroable,
12324 const X86Subtarget &Subtarget,
12325 SelectionDAG &DAG) {
12326 uint64_t BlendMask = 0;
12327 bool ForceV1Zero = false, ForceV2Zero = false;
12328 SmallVector<int, 64> Mask(Original.begin(), Original.end());
12329 if (!matchShuffleAsBlend(V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,
12330 BlendMask))
12331 return SDValue();
12332
12333 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
12334 if (ForceV1Zero)
12335 V1 = getZeroVector(VT, Subtarget, DAG, DL);
12336 if (ForceV2Zero)
12337 V2 = getZeroVector(VT, Subtarget, DAG, DL);
12338
12339 switch (VT.SimpleTy) {
12340 case MVT::v4i64:
12341 case MVT::v8i32:
12342 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!")(static_cast<void> (0));
12343 LLVM_FALLTHROUGH[[gnu::fallthrough]];
12344 case MVT::v4f64:
12345 case MVT::v8f32:
12346 assert(Subtarget.hasAVX() && "256-bit float blends require AVX!")(static_cast<void> (0));
12347 LLVM_FALLTHROUGH[[gnu::fallthrough]];
12348 case MVT::v2f64:
12349 case MVT::v2i64:
12350 case MVT::v4f32:
12351 case MVT::v4i32:
12352 case MVT::v8i16:
12353 assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!")(static_cast<void> (0));
12354 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
12355 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
12356 case MVT::v16i16: {
12357 assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!")(static_cast<void> (0));
12358 SmallVector<int, 8> RepeatedMask;
12359 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
12360 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
12361 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!")(static_cast<void> (0));
12362 BlendMask = 0;
12363 for (int i = 0; i < 8; ++i)
12364 if (RepeatedMask[i] >= 8)
12365 BlendMask |= 1ull << i;
12366 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
12367 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
12368 }
12369 // Use PBLENDW for lower/upper lanes and then blend lanes.
12370 // TODO - we should allow 2 PBLENDW here and leave shuffle combine to
12371 // merge to VSELECT where useful.
12372 uint64_t LoMask = BlendMask & 0xFF;
12373 uint64_t HiMask = (BlendMask >> 8) & 0xFF;
12374 if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
12375 SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
12376 DAG.getTargetConstant(LoMask, DL, MVT::i8));
12377 SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
12378 DAG.getTargetConstant(HiMask, DL, MVT::i8));
12379 return DAG.getVectorShuffle(
12380 MVT::v16i16, DL, Lo, Hi,
12381 {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
12382 }
12383 LLVM_FALLTHROUGH[[gnu::fallthrough]];
12384 }
12385 case MVT::v32i8:
12386 assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!")(static_cast<void> (0));
12387 LLVM_FALLTHROUGH[[gnu::fallthrough]];
12388 case MVT::v16i8: {
12389 assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!")(static_cast<void> (0));
12390
12391 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
12392 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
12393 Subtarget, DAG))
12394 return Masked;
12395
12396 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
12397 MVT IntegerType =
12398 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
12399 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
12400 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
12401 }
12402
12403 // If we have VPTERNLOG, we can use that as a bit blend.
12404 if (Subtarget.hasVLX())
12405 if (SDValue BitBlend =
12406 lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
12407 return BitBlend;
12408
12409 // Scale the blend by the number of bytes per element.
12410 int Scale = VT.getScalarSizeInBits() / 8;
12411
12412 // This form of blend is always done on bytes. Compute the byte vector
12413 // type.
12414 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
12415
12416 // x86 allows load folding with blendvb from the 2nd source operand. But
12417 // we are still using LLVM select here (see comment below), so that's V1.
12418 // If V2 can be load-folded and V1 cannot be load-folded, then commute to
12419 // allow that load-folding possibility.
12420 if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
12421 ShuffleVectorSDNode::commuteMask(Mask);
12422 std::swap(V1, V2);
12423 }
12424
12425 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
12426 // mix of LLVM's code generator and the x86 backend. We tell the code
12427 // generator that boolean values in the elements of an x86 vector register
12428 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
12429 // mapping a select to operand #1, and 'false' mapping to operand #2. The
12430 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
12431 // of the element (the remaining are ignored) and 0 in that high bit would
12432 // mean operand #1 while 1 in the high bit would mean operand #2. So while
12433 // the LLVM model for boolean values in vector elements gets the relevant
12434 // bit set, it is set backwards and over constrained relative to x86's
12435 // actual model.
12436 SmallVector<SDValue, 32> VSELECTMask;
12437 for (int i = 0, Size = Mask.size(); i < Size; ++i)
12438 for (int j = 0; j < Scale; ++j)
12439 VSELECTMask.push_back(
12440 Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
12441 : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
12442 MVT::i8));
12443
12444 V1 = DAG.getBitcast(BlendVT, V1);
12445 V2 = DAG.getBitcast(BlendVT, V2);
12446 return DAG.getBitcast(
12447 VT,
12448 DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
12449 V1, V2));
12450 }
12451 case MVT::v16f32:
12452 case MVT::v8f64:
12453 case MVT::v8i64:
12454 case MVT::v16i32:
12455 case MVT::v32i16:
12456 case MVT::v64i8: {
12457 // Attempt to lower to a bitmask if we can. Only if not optimizing for size.
12458 bool OptForSize = DAG.shouldOptForSize();
12459 if (!OptForSize) {
12460 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
12461 Subtarget, DAG))
12462 return Masked;
12463 }
12464
12465 // Otherwise load an immediate into a GPR, cast to k-register, and use a
12466 // masked move.
12467 MVT IntegerType =
12468 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
12469 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
12470 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
12471 }
12472 default:
12473 llvm_unreachable("Not a supported integer vector type!")__builtin_unreachable();
12474 }
12475}
12476
12477/// Try to lower as a blend of elements from two inputs followed by
12478/// a single-input permutation.
12479///
12480/// This matches the pattern where we can blend elements from two inputs and
12481/// then reduce the shuffle to a single-input permutation.
12482static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
12483 SDValue V1, SDValue V2,
12484 ArrayRef<int> Mask,
12485 SelectionDAG &DAG,
12486 bool ImmBlends = false) {
12487 // We build up the blend mask while checking whether a blend is a viable way
12488 // to reduce the shuffle.
12489 SmallVector<int, 32> BlendMask(Mask.size(), -1);
12490 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
12491
12492 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
12493 if (Mask[i] < 0)
12494 continue;
12495
12496 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.")(static_cast<void> (0));
12497
12498 if (BlendMask[Mask[i] % Size] < 0)
12499 BlendMask[Mask[i] % Size] = Mask[i];
12500 else if (BlendMask[Mask[i] % Size] != Mask[i])
12501 return SDValue(); // Can't blend in the needed input!
12502
12503 PermuteMask[i] = Mask[i] % Size;
12504 }
12505
12506 // If only immediate blends, then bail if the blend mask can't be widened to
12507 // i16.
12508 unsigned EltSize = VT.getScalarSizeInBits();
12509 if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
12510 return SDValue();
12511
12512 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
12513 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
12514}
12515
12516/// Try to lower as an unpack of elements from two inputs followed by
12517/// a single-input permutation.
12518///
12519/// This matches the pattern where we can unpack elements from two inputs and
12520/// then reduce the shuffle to a single-input (wider) permutation.
12521static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT,
12522 SDValue V1, SDValue V2,
12523 ArrayRef<int> Mask,
12524 SelectionDAG &DAG) {
12525 int NumElts = Mask.size();
12526 int NumLanes = VT.getSizeInBits() / 128;
12527 int NumLaneElts = NumElts / NumLanes;
12528 int NumHalfLaneElts = NumLaneElts / 2;
12529
12530 bool MatchLo = true, MatchHi = true;
12531 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
12532
12533 // Determine UNPCKL/UNPCKH type and operand order.
12534 for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
12535 for (int Elt = 0; Elt != NumLaneElts; ++Elt) {
12536 int M = Mask[Lane + Elt];
12537 if (M < 0)
12538 continue;
12539
12540 SDValue &Op = Ops[Elt & 1];
12541 if (M < NumElts && (Op.isUndef() || Op == V1))
12542 Op = V1;
12543 else if (NumElts <= M && (Op.isUndef() || Op == V2))
12544 Op = V2;
12545 else
12546 return SDValue();
12547
12548 int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;
12549 MatchLo &= isUndefOrInRange(M, Lo, Mid) ||
12550 isUndefOrInRange(M, NumElts + Lo, NumElts + Mid);
12551 MatchHi &= isUndefOrInRange(M, Mid, Hi) ||
12552 isUndefOrInRange(M, NumElts + Mid, NumElts + Hi);
12553 if (!MatchLo && !MatchHi)
12554 return SDValue();
12555 }
12556 }
12557 assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI")(static_cast<void> (0));
12558
12559 // Now check that each pair of elts come from the same unpack pair
12560 // and set the permute mask based on each pair.
12561 // TODO - Investigate cases where we permute individual elements.
12562 SmallVector<int, 32> PermuteMask(NumElts, -1);
12563 for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
12564 for (int Elt = 0; Elt != NumLaneElts; Elt += 2) {
12565 int M0 = Mask[Lane + Elt + 0];
12566 int M1 = Mask[Lane + Elt + 1];
12567 if (0 <= M0 && 0 <= M1 &&
12568 (M0 % NumHalfLaneElts) != (M1 % NumHalfLaneElts))
12569 return SDValue();
12570 if (0 <= M0)
12571 PermuteMask[Lane + Elt + 0] = Lane + (2 * (M0 % NumHalfLaneElts));
12572 if (0 <= M1)
12573 PermuteMask[Lane + Elt + 1] = Lane + (2 * (M1 % NumHalfLaneElts)) + 1;
12574 }
12575 }
12576
12577 unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
12578 SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);
12579 return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
12580}
12581
12582/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
12583/// permuting the elements of the result in place.
12584static SDValue lowerShuffleAsByteRotateAndPermute(
12585 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12586 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12587 if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
12588 (VT.is256BitVector() && !Subtarget.hasAVX2()) ||
12589 (VT.is512BitVector() && !Subtarget.hasBWI()))
12590 return SDValue();
12591
12592 // We don't currently support lane crossing permutes.
12593 if (is128BitLaneCrossingShuffleMask(VT, Mask))
12594 return SDValue();
12595
12596 int Scale = VT.getScalarSizeInBits() / 8;
12597 int NumLanes = VT.getSizeInBits() / 128;
12598 int NumElts = VT.getVectorNumElements();
12599 int NumEltsPerLane = NumElts / NumLanes;
12600
12601 // Determine range of mask elts.
12602 bool Blend1 = true;
12603 bool Blend2 = true;
12604 std::pair<int, int> Range1 = std::make_pair(INT_MAX2147483647, INT_MIN(-2147483647 -1));
12605 std::pair<int, int> Range2 = std::make_pair(INT_MAX2147483647, INT_MIN(-2147483647 -1));
12606 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
12607 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
12608 int M = Mask[Lane + Elt];
12609 if (M < 0)
12610 continue;
12611 if (M < NumElts) {
12612 Blend1 &= (M == (Lane + Elt));
12613 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask")(static_cast<void> (0));
12614 M = M % NumEltsPerLane;
12615 Range1.first = std::min(Range1.first, M);
12616 Range1.second = std::max(Range1.second, M);
12617 } else {
12618 M -= NumElts;
12619 Blend2 &= (M == (Lane + Elt));
12620 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask")(static_cast<void> (0));
12621 M = M % NumEltsPerLane;
12622 Range2.first = std::min(Range2.first, M);
12623 Range2.second = std::max(Range2.second, M);
12624 }
12625 }
12626 }
12627
12628 // Bail if we don't need both elements.
12629 // TODO - it might be worth doing this for unary shuffles if the permute
12630 // can be widened.
12631 if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
12632 !(0 <= Range2.first && Range2.second < NumEltsPerLane))
12633 return SDValue();
12634
12635 if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))
12636 return SDValue();
12637
12638 // Rotate the 2 ops so we can access both ranges, then permute the result.
12639 auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
12640 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
12641 SDValue Rotate = DAG.getBitcast(
12642 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
12643 DAG.getBitcast(ByteVT, Lo),
12644 DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));
12645 SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
12646 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
12647 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
12648 int M = Mask[Lane + Elt];
12649 if (M < 0)
12650 continue;
12651 if (M < NumElts)
12652 PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
12653 else
12654 PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
12655 }
12656 }
12657 return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
12658 };
12659
12660 // Check if the ranges are small enough to rotate from either direction.
12661 if (Range2.second < Range1.first)
12662 return RotateAndPermute(V1, V2, Range1.first, 0);
12663 if (Range1.second < Range2.first)
12664 return RotateAndPermute(V2, V1, Range2.first, NumElts);
12665 return SDValue();
12666}
12667
12668/// Generic routine to decompose a shuffle and blend into independent
12669/// blends and permutes.
12670///
12671/// This matches the extremely common pattern for handling combined
12672/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
12673/// operations. It will try to pick the best arrangement of shuffles and
12674/// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.
12675static SDValue lowerShuffleAsDecomposedShuffleMerge(
12676 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12677 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12678 int NumElts = Mask.size();
12679 int NumLanes = VT.getSizeInBits() / 128;
12680 int NumEltsPerLane = NumElts / NumLanes;
12681
12682 // Shuffle the input elements into the desired positions in V1 and V2 and
12683 // unpack/blend them together.
12684 bool IsAlternating = true;
12685 SmallVector<int, 32> V1Mask(NumElts, -1);
12686 SmallVector<int, 32> V2Mask(NumElts, -1);
12687 SmallVector<int, 32> FinalMask(NumElts, -1);
12688 for (int i = 0; i < NumElts; ++i) {
12689 int M = Mask[i];
12690 if (M >= 0 && M < NumElts) {
12691 V1Mask[i] = M;
12692 FinalMask[i] = i;
12693 IsAlternating &= (i & 1) == 0;
12694 } else if (M >= NumElts) {
12695 V2Mask[i] = M - NumElts;
12696 FinalMask[i] = i + NumElts;
12697 IsAlternating &= (i & 1) == 1;
12698 }
12699 }
12700
12701 // Try to lower with the simpler initial blend/unpack/rotate strategies unless
12702 // one of the input shuffles would be a no-op. We prefer to shuffle inputs as
12703 // the shuffle may be able to fold with a load or other benefit. However, when
12704 // we'll have to do 2x as many shuffles in order to achieve this, a 2-input
12705 // pre-shuffle first is a better strategy.
12706 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
12707 // Only prefer immediate blends to unpack/rotate.
12708 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
12709 DAG, true))
12710 return BlendPerm;
12711 if (SDValue UnpackPerm = lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask,
12712 DAG))
12713 return UnpackPerm;
12714 if (SDValue RotatePerm = lowerShuffleAsByteRotateAndPermute(
12715 DL, VT, V1, V2, Mask, Subtarget, DAG))
12716 return RotatePerm;
12717 // Unpack/rotate failed - try again with variable blends.
12718 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
12719 DAG))
12720 return BlendPerm;
12721 }
12722
12723 // If the final mask is an alternating blend of vXi8/vXi16, convert to an
12724 // UNPCKL(SHUFFLE, SHUFFLE) pattern.
12725 // TODO: It doesn't have to be alternating - but each lane mustn't have more
12726 // than half the elements coming from each source.
12727 if (IsAlternating && VT.getScalarSizeInBits() < 32) {
12728 V1Mask.assign(NumElts, -1);
12729 V2Mask.assign(NumElts, -1);
12730 FinalMask.assign(NumElts, -1);
12731 for (int i = 0; i != NumElts; i += NumEltsPerLane)
12732 for (int j = 0; j != NumEltsPerLane; ++j) {
12733 int M = Mask[i + j];
12734 if (M >= 0 && M < NumElts) {
12735 V1Mask[i + (j / 2)] = M;
12736 FinalMask[i + j] = i + (j / 2);
12737 } else if (M >= NumElts) {
12738 V2Mask[i + (j / 2)] = M - NumElts;
12739 FinalMask[i + j] = i + (j / 2) + NumElts;
12740 }
12741 }
12742 }
12743
12744 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
12745 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
12746 return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask);
12747}
12748
12749/// Try to lower a vector shuffle as a bit rotation.
12750///
12751/// Look for a repeated rotation pattern in each sub group.
12752/// Returns a ISD::ROTL element rotation amount or -1 if failed.
12753static int matchShuffleAsBitRotate(ArrayRef<int> Mask, int NumSubElts) {
12754 int NumElts = Mask.size();
12755 assert((NumElts % NumSubElts) == 0 && "Illegal shuffle mask")(static_cast<void> (0));
12756
12757 int RotateAmt = -1;
12758 for (int i = 0; i != NumElts; i += NumSubElts) {
12759 for (int j = 0; j != NumSubElts; ++j) {
12760 int M = Mask[i + j];
12761 if (M < 0)
12762 continue;
12763 if (!isInRange(M, i, i + NumSubElts))
12764 return -1;
12765 int Offset = (NumSubElts - (M - (i + j))) % NumSubElts;
12766 if (0 <= RotateAmt && Offset != RotateAmt)
12767 return -1;
12768 RotateAmt = Offset;
12769 }
12770 }
12771 return RotateAmt;
12772}
12773
12774static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,
12775 const X86Subtarget &Subtarget,
12776 ArrayRef<int> Mask) {
12777 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")(static_cast<void> (0));
12778 assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers")(static_cast<void> (0));
12779
12780 // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
12781 int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
12782 int MaxSubElts = 64 / EltSizeInBits;
12783 for (int NumSubElts = MinSubElts; NumSubElts <= MaxSubElts; NumSubElts *= 2) {
12784 int RotateAmt = matchShuffleAsBitRotate(Mask, NumSubElts);
12785 if (RotateAmt < 0)
12786 continue;
12787
12788 int NumElts = Mask.size();
12789 MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
12790 RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
12791 return RotateAmt * EltSizeInBits;
12792 }
12793
12794 return -1;
12795}
12796
12797/// Lower shuffle using X86ISD::VROTLI rotations.
12798static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1,
12799 ArrayRef<int> Mask,
12800 const X86Subtarget &Subtarget,
12801 SelectionDAG &DAG) {
12802 // Only XOP + AVX512 targets have bit rotation instructions.
12803 // If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.
12804 bool IsLegal =
12805 (VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();
12806 if (!IsLegal && Subtarget.hasSSE3())
12807 return SDValue();
12808
12809 MVT RotateVT;
12810 int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),
12811 Subtarget, Mask);
12812 if (RotateAmt < 0)
12813 return SDValue();
12814
12815 // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
12816 // expanded to OR(SRL,SHL), will be more efficient, but if they can
12817 // widen to vXi16 or more then existing lowering should will be better.
12818 if (!IsLegal) {
12819 if ((RotateAmt % 16) == 0)
12820 return SDValue();
12821 // TODO: Use getTargetVShiftByConstNode.
12822 unsigned ShlAmt = RotateAmt;
12823 unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;
12824 V1 = DAG.getBitcast(RotateVT, V1);
12825 SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,
12826 DAG.getTargetConstant(ShlAmt, DL, MVT::i8));
12827 SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,
12828 DAG.getTargetConstant(SrlAmt, DL, MVT::i8));
12829 SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
12830 return DAG.getBitcast(VT, Rot);
12831 }
12832
12833 SDValue Rot =
12834 DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
12835 DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
12836 return DAG.getBitcast(VT, Rot);
12837}
12838
12839/// Try to match a vector shuffle as an element rotation.
12840///
12841/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
12842static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2,
12843 ArrayRef<int> Mask) {
12844 int NumElts = Mask.size();
12845
12846 // We need to detect various ways of spelling a rotation:
12847 // [11, 12, 13, 14, 15, 0, 1, 2]
12848 // [-1, 12, 13, 14, -1, -1, 1, -1]
12849 // [-1, -1, -1, -1, -1, -1, 1, 2]
12850 // [ 3, 4, 5, 6, 7, 8, 9, 10]
12851 // [-1, 4, 5, 6, -1, -1, 9, -1]
12852 // [-1, 4, 5, 6, -1, -1, -1, -1]
12853 int Rotation = 0;
12854 SDValue Lo, Hi;
12855 for (int i = 0; i < NumElts; ++i) {
12856 int M = Mask[i];
12857 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&(static_cast<void> (0))
12858 "Unexpected mask index.")(static_cast<void> (0));
12859 if (M < 0)
12860 continue;
12861
12862 // Determine where a rotated vector would have started.
12863 int StartIdx = i - (M % NumElts);
12864 if (StartIdx == 0)
12865 // The identity rotation isn't interesting, stop.
12866 return -1;
12867
12868 // If we found the tail of a vector the rotation must be the missing
12869 // front. If we found the head of a vector, it must be how much of the
12870 // head.
12871 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
12872
12873 if (Rotation == 0)
12874 Rotation = CandidateRotation;
12875 else if (Rotation != CandidateRotation)
12876 // The rotations don't match, so we can't match this mask.
12877 return -1;
12878
12879 // Compute which value this mask is pointing at.
12880 SDValue MaskV = M < NumElts ? V1 : V2;
12881
12882 // Compute which of the two target values this index should be assigned
12883 // to. This reflects whether the high elements are remaining or the low
12884 // elements are remaining.
12885 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
12886
12887 // Either set up this value if we've not encountered it before, or check
12888 // that it remains consistent.
12889 if (!TargetV)
12890 TargetV = MaskV;
12891 else if (TargetV != MaskV)
12892 // This may be a rotation, but it pulls from the inputs in some
12893 // unsupported interleaving.
12894 return -1;
12895 }
12896
12897 // Check that we successfully analyzed the mask, and normalize the results.
12898 assert(Rotation != 0 && "Failed to locate a viable rotation!")(static_cast<void> (0));
12899 assert((Lo || Hi) && "Failed to find a rotated input vector!")(static_cast<void> (0));
12900 if (!Lo)
12901 Lo = Hi;
12902 else if (!Hi)
12903 Hi = Lo;
12904
12905 V1 = Lo;
12906 V2 = Hi;
12907
12908 return Rotation;
12909}
12910
12911/// Try to lower a vector shuffle as a byte rotation.
12912///
12913/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
12914/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
12915/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
12916/// try to generically lower a vector shuffle through such an pattern. It
12917/// does not check for the profitability of lowering either as PALIGNR or
12918/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
12919/// This matches shuffle vectors that look like:
12920///
12921/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
12922///
12923/// Essentially it concatenates V1 and V2, shifts right by some number of
12924/// elements, and takes the low elements as the result. Note that while this is
12925/// specified as a *right shift* because x86 is little-endian, it is a *left
12926/// rotate* of the vector lanes.
12927static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
12928 ArrayRef<int> Mask) {
12929 // Don't accept any shuffles with zero elements.
12930 if (isAnyZero(Mask))
12931 return -1;
12932
12933 // PALIGNR works on 128-bit lanes.
12934 SmallVector<int, 16> RepeatedMask;
12935 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
12936 return -1;
12937
12938 int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask);
12939 if (Rotation <= 0)
12940 return -1;
12941
12942 // PALIGNR rotates bytes, so we need to scale the
12943 // rotation based on how many bytes are in the vector lane.
12944 int NumElts = RepeatedMask.size();
12945 int Scale = 16 / NumElts;
12946 return Rotation * Scale;
12947}
12948
12949static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1,
12950 SDValue V2, ArrayRef<int> Mask,
12951 const X86Subtarget &Subtarget,
12952 SelectionDAG &DAG) {
12953 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")(static_cast<void> (0));
12954
12955 SDValue Lo = V1, Hi = V2;
12956 int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
12957 if (ByteRotation <= 0)
12958 return SDValue();
12959
12960 // Cast the inputs to i8 vector of correct length to match PALIGNR or
12961 // PSLLDQ/PSRLDQ.
12962 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
12963 Lo = DAG.getBitcast(ByteVT, Lo);
12964 Hi = DAG.getBitcast(ByteVT, Hi);
12965
12966 // SSSE3 targets can use the palignr instruction.
12967 if (Subtarget.hasSSSE3()) {
12968 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&(static_cast<void> (0))
12969 "512-bit PALIGNR requires BWI instructions")(static_cast<void> (0));
12970 return DAG.getBitcast(
12971 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
12972 DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));
12973 }
12974
12975 assert(VT.is128BitVector() &&(static_cast<void> (0))
12976 "Rotate-based lowering only supports 128-bit lowering!")(static_cast<void> (0));
12977 assert(Mask.size() <= 16 &&(static_cast<void> (0))
12978 "Can shuffle at most 16 bytes in a 128-bit vector!")(static_cast<void> (0));
12979 assert(ByteVT == MVT::v16i8 &&(static_cast<void> (0))
12980 "SSE2 rotate lowering only needed for v16i8!")(static_cast<void> (0));
12981
12982 // Default SSE2 implementation
12983 int LoByteShift = 16 - ByteRotation;
12984 int HiByteShift = ByteRotation;
12985
12986 SDValue LoShift =
12987 DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
12988 DAG.getTargetConstant(LoByteShift, DL, MVT::i8));
12989 SDValue HiShift =
12990 DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
12991 DAG.getTargetConstant(HiByteShift, DL, MVT::i8));
12992 return DAG.getBitcast(VT,
12993 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
12994}
12995
12996/// Try to lower a vector shuffle as a dword/qword rotation.
12997///
12998/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
12999/// rotation of the concatenation of two vectors; This routine will
13000/// try to generically lower a vector shuffle through such an pattern.
13001///
13002/// Essentially it concatenates V1 and V2, shifts right by some number of
13003/// elements, and takes the low elements as the result. Note that while this is
13004/// specified as a *right shift* because x86 is little-endian, it is a *left
13005/// rotate* of the vector lanes.
13006static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1,
13007 SDValue V2, ArrayRef<int> Mask,
13008 const X86Subtarget &Subtarget,
13009 SelectionDAG &DAG) {
13010 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&(static_cast<void> (0))
13011 "Only 32-bit and 64-bit elements are supported!")(static_cast<void> (0));
13012
13013 // 128/256-bit vectors are only supported with VLX.
13014 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))(static_cast<void> (0))
13015 && "VLX required for 128/256-bit vectors")(static_cast<void> (0));
13016
13017 SDValue Lo = V1, Hi = V2;
13018 int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);
13019 if (Rotation <= 0)
13020 return SDValue();
13021
13022 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
13023 DAG.getTargetConstant(Rotation, DL, MVT::i8));
13024}
13025
13026/// Try to lower a vector shuffle as a byte shift sequence.
13027static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1,
13028 SDValue V2, ArrayRef<int> Mask,
13029 const APInt &Zeroable,
13030 const X86Subtarget &Subtarget,
13031 SelectionDAG &DAG) {
13032 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")(static_cast<void> (0));
13033 assert(VT.is128BitVector() && "Only 128-bit vectors supported")(static_cast<void> (0));
13034
13035 // We need a shuffle that has zeros at one/both ends and a sequential
13036 // shuffle from one source within.
13037 unsigned ZeroLo = Zeroable.countTrailingOnes();
13038 unsigned ZeroHi = Zeroable.countLeadingOnes();
13039 if (!ZeroLo && !ZeroHi)
13040 return SDValue();
13041
13042 unsigned NumElts = Mask.size();
13043 unsigned Len = NumElts - (ZeroLo + ZeroHi);
13044 if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))
13045 return SDValue();
13046
13047 unsigned Scale = VT.getScalarSizeInBits() / 8;
13048 ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);
13049 if (!isUndefOrInRange(StubMask, 0, NumElts) &&
13050 !isUndefOrInRange(StubMask, NumElts, 2 * NumElts))
13051 return SDValue();
13052
13053 SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
13054 Res = DAG.getBitcast(MVT::v16i8, Res);
13055
13056 // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an
13057 // inner sequential set of elements, possibly offset:
13058 // 01234567 --> zzzzzz01 --> 1zzzzzzz
13059 // 01234567 --> 4567zzzz --> zzzzz456
13060 // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
13061 if (ZeroLo == 0) {
13062 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
13063 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
13064 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
13065 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
13066 DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));
13067 } else if (ZeroHi == 0) {
13068 unsigned Shift = Mask[ZeroLo] % NumElts;
13069 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
13070 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
13071 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
13072 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
13073 } else if (!Subtarget.hasSSSE3()) {
13074 // If we don't have PSHUFB then its worth avoiding an AND constant mask
13075 // by performing 3 byte shifts. Shuffle combining can kick in above that.
13076 // TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
13077 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
13078 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
13079 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
13080 Shift += Mask[ZeroLo] % NumElts;
13081 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
13082 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
13083 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
13084 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
13085 } else
13086 return SDValue();
13087
13088 return DAG.getBitcast(VT, Res);
13089}
13090
13091/// Try to lower a vector shuffle as a bit shift (shifts in zeros).
13092///
13093/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
13094/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
13095/// matches elements from one of the input vectors shuffled to the left or
13096/// right with zeroable elements 'shifted in'. It handles both the strictly
13097/// bit-wise element shifts and the byte shift across an entire 128-bit double
13098/// quad word lane.
13099///
13100/// PSHL : (little-endian) left bit shift.
13101/// [ zz, 0, zz, 2 ]
13102/// [ -1, 4, zz, -1 ]
13103/// PSRL : (little-endian) right bit shift.
13104/// [ 1, zz, 3, zz]
13105/// [ -1, -1, 7, zz]
13106/// PSLLDQ : (little-endian) left byte shift
13107/// [ zz, 0, 1, 2, 3, 4, 5, 6]
13108/// [ zz, zz, -1, -1, 2, 3, 4, -1]
13109/// [ zz, zz, zz, zz, zz, zz, -1, 1]
13110/// PSRLDQ : (little-endian) right byte shift
13111/// [ 5, 6, 7, zz, zz, zz, zz, zz]
13112/// [ -1, 5, 6, 7, zz, zz, zz, zz]
13113/// [ 1, 2, -1, -1, -1, -1, zz, zz]
13114static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
13115 unsigned ScalarSizeInBits, ArrayRef<int> Mask,
13116 int MaskOffset, const APInt &Zeroable,
13117 const X86Subtarget &Subtarget) {
13118 int Size = Mask.size();
13119 unsigned SizeInBits = Size * ScalarSizeInBits;
13120
13121 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
13122 for (int i = 0; i < Size; i += Scale)
13123 for (int j = 0; j < Shift; ++j)
13124 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
13125 return false;
13126
13127 return true;
13128 };
13129
13130 auto MatchShift = [&](int Shift, int Scale, bool Left) {
13131 for (int i = 0; i != Size; i += Scale) {
13132 unsigned Pos = Left ? i + Shift : i;
13133 unsigned Low = Left ? i : i + Shift;
13134 unsigned Len = Scale - Shift;
13135 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
13136 return -1;
13137 }
13138
13139 int ShiftEltBits = ScalarSizeInBits * Scale;
13140 bool ByteShift = ShiftEltBits > 64;
13141 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
13142 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
13143 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
13144
13145 // Normalize the scale for byte shifts to still produce an i64 element
13146 // type.
13147 Scale = ByteShift ? Scale / 2 : Scale;
13148
13149 // We need to round trip through the appropriate type for the shift.
13150 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
13151 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
13152 : MVT::getVectorVT(ShiftSVT, Size / Scale);
13153 return (int)ShiftAmt;
13154 };
13155
13156 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
13157 // keep doubling the size of the integer elements up to that. We can
13158 // then shift the elements of the integer vector by whole multiples of
13159 // their width within the elements of the larger integer vector. Test each
13160 // multiple to see if we can find a match with the moved element indices
13161 // and that the shifted in elements are all zeroable.
13162 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
13163 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
13164 for (int Shift = 1; Shift != Scale; ++Shift)
13165 for (bool Left : {true, false})
13166 if (CheckZeros(Shift, Scale, Left)) {
13167 int ShiftAmt = MatchShift(Shift, Scale, Left);
13168 if (0 < ShiftAmt)
13169 return ShiftAmt;
13170 }
13171
13172 // no match
13173 return -1;
13174}
13175
13176static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
13177 SDValue V2, ArrayRef<int> Mask,
13178 const APInt &Zeroable,
13179 const X86Subtarget &Subtarget,
13180 SelectionDAG &DAG) {
13181 int Size = Mask.size();
13182 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")(static_cast<void> (0));
13183
13184 MVT ShiftVT;
13185 SDValue V = V1;
13186 unsigned Opcode;
13187
13188 // Try to match shuffle against V1 shift.
13189 int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
13190 Mask, 0, Zeroable, Subtarget);
13191
13192 // If V1 failed, try to match shuffle against V2 shift.
13193 if (ShiftAmt < 0) {
13194 ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
13195 Mask, Size, Zeroable, Subtarget);
13196 V = V2;
13197 }
13198
13199 if (ShiftAmt < 0)
13200 return SDValue();
13201
13202 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&(static_cast<void> (0))
13203 "Illegal integer vector type")(static_cast<void> (0));
13204 V = DAG.getBitcast(ShiftVT, V);
13205 V = DAG.getNode(Opcode, DL, ShiftVT, V,
13206 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
13207 return DAG.getBitcast(VT, V);
13208}
13209
13210// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
13211// Remainder of lower half result is zero and upper half is all undef.
13212static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
13213 ArrayRef<int> Mask, uint64_t &BitLen,
13214 uint64_t &BitIdx, const APInt &Zeroable) {
13215 int Size = Mask.size();
13216 int HalfSize = Size / 2;
13217 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")(static_cast<void> (0));
13218 assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask")(static_cast<void> (0));
13219
13220 // Upper half must be undefined.
13221 if (!isUndefUpperHalf(Mask))
13222 return false;
13223
13224 // Determine the extraction length from the part of the
13225 // lower half that isn't zeroable.
13226 int Len = HalfSize;
13227 for (; Len > 0; --Len)
13228 if (!Zeroable[Len - 1])
13229 break;
13230 assert(Len > 0 && "Zeroable shuffle mask")(static_cast<void> (0));
13231
13232 // Attempt to match first Len sequential elements from the lower half.
13233 SDValue Src;
13234 int Idx = -1;
13235 for (int i = 0; i != Len; ++i) {
13236 int M = Mask[i];
13237 if (M == SM_SentinelUndef)
13238 continue;
13239 SDValue &V = (M < Size ? V1 : V2);
13240 M = M % Size;
13241
13242 // The extracted elements must start at a valid index and all mask
13243 // elements must be in the lower half.
13244 if (i > M || M >= HalfSize)
13245 return false;
13246
13247 if (Idx < 0 || (Src == V && Idx == (M - i))) {
13248 Src = V;
13249 Idx = M - i;
13250 continue;
13251 }
13252 return false;
13253 }
13254
13255 if (!Src || Idx < 0)
13256 return false;
13257
13258 assert((Idx + Len) <= HalfSize && "Illegal extraction mask")(static_cast<void> (0));
13259 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
13260 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
13261 V1 = Src;
13262 return true;
13263}
13264
13265// INSERTQ: Extract lowest Len elements from lower half of second source and
13266// insert over first source, starting at Idx.
13267// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
13268static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
13269 ArrayRef<int> Mask, uint64_t &BitLen,
13270 uint64_t &BitIdx) {
13271 int Size = Mask.size();
13272 int HalfSize = Size / 2;
13273 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")(static_cast<void> (0));
13274
13275 // Upper half must be undefined.
13276 if (!isUndefUpperHalf(Mask))
13277 return false;
13278
13279 for (int Idx = 0; Idx != HalfSize; ++Idx) {
13280 SDValue Base;
13281
13282 // Attempt to match first source from mask before insertion point.
13283 if (isUndefInRange(Mask, 0, Idx)) {
13284 /* EMPTY */
13285 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
13286 Base = V1;
13287 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
13288 Base = V2;
13289 } else {
13290 continue;
13291 }
13292
13293 // Extend the extraction length looking to match both the insertion of
13294 // the second source and the remaining elements of the first.
13295 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
13296 SDValue Insert;
13297 int Len = Hi - Idx;
13298
13299 // Match insertion.
13300 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
13301 Insert = V1;
13302 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
13303 Insert = V2;
13304 } else {
13305 continue;
13306 }
13307
13308 // Match the remaining elements of the lower half.
13309 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
13310 /* EMPTY */
13311 } else if ((!Base || (Base == V1)) &&
13312 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
13313 Base = V1;
13314 } else if ((!Base || (Base == V2)) &&
13315 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
13316 Size + Hi)) {
13317 Base = V2;
13318 } else {
13319 continue;
13320 }
13321
13322 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
13323 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
13324 V1 = Base;
13325 V2 = Insert;
13326 return true;
13327 }
13328 }
13329
13330 return false;
13331}
13332
13333/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
13334static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
13335 SDValue V2, ArrayRef<int> Mask,
13336 const APInt &Zeroable, SelectionDAG &DAG) {
13337 uint64_t BitLen, BitIdx;
13338 if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
13339 return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
13340 DAG.getTargetConstant(BitLen, DL, MVT::i8),
13341 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
13342
13343 if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
13344 return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
13345 V2 ? V2 : DAG.getUNDEF(VT),
13346 DAG.getTargetConstant(BitLen, DL, MVT::i8),
13347 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
13348
13349 return SDValue();
13350}
13351
13352/// Lower a vector shuffle as a zero or any extension.
13353///
13354/// Given a specific number of elements, element bit width, and extension
13355/// stride, produce either a zero or any extension based on the available
13356/// features of the subtarget. The extended elements are consecutive and
13357/// begin and can start from an offsetted element index in the input; to
13358/// avoid excess shuffling the offset must either being in the bottom lane
13359/// or at the start of a higher lane. All extended elements must be from
13360/// the same lane.
13361static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(
13362 const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
13363 ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13364 assert(Scale > 1 && "Need a scale to extend.")(static_cast<void> (0));
13365 int EltBits = VT.getScalarSizeInBits();
13366 int NumElements = VT.getVectorNumElements();
13367 int NumEltsPerLane = 128 / EltBits;
13368 int OffsetLane = Offset / NumEltsPerLane;
13369 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&(static_cast<void> (0))
13370 "Only 8, 16, and 32 bit elements can be extended.")(static_cast<void> (0));
13371 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.")(static_cast<void> (0));
13372 assert(0 <= Offset && "Extension offset must be positive.")(static_cast<void> (0));
13373 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&(static_cast<void> (0))
13374 "Extension offset must be in the first lane or start an upper lane.")(static_cast<void> (0));
13375
13376 // Check that an index is in same lane as the base offset.
13377 auto SafeOffset = [&](int Idx) {
13378 return OffsetLane == (Idx / NumEltsPerLane);
13379 };
13380
13381 // Shift along an input so that the offset base moves to the first element.
13382 auto ShuffleOffset = [&](SDValue V) {
13383 if (!Offset)
13384 return V;
13385
13386 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
13387 for (int i = 0; i * Scale < NumElements; ++i) {
13388 int SrcIdx = i + Offset;
13389 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
13390 }
13391 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
13392 };
13393
13394 // Found a valid a/zext mask! Try various lowering strategies based on the
13395 // input type and available ISA extensions.
13396 if (Subtarget.hasSSE41()) {
13397 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
13398 // PUNPCK will catch this in a later shuffle match.
13399 if (Offset && Scale == 2 && VT.is128BitVector())
13400 return SDValue();
13401 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
13402 NumElements / Scale);
13403 InputV = ShuffleOffset(InputV);
13404 InputV = getEXTEND_VECTOR_INREG(AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND,
13405 DL, ExtVT, InputV, DAG);
13406 return DAG.getBitcast(VT, InputV);
13407 }
13408
13409 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.")(static_cast<void> (0));
13410
13411 // For any extends we can cheat for larger element sizes and use shuffle
13412 // instructions that can fold with a load and/or copy.
13413 if (AnyExt && EltBits == 32) {
13414 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
13415 -1};
13416 return DAG.getBitcast(
13417 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
13418 DAG.getBitcast(MVT::v4i32, InputV),
13419 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13420 }
13421 if (AnyExt && EltBits == 16 && Scale > 2) {
13422 int PSHUFDMask[4] = {Offset / 2, -1,
13423 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
13424 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
13425 DAG.getBitcast(MVT::v4i32, InputV),
13426 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
13427 int PSHUFWMask[4] = {1, -1, -1, -1};
13428 unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
13429 return DAG.getBitcast(
13430 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
13431 DAG.getBitcast(MVT::v8i16, InputV),
13432 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
13433 }
13434
13435 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
13436 // to 64-bits.
13437 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
13438 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!")(static_cast<void> (0));
13439 assert(VT.is128BitVector() && "Unexpected vector width!")(static_cast<void> (0));
13440
13441 int LoIdx = Offset * EltBits;
13442 SDValue Lo = DAG.getBitcast(
13443 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
13444 DAG.getTargetConstant(EltBits, DL, MVT::i8),
13445 DAG.getTargetConstant(LoIdx, DL, MVT::i8)));
13446
13447 if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))
13448 return DAG.getBitcast(VT, Lo);
13449
13450 int HiIdx = (Offset + 1) * EltBits;
13451 SDValue Hi = DAG.getBitcast(
13452 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
13453 DAG.getTargetConstant(EltBits, DL, MVT::i8),
13454 DAG.getTargetConstant(HiIdx, DL, MVT::i8)));
13455 return DAG.getBitcast(VT,
13456 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
13457 }
13458
13459 // If this would require more than 2 unpack instructions to expand, use
13460 // pshufb when available. We can only use more than 2 unpack instructions
13461 // when zero extending i8 elements which also makes it easier to use pshufb.
13462 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
13463 assert(NumElements == 16 && "Unexpected byte vector width!")(static_cast<void> (0));
13464 SDValue PSHUFBMask[16];
13465 for (int i = 0; i < 16; ++i) {
13466 int Idx = Offset + (i / Scale);
13467 if ((i % Scale == 0 && SafeOffset(Idx))) {
13468 PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);
13469 continue;
13470 }
13471 PSHUFBMask[i] =
13472 AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);
13473 }
13474 InputV = DAG.getBitcast(MVT::v16i8, InputV);
13475 return DAG.getBitcast(
13476 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
13477 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
13478 }
13479
13480 // If we are extending from an offset, ensure we start on a boundary that
13481 // we can unpack from.
13482 int AlignToUnpack = Offset % (NumElements / Scale);
13483 if (AlignToUnpack) {
13484 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
13485 for (int i = AlignToUnpack; i < NumElements; ++i)
13486 ShMask[i - AlignToUnpack] = i;
13487 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
13488 Offset -= AlignToUnpack;
13489 }
13490
13491 // Otherwise emit a sequence of unpacks.
13492 do {
13493 unsigned UnpackLoHi = X86ISD::UNPCKL;
13494 if (Offset >= (NumElements / 2)) {
13495 UnpackLoHi = X86ISD::UNPCKH;
13496 Offset -= (NumElements / 2);
13497 }
13498
13499 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
13500 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
13501 : getZeroVector(InputVT, Subtarget, DAG, DL);
13502 InputV = DAG.getBitcast(InputVT, InputV);
13503 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
13504 Scale /= 2;
13505 EltBits *= 2;
13506 NumElements /= 2;
13507 } while (Scale > 1);
13508 return DAG.getBitcast(VT, InputV);
13509}
13510
13511/// Try to lower a vector shuffle as a zero extension on any microarch.
13512///
13513/// This routine will try to do everything in its power to cleverly lower
13514/// a shuffle which happens to match the pattern of a zero extend. It doesn't
13515/// check for the profitability of this lowering, it tries to aggressively
13516/// match this pattern. It will use all of the micro-architectural details it
13517/// can to emit an efficient lowering. It handles both blends with all-zero
13518/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
13519/// masking out later).
13520///
13521/// The reason we have dedicated lowering for zext-style shuffles is that they
13522/// are both incredibly common and often quite performance sensitive.
13523static SDValue lowerShuffleAsZeroOrAnyExtend(
13524 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13525 const APInt &Zeroable, const X86Subtarget &Subtarget,
13526 SelectionDAG &DAG) {
13527 int Bits = VT.getSizeInBits();
13528 int NumLanes = Bits / 128;
13529 int NumElements = VT.getVectorNumElements();
13530 int NumEltsPerLane = NumElements / NumLanes;
13531 assert(VT.getScalarSizeInBits() <= 32 &&(static_cast<void> (0))
13532 "Exceeds 32-bit integer zero extension limit")(static_cast<void> (0));
13533 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size")(static_cast<void> (0));
13534
13535 // Define a helper function to check a particular ext-scale and lower to it if
13536 // valid.
13537 auto Lower = [&](int Scale) -> SDValue {
13538 SDValue InputV;
13539 bool AnyExt = true;
13540 int Offset = 0;
13541 int Matches = 0;
13542 for (int i = 0; i < NumElements; ++i) {
13543 int M = Mask[i];
13544 if (M < 0)
13545 continue; // Valid anywhere but doesn't tell us anything.
13546 if (i % Scale != 0) {
13547 // Each of the extended elements need to be zeroable.
13548 if (!Zeroable[i])
13549 return SDValue();
13550
13551 // We no longer are in the anyext case.
13552 AnyExt = false;
13553 continue;
13554 }
13555
13556 // Each of the base elements needs to be consecutive indices into the
13557 // same input vector.
13558 SDValue V = M < NumElements ? V1 : V2;
13559 M = M % NumElements;
13560 if (!InputV) {
13561 InputV = V;
13562 Offset = M - (i / Scale);
13563 } else if (InputV != V)
13564 return SDValue(); // Flip-flopping inputs.
13565
13566 // Offset must start in the lowest 128-bit lane or at the start of an
13567 // upper lane.
13568 // FIXME: Is it ever worth allowing a negative base offset?
13569 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
13570 (Offset % NumEltsPerLane) == 0))
13571 return SDValue();
13572
13573 // If we are offsetting, all referenced entries must come from the same
13574 // lane.
13575 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
13576 return SDValue();
13577
13578 if ((M % NumElements) != (Offset + (i / Scale)))
13579 return SDValue(); // Non-consecutive strided elements.
13580 Matches++;
13581 }
13582
13583 // If we fail to find an input, we have a zero-shuffle which should always
13584 // have already been handled.
13585 // FIXME: Maybe handle this here in case during blending we end up with one?
13586 if (!InputV)
13587 return SDValue();
13588
13589 // If we are offsetting, don't extend if we only match a single input, we
13590 // can always do better by using a basic PSHUF or PUNPCK.
13591 if (Offset != 0 && Matches < 2)
13592 return SDValue();
13593
13594 return lowerShuffleAsSpecificZeroOrAnyExtend(DL, VT, Scale, Offset, AnyExt,
13595 InputV, Mask, Subtarget, DAG);
13596 };
13597
13598 // The widest scale possible for extending is to a 64-bit integer.
13599 assert(Bits % 64 == 0 &&(static_cast<void> (0))
13600 "The number of bits in a vector must be divisible by 64 on x86!")(static_cast<void> (0));
13601 int NumExtElements = Bits / 64;
13602
13603 // Each iteration, try extending the elements half as much, but into twice as
13604 // many elements.
13605 for (; NumExtElements < NumElements; NumExtElements *= 2) {
13606 assert(NumElements % NumExtElements == 0 &&(static_cast<void> (0))
13607 "The input vector size must be divisible by the extended size.")(static_cast<void> (0));
13608 if (SDValue V = Lower(NumElements / NumExtElements))
13609 return V;
13610 }
13611
13612 // General extends failed, but 128-bit vectors may be able to use MOVQ.
13613 if (Bits != 128)
13614 return SDValue();
13615
13616 // Returns one of the source operands if the shuffle can be reduced to a
13617 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
13618 auto CanZExtLowHalf = [&]() {
13619 for (int i = NumElements / 2; i != NumElements; ++i)
13620 if (!Zeroable[i])
13621 return SDValue();
13622 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
13623 return V1;
13624 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
13625 return V2;
13626 return SDValue();
13627 };
13628
13629 if (SDValue V = CanZExtLowHalf()) {
13630 V = DAG.getBitcast(MVT::v2i64, V);
13631 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
13632 return DAG.getBitcast(VT, V);
13633 }
13634
13635 // No viable ext lowering found.
13636 return SDValue();
13637}
13638
13639/// Try to get a scalar value for a specific element of a vector.
13640///
13641/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
13642static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
13643 SelectionDAG &DAG) {
13644 MVT VT = V.getSimpleValueType();
13645 MVT EltVT = VT.getVectorElementType();
13646 V = peekThroughBitcasts(V);
13647
13648 // If the bitcasts shift the element size, we can't extract an equivalent
13649 // element from it.
13650 MVT NewVT = V.getSimpleValueType();
13651 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
13652 return SDValue();
13653
13654 if (V.getOpcode() == ISD::BUILD_VECTOR ||
13655 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
13656 // Ensure the scalar operand is the same size as the destination.
13657 // FIXME: Add support for scalar truncation where possible.
13658 SDValue S = V.getOperand(Idx);
13659 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
13660 return DAG.getBitcast(EltVT, S);
13661 }
13662
13663 return SDValue();
13664}
13665
13666/// Helper to test for a load that can be folded with x86 shuffles.
13667///
13668/// This is particularly important because the set of instructions varies
13669/// significantly based on whether the operand is a load or not.
13670static bool isShuffleFoldableLoad(SDValue V) {
13671 V = peekThroughBitcasts(V);
13672 return ISD::isNON_EXTLoad(V.getNode());
13673}
13674
13675/// Try to lower insertion of a single element into a zero vector.
13676///
13677/// This is a common pattern that we have especially efficient patterns to lower
13678/// across all subtarget feature sets.
13679static SDValue lowerShuffleAsElementInsertion(
13680 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13681 const APInt &Zeroable, const X86Subtarget &Subtarget,
13682 SelectionDAG &DAG) {
13683 MVT ExtVT = VT;
13684 MVT EltVT = VT.getVectorElementType();
13685
13686 int V2Index =
13687 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
13688 Mask.begin();
13689 bool IsV1Zeroable = true;
13690 for (int i = 0, Size = Mask.size(); i < Size; ++i)
13691 if (i != V2Index && !Zeroable[i]) {
13692 IsV1Zeroable = false;
13693 break;
13694 }
13695
13696 // Check for a single input from a SCALAR_TO_VECTOR node.
13697 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
13698 // all the smarts here sunk into that routine. However, the current
13699 // lowering of BUILD_VECTOR makes that nearly impossible until the old
13700 // vector shuffle lowering is dead.
13701 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
13702 DAG);
13703 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
13704 // We need to zext the scalar if it is smaller than an i32.
13705 V2S = DAG.getBitcast(EltVT, V2S);
13706 if (EltVT == MVT::i8 || (EltVT == MVT::i16 && !Subtarget.hasFP16())) {
13707 // Using zext to expand a narrow element won't work for non-zero
13708 // insertions.
13709 if (!IsV1Zeroable)
13710 return SDValue();
13711
13712 // Zero-extend directly to i32.
13713 ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
13714 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
13715 }
13716 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
13717 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
13718 EltVT == MVT::i16) {
13719 // Either not inserting from the low element of the input or the input
13720 // element size is too small to use VZEXT_MOVL to clear the high bits.
13721 return SDValue();
13722 }
13723
13724 if (!IsV1Zeroable) {
13725 // If V1 can't be treated as a zero vector we have fewer options to lower
13726 // this. We can't support integer vectors or non-zero targets cheaply, and
13727 // the V1 elements can't be permuted in any way.
13728 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!")(static_cast<void> (0));
13729 if (!VT.isFloatingPoint() || V2Index != 0)
13730 return SDValue();
13731 SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
13732 V1Mask[V2Index] = -1;
13733 if (!isNoopShuffleMask(V1Mask))
13734 return SDValue();
13735 if (!VT.is128BitVector())
13736 return SDValue();
13737
13738 // Otherwise, use MOVSD, MOVSS or MOVSH.
13739 unsigned MovOpc = 0;
13740 if (EltVT == MVT::f16)
13741 MovOpc = X86ISD::MOVSH;
13742 else if (EltVT == MVT::f32)
13743 MovOpc = X86ISD::MOVSS;
13744 else if (EltVT == MVT::f64)
13745 MovOpc = X86ISD::MOVSD;
13746 else
13747 llvm_unreachable("Unsupported floating point element type to handle!")__builtin_unreachable();
13748 return DAG.getNode(MovOpc, DL, ExtVT, V1, V2);
13749 }
13750
13751 // This lowering only works for the low element with floating point vectors.
13752 if (VT.isFloatingPoint() && V2Index != 0)
13753 return SDValue();
13754
13755 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
13756 if (ExtVT != VT)
13757 V2 = DAG.getBitcast(VT, V2);
13758
13759 if (V2Index != 0) {
13760 // If we have 4 or fewer lanes we can cheaply shuffle the element into
13761 // the desired position. Otherwise it is more efficient to do a vector
13762 // shift left. We know that we can do a vector shift left because all
13763 // the inputs are zero.
13764 if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
13765 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
13766 V2Shuffle[V2Index] = 0;
13767 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
13768 } else {
13769 V2 = DAG.getBitcast(MVT::v16i8, V2);
13770 V2 = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
13771 DAG.getTargetConstant(
13772 V2Index * EltVT.getSizeInBits() / 8, DL, MVT::i8));
13773 V2 = DAG.getBitcast(VT, V2);
13774 }
13775 }
13776 return V2;
13777}
13778
13779/// Try to lower broadcast of a single - truncated - integer element,
13780/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
13781///
13782/// This assumes we have AVX2.
13783static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0,
13784 int BroadcastIdx,
13785 const X86Subtarget &Subtarget,
13786 SelectionDAG &DAG) {
13787 assert(Subtarget.hasAVX2() &&(static_cast<void> (0))
13788 "We can only lower integer broadcasts with AVX2!")(static_cast<void> (0));
13789
13790 MVT EltVT = VT.getVectorElementType();
13791 MVT V0VT = V0.getSimpleValueType();
13792
13793 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!")(static_cast<void> (0));
13794 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!")(static_cast<void> (0));
13795
13796 MVT V0EltVT = V0VT.getVectorElementType();
13797 if (!V0EltVT.isInteger())
13798 return SDValue();
13799
13800 const unsigned EltSize = EltVT.getSizeInBits();
13801 const unsigned V0EltSize = V0EltVT.getSizeInBits();
13802
13803 // This is only a truncation if the original element type is larger.
13804 if (V0EltSize <= EltSize)
13805 return SDValue();
13806
13807 assert(((V0EltSize % EltSize) == 0) &&(static_cast<void> (0))
13808 "Scalar type sizes must all be powers of 2 on x86!")(static_cast<void> (0));
13809
13810 const unsigned V0Opc = V0.getOpcode();
13811 const unsigned Scale = V0EltSize / EltSize;
13812 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
13813
13814 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
13815 V0Opc != ISD::BUILD_VECTOR)
13816 return SDValue();
13817
13818 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
13819
13820 // If we're extracting non-least-significant bits, shift so we can truncate.
13821 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
13822 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
13823 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
13824 if (const int OffsetIdx = BroadcastIdx % Scale)
13825 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
13826 DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));
13827
13828 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
13829 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
13830}
13831
13832/// Test whether this can be lowered with a single SHUFPS instruction.
13833///
13834/// This is used to disable more specialized lowerings when the shufps lowering
13835/// will happen to be efficient.
13836static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
13837 // This routine only handles 128-bit shufps.
13838 assert(Mask.size() == 4 && "Unsupported mask size!")(static_cast<void> (0));
13839 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!")(static_cast<void> (0));
13840 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!")(static_cast<void> (0));
13841 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!")(static_cast<void> (0));
13842 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!")(static_cast<void> (0));
13843
13844 // To lower with a single SHUFPS we need to have the low half and high half
13845 // each requiring a single input.
13846 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
13847 return false;
13848 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
13849 return false;
13850
13851 return true;
13852}
13853
13854/// If we are extracting two 128-bit halves of a vector and shuffling the
13855/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
13856/// multi-shuffle lowering.
13857static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0,
13858 SDValue N1, ArrayRef<int> Mask,
13859 SelectionDAG &DAG) {
13860 MVT VT = N0.getSimpleValueType();
13861 assert((VT.is128BitVector() &&(static_cast<void> (0))
13862 (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&(static_cast<void> (0))
13863 "VPERM* family of shuffles requires 32-bit or 64-bit elements")(static_cast<void> (0));
13864
13865 // Check that both sources are extracts of the same source vector.
13866 if (!N0.hasOneUse() || !N1.hasOneUse() ||
13867 N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
13868 N1.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
13869 N0.getOperand(0) != N1.getOperand(0))
13870 return SDValue();
13871
13872 SDValue WideVec = N0.getOperand(0);
13873 MVT WideVT = WideVec.getSimpleValueType();
13874 if (!WideVT.is256BitVector())
13875 return SDValue();
13876
13877 // Match extracts of each half of the wide source vector. Commute the shuffle
13878 // if the extract of the low half is N1.
13879 unsigned NumElts = VT.getVectorNumElements();
13880 SmallVector<int, 4> NewMask(Mask.begin(), Mask.end());
13881 const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);
13882 const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);
13883 if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
13884 ShuffleVectorSDNode::commuteMask(NewMask);
13885 else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)
13886 return SDValue();
13887
13888 // Final bailout: if the mask is simple, we are better off using an extract
13889 // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
13890 // because that avoids a constant load from memory.
13891 if (NumElts == 4 &&
13892 (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask)))
13893 return SDValue();
13894
13895 // Extend the shuffle mask with undef elements.
13896 NewMask.append(NumElts, -1);
13897
13898 // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
13899 SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
13900 NewMask);
13901 // This is free: ymm -> xmm.
13902 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
13903 DAG.getIntPtrConstant(0, DL));
13904}
13905
13906/// Try to lower broadcast of a single element.
13907///
13908/// For convenience, this code also bundles all of the subtarget feature set
13909/// filtering. While a little annoying to re-dispatch on type here, there isn't
13910/// a convenient way to factor it out.
13911static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
13912 SDValue V2, ArrayRef<int> Mask,
13913 const X86Subtarget &Subtarget,
13914 SelectionDAG &DAG) {
13915 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
13916 (Subtarget.hasAVX() && VT.isFloatingPoint()) ||
13917 (Subtarget.hasAVX2() && VT.isInteger())))
13918 return SDValue();
13919
13920 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
13921 // we can only broadcast from a register with AVX2.
13922 unsigned NumEltBits = VT.getScalarSizeInBits();
13923 unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
13924 ? X86ISD::MOVDDUP
13925 : X86ISD::VBROADCAST;
13926 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
13927
13928 // Check that the mask is a broadcast.
13929 int BroadcastIdx = getSplatIndex(Mask);
13930 if (BroadcastIdx < 0)
13931 return SDValue();
13932 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "(static_cast<void> (0))
13933 "a sorted mask where the broadcast "(static_cast<void> (0))
13934 "comes from V1.")(static_cast<void> (0));
13935
13936 // Go up the chain of (vector) values to find a scalar load that we can
13937 // combine with the broadcast.
13938 // TODO: Combine this logic with findEltLoadSrc() used by
13939 // EltsFromConsecutiveLoads().
13940 int BitOffset = BroadcastIdx * NumEltBits;
13941 SDValue V = V1;
13942 for (;;) {
13943 switch (V.getOpcode()) {
13944 case ISD::BITCAST: {
13945 V = V.getOperand(0);
13946 continue;
13947 }
13948 case ISD::CONCAT_VECTORS: {
13949 int OpBitWidth = V.getOperand(0).getValueSizeInBits();
13950 int OpIdx = BitOffset / OpBitWidth;
13951 V = V.getOperand(OpIdx);
13952 BitOffset %= OpBitWidth;
13953 continue;
13954 }
13955 case ISD::EXTRACT_SUBVECTOR: {
13956 // The extraction index adds to the existing offset.
13957 unsigned EltBitWidth = V.getScalarValueSizeInBits();
13958 unsigned Idx = V.getConstantOperandVal(1);
13959 unsigned BeginOffset = Idx * EltBitWidth;
13960 BitOffset += BeginOffset;
13961 V = V.getOperand(0);
13962 continue;
13963 }
13964 case ISD::INSERT_SUBVECTOR: {
13965 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
13966 int EltBitWidth = VOuter.getScalarValueSizeInBits();
13967 int Idx = (int)V.getConstantOperandVal(2);
13968 int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
13969 int BeginOffset = Idx * EltBitWidth;
13970 int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
13971 if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
13972 BitOffset -= BeginOffset;
13973 V = VInner;
13974 } else {
13975 V = VOuter;
13976 }
13977 continue;
13978 }
13979 }
13980 break;
13981 }
13982 assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset")(static_cast<void> (0));
13983 BroadcastIdx = BitOffset / NumEltBits;
13984
13985 // Do we need to bitcast the source to retrieve the original broadcast index?
13986 bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;
13987
13988 // Check if this is a broadcast of a scalar. We special case lowering
13989 // for scalars so that we can more effectively fold with loads.
13990 // If the original value has a larger element type than the shuffle, the
13991 // broadcast element is in essence truncated. Make that explicit to ease
13992 // folding.
13993 if (BitCastSrc && VT.isInteger())
13994 if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(
13995 DL, VT, V, BroadcastIdx, Subtarget, DAG))
13996 return TruncBroadcast;
13997
13998 // Also check the simpler case, where we can directly reuse the scalar.
13999 if (!BitCastSrc &&
14000 ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
14001 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {
14002 V = V.getOperand(BroadcastIdx);
14003
14004 // If we can't broadcast from a register, check that the input is a load.
14005 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
14006 return SDValue();
14007 } else if (ISD::isNormalLoad(V.getNode()) &&
14008 cast<LoadSDNode>(V)->isSimple()) {
14009 // We do not check for one-use of the vector load because a broadcast load
14010 // is expected to be a win for code size, register pressure, and possibly
14011 // uops even if the original vector load is not eliminated.
14012
14013 // Reduce the vector load and shuffle to a broadcasted scalar load.
14014 LoadSDNode *Ld = cast<LoadSDNode>(V);
14015 SDValue BaseAddr = Ld->getOperand(1);
14016 MVT SVT = VT.getScalarType();
14017 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
14018 assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset")(static_cast<void> (0));
14019 SDValue NewAddr =
14020 DAG.getMemBasePlusOffset(BaseAddr, TypeSize::Fixed(Offset), DL);
14021
14022 // Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather
14023 // than MOVDDUP.
14024 // FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?
14025 if (Opcode == X86ISD::VBROADCAST) {
14026 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
14027 SDValue Ops[] = {Ld->getChain(), NewAddr};
14028 V = DAG.getMemIntrinsicNode(
14029 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT,
14030 DAG.getMachineFunction().getMachineMemOperand(
14031 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
14032 DAG.makeEquivalentMemoryOrdering(Ld, V);
14033 return DAG.getBitcast(VT, V);
14034 }
14035 assert(SVT == MVT::f64 && "Unexpected VT!")(static_cast<void> (0));
14036 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
14037 DAG.getMachineFunction().getMachineMemOperand(
14038 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
14039 DAG.makeEquivalentMemoryOrdering(Ld, V);
14040 } else if (!BroadcastFromReg) {
14041 // We can't broadcast from a vector register.
14042 return SDValue();
14043 } else if (BitOffset != 0) {
14044 // We can only broadcast from the zero-element of a vector register,
14045 // but it can be advantageous to broadcast from the zero-element of a
14046 // subvector.
14047 if (!VT.is256BitVector() && !VT.is512BitVector())
14048 return SDValue();
14049
14050 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
14051 if (VT == MVT::v4f64 || VT == MVT::v4i64)
14052 return SDValue();
14053
14054 // Only broadcast the zero-element of a 128-bit subvector.
14055 if ((BitOffset % 128) != 0)
14056 return SDValue();
14057
14058 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&(static_cast<void> (0))
14059 "Unexpected bit-offset")(static_cast<void> (0));
14060 assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&(static_cast<void> (0))
14061 "Unexpected vector size")(static_cast<void> (0));
14062 unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
14063 V = extract128BitVector(V, ExtractIdx, DAG, DL);
14064 }
14065
14066 // On AVX we can use VBROADCAST directly for scalar sources.
14067 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) {
14068 V = DAG.getBitcast(MVT::f64, V);
14069 if (Subtarget.hasAVX()) {
14070 V = DAG.getNode(X86ISD::VBROADCAST, DL, MVT::v2f64, V);
14071 return DAG.getBitcast(VT, V);
14072 }
14073 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V);
14074 }
14075
14076 // If this is a scalar, do the broadcast on this type and bitcast.
14077 if (!V.getValueType().isVector()) {
14078 assert(V.getScalarValueSizeInBits() == NumEltBits &&(static_cast<void> (0))
14079 "Unexpected scalar size")(static_cast<void> (0));
14080 MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),
14081 VT.getVectorNumElements());
14082 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
14083 }
14084
14085 // We only support broadcasting from 128-bit vectors to minimize the
14086 // number of patterns we need to deal with in isel. So extract down to
14087 // 128-bits, removing as many bitcasts as possible.
14088 if (V.getValueSizeInBits() > 128)
14089 V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL);
14090
14091 // Otherwise cast V to a vector with the same element type as VT, but
14092 // possibly narrower than VT. Then perform the broadcast.
14093 unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
14094 MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);
14095 return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));
14096}
14097
14098// Check for whether we can use INSERTPS to perform the shuffle. We only use
14099// INSERTPS when the V1 elements are already in the correct locations
14100// because otherwise we can just always use two SHUFPS instructions which
14101// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
14102// perform INSERTPS if a single V1 element is out of place and all V2
14103// elements are zeroable.
14104static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2,
14105 unsigned &InsertPSMask,
14106 const APInt &Zeroable,
14107 ArrayRef<int> Mask, SelectionDAG &DAG) {
14108 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!")(static_cast<void> (0));
14109 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!")(static_cast<void> (0));
14110 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast<void> (0));
14111
14112 // Attempt to match INSERTPS with one element from VA or VB being
14113 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
14114 // are updated.
14115 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
14116 ArrayRef<int> CandidateMask) {
14117 unsigned ZMask = 0;
14118 int VADstIndex = -1;
14119 int VBDstIndex = -1;
14120 bool VAUsedInPlace = false;
14121
14122 for (int i = 0; i < 4; ++i) {
14123 // Synthesize a zero mask from the zeroable elements (includes undefs).
14124 if (Zeroable[i]) {
14125 ZMask |= 1 << i;
14126 continue;
14127 }
14128
14129 // Flag if we use any VA inputs in place.
14130 if (i == CandidateMask[i]) {
14131 VAUsedInPlace = true;
14132 continue;
14133 }
14134
14135 // We can only insert a single non-zeroable element.
14136 if (VADstIndex >= 0 || VBDstIndex >= 0)
14137 return false;
14138
14139 if (CandidateMask[i] < 4) {
14140 // VA input out of place for insertion.
14141 VADstIndex = i;
14142 } else {
14143 // VB input for insertion.
14144 VBDstIndex = i;
14145 }
14146 }
14147
14148 // Don't bother if we have no (non-zeroable) element for insertion.
14149 if (VADstIndex < 0 && VBDstIndex < 0)
14150 return false;
14151
14152 // Determine element insertion src/dst indices. The src index is from the
14153 // start of the inserted vector, not the start of the concatenated vector.
14154 unsigned VBSrcIndex = 0;
14155 if (VADstIndex >= 0) {
14156 // If we have a VA input out of place, we use VA as the V2 element
14157 // insertion and don't use the original V2 at all.
14158 VBSrcIndex = CandidateMask[VADstIndex];
14159 VBDstIndex = VADstIndex;
14160 VB = VA;
14161 } else {
14162 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
14163 }
14164
14165 // If no V1 inputs are used in place, then the result is created only from
14166 // the zero mask and the V2 insertion - so remove V1 dependency.
14167 if (!VAUsedInPlace)
14168 VA = DAG.getUNDEF(MVT::v4f32);
14169
14170 // Update V1, V2 and InsertPSMask accordingly.
14171 V1 = VA;
14172 V2 = VB;
14173
14174 // Insert the V2 element into the desired position.
14175 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
14176 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!")(static_cast<void> (0));
14177 return true;
14178 };
14179
14180 if (matchAsInsertPS(V1, V2, Mask))
14181 return true;
14182
14183 // Commute and try again.
14184 SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
14185 ShuffleVectorSDNode::commuteMask(CommutedMask);
14186 if (matchAsInsertPS(V2, V1, CommutedMask))
14187 return true;
14188
14189 return false;
14190}
14191
14192static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2,
14193 ArrayRef<int> Mask, const APInt &Zeroable,
14194 SelectionDAG &DAG) {
14195 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast<void> (0));
14196 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast<void> (0));
14197
14198 // Attempt to match the insertps pattern.
14199 unsigned InsertPSMask = 0;
14200 if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
14201 return SDValue();
14202
14203 // Insert the V2 element into the desired position.
14204 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
14205 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
14206}
14207
14208/// Try to lower a shuffle as a permute of the inputs followed by an
14209/// UNPCK instruction.
14210///
14211/// This specifically targets cases where we end up with alternating between
14212/// the two inputs, and so can permute them into something that feeds a single
14213/// UNPCK instruction. Note that this routine only targets integer vectors
14214/// because for floating point vectors we have a generalized SHUFPS lowering
14215/// strategy that handles everything that doesn't *exactly* match an unpack,
14216/// making this clever lowering unnecessary.
14217static SDValue lowerShuffleAsPermuteAndUnpack(
14218 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
14219 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
14220 assert(!VT.isFloatingPoint() &&(static_cast<void> (0))
14221 "This routine only supports integer vectors.")(static_cast<void> (0));
14222 assert(VT.is128BitVector() &&(static_cast<void> (0))
14223 "This routine only works on 128-bit vectors.")(static_cast<void> (0));
14224 assert(!V2.isUndef() &&(static_cast<void> (0))
14225 "This routine should only be used when blending two inputs.")(static_cast<void> (0));
14226 assert(Mask.size() >= 2 && "Single element masks are invalid.")(static_cast<void> (0));
14227
14228 int Size = Mask.size();
14229
14230 int NumLoInputs =
14231 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
14232 int NumHiInputs =
14233 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
14234
14235 bool UnpackLo = NumLoInputs >= NumHiInputs;
14236
14237 auto TryUnpack = [&](int ScalarSize, int Scale) {
14238 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
14239 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
14240
14241 for (int i = 0; i < Size; ++i) {
14242 if (Mask[i] < 0)
14243 continue;
14244
14245 // Each element of the unpack contains Scale elements from this mask.
14246 int UnpackIdx = i / Scale;
14247
14248 // We only handle the case where V1 feeds the first slots of the unpack.
14249 // We rely on canonicalization to ensure this is the case.
14250 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
14251 return SDValue();
14252
14253 // Setup the mask for this input. The indexing is tricky as we have to
14254 // handle the unpack stride.
14255 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
14256 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
14257 Mask[i] % Size;
14258 }
14259
14260 // If we will have to shuffle both inputs to use the unpack, check whether
14261 // we can just unpack first and shuffle the result. If so, skip this unpack.
14262 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
14263 !isNoopShuffleMask(V2Mask))
14264 return SDValue();
14265
14266 // Shuffle the inputs into place.
14267 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
14268 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
14269
14270 // Cast the inputs to the type we will use to unpack them.
14271 MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
14272 V1 = DAG.getBitcast(UnpackVT, V1);
14273 V2 = DAG.getBitcast(UnpackVT, V2);
14274
14275 // Unpack the inputs and cast the result back to the desired type.
14276 return DAG.getBitcast(
14277 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
14278 UnpackVT, V1, V2));
14279 };
14280
14281 // We try each unpack from the largest to the smallest to try and find one
14282 // that fits this mask.
14283 int OrigScalarSize = VT.getScalarSizeInBits();
14284 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
14285 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
14286 return Unpack;
14287
14288 // If we're shuffling with a zero vector then we're better off not doing
14289 // VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.
14290 if (ISD::isBuildVectorAllZeros(V1.getNode()) ||
14291 ISD::isBuildVectorAllZeros(V2.getNode()))
14292 return SDValue();
14293
14294 // If none of the unpack-rooted lowerings worked (or were profitable) try an
14295 // initial unpack.
14296 if (NumLoInputs == 0 || NumHiInputs == 0) {
14297 assert((NumLoInputs > 0 || NumHiInputs > 0) &&(static_cast<void> (0))
14298 "We have to have *some* inputs!")(static_cast<void> (0));
14299 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
14300
14301 // FIXME: We could consider the total complexity of the permute of each
14302 // possible unpacking. Or at the least we should consider how many
14303 // half-crossings are created.
14304 // FIXME: We could consider commuting the unpacks.
14305
14306 SmallVector<int, 32> PermMask((unsigned)Size, -1);
14307 for (int i = 0; i < Size; ++i) {
14308 if (Mask[i] < 0)
14309 continue;
14310
14311 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!")(static_cast<void> (0));
14312
14313 PermMask[i] =
14314 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
14315 }
14316 return DAG.getVectorShuffle(
14317 VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
14318 DL, VT, V1, V2),
14319 DAG.getUNDEF(VT), PermMask);
14320 }
14321
14322 return SDValue();
14323}
14324
14325/// Handle lowering of 2-lane 64-bit floating point shuffles.
14326///
14327/// This is the basis function for the 2-lane 64-bit shuffles as we have full
14328/// support for floating point shuffles but not integer shuffles. These
14329/// instructions will incur a domain crossing penalty on some chips though so
14330/// it is better to avoid lowering through this for integer vectors where
14331/// possible.
14332static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
14333 const APInt &Zeroable, SDValue V1, SDValue V2,
14334 const X86Subtarget &Subtarget,
14335 SelectionDAG &DAG) {
14336 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!")(static_cast<void> (0));
14337 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!")(static_cast<void> (0));
14338 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!")(static_cast<void> (0));
14339
14340 if (V2.isUndef()) {
14341 // Check for being able to broadcast a single element.
14342 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,
14343 Mask, Subtarget, DAG))
14344 return Broadcast;
14345
14346 // Straight shuffle of a single input vector. Simulate this by using the
14347 // single input as both of the "inputs" to this instruction..
14348 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
14349
14350 if (Subtarget.hasAVX()) {
14351 // If we have AVX, we can use VPERMILPS which will allow folding a load
14352 // into the shuffle.
14353 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
14354 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
14355 }
14356
14357 return DAG.getNode(
14358 X86ISD::SHUFP, DL, MVT::v2f64,
14359 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
14360 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
14361 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
14362 }
14363 assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!")(static_cast<void> (0));
14364 assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!")(static_cast<void> (0));
14365 assert(Mask[0] < 2 && "We sort V1 to be the first input.")(static_cast<void> (0));
14366 assert(Mask[1] >= 2 && "We sort V2 to be the second input.")(static_cast<void> (0));
14367
14368 if (Subtarget.hasAVX2())
14369 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
14370 return Extract;
14371
14372 // When loading a scalar and then shuffling it into a vector we can often do
14373 // the insertion cheaply.
14374 if (SDValue Insertion = lowerShuffleAsElementInsertion(
14375 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
14376 return Insertion;
14377 // Try inverting the insertion since for v2 masks it is easy to do and we
14378 // can't reliably sort the mask one way or the other.
14379 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
14380 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
14381 if (SDValue Insertion = lowerShuffleAsElementInsertion(
14382 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
14383 return Insertion;
14384
14385 // Try to use one of the special instruction patterns to handle two common
14386 // blend patterns if a zero-blend above didn't work.
14387 if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) ||
14388 isShuffleEquivalent(Mask, {1, 3}, V1, V2))
14389 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
14390 // We can either use a special instruction to load over the low double or
14391 // to move just the low double.
14392 return DAG.getNode(
14393 X86ISD::MOVSD, DL, MVT::v2f64, V2,
14394 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
14395
14396 if (Subtarget.hasSSE41())
14397 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
14398 Zeroable, Subtarget, DAG))
14399 return Blend;
14400
14401 // Use dedicated unpack instructions for masks that match their pattern.
14402 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
14403 return V;
14404
14405 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
14406 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
14407 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
14408}
14409
14410/// Handle lowering of 2-lane 64-bit integer shuffles.
14411///
14412/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
14413/// the integer unit to minimize domain crossing penalties. However, for blends
14414/// it falls back to the floating point shuffle operation with appropriate bit
14415/// casting.
14416static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
14417 const APInt &Zeroable, SDValue V1, SDValue V2,
14418 const X86Subtarget &Subtarget,
14419 SelectionDAG &DAG) {
14420 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!")(static_cast<void> (0));
14421 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!")(static_cast<void> (0));
14422 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!")(static_cast<void> (0));
14423
14424 if (V2.isUndef()) {
14425 // Check for being able to broadcast a single element.
14426 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,
14427 Mask, Subtarget, DAG))
14428 return Broadcast;
14429
14430 // Straight shuffle of a single input vector. For everything from SSE2
14431 // onward this has a single fast instruction with no scary immediates.
14432 // We have to map the mask as it is actually a v4i32 shuffle instruction.
14433 V1 = DAG.getBitcast(MVT::v4i32, V1);
14434 int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),
14435 Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),
14436 Mask[1] < 0 ? -1 : (Mask[1] * 2),
14437 Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};
14438 return DAG.getBitcast(
14439 MVT::v2i64,
14440 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
14441 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
14442 }
14443 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!")(static_cast<void> (0));
14444 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!")(static_cast<void> (0));
14445 assert(Mask[0] < 2 && "We sort V1 to be the first input.")(static_cast<void> (0));
14446 assert(Mask[1] >= 2 && "We sort V2 to be the second input.")(static_cast<void> (0));
14447
14448 if (Subtarget.hasAVX2())
14449 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
14450 return Extract;
14451
14452 // Try to use shift instructions.
14453 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
14454 Zeroable, Subtarget, DAG))
14455 return Shift;
14456
14457 // When loading a scalar and then shuffling it into a vector we can often do
14458 // the insertion cheaply.
14459 if (SDValue Insertion = lowerShuffleAsElementInsertion(
14460 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
14461 return Insertion;
14462 // Try inverting the insertion since for v2 masks it is easy to do and we
14463 // can't reliably sort the mask one way or the other.
14464 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
14465 if (SDValue Insertion = lowerShuffleAsElementInsertion(
14466 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
14467 return Insertion;
14468
14469 // We have different paths for blend lowering, but they all must use the
14470 // *exact* same predicate.
14471 bool IsBlendSupported = Subtarget.hasSSE41();
14472 if (IsBlendSupported)
14473 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
14474 Zeroable, Subtarget, DAG))
14475 return Blend;
14476
14477 // Use dedicated unpack instructions for masks that match their pattern.
14478 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
14479 return V;
14480
14481 // Try to use byte rotation instructions.
14482 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
14483 if (Subtarget.hasSSSE3()) {
14484 if (Subtarget.hasVLX())
14485 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,
14486 Subtarget, DAG))
14487 return Rotate;
14488
14489 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,
14490 Subtarget, DAG))
14491 return Rotate;
14492 }
14493
14494 // If we have direct support for blends, we should lower by decomposing into
14495 // a permute. That will be faster than the domain cross.
14496 if (IsBlendSupported)
14497 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask,
14498 Subtarget, DAG);
14499
14500 // We implement this with SHUFPD which is pretty lame because it will likely
14501 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
14502 // However, all the alternatives are still more cycles and newer chips don't
14503 // have this problem. It would be really nice if x86 had better shuffles here.
14504 V1 = DAG.getBitcast(MVT::v2f64, V1);
14505 V2 = DAG.getBitcast(MVT::v2f64, V2);
14506 return DAG.getBitcast(MVT::v2i64,
14507 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
14508}
14509
14510/// Lower a vector shuffle using the SHUFPS instruction.
14511///
14512/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
14513/// It makes no assumptions about whether this is the *best* lowering, it simply
14514/// uses it.
14515static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
14516 ArrayRef<int> Mask, SDValue V1,
14517 SDValue V2, SelectionDAG &DAG) {
14518 SDValue LowV = V1, HighV = V2;
14519 SmallVector<int, 4> NewMask(Mask.begin(), Mask.end());
14520 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
14521
14522 if (NumV2Elements == 1) {
14523 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
14524
14525 // Compute the index adjacent to V2Index and in the same half by toggling
14526 // the low bit.
14527 int V2AdjIndex = V2Index ^ 1;
14528
14529 if (Mask[V2AdjIndex] < 0) {
14530 // Handles all the cases where we have a single V2 element and an undef.
14531 // This will only ever happen in the high lanes because we commute the
14532 // vector otherwise.
14533 if (V2Index < 2)
14534 std::swap(LowV, HighV);
14535 NewMask[V2Index] -= 4;
14536 } else {
14537 // Handle the case where the V2 element ends up adjacent to a V1 element.
14538 // To make this work, blend them together as the first step.
14539 int V1Index = V2AdjIndex;
14540 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
14541 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
14542 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
14543
14544 // Now proceed to reconstruct the final blend as we have the necessary
14545 // high or low half formed.
14546 if (V2Index < 2) {
14547 LowV = V2;
14548 HighV = V1;
14549 } else {
14550 HighV = V2;
14551 }
14552 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
14553 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
14554 }
14555 } else if (NumV2Elements == 2) {
14556 if (Mask[0] < 4 && Mask[1] < 4) {
14557 // Handle the easy case where we have V1 in the low lanes and V2 in the
14558 // high lanes.
14559 NewMask[2] -= 4;
14560 NewMask[3] -= 4;
14561 } else if (Mask[2] < 4 && Mask[3] < 4) {
14562 // We also handle the reversed case because this utility may get called
14563 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
14564 // arrange things in the right direction.
14565 NewMask[0] -= 4;
14566 NewMask[1] -= 4;
14567 HighV = V1;
14568 LowV = V2;
14569 } else {
14570 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
14571 // trying to place elements directly, just blend them and set up the final
14572 // shuffle to place them.
14573
14574 // The first two blend mask elements are for V1, the second two are for
14575 // V2.
14576 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
14577 Mask[2] < 4 ? Mask[2] : Mask[3],
14578 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
14579 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
14580 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
14581 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
14582
14583 // Now we do a normal shuffle of V1 by giving V1 as both operands to
14584 // a blend.
14585 LowV = HighV = V1;
14586 NewMask[0] = Mask[0] < 4 ? 0 : 2;
14587 NewMask[1] = Mask[0] < 4 ? 2 : 0;
14588 NewMask[2] = Mask[2] < 4 ? 1 : 3;
14589 NewMask[3] = Mask[2] < 4 ? 3 : 1;
14590 }
14591 } else if (NumV2Elements == 3) {
14592 // Ideally canonicalizeShuffleMaskWithCommute should have caught this, but
14593 // we can get here due to other paths (e.g repeated mask matching) that we
14594 // don't want to do another round of lowerVECTOR_SHUFFLE.
14595 ShuffleVectorSDNode::commuteMask(NewMask);
14596 return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG);
14597 }
14598 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
14599 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
14600}
14601
14602/// Lower 4-lane 32-bit floating point shuffles.
14603///
14604/// Uses instructions exclusively from the floating point unit to minimize
14605/// domain crossing penalties, as these are sufficient to implement all v4f32
14606/// shuffles.
14607static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
14608 const APInt &Zeroable, SDValue V1, SDValue V2,
14609 const X86Subtarget &Subtarget,
14610 SelectionDAG &DAG) {
14611 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast<void> (0));
14612 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast<void> (0));
14613 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast<void> (0));
14614
14615 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
14616
14617 if (NumV2Elements == 0) {
14618 // Check for being able to broadcast a single element.
14619 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,
14620 Mask, Subtarget, DAG))
14621 return Broadcast;
14622
14623 // Use even/odd duplicate instructions for masks that match their pattern.
14624 if (Subtarget.hasSSE3()) {
14625 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
14626 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
14627 if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2))
14628 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
14629 }
14630
14631 if (Subtarget.hasAVX()) {
14632 // If we have AVX, we can use VPERMILPS which will allow folding a load
14633 // into the shuffle.
14634 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
14635 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
14636 }
14637
14638 // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
14639 // in SSE1 because otherwise they are widened to v2f64 and never get here.
14640 if (!Subtarget.hasSSE2()) {
14641 if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2))
14642 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
14643 if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2))
14644 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
14645 }
14646
14647 // Otherwise, use a straight shuffle of a single input vector. We pass the
14648 // input vector to both operands to simulate this with a SHUFPS.
14649 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
14650 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
14651 }
14652
14653 if (Subtarget.hasAVX2())
14654 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
14655 return Extract;
14656
14657 // There are special ways we can lower some single-element blends. However, we
14658 // have custom ways we can lower more complex single-element blends below that
14659 // we defer to if both this and BLENDPS fail to match, so restrict this to
14660 // when the V2 input is targeting element 0 of the mask -- that is the fast
14661 // case here.
14662 if (NumV2Elements == 1 && Mask[0] >= 4)
14663 if (SDValue V = lowerShuffleAsElementInsertion(
14664 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
14665 return V;
14666
14667 if (Subtarget.hasSSE41()) {
14668 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
14669 Zeroable, Subtarget, DAG))
14670 return Blend;
14671
14672 // Use INSERTPS if we can complete the shuffle efficiently.
14673 if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
14674 return V;
14675
14676 if (!isSingleSHUFPSMask(Mask))
14677 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,
14678 V2, Mask, DAG))
14679 return BlendPerm;
14680 }
14681
14682 // Use low/high mov instructions. These are only valid in SSE1 because
14683 // otherwise they are widened to v2f64 and never get here.
14684 if (!Subtarget.hasSSE2()) {
14685 if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2))
14686 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
14687 if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2))
14688 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
14689 }
14690
14691 // Use dedicated unpack instructions for masks that match their pattern.
14692 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
14693 return V;
14694
14695 // Otherwise fall back to a SHUFPS lowering strategy.
14696 return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
14697}
14698
14699/// Lower 4-lane i32 vector shuffles.
14700///
14701/// We try to handle these with integer-domain shuffles where we can, but for
14702/// blends we use the floating point domain blend instructions.
14703static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
14704 const APInt &Zeroable, SDValue V1, SDValue V2,
14705 const X86Subtarget &Subtarget,
14706 SelectionDAG &DAG) {
14707 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!")(static_cast<void> (0));
14708 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!")(static_cast<void> (0));
14709 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast<void> (0));
14710
14711 // Whenever we can lower this as a zext, that instruction is strictly faster
14712 // than any alternative. It also allows us to fold memory operands into the
14713 // shuffle in many cases.
14714 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,
14715 Zeroable, Subtarget, DAG))
14716 return ZExt;
14717
14718 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
14719
14720 if (NumV2Elements == 0) {
14721 // Try to use broadcast unless the mask only has one non-undef element.
14722 if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {
14723 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
14724 Mask, Subtarget, DAG))
14725 return Broadcast;
14726 }
14727
14728 // Straight shuffle of a single input vector. For everything from SSE2
14729 // onward this has a single fast instruction with no scary immediates.
14730 // We coerce the shuffle pattern to be compatible with UNPCK instructions
14731 // but we aren't actually going to use the UNPCK instruction because doing
14732 // so prevents folding a load into this instruction or making a copy.
14733 const int UnpackLoMask[] = {0, 0, 1, 1};
14734 const int UnpackHiMask[] = {2, 2, 3, 3};
14735 if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2))
14736 Mask = UnpackLoMask;
14737 else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2))
14738 Mask = UnpackHiMask;
14739
14740 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
14741 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
14742 }
14743
14744 if (Subtarget.hasAVX2())
14745 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
14746 return Extract;
14747
14748 // Try to use shift instructions.
14749 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
14750 Zeroable, Subtarget, DAG))
14751 return Shift;
14752
14753 // There are special ways we can lower some single-element blends.
14754 if (NumV2Elements == 1)
14755 if (SDValue V = lowerShuffleAsElementInsertion(
14756 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
14757 return V;
14758
14759 // We have different paths for blend lowering, but they all must use the
14760 // *exact* same predicate.
14761 bool IsBlendSupported = Subtarget.hasSSE41();
14762 if (IsBlendSupported)
14763 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
14764 Zeroable, Subtarget, DAG))
14765 return Blend;
14766
14767 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
14768 Zeroable, Subtarget, DAG))
14769 return Masked;
14770
14771 // Use dedicated unpack instructions for masks that match their pattern.
14772 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
14773 return V;
14774
14775 // Try to use byte rotation instructions.
14776 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
14777 if (Subtarget.hasSSSE3()) {
14778 if (Subtarget.hasVLX())
14779 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
14780 Subtarget, DAG))
14781 return Rotate;
14782
14783 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
14784 Subtarget, DAG))
14785 return Rotate;
14786 }
14787
14788 // Assume that a single SHUFPS is faster than an alternative sequence of
14789 // multiple instructions (even if the CPU has a domain penalty).
14790 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
14791 if (!isSingleSHUFPSMask(Mask)) {
14792 // If we have direct support for blends, we should lower by decomposing into
14793 // a permute. That will be faster than the domain cross.
14794 if (IsBlendSupported)
14795 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask,
14796 Subtarget, DAG);
14797
14798 // Try to lower by permuting the inputs into an unpack instruction.
14799 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
14800 Mask, Subtarget, DAG))
14801 return Unpack;
14802 }
14803
14804 // We implement this with SHUFPS because it can blend from two vectors.
14805 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
14806 // up the inputs, bypassing domain shift penalties that we would incur if we
14807 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
14808 // relevant.
14809 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
14810 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
14811 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
14812 return DAG.getBitcast(MVT::v4i32, ShufPS);
14813}
14814
14815/// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
14816/// shuffle lowering, and the most complex part.
14817///
14818/// The lowering strategy is to try to form pairs of input lanes which are
14819/// targeted at the same half of the final vector, and then use a dword shuffle
14820/// to place them onto the right half, and finally unpack the paired lanes into
14821/// their final position.
14822///
14823/// The exact breakdown of how to form these dword pairs and align them on the
14824/// correct sides is really tricky. See the comments within the function for
14825/// more of the details.
14826///
14827/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
14828/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
14829/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
14830/// vector, form the analogous 128-bit 8-element Mask.
14831static SDValue lowerV8I16GeneralSingleInputShuffle(
14832 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
14833 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
14834 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!")(static_cast<void> (0));
14835 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
14836
14837 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!")(static_cast<void> (0));
14838 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
14839 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
14840
14841 // Attempt to directly match PSHUFLW or PSHUFHW.
14842 if (isUndefOrInRange(LoMask, 0, 4) &&
14843 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
14844 return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14845 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
14846 }
14847 if (isUndefOrInRange(HiMask, 4, 8) &&
14848 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
14849 for (int i = 0; i != 4; ++i)
14850 HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
14851 return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14852 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
14853 }
14854
14855 SmallVector<int, 4> LoInputs;
14856 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
14857 array_pod_sort(LoInputs.begin(), LoInputs.end());
14858 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
14859 SmallVector<int, 4> HiInputs;
14860 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
14861 array_pod_sort(HiInputs.begin(), HiInputs.end());
14862 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
14863 int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
14864 int NumHToL = LoInputs.size() - NumLToL;
14865 int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
14866 int NumHToH = HiInputs.size() - NumLToH;
14867 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
14868 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
14869 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
14870 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
14871
14872 // If we are shuffling values from one half - check how many different DWORD
14873 // pairs we need to create. If only 1 or 2 then we can perform this as a
14874 // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
14875 auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
14876 ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
14877 V = DAG.getNode(ShufWOp, DL, VT, V,
14878 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
14879 V = DAG.getBitcast(PSHUFDVT, V);
14880 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
14881 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
14882 return DAG.getBitcast(VT, V);
14883 };
14884
14885 if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
14886 int PSHUFDMask[4] = { -1, -1, -1, -1 };
14887 SmallVector<std::pair<int, int>, 4> DWordPairs;
14888 int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
14889
14890 // Collect the different DWORD pairs.
14891 for (int DWord = 0; DWord != 4; ++DWord) {
14892 int M0 = Mask[2 * DWord + 0];
14893 int M1 = Mask[2 * DWord + 1];
14894 M0 = (M0 >= 0 ? M0 % 4 : M0);
14895 M1 = (M1 >= 0 ? M1 % 4 : M1);
14896 if (M0 < 0 && M1 < 0)
14897 continue;
14898
14899 bool Match = false;
14900 for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
14901 auto &DWordPair = DWordPairs[j];
14902 if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
14903 (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
14904 DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
14905 DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
14906 PSHUFDMask[DWord] = DOffset + j;
14907 Match = true;
14908 break;
14909 }
14910 }
14911 if (!Match) {
14912 PSHUFDMask[DWord] = DOffset + DWordPairs.size();
14913 DWordPairs.push_back(std::make_pair(M0, M1));
14914 }
14915 }
14916
14917 if (DWordPairs.size() <= 2) {
14918 DWordPairs.resize(2, std::make_pair(-1, -1));
14919 int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
14920 DWordPairs[1].first, DWordPairs[1].second};
14921 if ((NumHToL + NumHToH) == 0)
14922 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
14923 if ((NumLToL + NumLToH) == 0)
14924 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
14925 }
14926 }
14927
14928 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
14929 // such inputs we can swap two of the dwords across the half mark and end up
14930 // with <=2 inputs to each half in each half. Once there, we can fall through
14931 // to the generic code below. For example:
14932 //
14933 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
14934 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
14935 //
14936 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
14937 // and an existing 2-into-2 on the other half. In this case we may have to
14938 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
14939 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
14940 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
14941 // because any other situation (including a 3-into-1 or 1-into-3 in the other
14942 // half than the one we target for fixing) will be fixed when we re-enter this
14943 // path. We will also combine away any sequence of PSHUFD instructions that
14944 // result into a single instruction. Here is an example of the tricky case:
14945 //
14946 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
14947 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
14948 //
14949 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
14950 //
14951 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
14952 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
14953 //
14954 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
14955 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
14956 //
14957 // The result is fine to be handled by the generic logic.
14958 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
14959 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
14960 int AOffset, int BOffset) {
14961 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&(static_cast<void> (0))
14962 "Must call this with A having 3 or 1 inputs from the A half.")(static_cast<void> (0));
14963 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&(static_cast<void> (0))
14964 "Must call this with B having 1 or 3 inputs from the B half.")(static_cast<void> (0));
14965 assert(AToAInputs.size() + BToAInputs.size() == 4 &&(static_cast<void> (0))
14966 "Must call this with either 3:1 or 1:3 inputs (summing to 4).")(static_cast<void> (0));
14967
14968 bool ThreeAInputs = AToAInputs.size() == 3;
14969
14970 // Compute the index of dword with only one word among the three inputs in
14971 // a half by taking the sum of the half with three inputs and subtracting
14972 // the sum of the actual three inputs. The difference is the remaining
14973 // slot.
14974 int ADWord = 0, BDWord = 0;
14975 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
14976 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
14977 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
14978 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
14979 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
14980 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
14981 int TripleNonInputIdx =
14982 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
14983 TripleDWord = TripleNonInputIdx / 2;
14984
14985 // We use xor with one to compute the adjacent DWord to whichever one the
14986 // OneInput is in.
14987 OneInputDWord = (OneInput / 2) ^ 1;
14988
14989 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
14990 // and BToA inputs. If there is also such a problem with the BToB and AToB
14991 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
14992 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
14993 // is essential that we don't *create* a 3<-1 as then we might oscillate.
14994 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
14995 // Compute how many inputs will be flipped by swapping these DWords. We
14996 // need
14997 // to balance this to ensure we don't form a 3-1 shuffle in the other
14998 // half.
14999 int NumFlippedAToBInputs =
15000 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
15001 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
15002 int NumFlippedBToBInputs =
15003 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
15004 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
15005 if ((NumFlippedAToBInputs == 1 &&
15006 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
15007 (NumFlippedBToBInputs == 1 &&
15008 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
15009 // We choose whether to fix the A half or B half based on whether that
15010 // half has zero flipped inputs. At zero, we may not be able to fix it
15011 // with that half. We also bias towards fixing the B half because that
15012 // will more commonly be the high half, and we have to bias one way.
15013 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
15014 ArrayRef<int> Inputs) {
15015 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
15016 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
15017 // Determine whether the free index is in the flipped dword or the
15018 // unflipped dword based on where the pinned index is. We use this bit
15019 // in an xor to conditionally select the adjacent dword.
15020 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
15021 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
15022 if (IsFixIdxInput == IsFixFreeIdxInput)
15023 FixFreeIdx += 1;
15024 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
15025 assert(IsFixIdxInput != IsFixFreeIdxInput &&(static_cast<void> (0))
15026 "We need to be changing the number of flipped inputs!")(static_cast<void> (0));
15027 int PSHUFHalfMask[] = {0, 1, 2, 3};
15028 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
15029 V = DAG.getNode(
15030 FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
15031 MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
15032 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
15033
15034 for (int &M : Mask)
15035 if (M >= 0 && M == FixIdx)
15036 M = FixFreeIdx;
15037 else if (M >= 0 && M == FixFreeIdx)
15038 M = FixIdx;
15039 };
15040 if (NumFlippedBToBInputs != 0) {
15041 int BPinnedIdx =
15042 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
15043 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
15044 } else {
15045 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!")(static_cast<void> (0));
15046 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
15047 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
15048 }
15049 }
15050 }
15051
15052 int PSHUFDMask[] = {0, 1, 2, 3};
15053 PSHUFDMask[ADWord] = BDWord;
15054 PSHUFDMask[BDWord] = ADWord;
15055 V = DAG.getBitcast(
15056 VT,
15057 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
15058 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
15059
15060 // Adjust the mask to match the new locations of A and B.
15061 for (int &M : Mask)
15062 if (M >= 0 && M/2 == ADWord)
15063 M = 2 * BDWord + M % 2;
15064 else if (M >= 0 && M/2 == BDWord)
15065 M = 2 * ADWord + M % 2;
15066
15067 // Recurse back into this routine to re-compute state now that this isn't
15068 // a 3 and 1 problem.
15069 return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);
15070 };
15071 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
15072 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
15073 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
15074 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
15075
15076 // At this point there are at most two inputs to the low and high halves from
15077 // each half. That means the inputs can always be grouped into dwords and
15078 // those dwords can then be moved to the correct half with a dword shuffle.
15079 // We use at most one low and one high word shuffle to collect these paired
15080 // inputs into dwords, and finally a dword shuffle to place them.
15081 int PSHUFLMask[4] = {-1, -1, -1, -1};
15082 int PSHUFHMask[4] = {-1, -1, -1, -1};
15083 int PSHUFDMask[4] = {-1, -1, -1, -1};
15084
15085 // First fix the masks for all the inputs that are staying in their
15086 // original halves. This will then dictate the targets of the cross-half
15087 // shuffles.
15088 auto fixInPlaceInputs =
15089 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
15090 MutableArrayRef<int> SourceHalfMask,
15091 MutableArrayRef<int> HalfMask, int HalfOffset) {
15092 if (InPlaceInputs.empty())
15093 return;
15094 if (InPlaceInputs.size() == 1) {
15095 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
15096 InPlaceInputs[0] - HalfOffset;
15097 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
15098 return;
15099 }
15100 if (IncomingInputs.empty()) {
15101 // Just fix all of the in place inputs.
15102 for (int Input : InPlaceInputs) {
15103 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
15104 PSHUFDMask[Input / 2] = Input / 2;
15105 }
15106 return;
15107 }
15108
15109 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!")(static_cast<void> (0));
15110 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
15111 InPlaceInputs[0] - HalfOffset;
15112 // Put the second input next to the first so that they are packed into
15113 // a dword. We find the adjacent index by toggling the low bit.
15114 int AdjIndex = InPlaceInputs[0] ^ 1;
15115 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
15116 std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
15117 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
15118 };
15119 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
15120 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
15121
15122 // Now gather the cross-half inputs and place them into a free dword of
15123 // their target half.
15124 // FIXME: This operation could almost certainly be simplified dramatically to
15125 // look more like the 3-1 fixing operation.
15126 auto moveInputsToRightHalf = [&PSHUFDMask](
15127 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
15128 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
15129 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
15130 int DestOffset) {
15131 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
15132 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
15133 };
15134 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
15135 int Word) {
15136 int LowWord = Word & ~1;
15137 int HighWord = Word | 1;
15138 return isWordClobbered(SourceHalfMask, LowWord) ||
15139 isWordClobbered(SourceHalfMask, HighWord);
15140 };
15141
15142 if (IncomingInputs.empty())
15143 return;
15144
15145 if (ExistingInputs.empty()) {
15146 // Map any dwords with inputs from them into the right half.
15147 for (int Input : IncomingInputs) {
15148 // If the source half mask maps over the inputs, turn those into
15149 // swaps and use the swapped lane.
15150 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
15151 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
15152 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
15153 Input - SourceOffset;
15154 // We have to swap the uses in our half mask in one sweep.
15155 for (int &M : HalfMask)
15156 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
15157 M = Input;
15158 else if (M == Input)
15159 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
15160 } else {
15161 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==(static_cast<void> (0))
15162 Input - SourceOffset &&(static_cast<void> (0))
15163 "Previous placement doesn't match!")(static_cast<void> (0));
15164 }
15165 // Note that this correctly re-maps both when we do a swap and when
15166 // we observe the other side of the swap above. We rely on that to
15167 // avoid swapping the members of the input list directly.
15168 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
15169 }
15170
15171 // Map the input's dword into the correct half.
15172 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
15173 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
15174 else
15175 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==(static_cast<void> (0))
15176 Input / 2 &&(static_cast<void> (0))
15177 "Previous placement doesn't match!")(static_cast<void> (0));
15178 }
15179
15180 // And just directly shift any other-half mask elements to be same-half
15181 // as we will have mirrored the dword containing the element into the
15182 // same position within that half.
15183 for (int &M : HalfMask)
15184 if (M >= SourceOffset && M < SourceOffset + 4) {
15185 M = M - SourceOffset + DestOffset;
15186 assert(M >= 0 && "This should never wrap below zero!")(static_cast<void> (0));
15187 }
15188 return;
15189 }
15190
15191 // Ensure we have the input in a viable dword of its current half. This
15192 // is particularly tricky because the original position may be clobbered
15193 // by inputs being moved and *staying* in that half.
15194 if (IncomingInputs.size() == 1) {
15195 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
15196 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
15197 SourceOffset;
15198 SourceHalfMask[InputFixed - SourceOffset] =
15199 IncomingInputs[0] - SourceOffset;
15200 std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
15201 InputFixed);
15202 IncomingInputs[0] = InputFixed;
15203 }
15204 } else if (IncomingInputs.size() == 2) {
15205 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
15206 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
15207 // We have two non-adjacent or clobbered inputs we need to extract from
15208 // the source half. To do this, we need to map them into some adjacent
15209 // dword slot in the source mask.
15210 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
15211 IncomingInputs[1] - SourceOffset};
15212
15213 // If there is a free slot in the source half mask adjacent to one of
15214 // the inputs, place the other input in it. We use (Index XOR 1) to
15215 // compute an adjacent index.
15216 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
15217 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
15218 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
15219 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
15220 InputsFixed[1] = InputsFixed[0] ^ 1;
15221 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
15222 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
15223 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
15224 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
15225 InputsFixed[0] = InputsFixed[1] ^ 1;
15226 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
15227 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
15228 // The two inputs are in the same DWord but it is clobbered and the
15229 // adjacent DWord isn't used at all. Move both inputs to the free
15230 // slot.
15231 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
15232 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
15233 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
15234 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
15235 } else {
15236 // The only way we hit this point is if there is no clobbering
15237 // (because there are no off-half inputs to this half) and there is no
15238 // free slot adjacent to one of the inputs. In this case, we have to
15239 // swap an input with a non-input.
15240 for (int i = 0; i < 4; ++i)
15241 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&(static_cast<void> (0))
15242 "We can't handle any clobbers here!")(static_cast<void> (0));
15243 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&(static_cast<void> (0))
15244 "Cannot have adjacent inputs here!")(static_cast<void> (0));
15245
15246 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
15247 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
15248
15249 // We also have to update the final source mask in this case because
15250 // it may need to undo the above swap.
15251 for (int &M : FinalSourceHalfMask)
15252 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
15253 M = InputsFixed[1] + SourceOffset;
15254 else if (M == InputsFixed[1] + SourceOffset)
15255 M = (InputsFixed[0] ^ 1) + SourceOffset;
15256
15257 InputsFixed[1] = InputsFixed[0] ^ 1;
15258 }
15259
15260 // Point everything at the fixed inputs.
15261 for (int &M : HalfMask)
15262 if (M == IncomingInputs[0])
15263 M = InputsFixed[0] + SourceOffset;
15264 else if (M == IncomingInputs[1])
15265 M = InputsFixed[1] + SourceOffset;
15266
15267 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
15268 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
15269 }
15270 } else {
15271 llvm_unreachable("Unhandled input size!")__builtin_unreachable();
15272 }
15273
15274 // Now hoist the DWord down to the right half.
15275 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
15276 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free")(static_cast<void> (0));
15277 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
15278 for (int &M : HalfMask)
15279 for (int Input : IncomingInputs)
15280 if (M == Input)
15281 M = FreeDWord * 2 + Input % 2;
15282 };
15283 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
15284 /*SourceOffset*/ 4, /*DestOffset*/ 0);
15285 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
15286 /*SourceOffset*/ 0, /*DestOffset*/ 4);
15287
15288 // Now enact all the shuffles we've computed to move the inputs into their
15289 // target half.
15290 if (!isNoopShuffleMask(PSHUFLMask))
15291 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
15292 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
15293 if (!isNoopShuffleMask(PSHUFHMask))
15294 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
15295 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
15296 if (!isNoopShuffleMask(PSHUFDMask))
15297 V = DAG.getBitcast(
15298 VT,
15299 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
15300 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
15301
15302 // At this point, each half should contain all its inputs, and we can then
15303 // just shuffle them into their final position.
15304 assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&(static_cast<void> (0))
15305 "Failed to lift all the high half inputs to the low mask!")(static_cast<void> (0));
15306 assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&(static_cast<void> (0))
15307 "Failed to lift all the low half inputs to the high mask!")(static_cast<void> (0));
15308
15309 // Do a half shuffle for the low mask.
15310 if (!isNoopShuffleMask(LoMask))
15311 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
15312 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
15313
15314 // Do a half shuffle with the high mask after shifting its values down.
15315 for (int &M : HiMask)
15316 if (M >= 0)
15317 M -= 4;
15318 if (!isNoopShuffleMask(HiMask))
15319 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
15320 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
15321
15322 return V;
15323}
15324
15325/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
15326/// blend if only one input is used.
15327static SDValue lowerShuffleAsBlendOfPSHUFBs(
15328 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15329 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
15330 assert(!is128BitLaneCrossingShuffleMask(VT, Mask) &&(static_cast<void> (0))
15331 "Lane crossing shuffle masks not supported")(static_cast<void> (0));
15332
15333 int NumBytes = VT.getSizeInBits() / 8;
15334 int Size = Mask.size();
15335 int Scale = NumBytes / Size;
15336
15337 SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));
15338 SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));
15339 V1InUse = false;
15340 V2InUse = false;
15341
15342 for (int i = 0; i < NumBytes; ++i) {
15343 int M = Mask[i / Scale];
15344 if (M < 0)
15345 continue;
15346
15347 const int ZeroMask = 0x80;
15348 int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;
15349 int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;
15350 if (Zeroable[i / Scale])
15351 V1Idx = V2Idx = ZeroMask;
15352
15353 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
15354 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
15355 V1InUse |= (ZeroMask != V1Idx);
15356 V2InUse |= (ZeroMask != V2Idx);
15357 }
15358
15359 MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);
15360 if (V1InUse)
15361 V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),
15362 DAG.getBuildVector(ShufVT, DL, V1Mask));
15363 if (V2InUse)
15364 V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),
15365 DAG.getBuildVector(ShufVT, DL, V2Mask));
15366
15367 // If we need shuffled inputs from both, blend the two.
15368 SDValue V;
15369 if (V1InUse && V2InUse)
15370 V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);
15371 else
15372 V = V1InUse ? V1 : V2;
15373
15374 // Cast the result back to the correct type.
15375 return DAG.getBitcast(VT, V);
15376}
15377
15378/// Generic lowering of 8-lane i16 shuffles.
15379///
15380/// This handles both single-input shuffles and combined shuffle/blends with
15381/// two inputs. The single input shuffles are immediately delegated to
15382/// a dedicated lowering routine.
15383///
15384/// The blends are lowered in one of three fundamental ways. If there are few
15385/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
15386/// of the input is significantly cheaper when lowered as an interleaving of
15387/// the two inputs, try to interleave them. Otherwise, blend the low and high
15388/// halves of the inputs separately (making them have relatively few inputs)
15389/// and then concatenate them.
15390static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15391 const APInt &Zeroable, SDValue V1, SDValue V2,
15392 const X86Subtarget &Subtarget,
15393 SelectionDAG &DAG) {
15394 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!")(static_cast<void> (0));
15395 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!")(static_cast<void> (0));
15396 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast<void> (0));
15397
15398 // Whenever we can lower this as a zext, that instruction is strictly faster
15399 // than any alternative.
15400 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,
15401 Zeroable, Subtarget, DAG))
15402 return ZExt;
15403
15404 // Try to use lower using a truncation.
15405 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
15406 Subtarget, DAG))
15407 return V;
15408
15409 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
15410
15411 if (NumV2Inputs == 0) {
15412 // Try to use shift instructions.
15413 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
15414 Zeroable, Subtarget, DAG))
15415 return Shift;
15416
15417 // Check for being able to broadcast a single element.
15418 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
15419 Mask, Subtarget, DAG))
15420 return Broadcast;
15421
15422 // Try to use bit rotation instructions.
15423 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,
15424 Subtarget, DAG))
15425 return Rotate;
15426
15427 // Use dedicated unpack instructions for masks that match their pattern.
15428 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
15429 return V;
15430
15431 // Use dedicated pack instructions for masks that match their pattern.
15432 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
15433 Subtarget))
15434 return V;
15435
15436 // Try to use byte rotation instructions.
15437 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,
15438 Subtarget, DAG))
15439 return Rotate;
15440
15441 // Make a copy of the mask so it can be modified.
15442 SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
15443 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,
15444 Subtarget, DAG);
15445 }
15446
15447 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&(static_cast<void> (0))
15448 "All single-input shuffles should be canonicalized to be V1-input "(static_cast<void> (0))
15449 "shuffles.")(static_cast<void> (0));
15450
15451 // Try to use shift instructions.
15452 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
15453 Zeroable, Subtarget, DAG))
15454 return Shift;
15455
15456 // See if we can use SSE4A Extraction / Insertion.
15457 if (Subtarget.hasSSE4A())
15458 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
15459 Zeroable, DAG))
15460 return V;
15461
15462 // There are special ways we can lower some single-element blends.
15463 if (NumV2Inputs == 1)
15464 if (SDValue V = lowerShuffleAsElementInsertion(
15465 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
15466 return V;
15467
15468 // We have different paths for blend lowering, but they all must use the
15469 // *exact* same predicate.
15470 bool IsBlendSupported = Subtarget.hasSSE41();
15471 if (IsBlendSupported)
15472 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
15473 Zeroable, Subtarget, DAG))
15474 return Blend;
15475
15476 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
15477 Zeroable, Subtarget, DAG))
15478 return Masked;
15479
15480 // Use dedicated unpack instructions for masks that match their pattern.
15481 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
15482 return V;
15483
15484 // Use dedicated pack instructions for masks that match their pattern.
15485 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
15486 Subtarget))
15487 return V;
15488
15489 // Try to use lower using a truncation.
15490 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
15491 Subtarget, DAG))
15492 return V;
15493
15494 // Try to use byte rotation instructions.
15495 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
15496 Subtarget, DAG))
15497 return Rotate;
15498
15499 if (SDValue BitBlend =
15500 lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
15501 return BitBlend;
15502
15503 // Try to use byte shift instructions to mask.
15504 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,
15505 Zeroable, Subtarget, DAG))
15506 return V;
15507
15508 // Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.
15509 // We could use SIGN_EXTEND_INREG+PACKSSDW for older targets but this seems to
15510 // be slower than a PSHUFLW+PSHUFHW+PSHUFD chain.
15511 int NumEvenDrops = canLowerByDroppingEvenElements(Mask, false);
15512 if ((NumEvenDrops == 1 || NumEvenDrops == 2) && Subtarget.hasSSE41() &&
15513 !Subtarget.hasVLX()) {
15514 SmallVector<SDValue, 8> DWordClearOps(4, DAG.getConstant(0, DL, MVT::i32));
15515 for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
15516 DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);
15517 SDValue DWordClearMask = DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);
15518 V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),
15519 DWordClearMask);
15520 V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),
15521 DWordClearMask);
15522 // Now pack things back together.
15523 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, V1, V2);
15524 if (NumEvenDrops == 2) {
15525 Result = DAG.getBitcast(MVT::v4i32, Result);
15526 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, Result, Result);
15527 }
15528 return Result;
15529 }
15530
15531 // Try to lower by permuting the inputs into an unpack instruction.
15532 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
15533 Mask, Subtarget, DAG))
15534 return Unpack;
15535
15536 // If we can't directly blend but can use PSHUFB, that will be better as it
15537 // can both shuffle and set up the inefficient blend.
15538 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
15539 bool V1InUse, V2InUse;
15540 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
15541 Zeroable, DAG, V1InUse, V2InUse);
15542 }
15543
15544 // We can always bit-blend if we have to so the fallback strategy is to
15545 // decompose into single-input permutes and blends/unpacks.
15546 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2,
15547 Mask, Subtarget, DAG);
15548}
15549
15550/// Lower 8-lane 16-bit floating point shuffles.
15551static SDValue lowerV8F16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15552 const APInt &Zeroable, SDValue V1, SDValue V2,
15553 const X86Subtarget &Subtarget,
15554 SelectionDAG &DAG) {
15555 assert(V1.getSimpleValueType() == MVT::v8f16 && "Bad operand type!")(static_cast<void> (0));
15556 assert(V2.getSimpleValueType() == MVT::v8f16 && "Bad operand type!")(static_cast<void> (0));
15557 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast<void> (0));
15558 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
15559
15560 if (NumV2Elements == 0) {
15561 // Check for being able to broadcast a single element.
15562 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f16, V1, V2,
15563 Mask, Subtarget, DAG))
15564 return Broadcast;
15565 }
15566 if (NumV2Elements == 1 && Mask[0] >= 8)
15567 if (SDValue V = lowerShuffleAsElementInsertion(DL, MVT::v8f16, V1, V2, Mask,
15568 Zeroable, Subtarget, DAG))
15569 return V;
15570
15571 V1 = DAG.getBitcast(MVT::v8i16, V1);
15572 V2 = DAG.getBitcast(MVT::v8i16, V2);
15573 return DAG.getBitcast(MVT::v8f16,
15574 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
15575}
15576
15577// Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,
15578// sub-512-bit shuffles are padded to 512-bits for the shuffle and then
15579// the active subvector is extracted.
15580static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT,
15581 ArrayRef<int> Mask, SDValue V1, SDValue V2,
15582 const X86Subtarget &Subtarget,
15583 SelectionDAG &DAG) {
15584 MVT MaskVT = VT.changeTypeToInteger();
15585 SDValue MaskNode;
15586 MVT ShuffleVT = VT;
15587 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
15588 V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);
15589 V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);
15590 ShuffleVT = V1.getSimpleValueType();
15591
15592 // Adjust mask to correct indices for the second input.
15593 int NumElts = VT.getVectorNumElements();
15594 unsigned Scale = 512 / VT.getSizeInBits();
15595 SmallVector<int, 32> AdjustedMask(Mask.begin(), Mask.end());
15596 for (int &M : AdjustedMask)
15597 if (NumElts <= M)
15598 M += (Scale - 1) * NumElts;
15599 MaskNode = getConstVector(AdjustedMask, MaskVT, DAG, DL, true);
15600 MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);
15601 } else {
15602 MaskNode = getConstVector(Mask, MaskVT, DAG, DL, true);
15603 }
15604
15605 SDValue Result;
15606 if (V2.isUndef())
15607 Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);
15608 else
15609 Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);
15610
15611 if (VT != ShuffleVT)
15612 Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());
15613
15614 return Result;
15615}
15616
15617/// Generic lowering of v16i8 shuffles.
15618///
15619/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
15620/// detect any complexity reducing interleaving. If that doesn't help, it uses
15621/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
15622/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
15623/// back together.
15624static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15625 const APInt &Zeroable, SDValue V1, SDValue V2,
15626 const X86Subtarget &Subtarget,
15627 SelectionDAG &DAG) {
15628 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!")(static_cast<void> (0));
15629 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!")(static_cast<void> (0));
15630 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast<void> (0));
15631
15632 // Try to use shift instructions.
15633 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
15634 Zeroable, Subtarget, DAG))
15635 return Shift;
15636
15637 // Try to use byte rotation instructions.
15638 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,
15639 Subtarget, DAG))
15640 return Rotate;
15641
15642 // Use dedicated pack instructions for masks that match their pattern.
15643 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,
15644 Subtarget))
15645 return V;
15646
15647 // Try to use a zext lowering.
15648 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,
15649 Zeroable, Subtarget, DAG))
15650 return ZExt;
15651
15652 // Try to use lower using a truncation.
15653 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
15654 Subtarget, DAG))
15655 return V;
15656
15657 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
15658 Subtarget, DAG))
15659 return V;
15660
15661 // See if we can use SSE4A Extraction / Insertion.
15662 if (Subtarget.hasSSE4A())
15663 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
15664 Zeroable, DAG))
15665 return V;
15666
15667 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
15668
15669 // For single-input shuffles, there are some nicer lowering tricks we can use.
15670 if (NumV2Elements == 0) {
15671 // Check for being able to broadcast a single element.
15672 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,
15673 Mask, Subtarget, DAG))
15674 return Broadcast;
15675
15676 // Try to use bit rotation instructions.
15677 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,
15678 Subtarget, DAG))
15679 return Rotate;
15680
15681 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
15682 return V;
15683
15684 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
15685 // Notably, this handles splat and partial-splat shuffles more efficiently.
15686 // However, it only makes sense if the pre-duplication shuffle simplifies
15687 // things significantly. Currently, this means we need to be able to
15688 // express the pre-duplication shuffle as an i16 shuffle.
15689 //
15690 // FIXME: We should check for other patterns which can be widened into an
15691 // i16 shuffle as well.
15692 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
15693 for (int i = 0; i < 16; i += 2)
15694 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
15695 return false;
15696
15697 return true;
15698 };
15699 auto tryToWidenViaDuplication = [&]() -> SDValue {
15700 if (!canWidenViaDuplication(Mask))
15701 return SDValue();
15702 SmallVector<int, 4> LoInputs;
15703 copy_if(Mask, std::back_inserter(LoInputs),
15704 [](int M) { return M >= 0 && M < 8; });
15705 array_pod_sort(LoInputs.begin(), LoInputs.end());
15706 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
15707 LoInputs.end());
15708 SmallVector<int, 4> HiInputs;
15709 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
15710 array_pod_sort(HiInputs.begin(), HiInputs.end());
15711 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
15712 HiInputs.end());
15713
15714 bool TargetLo = LoInputs.size() >= HiInputs.size();
15715 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
15716 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
15717
15718 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
15719 SmallDenseMap<int, int, 8> LaneMap;
15720 for (int I : InPlaceInputs) {
15721 PreDupI16Shuffle[I/2] = I/2;
15722 LaneMap[I] = I;
15723 }
15724 int j = TargetLo ? 0 : 4, je = j + 4;
15725 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
15726 // Check if j is already a shuffle of this input. This happens when
15727 // there are two adjacent bytes after we move the low one.
15728 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
15729 // If we haven't yet mapped the input, search for a slot into which
15730 // we can map it.
15731 while (j < je && PreDupI16Shuffle[j] >= 0)
15732 ++j;
15733
15734 if (j == je)
15735 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
15736 return SDValue();
15737
15738 // Map this input with the i16 shuffle.
15739 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
15740 }
15741
15742 // Update the lane map based on the mapping we ended up with.
15743 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
15744 }
15745 V1 = DAG.getBitcast(
15746 MVT::v16i8,
15747 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
15748 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
15749
15750 // Unpack the bytes to form the i16s that will be shuffled into place.
15751 bool EvenInUse = false, OddInUse = false;
15752 for (int i = 0; i < 16; i += 2) {
15753 EvenInUse |= (Mask[i + 0] >= 0);
15754 OddInUse |= (Mask[i + 1] >= 0);
15755 if (EvenInUse && OddInUse)
15756 break;
15757 }
15758 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
15759 MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),
15760 OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));
15761
15762 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
15763 for (int i = 0; i < 16; ++i)
15764 if (Mask[i] >= 0) {
15765 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
15766 assert(MappedMask < 8 && "Invalid v8 shuffle mask!")(static_cast<void> (0));
15767 if (PostDupI16Shuffle[i / 2] < 0)
15768 PostDupI16Shuffle[i / 2] = MappedMask;
15769 else
15770 assert(PostDupI16Shuffle[i / 2] == MappedMask &&(static_cast<void> (0))
15771 "Conflicting entries in the original shuffle!")(static_cast<void> (0));
15772 }
15773 return DAG.getBitcast(
15774 MVT::v16i8,
15775 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
15776 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
15777 };
15778 if (SDValue V = tryToWidenViaDuplication())
15779 return V;
15780 }
15781
15782 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
15783 Zeroable, Subtarget, DAG))
15784 return Masked;
15785
15786 // Use dedicated unpack instructions for masks that match their pattern.
15787 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
15788 return V;
15789
15790 // Try to use byte shift instructions to mask.
15791 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,
15792 Zeroable, Subtarget, DAG))
15793 return V;
15794
15795 // Check for compaction patterns.
15796 bool IsSingleInput = V2.isUndef();
15797 int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput);
15798
15799 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
15800 // with PSHUFB. It is important to do this before we attempt to generate any
15801 // blends but after all of the single-input lowerings. If the single input
15802 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
15803 // want to preserve that and we can DAG combine any longer sequences into
15804 // a PSHUFB in the end. But once we start blending from multiple inputs,
15805 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
15806 // and there are *very* few patterns that would actually be faster than the
15807 // PSHUFB approach because of its ability to zero lanes.
15808 //
15809 // If the mask is a binary compaction, we can more efficiently perform this
15810 // as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).
15811 //
15812 // FIXME: The only exceptions to the above are blends which are exact
15813 // interleavings with direct instructions supporting them. We currently don't
15814 // handle those well here.
15815 if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {
15816 bool V1InUse = false;
15817 bool V2InUse = false;
15818
15819 SDValue PSHUFB = lowerShuffleAsBlendOfPSHUFBs(
15820 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
15821
15822 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
15823 // do so. This avoids using them to handle blends-with-zero which is
15824 // important as a single pshufb is significantly faster for that.
15825 if (V1InUse && V2InUse) {
15826 if (Subtarget.hasSSE41())
15827 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
15828 Zeroable, Subtarget, DAG))
15829 return Blend;
15830
15831 // We can use an unpack to do the blending rather than an or in some
15832 // cases. Even though the or may be (very minorly) more efficient, we
15833 // preference this lowering because there are common cases where part of
15834 // the complexity of the shuffles goes away when we do the final blend as
15835 // an unpack.
15836 // FIXME: It might be worth trying to detect if the unpack-feeding
15837 // shuffles will both be pshufb, in which case we shouldn't bother with
15838 // this.
15839 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(
15840 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
15841 return Unpack;
15842
15843 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
15844 if (Subtarget.hasVBMI())
15845 return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,
15846 DAG);
15847
15848 // If we have XOP we can use one VPPERM instead of multiple PSHUFBs.
15849 if (Subtarget.hasXOP()) {
15850 SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true);
15851 return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode);
15852 }
15853
15854 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
15855 // PALIGNR will be cheaper than the second PSHUFB+OR.
15856 if (SDValue V = lowerShuffleAsByteRotateAndPermute(
15857 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
15858 return V;
15859 }
15860
15861 return PSHUFB;
15862 }
15863
15864 // There are special ways we can lower some single-element blends.
15865 if (NumV2Elements == 1)
15866 if (SDValue V = lowerShuffleAsElementInsertion(
15867 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
15868 return V;
15869
15870 if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
15871 return Blend;
15872
15873 // Check whether a compaction lowering can be done. This handles shuffles
15874 // which take every Nth element for some even N. See the helper function for
15875 // details.
15876 //
15877 // We special case these as they can be particularly efficiently handled with
15878 // the PACKUSB instruction on x86 and they show up in common patterns of
15879 // rearranging bytes to truncate wide elements.
15880 if (NumEvenDrops) {
15881 // NumEvenDrops is the power of two stride of the elements. Another way of
15882 // thinking about it is that we need to drop the even elements this many
15883 // times to get the original input.
15884
15885 // First we need to zero all the dropped bytes.
15886 assert(NumEvenDrops <= 3 &&(static_cast<void> (0))
15887 "No support for dropping even elements more than 3 times.")(static_cast<void> (0));
15888 SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16));
15889 for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
15890 WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16);
15891 SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps);
15892 V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1),
15893 WordClearMask);
15894 if (!IsSingleInput)
15895 V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2),
15896 WordClearMask);
15897
15898 // Now pack things back together.
15899 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
15900 IsSingleInput ? V1 : V2);
15901 for (int i = 1; i < NumEvenDrops; ++i) {
15902 Result = DAG.getBitcast(MVT::v8i16, Result);
15903 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
15904 }
15905 return Result;
15906 }
15907
15908 // Handle multi-input cases by blending/unpacking single-input shuffles.
15909 if (NumV2Elements > 0)
15910 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,
15911 Subtarget, DAG);
15912
15913 // The fallback path for single-input shuffles widens this into two v8i16
15914 // vectors with unpacks, shuffles those, and then pulls them back together
15915 // with a pack.
15916 SDValue V = V1;
15917
15918 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15919 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15920 for (int i = 0; i < 16; ++i)
15921 if (Mask[i] >= 0)
15922 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
15923
15924 SDValue VLoHalf, VHiHalf;
15925 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
15926 // them out and avoid using UNPCK{L,H} to extract the elements of V as
15927 // i16s.
15928 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
15929 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
15930 // Use a mask to drop the high bytes.
15931 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
15932 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
15933 DAG.getConstant(0x00FF, DL, MVT::v8i16));
15934
15935 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
15936 VHiHalf = DAG.getUNDEF(MVT::v8i16);
15937
15938 // Squash the masks to point directly into VLoHalf.
15939 for (int &M : LoBlendMask)
15940 if (M >= 0)
15941 M /= 2;
15942 for (int &M : HiBlendMask)
15943 if (M >= 0)
15944 M /= 2;
15945 } else {
15946 // Otherwise just unpack the low half of V into VLoHalf and the high half into
15947 // VHiHalf so that we can blend them as i16s.
15948 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
15949
15950 VLoHalf = DAG.getBitcast(
15951 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
15952 VHiHalf = DAG.getBitcast(
15953 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
15954 }
15955
15956 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
15957 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
15958
15959 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
15960}
15961
15962/// Dispatching routine to lower various 128-bit x86 vector shuffles.
15963///
15964/// This routine breaks down the specific type of 128-bit shuffle and
15965/// dispatches to the lowering routines accordingly.
15966static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
15967 MVT VT, SDValue V1, SDValue V2,
15968 const APInt &Zeroable,
15969 const X86Subtarget &Subtarget,
15970 SelectionDAG &DAG) {
15971 switch (VT.SimpleTy) {
15972 case MVT::v2i64:
15973 return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15974 case MVT::v2f64:
15975 return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15976 case MVT::v4i32:
15977 return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15978 case MVT::v4f32:
15979 return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15980 case MVT::v8i16:
15981 return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15982 case MVT::v8f16:
15983 return lowerV8F16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15984 case MVT::v16i8:
15985 return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15986
15987 default:
15988 llvm_unreachable("Unimplemented!")__builtin_unreachable();
15989 }
15990}
15991
15992/// Generic routine to split vector shuffle into half-sized shuffles.
15993///
15994/// This routine just extracts two subvectors, shuffles them independently, and
15995/// then concatenates them back together. This should work effectively with all
15996/// AVX vector shuffle types.
15997static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
15998 SDValue V2, ArrayRef<int> Mask,
15999 SelectionDAG &DAG) {
16000 assert(VT.getSizeInBits() >= 256 &&(static_cast<void> (0))
16001 "Only for 256-bit or wider vector shuffles!")(static_cast<void> (0));
16002 assert(V1.getSimpleValueType() == VT && "Bad operand type!")(static_cast<void> (0));
16003 assert(V2.getSimpleValueType() == VT && "Bad operand type!")(static_cast<void> (0));
16004
16005 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
16006 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
16007
16008 int NumElements = VT.getVectorNumElements();
16009 int SplitNumElements = NumElements / 2;
16010 MVT ScalarVT = VT.getVectorElementType();
16011 MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements);
16012
16013 // Use splitVector/extractSubVector so that split build-vectors just build two
16014 // narrower build vectors. This helps shuffling with splats and zeros.
16015 auto SplitVector = [&](SDValue V) {
16016 SDValue LoV, HiV;
16017 std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL);
16018 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
16019 DAG.getBitcast(SplitVT, HiV));
16020 };
16021
16022 SDValue LoV1, HiV1, LoV2, HiV2;
16023 std::tie(LoV1, HiV1) = SplitVector(V1);
16024 std::tie(LoV2, HiV2) = SplitVector(V2);
16025
16026 // Now create two 4-way blends of these half-width vectors.
16027 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
16028 bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
16029 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
16030 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
16031 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
16032 for (int i = 0; i < SplitNumElements; ++i) {
16033 int M = HalfMask[i];
16034 if (M >= NumElements) {
16035 if (M >= NumElements + SplitNumElements)
16036 UseHiV2 = true;
16037 else
16038 UseLoV2 = true;
16039 V2BlendMask[i] = M - NumElements;
16040 BlendMask[i] = SplitNumElements + i;
16041 } else if (M >= 0) {
16042 if (M >= SplitNumElements)
16043 UseHiV1 = true;
16044 else
16045 UseLoV1 = true;
16046 V1BlendMask[i] = M;
16047 BlendMask[i] = i;
16048 }
16049 }
16050
16051 // Because the lowering happens after all combining takes place, we need to
16052 // manually combine these blend masks as much as possible so that we create
16053 // a minimal number of high-level vector shuffle nodes.
16054
16055 // First try just blending the halves of V1 or V2.
16056 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
16057 return DAG.getUNDEF(SplitVT);
16058 if (!UseLoV2 && !UseHiV2)
16059 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
16060 if (!UseLoV1 && !UseHiV1)
16061 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
16062
16063 SDValue V1Blend, V2Blend;
16064 if (UseLoV1 && UseHiV1) {
16065 V1Blend =
16066 DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
16067 } else {
16068 // We only use half of V1 so map the usage down into the final blend mask.
16069 V1Blend = UseLoV1 ? LoV1 : HiV1;
16070 for (int i = 0; i < SplitNumElements; ++i)
16071 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
16072 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
16073 }
16074 if (UseLoV2 && UseHiV2) {
16075 V2Blend =
16076 DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
16077 } else {
16078 // We only use half of V2 so map the usage down into the final blend mask.
16079 V2Blend = UseLoV2 ? LoV2 : HiV2;
16080 for (int i = 0; i < SplitNumElements; ++i)
16081 if (BlendMask[i] >= SplitNumElements)
16082 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
16083 }
16084 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
16085 };
16086 SDValue Lo = HalfBlend(LoMask);
16087 SDValue Hi = HalfBlend(HiMask);
16088 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
16089}
16090
16091/// Either split a vector in halves or decompose the shuffles and the
16092/// blend/unpack.
16093///
16094/// This is provided as a good fallback for many lowerings of non-single-input
16095/// shuffles with more than one 128-bit lane. In those cases, we want to select
16096/// between splitting the shuffle into 128-bit components and stitching those
16097/// back together vs. extracting the single-input shuffles and blending those
16098/// results.
16099static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,
16100 SDValue V2, ArrayRef<int> Mask,
16101 const X86Subtarget &Subtarget,
16102 SelectionDAG &DAG) {
16103 assert(!V2.isUndef() && "This routine must not be used to lower single-input "(static_cast<void> (0))
16104 "shuffles as it could then recurse on itself.")(static_cast<void> (0));
16105 int Size = Mask.size();
16106
16107 // If this can be modeled as a broadcast of two elements followed by a blend,
16108 // prefer that lowering. This is especially important because broadcasts can
16109 // often fold with memory operands.
16110 auto DoBothBroadcast = [&] {
16111 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
16112 for (int M : Mask)
16113 if (M >= Size) {
16114 if (V2BroadcastIdx < 0)
16115 V2BroadcastIdx = M - Size;
16116 else if (M - Size != V2BroadcastIdx)
16117 return false;
16118 } else if (M >= 0) {
16119 if (V1BroadcastIdx < 0)
16120 V1BroadcastIdx = M;
16121 else if (M != V1BroadcastIdx)
16122 return false;
16123 }
16124 return true;
16125 };
16126 if (DoBothBroadcast())
12
Taking false branch
16127 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
16128 DAG);
16129
16130 // If the inputs all stem from a single 128-bit lane of each input, then we
16131 // split them rather than blending because the split will decompose to
16132 // unusually few instructions.
16133 int LaneCount = VT.getSizeInBits() / 128;
16134 int LaneSize = Size / LaneCount;
16135 SmallBitVector LaneInputs[2];
16136 LaneInputs[0].resize(LaneCount, false);
16137 LaneInputs[1].resize(LaneCount, false);
13
Calling 'SmallBitVector::resize'
20
Returned allocated memory
16138 for (int i = 0; i < Size; ++i)
21
Assuming 'i' is < 'Size'
22
Loop condition is true. Entering loop body
16139 if (Mask[i] >= 0)
23
Assuming the condition is true
24
Taking true branch
16140 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
25
Calling 'reference::operator='
16141 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
16142 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
16143
16144 // Otherwise, just fall back to decomposed shuffles and a blend/unpack. This
16145 // requires that the decomposed single-input shuffles don't end up here.
16146 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
16147 DAG);
16148}
16149
16150// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
16151// TODO: Extend to support v8f32 (+ 512-bit shuffles).
16152static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT,
16153 SDValue V1, SDValue V2,
16154 ArrayRef<int> Mask,
16155 SelectionDAG &DAG) {
16156 assert(VT == MVT::v4f64 && "Only for v4f64 shuffles")(static_cast<void> (0));
16157
16158 int LHSMask[4] = {-1, -1, -1, -1};
16159 int RHSMask[4] = {-1, -1, -1, -1};
16160 unsigned SHUFPMask = 0;
16161
16162 // As SHUFPD uses a single LHS/RHS element per lane, we can always
16163 // perform the shuffle once the lanes have been shuffled in place.
16164 for (int i = 0; i != 4; ++i) {
16165 int M = Mask[i];
16166 if (M < 0)
16167 continue;
16168 int LaneBase = i & ~1;
16169 auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
16170 LaneMask[LaneBase + (M & 1)] = M;
16171 SHUFPMask |= (M & 1) << i;
16172 }
16173
16174 SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);
16175 SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);
16176 return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,
16177 DAG.getTargetConstant(SHUFPMask, DL, MVT::i8));
16178}
16179
16180/// Lower a vector shuffle crossing multiple 128-bit lanes as
16181/// a lane permutation followed by a per-lane permutation.
16182///
16183/// This is mainly for cases where we can have non-repeating permutes
16184/// in each lane.
16185///
16186/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,
16187/// we should investigate merging them.
16188static SDValue lowerShuffleAsLanePermuteAndPermute(
16189 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16190 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
16191 int NumElts = VT.getVectorNumElements();
16192 int NumLanes = VT.getSizeInBits() / 128;
16193 int NumEltsPerLane = NumElts / NumLanes;
16194 bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef();
16195
16196 /// Attempts to find a sublane permute with the given size
16197 /// that gets all elements into their target lanes.
16198 ///
16199 /// If successful, fills CrossLaneMask and InLaneMask and returns true.
16200 /// If unsuccessful, returns false and may overwrite InLaneMask.
16201 auto getSublanePermute = [&](int NumSublanes) -> SDValue {
16202 int NumSublanesPerLane = NumSublanes / NumLanes;
16203 int NumEltsPerSublane = NumElts / NumSublanes;
16204
16205 SmallVector<int, 16> CrossLaneMask;
16206 SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef);
16207 // CrossLaneMask but one entry == one sublane.
16208 SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef);
16209
16210 for (int i = 0; i != NumElts; ++i) {
16211 int M = Mask[i];
16212 if (M < 0)
16213 continue;
16214
16215 int SrcSublane = M / NumEltsPerSublane;
16216 int DstLane = i / NumEltsPerLane;
16217
16218 // We only need to get the elements into the right lane, not sublane.
16219 // So search all sublanes that make up the destination lane.
16220 bool Found = false;
16221 int DstSubStart = DstLane * NumSublanesPerLane;
16222 int DstSubEnd = DstSubStart + NumSublanesPerLane;
16223 for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {
16224 if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))
16225 continue;
16226
16227 Found = true;
16228 CrossLaneMaskLarge[DstSublane] = SrcSublane;
16229 int DstSublaneOffset = DstSublane * NumEltsPerSublane;
16230 InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;
16231 break;
16232 }
16233 if (!Found)
16234 return SDValue();
16235 }
16236
16237 // Fill CrossLaneMask using CrossLaneMaskLarge.
16238 narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask);
16239
16240 if (!CanUseSublanes) {
16241 // If we're only shuffling a single lowest lane and the rest are identity
16242 // then don't bother.
16243 // TODO - isShuffleMaskInputInPlace could be extended to something like
16244 // this.
16245 int NumIdentityLanes = 0;
16246 bool OnlyShuffleLowestLane = true;
16247 for (int i = 0; i != NumLanes; ++i) {
16248 int LaneOffset = i * NumEltsPerLane;
16249 if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane,
16250 i * NumEltsPerLane))
16251 NumIdentityLanes++;
16252 else if (CrossLaneMask[LaneOffset] != 0)
16253 OnlyShuffleLowestLane = false;
16254 }
16255 if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
16256 return SDValue();
16257 }
16258
16259 SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);
16260 return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),
16261 InLaneMask);
16262 };
16263
16264 // First attempt a solution with full lanes.
16265 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes))
16266 return V;
16267
16268 // The rest of the solutions use sublanes.
16269 if (!CanUseSublanes)
16270 return SDValue();
16271
16272 // Then attempt a solution with 64-bit sublanes (vpermq).
16273 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2))
16274 return V;
16275
16276 // If that doesn't work and we have fast variable cross-lane shuffle,
16277 // attempt 32-bit sublanes (vpermd).
16278 if (!Subtarget.hasFastVariableCrossLaneShuffle())
16279 return SDValue();
16280
16281 return getSublanePermute(/*NumSublanes=*/NumLanes * 4);
16282}
16283
16284/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
16285/// source with a lane permutation.
16286///
16287/// This lowering strategy results in four instructions in the worst case for a
16288/// single-input cross lane shuffle which is lower than any other fully general
16289/// cross-lane shuffle strategy I'm aware of. Special cases for each particular
16290/// shuffle pattern should be handled prior to trying this lowering.
16291static SDValue lowerShuffleAsLanePermuteAndShuffle(
16292 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16293 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
16294 // FIXME: This should probably be generalized for 512-bit vectors as well.
16295 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!")(static_cast<void> (0));
16296 int Size = Mask.size();
16297 int LaneSize = Size / 2;
16298
16299 // Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
16300 // Only do this if the elements aren't all from the lower lane,
16301 // otherwise we're (probably) better off doing a split.
16302 if (VT == MVT::v4f64 &&
16303 !all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))
16304 if (SDValue V =
16305 lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG))
16306 return V;
16307
16308 // If there are only inputs from one 128-bit lane, splitting will in fact be
16309 // less expensive. The flags track whether the given lane contains an element
16310 // that crosses to another lane.
16311 if (!Subtarget.hasAVX2()) {
16312 bool LaneCrossing[2] = {false, false};
16313 for (int i = 0; i < Size; ++i)
16314 if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
16315 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
16316 if (!LaneCrossing[0] || !LaneCrossing[1])
16317 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
16318 } else {
16319 bool LaneUsed[2] = {false, false};
16320 for (int i = 0; i < Size; ++i)
16321 if (Mask[i] >= 0)
16322 LaneUsed[(Mask[i] % Size) / LaneSize] = true;
16323 if (!LaneUsed[0] || !LaneUsed[1])
16324 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
16325 }
16326
16327 // TODO - we could support shuffling V2 in the Flipped input.
16328 assert(V2.isUndef() &&(static_cast<void> (0))
16329 "This last part of this routine only works on single input shuffles")(static_cast<void> (0));
16330
16331 SmallVector<int, 32> InLaneMask(Mask.begin(), Mask.end());
16332 for (int i = 0; i < Size; ++i) {
16333 int &M = InLaneMask[i];
16334 if (M < 0)
16335 continue;
16336 if (((M % Size) / LaneSize) != (i / LaneSize))
16337 M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
16338 }
16339 assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&(static_cast<void> (0))
16340 "In-lane shuffle mask expected")(static_cast<void> (0));
16341
16342 // Flip the lanes, and shuffle the results which should now be in-lane.
16343 MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
16344 SDValue Flipped = DAG.getBitcast(PVT, V1);
16345 Flipped =
16346 DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});
16347 Flipped = DAG.getBitcast(VT, Flipped);
16348 return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);
16349}
16350
16351/// Handle lowering 2-lane 128-bit shuffles.
16352static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,
16353 SDValue V2, ArrayRef<int> Mask,
16354 const APInt &Zeroable,
16355 const X86Subtarget &Subtarget,
16356 SelectionDAG &DAG) {
16357 if (V2.isUndef()) {
16358 // Attempt to match VBROADCAST*128 subvector broadcast load.
16359 bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1);
16360 bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1);
16361 if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() &&
16362 MayFoldLoad(peekThroughOneUseBitcasts(V1))) {
16363 auto *Ld = cast<LoadSDNode>(peekThroughOneUseBitcasts(V1));
16364 if (!Ld->isNonTemporal()) {
16365 MVT MemVT = VT.getHalfNumVectorElementsVT();
16366 unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
16367 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
16368 SDValue Ptr = DAG.getMemBasePlusOffset(Ld->getBasePtr(),
16369 TypeSize::Fixed(Ofs), DL);
16370 SDValue Ops[] = {Ld->getChain(), Ptr};
16371 SDValue BcastLd = DAG.getMemIntrinsicNode(
16372 X86ISD::SUBV_BROADCAST_LOAD, DL, Tys, Ops, MemVT,
16373 DAG.getMachineFunction().getMachineMemOperand(
16374 Ld->getMemOperand(), Ofs, MemVT.getStoreSize()));
16375 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), BcastLd.getValue(1));
16376 return BcastLd;
16377 }
16378 }
16379
16380 // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
16381 if (Subtarget.hasAVX2())
16382 return SDValue();
16383 }
16384
16385 bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());
16386
16387 SmallVector<int, 4> WidenedMask;
16388 if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))
16389 return SDValue();
16390
16391 bool IsLowZero = (Zeroable & 0x3) == 0x3;
16392 bool IsHighZero = (Zeroable & 0xc) == 0xc;
16393
16394 // Try to use an insert into a zero vector.
16395 if (WidenedMask[0] == 0 && IsHighZero) {
16396 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
16397 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
16398 DAG.getIntPtrConstant(0, DL));
16399 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
16400 getZeroVector(VT, Subtarget, DAG, DL), LoV,
16401 DAG.getIntPtrConstant(0, DL));
16402 }
16403
16404 // TODO: If minimizing size and one of the inputs is a zero vector and the
16405 // the zero vector has only one use, we could use a VPERM2X128 to save the
16406 // instruction bytes needed to explicitly generate the zero vector.
16407
16408 // Blends are faster and handle all the non-lane-crossing cases.
16409 if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
16410 Subtarget, DAG))
16411 return Blend;
16412
16413 // If either input operand is a zero vector, use VPERM2X128 because its mask
16414 // allows us to replace the zero input with an implicit zero.
16415 if (!IsLowZero && !IsHighZero) {
16416 // Check for patterns which can be matched with a single insert of a 128-bit
16417 // subvector.
16418 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2);
16419 if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) {
16420
16421 // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
16422 // this will likely become vinsertf128 which can't fold a 256-bit memop.
16423 if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
16424 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
16425 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
16426 OnlyUsesV1 ? V1 : V2,
16427 DAG.getIntPtrConstant(0, DL));
16428 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
16429 DAG.getIntPtrConstant(2, DL));
16430 }
16431 }
16432
16433 // Try to use SHUF128 if possible.
16434 if (Subtarget.hasVLX()) {
16435 if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
16436 unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
16437 ((WidenedMask[1] % 2) << 1);
16438 return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
16439 DAG.getTargetConstant(PermMask, DL, MVT::i8));
16440 }
16441 }
16442 }
16443
16444 // Otherwise form a 128-bit permutation. After accounting for undefs,
16445 // convert the 64-bit shuffle mask selection values into 128-bit
16446 // selection bits by dividing the indexes by 2 and shifting into positions
16447 // defined by a vperm2*128 instruction's immediate control byte.
16448
16449 // The immediate permute control byte looks like this:
16450 // [1:0] - select 128 bits from sources for low half of destination
16451 // [2] - ignore
16452 // [3] - zero low half of destination
16453 // [5:4] - select 128 bits from sources for high half of destination
16454 // [6] - ignore
16455 // [7] - zero high half of destination
16456
16457 assert((WidenedMask[0] >= 0 || IsLowZero) &&(static_cast<void> (0))
16458 (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?")(static_cast<void> (0));
16459
16460 unsigned PermMask = 0;
16461 PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
16462 PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
16463
16464 // Check the immediate mask and replace unused sources with undef.
16465 if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
16466 V1 = DAG.getUNDEF(VT);
16467 if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
16468 V2 = DAG.getUNDEF(VT);
16469
16470 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
16471 DAG.getTargetConstant(PermMask, DL, MVT::i8));
16472}
16473
16474/// Lower a vector shuffle by first fixing the 128-bit lanes and then
16475/// shuffling each lane.
16476///
16477/// This attempts to create a repeated lane shuffle where each lane uses one
16478/// or two of the lanes of the inputs. The lanes of the input vectors are
16479/// shuffled in one or two independent shuffles to get the lanes into the
16480/// position needed by the final shuffle.
16481static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(
16482 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16483 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
16484 assert(!V2.isUndef() && "This is only useful with multiple inputs.")(static_cast<void> (0));
16485
16486 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
16487 return SDValue();
16488
16489 int NumElts = Mask.size();
16490 int NumLanes = VT.getSizeInBits() / 128;
16491 int NumLaneElts = 128 / VT.getScalarSizeInBits();
16492 SmallVector<int, 16> RepeatMask(NumLaneElts, -1);
16493 SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});
16494
16495 // First pass will try to fill in the RepeatMask from lanes that need two
16496 // sources.
16497 for (int Lane = 0; Lane != NumLanes; ++Lane) {
16498 int Srcs[2] = {-1, -1};
16499 SmallVector<int, 16> InLaneMask(NumLaneElts, -1);
16500 for (int i = 0; i != NumLaneElts; ++i) {
16501 int M = Mask[(Lane * NumLaneElts) + i];
16502 if (M < 0)
16503 continue;
16504 // Determine which of the possible input lanes (NumLanes from each source)
16505 // this element comes from. Assign that as one of the sources for this
16506 // lane. We can assign up to 2 sources for this lane. If we run out
16507 // sources we can't do anything.
16508 int LaneSrc = M / NumLaneElts;
16509 int Src;
16510 if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
16511 Src = 0;
16512 else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)
16513 Src = 1;
16514 else
16515 return SDValue();
16516
16517 Srcs[Src] = LaneSrc;
16518 InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;
16519 }
16520
16521 // If this lane has two sources, see if it fits with the repeat mask so far.
16522 if (Srcs[1] < 0)
16523 continue;
16524
16525 LaneSrcs[Lane][0] = Srcs[0];
16526 LaneSrcs[Lane][1] = Srcs[1];
16527
16528 auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {
16529 assert(M1.size() == M2.size() && "Unexpected mask size")(static_cast<void> (0));
16530 for (int i = 0, e = M1.size(); i != e; ++i)
16531 if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])
16532 return false;
16533 return true;
16534 };
16535
16536 auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {
16537 assert(Mask.size() == MergedMask.size() && "Unexpected mask size")(static_cast<void> (0));
16538 for (int i = 0, e = MergedMask.size(); i != e; ++i) {
16539 int M = Mask[i];
16540 if (M < 0)
16541 continue;
16542 assert((MergedMask[i] < 0 || MergedMask[i] == M) &&(static_cast<void> (0))
16543 "Unexpected mask element")(static_cast<void> (0));
16544 MergedMask[i] = M;
16545 }
16546 };
16547
16548 if (MatchMasks(InLaneMask, RepeatMask)) {
16549 // Merge this lane mask into the final repeat mask.
16550 MergeMasks(InLaneMask, RepeatMask);
16551 continue;
16552 }
16553
16554 // Didn't find a match. Swap the operands and try again.
16555 std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
16556 ShuffleVectorSDNode::commuteMask(InLaneMask);
16557
16558 if (MatchMasks(InLaneMask, RepeatMask)) {
16559 // Merge this lane mask into the final repeat mask.
16560 MergeMasks(InLaneMask, RepeatMask);
16561 continue;
16562 }
16563
16564 // Couldn't find a match with the operands in either order.
16565 return SDValue();
16566 }
16567
16568 // Now handle any lanes with only one source.
16569 for (int Lane = 0; Lane != NumLanes; ++Lane) {
16570 // If this lane has already been processed, skip it.
16571 if (LaneSrcs[Lane][0] >= 0)
16572 continue;
16573
16574 for (int i = 0; i != NumLaneElts; ++i) {
16575 int M = Mask[(Lane * NumLaneElts) + i];
16576 if (M < 0)
16577 continue;
16578
16579 // If RepeatMask isn't defined yet we can define it ourself.
16580 if (RepeatMask[i] < 0)
16581 RepeatMask[i] = M % NumLaneElts;
16582
16583 if (RepeatMask[i] < NumElts) {
16584 if (RepeatMask[i] != M % NumLaneElts)
16585 return SDValue();
16586 LaneSrcs[Lane][0] = M / NumLaneElts;
16587 } else {
16588 if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))
16589 return SDValue();
16590 LaneSrcs[Lane][1] = M / NumLaneElts;
16591 }
16592 }
16593
16594 if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
16595 return SDValue();
16596 }
16597
16598 SmallVector<int, 16> NewMask(NumElts, -1);
16599 for (int Lane = 0; Lane != NumLanes; ++Lane) {
16600 int Src = LaneSrcs[Lane][0];
16601 for (int i = 0; i != NumLaneElts; ++i) {
16602 int M = -1;
16603 if (Src >= 0)
16604 M = Src * NumLaneElts + i;
16605 NewMask[Lane * NumLaneElts + i] = M;
16606 }
16607 }
16608 SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
16609 // Ensure we didn't get back the shuffle we started with.
16610 // FIXME: This is a hack to make up for some splat handling code in
16611 // getVectorShuffle.
16612 if (isa<ShuffleVectorSDNode>(NewV1) &&
16613 cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
16614 return SDValue();
16615
16616 for (int Lane = 0; Lane != NumLanes; ++Lane) {
16617 int Src = LaneSrcs[Lane][1];
16618 for (int i = 0; i != NumLaneElts; ++i) {
16619 int M = -1;
16620 if (Src >= 0)
16621 M = Src * NumLaneElts + i;
16622 NewMask[Lane * NumLaneElts + i] = M;
16623 }
16624 }
16625 SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
16626 // Ensure we didn't get back the shuffle we started with.
16627 // FIXME: This is a hack to make up for some splat handling code in
16628 // getVectorShuffle.
16629 if (isa<ShuffleVectorSDNode>(NewV2) &&
16630 cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
16631 return SDValue();
16632
16633 for (int i = 0; i != NumElts; ++i) {
16634 NewMask[i] = RepeatMask[i % NumLaneElts];
16635 if (NewMask[i] < 0)
16636 continue;
16637
16638 NewMask[i] += (i / NumLaneElts) * NumLaneElts;
16639 }
16640 return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
16641}
16642
16643/// If the input shuffle mask results in a vector that is undefined in all upper
16644/// or lower half elements and that mask accesses only 2 halves of the
16645/// shuffle's operands, return true. A mask of half the width with mask indexes
16646/// adjusted to access the extracted halves of the original shuffle operands is
16647/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
16648/// lower half of each input operand is accessed.
16649static bool
16650getHalfShuffleMask(ArrayRef<int> Mask, MutableArrayRef<int> HalfMask,
16651 int &HalfIdx1, int &HalfIdx2) {
16652 assert((Mask.size() == HalfMask.size() * 2) &&(static_cast<void> (0))
16653 "Expected input mask to be twice as long as output")(static_cast<void> (0));
16654
16655 // Exactly one half of the result must be undef to allow narrowing.
16656 bool UndefLower = isUndefLowerHalf(Mask);
16657 bool UndefUpper = isUndefUpperHalf(Mask);
16658 if (UndefLower == UndefUpper)
16659 return false;
16660
16661 unsigned HalfNumElts = HalfMask.size();
16662 unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
16663 HalfIdx1 = -1;
16664 HalfIdx2 = -1;
16665 for (unsigned i = 0; i != HalfNumElts; ++i) {
16666 int M = Mask[i + MaskIndexOffset];
16667 if (M < 0) {
16668 HalfMask[i] = M;
16669 continue;
16670 }
16671
16672 // Determine which of the 4 half vectors this element is from.
16673 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
16674 int HalfIdx = M / HalfNumElts;
16675
16676 // Determine the element index into its half vector source.
16677 int HalfElt = M % HalfNumElts;
16678
16679 // We can shuffle with up to 2 half vectors, set the new 'half'
16680 // shuffle mask accordingly.
16681 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
16682 HalfMask[i] = HalfElt;
16683 HalfIdx1 = HalfIdx;
16684 continue;
16685 }
16686 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
16687 HalfMask[i] = HalfElt + HalfNumElts;
16688 HalfIdx2 = HalfIdx;
16689 continue;
16690 }
16691
16692 // Too many half vectors referenced.
16693 return false;
16694 }
16695
16696 return true;
16697}
16698
16699/// Given the output values from getHalfShuffleMask(), create a half width
16700/// shuffle of extracted vectors followed by an insert back to full width.
16701static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2,
16702 ArrayRef<int> HalfMask, int HalfIdx1,
16703 int HalfIdx2, bool UndefLower,
16704 SelectionDAG &DAG, bool UseConcat = false) {
16705 assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?")(static_cast<void> (0));
16706 assert(V1.getValueType().isSimple() && "Expecting only simple types")(static_cast<void> (0));
16707
16708 MVT VT = V1.getSimpleValueType();
16709 MVT HalfVT = VT.getHalfNumVectorElementsVT();
16710 unsigned HalfNumElts = HalfVT.getVectorNumElements();
16711
16712 auto getHalfVector = [&](int HalfIdx) {
16713 if (HalfIdx < 0)
16714 return DAG.getUNDEF(HalfVT);
16715 SDValue V = (HalfIdx < 2 ? V1 : V2);
16716 HalfIdx = (HalfIdx % 2) * HalfNumElts;
16717 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
16718 DAG.getIntPtrConstant(HalfIdx, DL));
16719 };
16720
16721 // ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset
16722 SDValue Half1 = getHalfVector(HalfIdx1);
16723 SDValue Half2 = getHalfVector(HalfIdx2);
16724 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
16725 if (UseConcat) {
16726 SDValue Op0 = V;
16727 SDValue Op1 = DAG.getUNDEF(HalfVT);
16728 if (UndefLower)
16729 std::swap(Op0, Op1);
16730 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);
16731 }
16732
16733 unsigned Offset = UndefLower ? HalfNumElts : 0;
16734 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
16735 DAG.getIntPtrConstant(Offset, DL));
16736}
16737
16738/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
16739/// This allows for fast cases such as subvector extraction/insertion
16740/// or shuffling smaller vector types which can lower more efficiently.
16741static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1,
16742 SDValue V2, ArrayRef<int> Mask,
16743 const X86Subtarget &Subtarget,
16744 SelectionDAG &DAG) {
16745 assert((VT.is256BitVector() || VT.is512BitVector()) &&(static_cast<void> (0))
16746 "Expected 256-bit or 512-bit vector")(static_cast<void> (0));
16747
16748 bool UndefLower = isUndefLowerHalf(Mask);
16749 if (!UndefLower && !isUndefUpperHalf(Mask))
16750 return SDValue();
16751
16752 assert((!UndefLower || !isUndefUpperHalf(Mask)) &&(static_cast<void> (0))
16753 "Completely undef shuffle mask should have been simplified already")(static_cast<void> (0));
16754
16755 // Upper half is undef and lower half is whole upper subvector.
16756 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
16757 MVT HalfVT = VT.getHalfNumVectorElementsVT();
16758 unsigned HalfNumElts = HalfVT.getVectorNumElements();
16759 if (!UndefLower &&
16760 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
16761 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
16762 DAG.getIntPtrConstant(HalfNumElts, DL));
16763 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
16764 DAG.getIntPtrConstant(0, DL));
16765 }
16766
16767 // Lower half is undef and upper half is whole lower subvector.
16768 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
16769 if (UndefLower &&
16770 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
16771 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
16772 DAG.getIntPtrConstant(0, DL));
16773 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
16774 DAG.getIntPtrConstant(HalfNumElts, DL));
16775 }
16776
16777 int HalfIdx1, HalfIdx2;
16778 SmallVector<int, 8> HalfMask(HalfNumElts);
16779 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))
16780 return SDValue();
16781
16782 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length")(static_cast<void> (0));
16783
16784 // Only shuffle the halves of the inputs when useful.
16785 unsigned NumLowerHalves =
16786 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
16787 unsigned NumUpperHalves =
16788 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
16789 assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed")(static_cast<void> (0));
16790
16791 // Determine the larger pattern of undef/halves, then decide if it's worth
16792 // splitting the shuffle based on subtarget capabilities and types.
16793 unsigned EltWidth = VT.getVectorElementType().getSizeInBits();
16794 if (!UndefLower) {
16795 // XXXXuuuu: no insert is needed.
16796 // Always extract lowers when setting lower - these are all free subreg ops.
16797 if (NumUpperHalves == 0)
16798 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16799 UndefLower, DAG);
16800
16801 if (NumUpperHalves == 1) {
16802 // AVX2 has efficient 32/64-bit element cross-lane shuffles.
16803 if (Subtarget.hasAVX2()) {
16804 // extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
16805 if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
16806 !is128BitUnpackShuffleMask(HalfMask) &&
16807 (!isSingleSHUFPSMask(HalfMask) ||
16808 Subtarget.hasFastVariableCrossLaneShuffle()))
16809 return SDValue();
16810 // If this is a unary shuffle (assume that the 2nd operand is
16811 // canonicalized to undef), then we can use vpermpd. Otherwise, we
16812 // are better off extracting the upper half of 1 operand and using a
16813 // narrow shuffle.
16814 if (EltWidth == 64 && V2.isUndef())
16815 return SDValue();
16816 }
16817 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
16818 if (Subtarget.hasAVX512() && VT.is512BitVector())
16819 return SDValue();
16820 // Extract + narrow shuffle is better than the wide alternative.
16821 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16822 UndefLower, DAG);
16823 }
16824
16825 // Don't extract both uppers, instead shuffle and then extract.
16826 assert(NumUpperHalves == 2 && "Half vector count went wrong")(static_cast<void> (0));
16827 return SDValue();
16828 }
16829
16830 // UndefLower - uuuuXXXX: an insert to high half is required if we split this.
16831 if (NumUpperHalves == 0) {
16832 // AVX2 has efficient 64-bit element cross-lane shuffles.
16833 // TODO: Refine to account for unary shuffle, splat, and other masks?
16834 if (Subtarget.hasAVX2() && EltWidth == 64)
16835 return SDValue();
16836 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
16837 if (Subtarget.hasAVX512() && VT.is512BitVector())
16838 return SDValue();
16839 // Narrow shuffle + insert is better than the wide alternative.
16840 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16841 UndefLower, DAG);
16842 }
16843
16844 // NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.
16845 return SDValue();
16846}
16847
16848/// Test whether the specified input (0 or 1) is in-place blended by the
16849/// given mask.
16850///
16851/// This returns true if the elements from a particular input are already in the
16852/// slot required by the given mask and require no permutation.
16853static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
16854 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.")(static_cast<void> (0));
16855 int Size = Mask.size();
16856 for (int i = 0; i < Size; ++i)
16857 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
16858 return false;
16859
16860 return true;
16861}
16862
16863/// Handle case where shuffle sources are coming from the same 128-bit lane and
16864/// every lane can be represented as the same repeating mask - allowing us to
16865/// shuffle the sources with the repeating shuffle and then permute the result
16866/// to the destination lanes.
16867static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
16868 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16869 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
16870 int NumElts = VT.getVectorNumElements();
16871 int NumLanes = VT.getSizeInBits() / 128;
16872 int NumLaneElts = NumElts / NumLanes;
16873
16874 // On AVX2 we may be able to just shuffle the lowest elements and then
16875 // broadcast the result.
16876 if (Subtarget.hasAVX2()) {
16877 for (unsigned BroadcastSize : {16, 32, 64}) {
16878 if (BroadcastSize <= VT.getScalarSizeInBits())
16879 continue;
16880 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
16881
16882 // Attempt to match a repeating pattern every NumBroadcastElts,
16883 // accounting for UNDEFs but only references the lowest 128-bit
16884 // lane of the inputs.
16885 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
16886 for (int i = 0; i != NumElts; i += NumBroadcastElts)
16887 for (int j = 0; j != NumBroadcastElts; ++j) {
16888 int M = Mask[i + j];
16889 if (M < 0)
16890 continue;
16891 int &R = RepeatMask[j];
16892 if (0 != ((M % NumElts) / NumLaneElts))
16893 return false;
16894 if (0 <= R && R != M)
16895 return false;
16896 R = M;
16897 }
16898 return true;
16899 };
16900
16901 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
16902 if (!FindRepeatingBroadcastMask(RepeatMask))
16903 continue;
16904
16905 // Shuffle the (lowest) repeated elements in place for broadcast.
16906 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
16907
16908 // Shuffle the actual broadcast.
16909 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
16910 for (int i = 0; i != NumElts; i += NumBroadcastElts)
16911 for (int j = 0; j != NumBroadcastElts; ++j)
16912 BroadcastMask[i + j] = j;
16913 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
16914 BroadcastMask);
16915 }
16916 }
16917
16918 // Bail if the shuffle mask doesn't cross 128-bit lanes.
16919 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
16920 return SDValue();
16921
16922 // Bail if we already have a repeated lane shuffle mask.
16923 SmallVector<int, 8> RepeatedShuffleMask;
16924 if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
16925 return SDValue();
16926
16927 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
16928 // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
16929 int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
16930 int NumSubLanes = NumLanes * SubLaneScale;
16931 int NumSubLaneElts = NumLaneElts / SubLaneScale;
16932
16933 // Check that all the sources are coming from the same lane and see if we can
16934 // form a repeating shuffle mask (local to each sub-lane). At the same time,
16935 // determine the source sub-lane for each destination sub-lane.
16936 int TopSrcSubLane = -1;
16937 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
16938 SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
16939 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
16940 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};
16941
16942 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
16943 // Extract the sub-lane mask, check that it all comes from the same lane
16944 // and normalize the mask entries to come from the first lane.
16945 int SrcLane = -1;
16946 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
16947 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
16948 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
16949 if (M < 0)
16950 continue;
16951 int Lane = (M % NumElts) / NumLaneElts;
16952 if ((0 <= SrcLane) && (SrcLane != Lane))
16953 return SDValue();
16954 SrcLane = Lane;
16955 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
16956 SubLaneMask[Elt] = LocalM;
16957 }
16958
16959 // Whole sub-lane is UNDEF.
16960 if (SrcLane < 0)
16961 continue;
16962
16963 // Attempt to match against the candidate repeated sub-lane masks.
16964 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
16965 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
16966 for (int i = 0; i != NumSubLaneElts; ++i) {
16967 if (M1[i] < 0 || M2[i] < 0)
16968 continue;
16969 if (M1[i] != M2[i])
16970 return false;
16971 }
16972 return true;
16973 };
16974
16975 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
16976 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
16977 continue;
16978
16979 // Merge the sub-lane mask into the matching repeated sub-lane mask.
16980 for (int i = 0; i != NumSubLaneElts; ++i) {
16981 int M = SubLaneMask[i];
16982 if (M < 0)
16983 continue;
16984 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&(static_cast<void> (0))
16985 "Unexpected mask element")(static_cast<void> (0));
16986 RepeatedSubLaneMask[i] = M;
16987 }
16988
16989 // Track the top most source sub-lane - by setting the remaining to UNDEF
16990 // we can greatly simplify shuffle matching.
16991 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
16992 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
16993 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
16994 break;
16995 }
16996
16997 // Bail if we failed to find a matching repeated sub-lane mask.
16998 if (Dst2SrcSubLanes[DstSubLane] < 0)
16999 return SDValue();
17000 }
17001 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&(static_cast<void> (0))
17002 "Unexpected source lane")(static_cast<void> (0));
17003
17004 // Create a repeating shuffle mask for the entire vector.
17005 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
17006 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
17007 int Lane = SubLane / SubLaneScale;
17008 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
17009 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
17010 int M = RepeatedSubLaneMask[Elt];
17011 if (M < 0)
17012 continue;
17013 int Idx = (SubLane * NumSubLaneElts) + Elt;
17014 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
17015 }
17016 }
17017 SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
17018
17019 // Shuffle each source sub-lane to its destination.
17020 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
17021 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
17022 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
17023 if (SrcSubLane < 0)
17024 continue;
17025 for (int j = 0; j != NumSubLaneElts; ++j)
17026 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
17027 }
17028
17029 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
17030 SubLaneMask);
17031}
17032
17033static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
17034 bool &ForceV1Zero, bool &ForceV2Zero,
17035 unsigned &ShuffleImm, ArrayRef<int> Mask,
17036 const APInt &Zeroable) {
17037 int NumElts = VT.getVectorNumElements();
17038 assert(VT.getScalarSizeInBits() == 64 &&(static_cast<void> (0))
17039 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&(static_cast<void> (0))
17040 "Unexpected data type for VSHUFPD")(static_cast<void> (0));
17041 assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&(static_cast<void> (0))
17042 "Illegal shuffle mask")(static_cast<void> (0));
17043
17044 bool ZeroLane[2] = { true, true };
17045 for (int i = 0; i < NumElts; ++i)
17046 ZeroLane[i & 1] &= Zeroable[i];
17047
17048 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
17049 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
17050 ShuffleImm = 0;
17051 bool ShufpdMask = true;
17052 bool CommutableMask = true;
17053 for (int i = 0; i < NumElts; ++i) {
17054 if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])
17055 continue;
17056 if (Mask[i] < 0)
17057 return false;
17058 int Val = (i & 6) + NumElts * (i & 1);
17059 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
17060 if (Mask[i] < Val || Mask[i] > Val + 1)
17061 ShufpdMask = false;
17062 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
17063 CommutableMask = false;
17064 ShuffleImm |= (Mask[i] % 2) << i;
17065 }
17066
17067 if (!ShufpdMask && !CommutableMask)
17068 return false;
17069
17070 if (!ShufpdMask && CommutableMask)
17071 std::swap(V1, V2);
17072
17073 ForceV1Zero = ZeroLane[0];
17074 ForceV2Zero = ZeroLane[1];
17075 return true;
17076}
17077
17078static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1,
17079 SDValue V2, ArrayRef<int> Mask,
17080 const APInt &Zeroable,
17081 const X86Subtarget &Subtarget,
17082 SelectionDAG &DAG) {
17083 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&(static_cast<void> (0))
17084 "Unexpected data type for VSHUFPD")(static_cast<void> (0));
17085
17086 unsigned Immediate = 0;
17087 bool ForceV1Zero = false, ForceV2Zero = false;
17088 if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,
17089 Mask, Zeroable))
17090 return SDValue();
17091
17092 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
17093 if (ForceV1Zero)
17094 V1 = getZeroVector(VT, Subtarget, DAG, DL);
17095 if (ForceV2Zero)
17096 V2 = getZeroVector(VT, Subtarget, DAG, DL);
17097
17098 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
17099 DAG.getTargetConstant(Immediate, DL, MVT::i8));
17100}
17101
17102// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
17103// by zeroable elements in the remaining 24 elements. Turn this into two
17104// vmovqb instructions shuffled together.
17105static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT,
17106 SDValue V1, SDValue V2,
17107 ArrayRef<int> Mask,
17108 const APInt &Zeroable,
17109 SelectionDAG &DAG) {
17110 assert(VT == MVT::v32i8 && "Unexpected type!")(static_cast<void> (0));
17111
17112 // The first 8 indices should be every 8th element.
17113 if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))
17114 return SDValue();
17115
17116 // Remaining elements need to be zeroable.
17117 if (Zeroable.countLeadingOnes() < (Mask.size() - 8))
17118 return SDValue();
17119
17120 V1 = DAG.getBitcast(MVT::v4i64, V1);
17121 V2 = DAG.getBitcast(MVT::v4i64, V2);
17122
17123 V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);
17124 V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);
17125
17126 // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in
17127 // the upper bits of the result using an unpckldq.
17128 SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,
17129 { 0, 1, 2, 3, 16, 17, 18, 19,
17130 4, 5, 6, 7, 20, 21, 22, 23 });
17131 // Insert the unpckldq into a zero vector to widen to v32i8.
17132 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
17133 DAG.getConstant(0, DL, MVT::v32i8), Unpack,
17134 DAG.getIntPtrConstant(0, DL));
17135}
17136
17137
17138/// Handle lowering of 4-lane 64-bit floating point shuffles.
17139///
17140/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
17141/// isn't available.
17142static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17143 const APInt &Zeroable, SDValue V1, SDValue V2,
17144 const X86Subtarget &Subtarget,
17145 SelectionDAG &DAG) {
17146 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!")(static_cast<void> (0));
17147 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!")(static_cast<void> (0));
17148 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast<void> (0));
17149
17150 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
17151 Subtarget, DAG))
17152 return V;
17153
17154 if (V2.isUndef()) {
17155 // Check for being able to broadcast a single element.
17156 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,
17157 Mask, Subtarget, DAG))
17158 return Broadcast;
17159
17160 // Use low duplicate instructions for masks that match their pattern.
17161 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
17162 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
17163
17164 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
17165 // Non-half-crossing single input shuffles can be lowered with an
17166 // interleaved permutation.
17167 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
17168 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
17169 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
17170 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
17171 }
17172
17173 // With AVX2 we have direct support for this permutation.
17174 if (Subtarget.hasAVX2())
17175 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
17176 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
17177
17178 // Try to create an in-lane repeating shuffle mask and then shuffle the
17179 // results into the target lanes.
17180 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17181 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
17182 return V;
17183
17184 // Try to permute the lanes and then use a per-lane permute.
17185 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,
17186 Mask, DAG, Subtarget))
17187 return V;
17188
17189 // Otherwise, fall back.
17190 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,
17191 DAG, Subtarget);
17192 }
17193
17194 // Use dedicated unpack instructions for masks that match their pattern.
17195 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
17196 return V;
17197
17198 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
17199 Zeroable, Subtarget, DAG))
17200 return Blend;
17201
17202 // Check if the blend happens to exactly fit that of SHUFPD.
17203 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,
17204 Zeroable, Subtarget, DAG))
17205 return Op;
17206
17207 // If we have lane crossing shuffles AND they don't all come from the lower
17208 // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
17209 // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently
17210 // canonicalize to a blend of splat which isn't necessary for this combine.
17211 if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&
17212 !all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&
17213 (V1.getOpcode() != ISD::BUILD_VECTOR) &&
17214 (V2.getOpcode() != ISD::BUILD_VECTOR))
17215 if (SDValue Op = lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2,
17216 Mask, DAG))
17217 return Op;
17218
17219 // If we have one input in place, then we can permute the other input and
17220 // blend the result.
17221 if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask))
17222 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
17223 Subtarget, DAG);
17224
17225 // Try to create an in-lane repeating shuffle mask and then shuffle the
17226 // results into the target lanes.
17227 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17228 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
17229 return V;
17230
17231 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17232 // shuffle. However, if we have AVX2 and either inputs are already in place,
17233 // we will be able to shuffle even across lanes the other input in a single
17234 // instruction so skip this pattern.
17235 if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
17236 isShuffleMaskInputInPlace(1, Mask))))
17237 if (SDValue V = lowerShuffleAsLanePermuteAndRepeatedMask(
17238 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
17239 return V;
17240
17241 // If we have VLX support, we can use VEXPAND.
17242 if (Subtarget.hasVLX())
17243 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, V1, V2,
17244 DAG, Subtarget))
17245 return V;
17246
17247 // If we have AVX2 then we always want to lower with a blend because an v4 we
17248 // can fully permute the elements.
17249 if (Subtarget.hasAVX2())
17250 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
17251 Subtarget, DAG);
17252
17253 // Otherwise fall back on generic lowering.
17254 return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask,
17255 Subtarget, DAG);
17256}
17257
17258/// Handle lowering of 4-lane 64-bit integer shuffles.
17259///
17260/// This routine is only called when we have AVX2 and thus a reasonable
17261/// instruction set for v4i64 shuffling..
17262static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17263 const APInt &Zeroable, SDValue V1, SDValue V2,
17264 const X86Subtarget &Subtarget,
17265 SelectionDAG &DAG) {
17266 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!")(static_cast<void> (0));
17267 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!")(static_cast<void> (0));
17268 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast<void> (0));
17269 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!")(static_cast<void> (0));
17270
17271 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
17272 Subtarget, DAG))
17273 return V;
17274
17275 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
17276 Zeroable, Subtarget, DAG))
17277 return Blend;
17278
17279 // Check for being able to broadcast a single element.
17280 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
17281 Subtarget, DAG))
17282 return Broadcast;
17283
17284 if (V2.isUndef()) {
17285 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
17286 // can use lower latency instructions that will operate on both lanes.
17287 SmallVector<int, 2> RepeatedMask;
17288 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
17289 SmallVector<int, 4> PSHUFDMask;
17290 narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask);
17291 return DAG.getBitcast(
17292 MVT::v4i64,
17293 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
17294 DAG.getBitcast(MVT::v8i32, V1),
17295 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
17296 }
17297
17298 // AVX2 provides a direct instruction for permuting a single input across
17299 // lanes.
17300 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
17301 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
17302 }
17303
17304 // Try to use shift instructions.
17305 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
17306 Zeroable, Subtarget, DAG))
17307 return Shift;
17308
17309 // If we have VLX support, we can use VALIGN or VEXPAND.
17310 if (Subtarget.hasVLX()) {
17311 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,
17312 Subtarget, DAG))
17313 return Rotate;
17314
17315 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, V1, V2,
17316 DAG, Subtarget))
17317 return V;
17318 }
17319
17320 // Try to use PALIGNR.
17321 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,
17322 Subtarget, DAG))
17323 return Rotate;
17324
17325 // Use dedicated unpack instructions for masks that match their pattern.
17326 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
17327 return V;
17328
17329 // If we have one input in place, then we can permute the other input and
17330 // blend the result.
17331 if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask))
17332 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
17333 Subtarget, DAG);
17334
17335 // Try to create an in-lane repeating shuffle mask and then shuffle the
17336 // results into the target lanes.
17337 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17338 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
17339 return V;
17340
17341 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17342 // shuffle. However, if we have AVX2 and either inputs are already in place,
17343 // we will be able to shuffle even across lanes the other input in a single
17344 // instruction so skip this pattern.
17345 if (!isShuffleMaskInputInPlace(0, Mask) &&
17346 !isShuffleMaskInputInPlace(1, Mask))
17347 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
17348 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
17349 return Result;
17350
17351 // Otherwise fall back on generic blend lowering.
17352 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
17353 Subtarget, DAG);
17354}
17355
17356/// Handle lowering of 8-lane 32-bit floating point shuffles.
17357///
17358/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
17359/// isn't available.
17360static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17361 const APInt &Zeroable, SDValue V1, SDValue V2,
17362 const X86Subtarget &Subtarget,
17363 SelectionDAG &DAG) {
17364 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!")(static_cast<void> (0));
17365 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!")(static_cast<void> (0));
17366 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast<void> (0));
17367
17368 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
1
Taking false branch
17369 Zeroable, Subtarget, DAG))
17370 return Blend;
17371
17372 // Check for being able to broadcast a single element.
17373 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
2
Taking false branch
17374 Subtarget, DAG))
17375 return Broadcast;
17376
17377 // If the shuffle mask is repeated in each 128-bit lane, we have many more
17378 // options to efficiently lower the shuffle.
17379 SmallVector<int, 4> RepeatedMask;
17380 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
3
Assuming the condition is false
4
Taking false branch
17381 assert(RepeatedMask.size() == 4 &&(static_cast<void> (0))
17382 "Repeated masks must be half the mask width!")(static_cast<void> (0));
17383
17384 // Use even/odd duplicate instructions for masks that match their pattern.
17385 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
17386 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
17387 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
17388 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
17389
17390 if (V2.isUndef())
17391 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
17392 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17393
17394 // Use dedicated unpack instructions for masks that match their pattern.
17395 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
17396 return V;
17397
17398 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
17399 // have already handled any direct blends.
17400 return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
17401 }
17402
17403 // Try to create an in-lane repeating shuffle mask and then shuffle the
17404 // results into the target lanes.
17405 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
5
Taking false branch
17406 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
17407 return V;
17408
17409 // If we have a single input shuffle with different shuffle patterns in the
17410 // two 128-bit lanes use the variable mask to VPERMILPS.
17411 if (V2.isUndef()) {
6
Taking false branch
17412 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {
17413 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
17414 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
17415 }
17416 if (Subtarget.hasAVX2()) {
17417 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
17418 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
17419 }
17420 // Otherwise, fall back.
17421 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,
17422 DAG, Subtarget);
17423 }
17424
17425 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17426 // shuffle.
17427 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
7
Taking false branch
17428 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
17429 return Result;
17430
17431 // If we have VLX support, we can use VEXPAND.
17432 if (Subtarget.hasVLX())
8
Assuming the condition is false
17433 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask, V1, V2,
17434 DAG, Subtarget))
17435 return V;
17436
17437 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
17438 // since after split we get a more efficient code using vpunpcklwd and
17439 // vpunpckhwd instrs than vblend.
17440 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
9
Assuming the condition is true
10
Taking true branch
17441 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Subtarget,
11
Calling 'lowerShuffleAsSplitOrBlend'
17442 DAG);
17443
17444 // If we have AVX2 then we always want to lower with a blend because at v8 we
17445 // can fully permute the elements.
17446 if (Subtarget.hasAVX2())
17447 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask,
17448 Subtarget, DAG);
17449
17450 // Otherwise fall back on generic lowering.
17451 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,
17452 Subtarget, DAG);
17453}
17454
17455/// Handle lowering of 8-lane 32-bit integer shuffles.
17456///
17457/// This routine is only called when we have AVX2 and thus a reasonable
17458/// instruction set for v8i32 shuffling..
17459static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17460 const APInt &Zeroable, SDValue V1, SDValue V2,
17461 const X86Subtarget &Subtarget,
17462 SelectionDAG &DAG) {
17463 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!")(static_cast<void> (0));
17464 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!")(static_cast<void> (0));
17465 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast<void> (0));
17466 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!")(static_cast<void> (0));
17467
17468 // Whenever we can lower this as a zext, that instruction is strictly faster
17469 // than any alternative. It also allows us to fold memory operands into the
17470 // shuffle in many cases.
17471 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
17472 Zeroable, Subtarget, DAG))
17473 return ZExt;
17474
17475 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
17476 // since after split we get a more efficient code than vblend by using
17477 // vpunpcklwd and vpunpckhwd instrs.
17478 if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
17479 !Subtarget.hasAVX512())
17480 return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Subtarget,
17481 DAG);
17482
17483 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
17484 Zeroable, Subtarget, DAG))
17485 return Blend;
17486
17487 // Check for being able to broadcast a single element.
17488 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
17489 Subtarget, DAG))
17490 return Broadcast;
17491
17492 // If the shuffle mask is repeated in each 128-bit lane we can use more
17493 // efficient instructions that mirror the shuffles across the two 128-bit
17494 // lanes.
17495 SmallVector<int, 4> RepeatedMask;
17496 bool Is128BitLaneRepeatedShuffle =
17497 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
17498 if (Is128BitLaneRepeatedShuffle) {
17499 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")(static_cast<void> (0));
17500 if (V2.isUndef())
17501 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
17502 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17503
17504 // Use dedicated unpack instructions for masks that match their pattern.
17505 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
17506 return V;
17507 }
17508
17509 // Try to use shift instructions.
17510 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
17511 Zeroable, Subtarget, DAG))
17512 return Shift;
17513
17514 // If we have VLX support, we can use VALIGN or EXPAND.
17515 if (Subtarget.hasVLX()) {
17516 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,
17517 Subtarget, DAG))
17518 return Rotate;
17519
17520 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, V1, V2,
17521 DAG, Subtarget))
17522 return V;
17523 }
17524
17525 // Try to use byte rotation instructions.
17526 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,
17527 Subtarget, DAG))
17528 return Rotate;
17529
17530 // Try to create an in-lane repeating shuffle mask and then shuffle the
17531 // results into the target lanes.
17532 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17533 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
17534 return V;
17535
17536 if (V2.isUndef()) {
17537 // Try to produce a fixed cross-128-bit lane permute followed by unpack
17538 // because that should be faster than the variable permute alternatives.
17539 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, Mask, V1, V2, DAG))
17540 return V;
17541
17542 // If the shuffle patterns aren't repeated but it's a single input, directly
17543 // generate a cross-lane VPERMD instruction.
17544 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
17545 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
17546 }
17547
17548 // Assume that a single SHUFPS is faster than an alternative sequence of
17549 // multiple instructions (even if the CPU has a domain penalty).
17550 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
17551 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
17552 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
17553 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
17554 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
17555 CastV1, CastV2, DAG);
17556 return DAG.getBitcast(MVT::v8i32, ShufPS);
17557 }
17558
17559 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17560 // shuffle.
17561 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
17562 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
17563 return Result;
17564
17565 // Otherwise fall back on generic blend lowering.
17566 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask,
17567 Subtarget, DAG);
17568}
17569
17570/// Handle lowering of 16-lane 16-bit integer shuffles.
17571///
17572/// This routine is only called when we have AVX2 and thus a reasonable
17573/// instruction set for v16i16 shuffling..
17574static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17575 const APInt &Zeroable, SDValue V1, SDValue V2,
17576 const X86Subtarget &Subtarget,
17577 SelectionDAG &DAG) {
17578 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!")(static_cast<void> (0));
17579 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!")(static_cast<void> (0));
17580 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast<void> (0));
17581 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!")(static_cast<void> (0));
17582
17583 // Whenever we can lower this as a zext, that instruction is strictly faster
17584 // than any alternative. It also allows us to fold memory operands into the
17585 // shuffle in many cases.
17586 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
17587 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
17588 return ZExt;
17589
17590 // Check for being able to broadcast a single element.
17591 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,
17592 Subtarget, DAG))
17593 return Broadcast;
17594
17595 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
17596 Zeroable, Subtarget, DAG))
17597 return Blend;
17598
17599 // Use dedicated unpack instructions for masks that match their pattern.
17600 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
17601 return V;
17602
17603 // Use dedicated pack instructions for masks that match their pattern.
17604 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,
17605 Subtarget))
17606 return V;
17607
17608 // Try to use lower using a truncation.
17609 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
17610 Subtarget, DAG))
17611 return V;
17612
17613 // Try to use shift instructions.
17614 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
17615 Zeroable, Subtarget, DAG))
17616 return Shift;
17617
17618 // Try to use byte rotation instructions.
17619 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,
17620 Subtarget, DAG))
17621 return Rotate;
17622
17623 // Try to create an in-lane repeating shuffle mask and then shuffle the
17624 // results into the target lanes.
17625 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17626 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17627 return V;
17628
17629 if (V2.isUndef()) {
17630 // Try to use bit rotation instructions.
17631 if (SDValue Rotate =
17632 lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))
17633 return Rotate;
17634
17635 // Try to produce a fixed cross-128-bit lane permute followed by unpack
17636 // because that should be faster than the variable permute alternatives.
17637 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, Mask, V1, V2, DAG))
17638 return V;
17639
17640 // There are no generalized cross-lane shuffle operations available on i16
17641 // element types.
17642 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
17643 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
17644 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17645 return V;
17646
17647 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,
17648 DAG, Subtarget);
17649 }
17650
17651 SmallVector<int, 8> RepeatedMask;
17652 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
17653 // As this is a single-input shuffle, the repeated mask should be
17654 // a strictly valid v8i16 mask that we can pass through to the v8i16
17655 // lowering to handle even the v16 case.
17656 return lowerV8I16GeneralSingleInputShuffle(
17657 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
17658 }
17659 }
17660
17661 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,
17662 Zeroable, Subtarget, DAG))
17663 return PSHUFB;
17664
17665 // AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).
17666 if (Subtarget.hasBWI())
17667 return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);
17668
17669 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17670 // shuffle.
17671 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
17672 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17673 return Result;
17674
17675 // Try to permute the lanes and then use a per-lane permute.
17676 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
17677 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17678 return V;
17679
17680 // Otherwise fall back on generic lowering.
17681 return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask,
17682 Subtarget, DAG);
17683}
17684
17685/// Handle lowering of 32-lane 8-bit integer shuffles.
17686///
17687/// This routine is only called when we have AVX2 and thus a reasonable
17688/// instruction set for v32i8 shuffling..
17689static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17690 const APInt &Zeroable, SDValue V1, SDValue V2,
17691 const X86Subtarget &Subtarget,
17692 SelectionDAG &DAG) {
17693 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!")(static_cast<void> (0));
17694 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!")(static_cast<void> (0));
17695 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!")(static_cast<void> (0));
17696 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!")(static_cast<void> (0));
17697
17698 // Whenever we can lower this as a zext, that instruction is strictly faster
17699 // than any alternative. It also allows us to fold memory operands into the
17700 // shuffle in many cases.
17701 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,
17702 Zeroable, Subtarget, DAG))
17703 return ZExt;
17704
17705 // Check for being able to broadcast a single element.
17706 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
17707 Subtarget, DAG))
17708 return Broadcast;
17709
17710 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
17711 Zeroable, Subtarget, DAG))
17712 return Blend;
17713
17714 // Use dedicated unpack instructions for masks that match their pattern.
17715 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
17716 return V;
17717
17718 // Use dedicated pack instructions for masks that match their pattern.
17719 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,
17720 Subtarget))
17721 return V;
17722
17723 // Try to use lower using a truncation.
17724 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
17725 Subtarget, DAG))
17726 return V;
17727
17728 // Try to use shift instructions.
17729 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
17730 Zeroable, Subtarget, DAG))
17731 return Shift;
17732
17733 // Try to use byte rotation instructions.
17734 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,
17735 Subtarget, DAG))
17736 return Rotate;
17737
17738 // Try to use bit rotation instructions.
17739 if (V2.isUndef())
17740 if (SDValue Rotate =
17741 lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))
17742 return Rotate;
17743
17744 // Try to create an in-lane repeating shuffle mask and then shuffle the
17745 // results into the target lanes.
17746 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17747 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17748 return V;
17749
17750 // There are no generalized cross-lane shuffle operations available on i8
17751 // element types.
17752 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
17753 // Try to produce a fixed cross-128-bit lane permute followed by unpack
17754 // because that should be faster than the variable permute alternatives.
17755 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, Mask, V1, V2, DAG))
17756 return V;
17757
17758 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
17759 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17760 return V;
17761
17762 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,
17763 DAG, Subtarget);
17764 }
17765
17766 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
17767 Zeroable, Subtarget, DAG))
17768 return PSHUFB;
17769
17770 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
17771 if (Subtarget.hasVBMI())
17772 return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);
17773
17774 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17775 // shuffle.
17776 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
17777 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17778 return Result;
17779
17780 // Try to permute the lanes and then use a per-lane permute.
17781 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
17782 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17783 return V;
17784
17785 // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
17786 // by zeroable elements in the remaining 24 elements. Turn this into two
17787 // vmovqb instructions shuffled together.
17788 if (Subtarget.hasVLX())
17789 if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,
17790 Mask, Zeroable, DAG))
17791 return V;
17792
17793 // Otherwise fall back on generic lowering.
17794 return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask,
17795 Subtarget, DAG);
17796}
17797
17798/// High-level routine to lower various 256-bit x86 vector shuffles.
17799///
17800/// This routine either breaks down the specific type of a 256-bit x86 vector
17801/// shuffle or splits it into two 128-bit shuffles and fuses the results back
17802/// together based on the available instructions.
17803static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
17804 SDValue V1, SDValue V2, const APInt &Zeroable,
17805 const X86Subtarget &Subtarget,
17806 SelectionDAG &DAG) {
17807 // If we have a single input to the zero element, insert that into V1 if we
17808 // can do so cheaply.
17809 int NumElts = VT.getVectorNumElements();
17810 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17811
17812 if (NumV2Elements == 1 && Mask[0] >= NumElts)
17813 if (SDValue Insertion = lowerShuffleAsElementInsertion(
17814 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
17815 return Insertion;
17816
17817 // Handle special cases where the lower or upper half is UNDEF.
17818 if (SDValue V =
17819 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
17820 return V;
17821
17822 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
17823 // can check for those subtargets here and avoid much of the subtarget
17824 // querying in the per-vector-type lowering routines. With AVX1 we have
17825 // essentially *zero* ability to manipulate a 256-bit vector with integer
17826 // types. Since we'll use floating point types there eventually, just
17827 // immediately cast everything to a float and operate entirely in that domain.
17828 if (VT.isInteger() && !Subtarget.hasAVX2()) {
17829 int ElementBits = VT.getScalarSizeInBits();
17830 if (ElementBits < 32) {
17831 // No floating point type available, if we can't use the bit operations
17832 // for masking/blending then decompose into 128-bit vectors.
17833 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
17834 Subtarget, DAG))
17835 return V;
17836 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
17837 return V;
17838 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
17839 }
17840
17841 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
17842 VT.getVectorNumElements());
17843 V1 = DAG.getBitcast(FpVT, V1);
17844 V2 = DAG.getBitcast(FpVT, V2);
17845 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
17846 }
17847
17848 if (VT == MVT::v16f16) {
17849 V1 = DAG.getBitcast(MVT::v16i16, V1);
17850 V2 = DAG.getBitcast(MVT::v16i16, V2);
17851 return DAG.getBitcast(MVT::v16f16,
17852 DAG.getVectorShuffle(MVT::v16i16, DL, V1, V2, Mask));
17853 }
17854
17855 switch (VT.SimpleTy) {
17856 case MVT::v4f64:
17857 return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17858 case MVT::v4i64:
17859 return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17860 case MVT::v8f32:
17861 return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17862 case MVT::v8i32:
17863 return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17864 case MVT::v16i16:
17865 return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17866 case MVT::v32i8:
17867 return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17868
17869 default:
17870 llvm_unreachable("Not a valid 256-bit x86 vector type!")__builtin_unreachable();
17871 }
17872}
17873
17874/// Try to lower a vector shuffle as a 128-bit shuffles.
17875static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
17876 const APInt &Zeroable, SDValue V1, SDValue V2,
17877 const X86Subtarget &Subtarget,
17878 SelectionDAG &DAG) {
17879 assert(VT.getScalarSizeInBits() == 64 &&(static_cast<void> (0))
17880 "Unexpected element type size for 128bit shuffle.")(static_cast<void> (0));
17881
17882 // To handle 256 bit vector requires VLX and most probably
17883 // function lowerV2X128VectorShuffle() is better solution.
17884 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.")(static_cast<void> (0));
17885
17886 // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
17887 SmallVector<int, 4> Widened128Mask;
17888 if (!canWidenShuffleElements(Mask, Widened128Mask))
17889 return SDValue();
17890 assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch")(static_cast<void> (0));
17891
17892 // Try to use an insert into a zero vector.
17893 if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
17894 (Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
17895 unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
17896 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
17897 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
17898 DAG.getIntPtrConstant(0, DL));
17899 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
17900 getZeroVector(VT, Subtarget, DAG, DL), LoV,
17901 DAG.getIntPtrConstant(0, DL));
17902 }
17903
17904 // Check for patterns which can be matched with a single insert of a 256-bit
17905 // subvector.
17906 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);
17907 if (OnlyUsesV1 ||
17908 isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {
17909 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
17910 SDValue SubVec =
17911 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
17912 DAG.getIntPtrConstant(0, DL));
17913 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
17914 DAG.getIntPtrConstant(4, DL));
17915 }
17916
17917 // See if this is an insertion of the lower 128-bits of V2 into V1.
17918 bool IsInsert = true;
17919 int V2Index = -1;
17920 for (int i = 0; i < 4; ++i) {
17921 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value")(static_cast<void> (0));
17922 if (Widened128Mask[i] < 0)
17923 continue;
17924
17925 // Make sure all V1 subvectors are in place.
17926 if (Widened128Mask[i] < 4) {
17927 if (Widened128Mask[i] != i) {
17928 IsInsert = false;
17929 break;
17930 }
17931 } else {
17932 // Make sure we only have a single V2 index and its the lowest 128-bits.
17933 if (V2Index >= 0 || Widened128Mask[i] != 4) {
17934 IsInsert = false;
17935 break;
17936 }
17937 V2Index = i;
17938 }
17939 }
17940 if (IsInsert && V2Index >= 0) {
17941 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
17942 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
17943 DAG.getIntPtrConstant(0, DL));
17944 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
17945 }
17946
17947 // See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane
17948 // UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where
17949 // possible we at least ensure the lanes stay sequential to help later
17950 // combines.
17951 SmallVector<int, 2> Widened256Mask;
17952 if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {
17953 Widened128Mask.clear();
17954 narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask);
17955 }
17956
17957 // Try to lower to vshuf64x2/vshuf32x4.
17958 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
17959 unsigned PermMask = 0;
17960 // Insure elements came from the same Op.
17961 for (int i = 0; i < 4; ++i) {
17962 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value")(static_cast<void> (0));
17963 if (Widened128Mask[i] < 0)
17964 continue;
17965
17966 SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;
17967 unsigned OpIndex = i / 2;
17968 if (Ops[OpIndex].isUndef())
17969 Ops[OpIndex] = Op;
17970 else if (Ops[OpIndex] != Op)
17971 return SDValue();
17972
17973 // Convert the 128-bit shuffle mask selection values into 128-bit selection
17974 // bits defined by a vshuf64x2 instruction's immediate control byte.
17975 PermMask |= (Widened128Mask[i] % 4) << (i * 2);
17976 }
17977
17978 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
17979 DAG.getTargetConstant(PermMask, DL, MVT::i8));
17980}
17981
17982/// Handle lowering of 8-lane 64-bit floating point shuffles.
17983static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17984 const APInt &Zeroable, SDValue V1, SDValue V2,
17985 const X86Subtarget &Subtarget,
17986 SelectionDAG &DAG) {
17987 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!")(static_cast<void> (0));
17988 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!")(static_cast<void> (0));
17989 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast<void> (0));
17990
17991 if (V2.isUndef()) {
17992 // Use low duplicate instructions for masks that match their pattern.
17993 if (isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))
17994 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
17995
17996 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
17997 // Non-half-crossing single input shuffles can be lowered with an
17998 // interleaved permutation.
17999 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
18000 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
18001 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
18002 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
18003 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
18004 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
18005 }
18006
18007 SmallVector<int, 4> RepeatedMask;
18008 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
18009 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
18010 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
18011 }
18012
18013 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,
18014 V2, Subtarget, DAG))
18015 return Shuf128;
18016
18017 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
18018 return Unpck;
18019
18020 // Check if the blend happens to exactly fit that of SHUFPD.
18021 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,
18022 Zeroable, Subtarget, DAG))
18023 return Op;
18024
18025 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, V2,
18026 DAG, Subtarget))
18027 return V;
18028
18029 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
18030 Zeroable, Subtarget, DAG))
18031 return Blend;
18032
18033 return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);
18034}
18035
18036/// Handle lowering of 16-lane 32-bit floating point shuffles.
18037static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18038 const APInt &Zeroable, SDValue V1, SDValue V2,
18039 const X86Subtarget &Subtarget,
18040 SelectionDAG &DAG) {
18041 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!")(static_cast<void> (0));
18042 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!")(static_cast<void> (0));
18043 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast<void> (0));
18044
18045 // If the shuffle mask is repeated in each 128-bit lane, we have many more
18046 // options to efficiently lower the shuffle.
18047 SmallVector<int, 4> RepeatedMask;
18048 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
18049 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")(static_cast<void> (0));
18050
18051 // Use even/odd duplicate instructions for masks that match their pattern.
18052 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
18053 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
18054 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
18055 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
18056
18057 if (V2.isUndef())
18058 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
18059 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
18060
18061 // Use dedicated unpack instructions for masks that match their pattern.
18062 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
18063 return V;
18064
18065 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
18066 Zeroable, Subtarget, DAG))
18067 return Blend;
18068
18069 // Otherwise, fall back to a SHUFPS sequence.
18070 return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
18071 }
18072
18073 // Try to create an in-lane repeating shuffle mask and then shuffle the
18074 // results into the target lanes.
18075 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
18076 DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))
18077 return V;
18078
18079 // If we have a single input shuffle with different shuffle patterns in the
18080 // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
18081 if (V2.isUndef() &&
18082 !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
18083 SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
18084 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
18085 }
18086
18087 // If we have AVX512F support, we can use VEXPAND.
18088 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
18089 V1, V2, DAG, Subtarget))
18090 return V;
18091
18092 return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);
18093}
18094
18095/// Handle lowering of 8-lane 64-bit integer shuffles.
18096static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18097 const APInt &Zeroable, SDValue V1, SDValue V2,
18098 const X86Subtarget &Subtarget,
18099 SelectionDAG &DAG) {
18100 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!")(static_cast<void> (0));
18101 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!")(static_cast<void> (0));
18102 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast<void> (0));
18103
18104 if (V2.isUndef()) {
18105 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
18106 // can use lower latency instructions that will operate on all four
18107 // 128-bit lanes.
18108 SmallVector<int, 2> Repeated128Mask;
18109 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
18110 SmallVector<int, 4> PSHUFDMask;
18111 narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask);
18112 return DAG.getBitcast(
18113 MVT::v8i64,
18114 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
18115 DAG.getBitcast(MVT::v16i32, V1),
18116 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
18117 }
18118
18119 SmallVector<int, 4> Repeated256Mask;
18120 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
18121 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
18122 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
18123 }
18124
18125 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,
18126 V2, Subtarget, DAG))
18127 return Shuf128;
18128
18129 // Try to use shift instructions.
18130 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
18131 Zeroable, Subtarget, DAG))
18132 return Shift;
18133
18134 // Try to use VALIGN.
18135 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,
18136 Subtarget, DAG))
18137 return Rotate;
18138
18139 // Try to use PALIGNR.
18140 if (Subtarget.hasBWI())
18141 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
18142 Subtarget, DAG))
18143 return Rotate;
18144
18145 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
18146 return Unpck;
18147
18148 // If we have AVX512F support, we can use VEXPAND.
18149 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, V2,
18150 DAG, Subtarget))
18151 return V;
18152
18153 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
18154 Zeroable, Subtarget, DAG))
18155 return Blend;
18156
18157 return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);
18158}
18159
18160/// Handle lowering of 16-lane 32-bit integer shuffles.
18161static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18162 const APInt &Zeroable, SDValue V1, SDValue V2,
18163 const X86Subtarget &Subtarget,
18164 SelectionDAG &DAG) {
18165 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!")(static_cast<void> (0));
18166 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!")(static_cast<void> (0));
18167 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast<void> (0));
18168
18169 // Whenever we can lower this as a zext, that instruction is strictly faster
18170 // than any alternative. It also allows us to fold memory operands into the
18171 // shuffle in many cases.
18172 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
18173 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
18174 return ZExt;
18175
18176 // If the shuffle mask is repeated in each 128-bit lane we can use more
18177 // efficient instructions that mirror the shuffles across the four 128-bit
18178 // lanes.
18179 SmallVector<int, 4> RepeatedMask;
18180 bool Is128BitLaneRepeatedShuffle =
18181 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
18182 if (Is128BitLaneRepeatedShuffle) {
18183 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")(static_cast<void> (0));
18184 if (V2.isUndef())
18185 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
18186 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
18187
18188 // Use dedicated unpack instructions for masks that match their pattern.
18189 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
18190 return V;
18191 }
18192
18193 // Try to use shift instructions.
18194 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
18195 Zeroable, Subtarget, DAG))
18196 return Shift;
18197
18198 // Try to use VALIGN.
18199 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,
18200 Subtarget, DAG))
18201 return Rotate;
18202
18203 // Try to use byte rotation instructions.
18204 if (Subtarget.hasBWI())
18205 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,
18206 Subtarget, DAG))
18207 return Rotate;
18208
18209 // Assume that a single SHUFPS is faster than using a permv shuffle.
18210 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
18211 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
18212 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
18213 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
18214 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
18215 CastV1, CastV2, DAG);
18216 return DAG.getBitcast(MVT::v16i32, ShufPS);
18217 }
18218
18219 // Try to create an in-lane repeating shuffle mask and then shuffle the
18220 // results into the target lanes.
18221 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
18222 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
18223 return V;
18224
18225 // If we have AVX512F support, we can use VEXPAND.
18226 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, V1, V2,
18227 DAG, Subtarget))
18228 return V;
18229
18230 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
18231 Zeroable, Subtarget, DAG))
18232 return Blend;
18233
18234 return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);
18235}
18236
18237/// Handle lowering of 32-lane 16-bit integer shuffles.
18238static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18239 const APInt &Zeroable, SDValue V1, SDValue V2,
18240 const X86Subtarget &Subtarget,
18241 SelectionDAG &DAG) {
18242 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!")(static_cast<void> (0));
18243 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!")(static_cast<void> (0));
18244 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!")(static_cast<void> (0));
18245 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!")(static_cast<void> (0));
18246
18247 // Whenever we can lower this as a zext, that instruction is strictly faster
18248 // than any alternative. It also allows us to fold memory operands into the
18249 // shuffle in many cases.
18250 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
18251 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
18252 return ZExt;
18253
18254 // Use dedicated unpack instructions for masks that match their pattern.
18255 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
18256 return V;
18257
18258 // Use dedicated pack instructions for masks that match their pattern.
18259 if (SDValue V =
18260 lowerShuffleWithPACK(DL, MVT::v32i16, Mask, V1, V2, DAG, Subtarget))
18261 return V;
18262
18263 // Try to use shift instructions.
18264 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
18265 Zeroable, Subtarget, DAG))
18266 return Shift;
18267
18268 // Try to use byte rotation instructions.
18269 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,
18270 Subtarget, DAG))
18271 return Rotate;
18272
18273 if (V2.isUndef()) {
18274 // Try to use bit rotation instructions.
18275 if (SDValue Rotate =
18276 lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))
18277 return Rotate;
18278
18279 SmallVector<int, 8> RepeatedMask;
18280 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
18281 // As this is a single-input shuffle, the repeated mask should be
18282 // a strictly valid v8i16 mask that we can pass through to the v8i16
18283 // lowering to handle even the v32 case.
18284 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,
18285 RepeatedMask, Subtarget, DAG);
18286 }
18287 }
18288
18289 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
18290 Zeroable, Subtarget, DAG))
18291 return Blend;
18292
18293 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
18294 Zeroable, Subtarget, DAG))
18295 return PSHUFB;
18296
18297 return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);
18298}
18299
18300/// Handle lowering of 64-lane 8-bit integer shuffles.
18301static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18302 const APInt &Zeroable, SDValue V1, SDValue V2,
18303 const X86Subtarget &Subtarget,
18304 SelectionDAG &DAG) {
18305 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!")(static_cast<void> (0));
18306 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!")(static_cast<void> (0));
18307 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!")(static_cast<void> (0));
18308 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!")(static_cast<void> (0));
18309
18310 // Whenever we can lower this as a zext, that instruction is strictly faster
18311 // than any alternative. It also allows us to fold memory operands into the
18312 // shuffle in many cases.
18313 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
18314 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
18315 return ZExt;
18316
18317 // Use dedicated unpack instructions for masks that match their pattern.
18318 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
18319 return V;
18320
18321 // Use dedicated pack instructions for masks that match their pattern.
18322 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v64i8, Mask, V1, V2, DAG,
18323 Subtarget))
18324 return V;
18325
18326 // Try to use shift instructions.
18327 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
18328 Zeroable, Subtarget, DAG))
18329 return Shift;
18330
18331 // Try to use byte rotation instructions.
18332 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,
18333 Subtarget, DAG))
18334 return Rotate;
18335
18336 // Try to use bit rotation instructions.
18337 if (V2.isUndef())
18338 if (SDValue Rotate =
18339 lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))
18340 return Rotate;
18341
18342 // Lower as AND if possible.
18343 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask,
18344 Zeroable, Subtarget, DAG))
18345 return Masked;
18346
18347 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
18348 Zeroable, Subtarget, DAG))
18349 return PSHUFB;
18350
18351 // VBMI can use VPERMV/VPERMV3 byte shuffles.
18352 if (Subtarget.hasVBMI())
18353 return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
18354
18355 // Try to create an in-lane repeating shuffle mask and then shuffle the
18356 // results into the target lanes.
18357 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
18358 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
18359 return V;
18360
18361 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
18362 Zeroable, Subtarget, DAG))
18363 return Blend;
18364
18365 // Try to simplify this by merging 128-bit lanes to enable a lane-based
18366 // shuffle.
18367 if (!V2.isUndef())
18368 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
18369 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
18370 return Result;
18371
18372 // FIXME: Implement direct support for this type!
18373 return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
18374}
18375
18376/// High-level routine to lower various 512-bit x86 vector shuffles.
18377///
18378/// This routine either breaks down the specific type of a 512-bit x86 vector
18379/// shuffle or splits it into two 256-bit shuffles and fuses the results back
18380/// together based on the available instructions.
18381static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
18382 MVT VT, SDValue V1, SDValue V2,
18383 const APInt &Zeroable,
18384 const X86Subtarget &Subtarget,
18385 SelectionDAG &DAG) {
18386 assert(Subtarget.hasAVX512() &&(static_cast<void> (0))
18387 "Cannot lower 512-bit vectors w/ basic ISA!")(static_cast<void> (0));
18388
18389 // If we have a single input to the zero element, insert that into V1 if we
18390 // can do so cheaply.
18391 int NumElts = Mask.size();
18392 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
18393
18394 if (NumV2Elements == 1 && Mask[0] >= NumElts)
18395 if (SDValue Insertion = lowerShuffleAsElementInsertion(
18396 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
18397 return Insertion;
18398
18399 // Handle special cases where the lower or upper half is UNDEF.
18400 if (SDValue V =
18401 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
18402 return V;
18403
18404 // Check for being able to broadcast a single element.
18405 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
18406 Subtarget, DAG))
18407 return Broadcast;
18408
18409 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {
18410 // Try using bit ops for masking and blending before falling back to
18411 // splitting.
18412 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
18413 Subtarget, DAG))
18414 return V;
18415 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
18416 return V;
18417
18418 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
18419 }
18420
18421 if (VT == MVT::v32f16) {
18422 V1 = DAG.getBitcast(MVT::v32i16, V1);
18423 V2 = DAG.getBitcast(MVT::v32i16, V2);
18424 return DAG.getBitcast(MVT::v32f16,
18425 DAG.getVectorShuffle(MVT::v32i16, DL, V1, V2, Mask));
18426 }
18427
18428 // Dispatch to each element type for lowering. If we don't have support for
18429 // specific element type shuffles at 512 bits, immediately split them and
18430 // lower them. Each lowering routine of a given type is allowed to assume that
18431 // the requisite ISA extensions for that element type are available.
18432 switch (VT.SimpleTy) {
18433 case MVT::v8f64:
18434 return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18435 case MVT::v16f32:
18436 return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18437 case MVT::v8i64:
18438 return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18439 case MVT::v16i32:
18440 return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18441 case MVT::v32i16:
18442 return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18443 case MVT::v64i8:
18444 return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18445
18446 default:
18447 llvm_unreachable("Not a valid 512-bit x86 vector type!")__builtin_unreachable();
18448 }
18449}
18450
18451static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef<int> Mask,
18452 MVT VT, SDValue V1, SDValue V2,
18453 const X86Subtarget &Subtarget,
18454 SelectionDAG &DAG) {
18455 // Shuffle should be unary.
18456 if (!V2.isUndef())
18457 return SDValue();
18458
18459 int ShiftAmt = -1;
18460 int NumElts = Mask.size();
18461 for (int i = 0; i != NumElts; ++i) {
18462 int M = Mask[i];
18463 assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&(static_cast<void> (0))
18464 "Unexpected mask index.")(static_cast<void> (0));
18465 if (M < 0)
18466 continue;
18467
18468 // The first non-undef element determines our shift amount.
18469 if (ShiftAmt < 0) {
18470 ShiftAmt = M - i;
18471 // Need to be shifting right.
18472 if (ShiftAmt <= 0)
18473 return SDValue();
18474 }
18475 // All non-undef elements must shift by the same amount.
18476 if (ShiftAmt != M - i)
18477 return SDValue();
18478 }
18479 assert(ShiftAmt >= 0 && "All undef?")(static_cast<void> (0));
18480
18481 // Great we found a shift right.
18482 MVT WideVT = VT;
18483 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
18484 WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
18485 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,
18486 DAG.getUNDEF(WideVT), V1,
18487 DAG.getIntPtrConstant(0, DL));
18488 Res = DAG.getNode(X86ISD::KSHIFTR, DL, WideVT, Res,
18489 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
18490 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
18491 DAG.getIntPtrConstant(0, DL));
18492}
18493
18494// Determine if this shuffle can be implemented with a KSHIFT instruction.
18495// Returns the shift amount if possible or -1 if not. This is a simplified
18496// version of matchShuffleAsShift.
18497static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
18498 int MaskOffset, const APInt &Zeroable) {
18499 int Size = Mask.size();
18500
18501 auto CheckZeros = [&](int Shift, bool Left) {
18502 for (int j = 0; j < Shift; ++j)
18503 if (!Zeroable[j + (Left ? 0 : (Size - Shift))])
18504 return false;
18505
18506 return true;
18507 };
18508
18509 auto MatchShift = [&](int Shift, bool Left) {
18510 unsigned Pos = Left ? Shift : 0;
18511 unsigned Low = Left ? 0 : Shift;
18512 unsigned Len = Size - Shift;
18513 return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);
18514 };
18515
18516 for (int Shift = 1; Shift != Size; ++Shift)
18517 for (bool Left : {true, false})
18518 if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {
18519 Opcode = Left ? X86ISD::KSHIFTL : X86ISD::KSHIFTR;
18520 return Shift;
18521 }
18522
18523 return -1;
18524}
18525
18526
18527// Lower vXi1 vector shuffles.
18528// There is no a dedicated instruction on AVX-512 that shuffles the masks.
18529// The only way to shuffle bits is to sign-extend the mask vector to SIMD
18530// vector, shuffle and then truncate it back.
18531static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
18532 MVT VT, SDValue V1, SDValue V2,
18533 const APInt &Zeroable,
18534 const X86Subtarget &Subtarget,
18535 SelectionDAG &DAG) {
18536 assert(Subtarget.hasAVX512() &&(static_cast<void> (0))
18537 "Cannot lower 512-bit vectors w/o basic ISA!")(static_cast<void> (0));
18538
18539 int NumElts = Mask.size();
18540
18541 // Try to recognize shuffles that are just padding a subvector with zeros.
18542 int SubvecElts = 0;
18543 int Src = -1;
18544 for (int i = 0; i != NumElts; ++i) {
18545 if (Mask[i] >= 0) {
18546 // Grab the source from the first valid mask. All subsequent elements need
18547 // to use this same source.
18548 if (Src < 0)
18549 Src = Mask[i] / NumElts;
18550 if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)
18551 break;
18552 }
18553
18554 ++SubvecElts;
18555 }
18556 assert(SubvecElts != NumElts && "Identity shuffle?")(static_cast<void> (0));
18557
18558 // Clip to a power 2.
18559 SubvecElts = PowerOf2Floor(SubvecElts);
18560
18561 // Make sure the number of zeroable bits in the top at least covers the bits
18562 // not covered by the subvector.
18563 if ((int)Zeroable.countLeadingOnes() >= (NumElts - SubvecElts)) {
18564 assert(Src >= 0 && "Expected a source!")(static_cast<void> (0));
18565 MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
18566 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT,
18567 Src == 0 ? V1 : V2,
18568 DAG.getIntPtrConstant(0, DL));
18569 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
18570 DAG.getConstant(0, DL, VT),
18571 Extract, DAG.getIntPtrConstant(0, DL));
18572 }
18573
18574 // Try a simple shift right with undef elements. Later we'll try with zeros.
18575 if (SDValue Shift = lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget,
18576 DAG))
18577 return Shift;
18578
18579 // Try to match KSHIFTs.
18580 unsigned Offset = 0;
18581 for (SDValue V : { V1, V2 }) {
18582 unsigned Opcode;
18583 int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
18584 if (ShiftAmt >= 0) {
18585 MVT WideVT = VT;
18586 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
18587 WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
18588 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,
18589 DAG.getUNDEF(WideVT), V,
18590 DAG.getIntPtrConstant(0, DL));
18591 // Widened right shifts need two shifts to ensure we shift in zeroes.
18592 if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {
18593 int WideElts = WideVT.getVectorNumElements();
18594 // Shift left to put the original vector in the MSBs of the new size.
18595 Res = DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,
18596 DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));
18597 // Increase the shift amount to account for the left shift.
18598 ShiftAmt += WideElts - NumElts;
18599 }
18600
18601 Res = DAG.getNode(Opcode, DL, WideVT, Res,
18602 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
18603 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
18604 DAG.getIntPtrConstant(0, DL));
18605 }
18606 Offset += NumElts; // Increment for next iteration.
18607 }
18608
18609
18610
18611 MVT ExtVT;
18612 switch (VT.SimpleTy) {
18613 default:
18614 llvm_unreachable("Expected a vector of i1 elements")__builtin_unreachable();
18615 case MVT::v2i1:
18616 ExtVT = MVT::v2i64;
18617 break;
18618 case MVT::v4i1:
18619 ExtVT = MVT::v4i32;
18620 break;
18621 case MVT::v8i1:
18622 // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
18623 // shuffle.
18624 ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
18625 break;
18626 case MVT::v16i1:
18627 // Take 512-bit type, unless we are avoiding 512-bit types and have the
18628 // 256-bit operation available.
18629 ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
18630 break;
18631 case MVT::v32i1:
18632 // Take 512-bit type, unless we are avoiding 512-bit types and have the
18633 // 256-bit operation available.
18634 assert(Subtarget.hasBWI() && "Expected AVX512BW support")(static_cast<void> (0));
18635 ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
18636 break;
18637 case MVT::v64i1:
18638 // Fall back to scalarization. FIXME: We can do better if the shuffle
18639 // can be partitioned cleanly.
18640 if (!Subtarget.useBWIRegs())
18641 return SDValue();
18642 ExtVT = MVT::v64i8;
18643 break;
18644 }
18645
18646 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
18647 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
18648
18649 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
18650 // i1 was sign extended we can use X86ISD::CVT2MASK.
18651 int NumElems = VT.getVectorNumElements();
18652 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
18653 (Subtarget.hasDQI() && (NumElems < 32)))
18654 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
18655 Shuffle, ISD::SETGT);
18656
18657 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
18658}
18659
18660/// Helper function that returns true if the shuffle mask should be
18661/// commuted to improve canonicalization.
18662static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
18663 int NumElements = Mask.size();
18664
18665 int NumV1Elements = 0, NumV2Elements = 0;
18666 for (int M : Mask)
18667 if (M < 0)
18668 continue;
18669 else if (M < NumElements)
18670 ++NumV1Elements;
18671 else
18672 ++NumV2Elements;
18673
18674 // Commute the shuffle as needed such that more elements come from V1 than
18675 // V2. This allows us to match the shuffle pattern strictly on how many
18676 // elements come from V1 without handling the symmetric cases.
18677 if (NumV2Elements > NumV1Elements)
18678 return true;
18679
18680 assert(NumV1Elements > 0 && "No V1 indices")(static_cast<void> (0));
18681
18682 if (NumV2Elements == 0)
18683 return false;
18684
18685 // When the number of V1 and V2 elements are the same, try to minimize the
18686 // number of uses of V2 in the low half of the vector. When that is tied,
18687 // ensure that the sum of indices for V1 is equal to or lower than the sum
18688 // indices for V2. When those are equal, try to ensure that the number of odd
18689 // indices for V1 is lower than the number of odd indices for V2.
18690 if (NumV1Elements == NumV2Elements) {
18691 int LowV1Elements = 0, LowV2Elements = 0;
18692 for (int M : Mask.slice(0, NumElements / 2))
18693 if (M >= NumElements)
18694 ++LowV2Elements;
18695 else if (M >= 0)
18696 ++LowV1Elements;
18697 if (LowV2Elements > LowV1Elements)
18698 return true;
18699 if (LowV2Elements == LowV1Elements) {
18700 int SumV1Indices = 0, SumV2Indices = 0;
18701 for (int i = 0, Size = Mask.size(); i < Size; ++i)
18702 if (Mask[i] >= NumElements)
18703 SumV2Indices += i;
18704 else if (Mask[i] >= 0)
18705 SumV1Indices += i;
18706 if (SumV2Indices < SumV1Indices)
18707 return true;
18708 if (SumV2Indices == SumV1Indices) {
18709 int NumV1OddIndices = 0, NumV2OddIndices = 0;
18710 for (int i = 0, Size = Mask.size(); i < Size; ++i)
18711 if (Mask[i] >= NumElements)
18712 NumV2OddIndices += i % 2;
18713 else if (Mask[i] >= 0)
18714 NumV1OddIndices += i % 2;
18715 if (NumV2OddIndices < NumV1OddIndices)
18716 return true;
18717 }
18718 }
18719 }
18720
18721 return false;
18722}
18723
18724// Forward declaration.
18725static SDValue canonicalizeShuffleMaskWithHorizOp(
18726 MutableArrayRef<SDValue> Ops, MutableArrayRef<int> Mask,
18727 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
18728 const X86Subtarget &Subtarget);
18729
18730 /// Top-level lowering for x86 vector shuffles.
18731///
18732/// This handles decomposition, canonicalization, and lowering of all x86
18733/// vector shuffles. Most of the specific lowering strategies are encapsulated
18734/// above in helper routines. The canonicalization attempts to widen shuffles
18735/// to involve fewer lanes of wider elements, consolidate symmetric patterns
18736/// s.t. only one of the two inputs needs to be tested, etc.
18737static SDValue lowerVECTOR_SHUFFLE(SDValue Op, const X86Subtarget &Subtarget,
18738 SelectionDAG &DAG) {
18739 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
18740 ArrayRef<int> OrigMask = SVOp->getMask();
18741 SDValue V1 = Op.getOperand(0);
18742 SDValue V2 = Op.getOperand(1);
18743 MVT VT = Op.getSimpleValueType();
18744 int NumElements = VT.getVectorNumElements();
18745 SDLoc DL(Op);
18746 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
18747
18748 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&(static_cast<void> (0))
18749 "Can't lower MMX shuffles")(static_cast<void> (0));
18750
18751 bool V1IsUndef = V1.isUndef();
18752 bool V2IsUndef = V2.isUndef();
18753 if (V1IsUndef && V2IsUndef)
18754 return DAG.getUNDEF(VT);
18755
18756 // When we create a shuffle node we put the UNDEF node to second operand,
18757 // but in some cases the first operand may be transformed to UNDEF.
18758 // In this case we should just commute the node.
18759 if (V1IsUndef)
18760 return DAG.getCommutedVectorShuffle(*SVOp);
18761
18762 // Check for non-undef masks pointing at an undef vector and make the masks
18763 // undef as well. This makes it easier to match the shuffle based solely on
18764 // the mask.
18765 if (V2IsUndef &&
18766 any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
18767 SmallVector<int, 8> NewMask(OrigMask.begin(), OrigMask.end());
18768 for (int &M : NewMask)
18769 if (M >= NumElements)
18770 M = -1;
18771 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
18772 }
18773
18774 // Check for illegal shuffle mask element index values.
18775 int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
18776 (void)MaskUpperLimit;
18777 assert(llvm::all_of(OrigMask,(static_cast<void> (0))
18778 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&(static_cast<void> (0))
18779 "Out of bounds shuffle index")(static_cast<void> (0));
18780
18781 // We actually see shuffles that are entirely re-arrangements of a set of
18782 // zero inputs. This mostly happens while decomposing complex shuffles into
18783 // simple ones. Directly lower these as a buildvector of zeros.
18784 APInt KnownUndef, KnownZero;
18785 computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);
18786
18787 APInt Zeroable = KnownUndef | KnownZero;
18788 if (Zeroable.isAllOnesValue())
18789 return getZeroVector(VT, Subtarget, DAG, DL);
18790
18791 bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
18792
18793 // Try to collapse shuffles into using a vector type with fewer elements but
18794 // wider element types. We cap this to not form integers or floating point
18795 // elements wider than 64 bits. It does not seem beneficial to form i128
18796 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
18797 SmallVector<int, 16> WidenedMask;
18798 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
18799 canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {
18800 // Shuffle mask widening should not interfere with a broadcast opportunity
18801 // by obfuscating the operands with bitcasts.
18802 // TODO: Avoid lowering directly from this top-level function: make this
18803 // a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
18804 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,
18805 Subtarget, DAG))
18806 return Broadcast;
18807
18808 MVT NewEltVT = VT.isFloatingPoint()
18809 ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
18810 : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
18811 int NewNumElts = NumElements / 2;
18812 MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
18813 // Make sure that the new vector type is legal. For example, v2f64 isn't
18814 // legal on SSE1.
18815 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
18816 if (V2IsZero) {
18817 // Modify the new Mask to take all zeros from the all-zero vector.
18818 // Choose indices that are blend-friendly.
18819 bool UsedZeroVector = false;
18820 assert(is_contained(WidenedMask, SM_SentinelZero) &&(static_cast<void> (0))
18821 "V2's non-undef elements are used?!")(static_cast<void> (0));
18822 for (int i = 0; i != NewNumElts; ++i)
18823 if (WidenedMask[i] == SM_SentinelZero) {
18824 WidenedMask[i] = i + NewNumElts;
18825 UsedZeroVector = true;
18826 }
18827 // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
18828 // some elements to be undef.
18829 if (UsedZeroVector)
18830 V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
18831 }
18832 V1 = DAG.getBitcast(NewVT, V1);
18833 V2 = DAG.getBitcast(NewVT, V2);
18834 return DAG.getBitcast(
18835 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
18836 }
18837 }
18838
18839 SmallVector<SDValue> Ops = {V1, V2};
18840 SmallVector<int> Mask(OrigMask.begin(), OrigMask.end());
18841
18842 // Canonicalize the shuffle with any horizontal ops inputs.
18843 // NOTE: This may update Ops and Mask.
18844 if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp(
18845 Ops, Mask, VT.getSizeInBits(), DL, DAG, Subtarget))
18846 return DAG.getBitcast(VT, HOp);
18847
18848 V1 = DAG.getBitcast(VT, Ops[0]);
18849 V2 = DAG.getBitcast(VT, Ops[1]);
18850 assert(NumElements == (int)Mask.size() &&(static_cast<void> (0))
18851 "canonicalizeShuffleMaskWithHorizOp "(static_cast<void> (0))
18852 "shouldn't alter the shuffle mask size")(static_cast<void> (0));
18853
18854 // Commute the shuffle if it will improve canonicalization.
18855 if (canonicalizeShuffleMaskWithCommute(Mask)) {
18856 ShuffleVectorSDNode::commuteMask(Mask);
18857 std::swap(V1, V2);
18858 }
18859
18860 // For each vector width, delegate to a specialized lowering routine.
18861 if (VT.is128BitVector())
18862 return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18863
18864 if (VT.is256BitVector())
18865 return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18866
18867 if (VT.is512BitVector())
18868 return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18869
18870 if (Is1BitVector)
18871 return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18872
18873 llvm_unreachable("Unimplemented!")__builtin_unreachable();
18874}
18875
18876/// Try to lower a VSELECT instruction to a vector shuffle.
18877static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
18878 const X86Subtarget &Subtarget,
18879 SelectionDAG &DAG) {
18880 SDValue Cond = Op.getOperand(0);
18881 SDValue LHS = Op.getOperand(1);
18882 SDValue RHS = Op.getOperand(2);
18883 MVT VT = Op.getSimpleValueType();
18884
18885 // Only non-legal VSELECTs reach this lowering, convert those into generic
18886 // shuffles and re-use the shuffle lowering path for blends.
18887 if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
18888 SmallVector<int, 32> Mask;
18889 if (createShuffleMaskFromVSELECT(Mask, Cond))
18890 return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);
18891 }
18892
18893 return SDValue();
18894}
18895
18896SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
18897 SDValue Cond = Op.getOperand(0);
18898 SDValue LHS = Op.getOperand(1);
18899 SDValue RHS = Op.getOperand(2);
18900
18901 // A vselect where all conditions and data are constants can be optimized into
18902 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
18903 if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) &&
18904 ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&
18905 ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))
18906 return SDValue();
18907
18908 // Try to lower this to a blend-style vector shuffle. This can handle all
18909 // constant condition cases.
18910 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
18911 return BlendOp;
18912
18913 // If this VSELECT has a vector if i1 as a mask, it will be directly matched
18914 // with patterns on the mask registers on AVX-512.
18915 MVT CondVT = Cond.getSimpleValueType();
18916 unsigned CondEltSize = Cond.getScalarValueSizeInBits();
18917 if (CondEltSize == 1)
18918 return Op;
18919
18920 // Variable blends are only legal from SSE4.1 onward.
18921 if (!Subtarget.hasSSE41())
18922 return SDValue();
18923
18924 SDLoc dl(Op);
18925 MVT VT = Op.getSimpleValueType();
18926 unsigned EltSize = VT.getScalarSizeInBits();
18927 unsigned NumElts = VT.getVectorNumElements();
18928
18929 // Expand v32i16/v64i8 without BWI.
18930 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
18931 return SDValue();
18932
18933 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
18934 // into an i1 condition so that we can use the mask-based 512-bit blend
18935 // instructions.
18936 if (VT.getSizeInBits() == 512) {
18937 // Build a mask by testing the condition against zero.
18938 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
18939 SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
18940 DAG.getConstant(0, dl, CondVT),
18941 ISD::SETNE);
18942 // Now return a new VSELECT using the mask.
18943 return DAG.getSelect(dl, VT, Mask, LHS, RHS);
18944 }
18945
18946 // SEXT/TRUNC cases where the mask doesn't match the destination size.
18947 if (CondEltSize != EltSize) {
18948 // If we don't have a sign splat, rely on the expansion.
18949 if (CondEltSize != DAG.ComputeNumSignBits(Cond))
18950 return SDValue();
18951
18952 MVT NewCondSVT = MVT::getIntegerVT(EltSize);
18953 MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);
18954 Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);
18955 return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
18956 }
18957
18958 // Only some types will be legal on some subtargets. If we can emit a legal
18959 // VSELECT-matching blend, return Op, and but if we need to expand, return
18960 // a null value.
18961 switch (VT.SimpleTy) {
18962 default:
18963 // Most of the vector types have blends past SSE4.1.
18964 return Op;
18965
18966 case MVT::v32i8:
18967 // The byte blends for AVX vectors were introduced only in AVX2.
18968 if (Subtarget.hasAVX2())
18969 return Op;
18970
18971 return SDValue();
18972
18973 case MVT::v8i16:
18974 case MVT::v16i16: {
18975 // Bitcast everything to the vXi8 type and use a vXi8 vselect.
18976 MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
18977 Cond = DAG.getBitcast(CastVT, Cond);
18978 LHS = DAG.getBitcast(CastVT, LHS);
18979 RHS = DAG.getBitcast(CastVT, RHS);
18980 SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
18981 return DAG.getBitcast(VT, Select);
18982 }
18983 }
18984}
18985
18986static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
18987 MVT VT = Op.getSimpleValueType();
18988 SDValue Vec = Op.getOperand(0);
18989 SDValue Idx = Op.getOperand(1);
18990 assert(isa<ConstantSDNode>(Idx) && "Constant index expected")(static_cast<void> (0));
18991 SDLoc dl(Op);
18992
18993 if (!Vec.getSimpleValueType().is128BitVector())
18994 return SDValue();
18995
18996 if (VT.getSizeInBits() == 8) {
18997 // If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless
18998 // we're going to zero extend the register or fold the store.
18999 if (llvm::isNullConstant(Idx) && !MayFoldIntoZeroExtend(Op) &&
19000 !MayFoldIntoStore(Op))
19001 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
19002 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
19003 DAG.getBitcast(MVT::v4i32, Vec), Idx));
19004
19005 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
19006 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,
19007 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
19008 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
19009 }
19010
19011 if (VT == MVT::f32) {
19012 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
19013 // the result back to FR32 register. It's only worth matching if the
19014 // result has a single use which is a store or a bitcast to i32. And in
19015 // the case of a store, it's not worth it if the index is a constant 0,
19016 // because a MOVSSmr can be used instead, which is smaller and faster.
19017 if (!Op.hasOneUse())
19018 return SDValue();
19019 SDNode *User = *Op.getNode()->use_begin();
19020 if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&
19021 (User->getOpcode() != ISD::BITCAST ||
19022 User->getValueType(0) != MVT::i32))
19023 return SDValue();
19024 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
19025 DAG.getBitcast(MVT::v4i32, Vec), Idx);
19026 return DAG.getBitcast(MVT::f32, Extract);
19027 }
19028
19029 if (VT == MVT::i32 || VT == MVT::i64)
19030 return Op;
19031
19032 return SDValue();
19033}
19034
19035/// Extract one bit from mask vector, like v16i1 or v8i1.
19036/// AVX-512 feature.
19037static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
19038 const X86Subtarget &Subtarget) {
19039 SDValue Vec = Op.getOperand(0);
19040 SDLoc dl(Vec);
19041 MVT VecVT = Vec.getSimpleValueType();
19042 SDValue Idx = Op.getOperand(1);
19043 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
19044 MVT EltVT = Op.getSimpleValueType();
19045
19046 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&(static_cast<void> (0))
19047 "Unexpected vector type in ExtractBitFromMaskVector")(static_cast<void> (0));
19048
19049 // variable index can't be handled in mask registers,
19050 // extend vector to VR512/128
19051 if (!IdxC) {
19052 unsigned NumElts = VecVT.getVectorNumElements();
19053 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
19054 // than extending to 128/256bit.
19055 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
19056 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
19057 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
19058 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
19059 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
19060 }
19061
19062 unsigned IdxVal = IdxC->getZExtValue();
19063 if (IdxVal == 0) // the operation is legal
19064 return Op;
19065
19066 // Extend to natively supported kshift.
19067 unsigned NumElems = VecVT.getVectorNumElements();
19068 MVT WideVecVT = VecVT;
19069 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {
19070 WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
19071 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
19072 DAG.getUNDEF(WideVecVT), Vec,
19073 DAG.getIntPtrConstant(0, dl));
19074 }
19075
19076 // Use kshiftr instruction to move to the lower element.
19077 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
19078 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
19079
19080 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
19081 DAG.getIntPtrConstant(0, dl));
19082}
19083
19084SDValue
19085X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
19086 SelectionDAG &DAG) const {
19087 SDLoc dl(Op);
19088 SDValue Vec = Op.getOperand(0);
19089 MVT VecVT = Vec.getSimpleValueType();
19090 SDValue Idx = Op.getOperand(1);
19091 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
19092
19093 if (VecVT.getVectorElementType() == MVT::i1)
19094 return ExtractBitFromMaskVector(Op, DAG, Subtarget);
19095
19096 if (!IdxC) {
19097 // Its more profitable to go through memory (1 cycles throughput)
19098 // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
19099 // IACA tool was used to get performance estimation
19100 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
19101 //
19102 // example : extractelement <16 x i8> %a, i32 %i
19103 //
19104 // Block Throughput: 3.00 Cycles
19105 // Throughput Bottleneck: Port5
19106 //
19107 // | Num Of | Ports pressure in cycles | |
19108 // | Uops | 0 - DV | 5 | 6 | 7 | |
19109 // ---------------------------------------------
19110 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
19111 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
19112 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
19113 // Total Num Of Uops: 4
19114 //
19115 //
19116 // Block Throughput: 1.00 Cycles
19117 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
19118 //
19119 // | | Ports pressure in cycles | |
19120 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
19121 // ---------------------------------------------------------
19122 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
19123 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
19124 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
19125 // Total Num Of Uops: 4
19126
19127 return SDValue();
19128 }
19129
19130 unsigned IdxVal = IdxC->getZExtValue();
19131
19132 // If this is a 256-bit vector result, first extract the 128-bit vector and
19133 // then extract the element from the 128-bit vector.
19134 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
19135 // Get the 128-bit vector.
19136 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
19137 MVT EltVT = VecVT.getVectorElementType();
19138
19139 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
19140 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")(static_cast<void> (0));
19141
19142 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
19143 // this can be done with a mask.
19144 IdxVal &= ElemsPerChunk - 1;
19145 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
19146 DAG.getIntPtrConstant(IdxVal, dl));
19147 }
19148
19149 assert(VecVT.is128BitVector() && "Unexpected vector length")(static_cast<void> (0));
19150
19151 MVT VT = Op.getSimpleValueType();
19152
19153 if (VT == MVT::i16) {
19154 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
19155 // we're going to zero extend the register or fold the store (SSE41 only).
19156 if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) &&
19157 !(Subtarget.hasSSE41() && MayFoldIntoStore(Op))) {
19158 if (Subtarget.hasFP16())
19159 return Op;
19160
19161 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
19162 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
19163 DAG.getBitcast(MVT::v4i32, Vec), Idx));
19164 }
19165
19166 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,
19167 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
19168 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
19169 }
19170
19171 if (Subtarget.hasSSE41())
19172 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
19173 return Res;
19174
19175 // TODO: We only extract a single element from v16i8, we can probably afford
19176 // to be more aggressive here before using the default approach of spilling to
19177 // stack.
19178 if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
19179 // Extract either the lowest i32 or any i16, and extract the sub-byte.
19180 int DWordIdx = IdxVal / 4;
19181 if (DWordIdx == 0) {
19182 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
19183 DAG.getBitcast(MVT::v4i32, Vec),
19184 DAG.getIntPtrConstant(DWordIdx, dl));
19185 int ShiftVal = (IdxVal % 4) * 8;
19186 if (ShiftVal != 0)
19187 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
19188 DAG.getConstant(ShiftVal, dl, MVT::i8));
19189 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
19190 }
19191
19192 int WordIdx = IdxVal / 2;
19193 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
19194 DAG.getBitcast(MVT::v8i16, Vec),
19195 DAG.getIntPtrConstant(WordIdx, dl));
19196 int ShiftVal = (IdxVal % 2) * 8;
19197 if (ShiftVal != 0)
19198 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
19199 DAG.getConstant(ShiftVal, dl, MVT::i8));
19200 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
19201 }
19202
19203 if (VT == MVT::f16 || VT.getSizeInBits() == 32) {
19204 if (IdxVal == 0)
19205 return Op;
19206
19207 // Shuffle the element to the lowest element, then movss or movsh.
19208 SmallVector<int, 8> Mask(VecVT.getVectorNumElements(), -1);
19209 Mask[0] = static_cast<int>(IdxVal);
19210 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
19211 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
19212 DAG.getIntPtrConstant(0, dl));
19213 }
19214
19215 if (VT.getSizeInBits() == 64) {
19216 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
19217 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
19218 // to match extract_elt for f64.
19219 if (IdxVal == 0)
19220 return Op;
19221
19222 // UNPCKHPD the element to the lowest double word, then movsd.
19223 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
19224 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
19225 int Mask[2] = { 1, -1 };
19226 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
19227 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
19228 DAG.getIntPtrConstant(0, dl));
19229 }
19230
19231 return SDValue();
19232}
19233
19234/// Insert one bit to mask vector, like v16i1 or v8i1.
19235/// AVX-512 feature.
19236static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,
19237 const X86Subtarget &Subtarget) {
19238 SDLoc dl(Op);
19239 SDValue Vec = Op.getOperand(0);
19240 SDValue Elt = Op.getOperand(1);
19241 SDValue Idx = Op.getOperand(2);
19242 MVT VecVT = Vec.getSimpleValueType();
19243
19244 if (!isa<ConstantSDNode>(Idx)) {
19245 // Non constant index. Extend source and destination,
19246 // insert element and then truncate the result.
19247 unsigned NumElts = VecVT.getVectorNumElements();
19248 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
19249 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
19250 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
19251 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
19252 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
19253 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
19254 }
19255
19256 // Copy into a k-register, extract to v1i1 and insert_subvector.
19257 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
19258 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx);
19259}
19260
19261SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
19262 SelectionDAG &DAG) const {
19263 MVT VT = Op.getSimpleValueType();
19264 MVT EltVT = VT.getVectorElementType();
19265 unsigned NumElts = VT.getVectorNumElements();
19266 unsigned EltSizeInBits = EltVT.getScalarSizeInBits();
19267
19268 if (EltVT == MVT::i1)
19269 return InsertBitToMaskVector(Op, DAG, Subtarget);
19270
19271 SDLoc dl(Op);
19272 SDValue N0 = Op.getOperand(0);
19273 SDValue N1 = Op.getOperand(1);
19274 SDValue N2 = Op.getOperand(2);
19275 auto *N2C = dyn_cast<ConstantSDNode>(N2);
19276
19277 if (!N2C) {
19278 // Variable insertion indices, usually we're better off spilling to stack,
19279 // but AVX512 can use a variable compare+select by comparing against all
19280 // possible vector indices, and FP insertion has less gpr->simd traffic.
19281 if (!(Subtarget.hasBWI() ||
19282 (Subtarget.hasAVX512() && EltSizeInBits >= 32) ||
19283 (Subtarget.hasSSE41() && VT.isFloatingPoint())))
19284 return SDValue();
19285
19286 MVT IdxSVT = MVT::getIntegerVT(EltSizeInBits);
19287 MVT IdxVT = MVT::getVectorVT(IdxSVT, NumElts);
19288 if (!isTypeLegal(IdxSVT) || !isTypeLegal(IdxVT))
19289 return SDValue();
19290
19291 SDValue IdxExt = DAG.getZExtOrTrunc(N2, dl, IdxSVT);
19292 SDValue IdxSplat = DAG.getSplatBuildVector(IdxVT, dl, IdxExt);
19293 SDValue EltSplat = DAG.getSplatBuildVector(VT, dl, N1);
19294
19295 SmallVector<SDValue, 16> RawIndices;
19296 for (unsigned I = 0; I != NumElts; ++I)
19297 RawIndices.push_back(DAG.getConstant(I, dl, IdxSVT));
19298 SDValue Indices = DAG.getBuildVector(IdxVT, dl, RawIndices);
19299
19300 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
19301 return DAG.getSelectCC(dl, IdxSplat, Indices, EltSplat, N0,
19302 ISD::CondCode::SETEQ);
19303 }
19304
19305 if (N2C->getAPIntValue().uge(NumElts))
19306 return SDValue();
19307 uint64_t IdxVal = N2C->getZExtValue();
19308
19309 bool IsZeroElt = X86::isZeroNode(N1);
19310 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
19311
19312 // If we are inserting a element, see if we can do this more efficiently with
19313 // a blend shuffle with a rematerializable vector than a costly integer
19314 // insertion.
19315 if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() &&
19316 (16 <= EltSizeInBits || (IsZeroElt && !VT.is128BitVector()))) {
19317 SmallVector<int, 8> BlendMask;
19318 for (unsigned i = 0; i != NumElts; ++i)
19319 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
19320 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
19321 : getOnesVector(VT, DAG, dl);
19322 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
19323 }
19324
19325 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
19326 // into that, and then insert the subvector back into the result.
19327 if (VT.is256BitVector() || VT.is512BitVector()) {
19328 // With a 256-bit vector, we can insert into the zero element efficiently
19329 // using a blend if we have AVX or AVX2 and the right data type.
19330 if (VT.is256BitVector() && IdxVal == 0) {
19331 // TODO: It is worthwhile to cast integer to floating point and back
19332 // and incur a domain crossing penalty if that's what we'll end up
19333 // doing anyway after extracting to a 128-bit vector.
19334 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
19335 (Subtarget.hasAVX2() && EltVT == MVT::i32)) {
19336 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
19337 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,
19338 DAG.getTargetConstant(1, dl, MVT::i8));
19339 }
19340 }
19341
19342 unsigned NumEltsIn128 = 128 / EltSizeInBits;
19343 assert(isPowerOf2_32(NumEltsIn128) &&(static_cast<void> (0))
19344 "Vectors will always have power-of-two number of elements.")(static_cast<void> (0));
19345
19346 // If we are not inserting into the low 128-bit vector chunk,
19347 // then prefer the broadcast+blend sequence.
19348 // FIXME: relax the profitability check iff all N1 uses are insertions.
19349 if (!VT.is128BitVector() && IdxVal >= NumEltsIn128 &&
19350 ((Subtarget.hasAVX2() && EltSizeInBits != 8) ||
19351 (Subtarget.hasAVX() && (EltSizeInBits >= 32) && MayFoldLoad(N1)))) {
19352 SDValue N1SplatVec = DAG.getSplatBuildVector(VT, dl, N1);
19353 SmallVector<int, 8> BlendMask;
19354 for (unsigned i = 0; i != NumElts; ++i)
19355 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
19356 return DAG.getVectorShuffle(VT, dl, N0, N1SplatVec, BlendMask);
19357 }
19358
19359 // Get the desired 128-bit vector chunk.
19360 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
19361
19362 // Insert the element into the desired chunk.
19363 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
19364 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
19365
19366 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
19367 DAG.getIntPtrConstant(IdxIn128, dl));
19368
19369 // Insert the changed part back into the bigger vector
19370 return insert128BitVector(N0, V, IdxVal, DAG, dl);
19371 }
19372 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!")(static_cast<void> (0));
19373
19374 // This will be just movw/movd/movq/movsh/movss/movsd.
19375 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {
19376 if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
19377 EltVT == MVT::f16 || EltVT == MVT::i64) {
19378 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
19379 return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
19380 }
19381
19382 // We can't directly insert an i8 or i16 into a vector, so zero extend
19383 // it to i32 first.
19384 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
19385 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);
19386 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
19387 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);
19388 N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
19389 return DAG.getBitcast(VT, N1);
19390 }
19391 }
19392
19393 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
19394 // argument. SSE41 required for pinsrb.
19395 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
19396 unsigned Opc;
19397 if (VT == MVT::v8i16) {
19398 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW")(static_cast<void> (0));
19399 Opc = X86ISD::PINSRW;
19400 } else {
19401 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector")(static_cast<void> (0));
19402 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB")(static_cast<void> (0));
19403 Opc = X86ISD::PINSRB;
19404 }
19405
19406 assert(N1.getValueType() != MVT::i32 && "Unexpected VT")(static_cast<void> (0));
19407 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
19408 N2 = DAG.getTargetConstant(IdxVal, dl, MVT::i8);
19409 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
19410 }
19411
19412 if (Subtarget.hasSSE41()) {
19413 if (EltVT == MVT::f32) {
19414 // Bits [7:6] of the constant are the source select. This will always be
19415 // zero here. The DAG Combiner may combine an extract_elt index into
19416 // these bits. For example (insert (extract, 3), 2) could be matched by
19417 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
19418 // Bits [5:4] of the constant are the destination select. This is the
19419 // value of the incoming immediate.
19420 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
19421 // combine either bitwise AND or insert of float 0.0 to set these bits.
19422
19423 bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
19424 if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
19425 // If this is an insertion of 32-bits into the low 32-bits of
19426 // a vector, we prefer to generate a blend with immediate rather
19427 // than an insertps. Blends are simpler operations in hardware and so
19428 // will always have equal or better performance than insertps.
19429 // But if optimizing for size and there's a load folding opportunity,
19430 // generate insertps because blendps does not have a 32-bit memory
19431 // operand form.
19432 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
19433 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,
19434 DAG.getTargetConstant(1, dl, MVT::i8));
19435 }
19436 // Create this as a scalar to vector..
19437 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
19438 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,
19439 DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));
19440 }
19441
19442 // PINSR* works with constant index.
19443 if (EltVT == MVT::i32 || EltVT == MVT::i64)
19444 return Op;
19445 }
19446
19447 return SDValue();
19448}
19449
19450static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
19451 SelectionDAG &DAG) {
19452 SDLoc dl(Op);
19453 MVT OpVT = Op.getSimpleValueType();
19454
19455 // It's always cheaper to replace a xor+movd with xorps and simplifies further
19456 // combines.
19457 if (X86::isZeroNode(Op.getOperand(0)))
19458 return getZeroVector(OpVT, Subtarget, DAG, dl);
19459
19460 // If this is a 256-bit vector result, first insert into a 128-bit
19461 // vector and then insert into the 256-bit vector.
19462 if (!OpVT.is128BitVector()) {
19463 // Insert into a 128-bit vector.
19464 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
19465 MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
19466 OpVT.getVectorNumElements() / SizeFactor);
19467
19468 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
19469
19470 // Insert the 128-bit vector.
19471 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
19472 }
19473 assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&(static_cast<void> (0))
19474 "Expected an SSE type!")(static_cast<void> (0));
19475
19476 // Pass through a v4i32 or V8i16 SCALAR_TO_VECTOR as that's what we use in
19477 // tblgen.
19478 if (OpVT == MVT::v4i32 || (OpVT == MVT::v8i16 && Subtarget.hasFP16()))
19479 return Op;
19480
19481 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
19482 return DAG.getBitcast(
19483 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
19484}
19485
19486// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
19487// simple superregister reference or explicit instructions to insert
19488// the upper bits of a vector.
19489static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
19490 SelectionDAG &DAG) {
19491 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1)(static_cast<void> (0));
19492
19493 return insert1BitVector(Op, DAG, Subtarget);
19494}
19495
19496static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
19497 SelectionDAG &DAG) {
19498 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&(static_cast<void> (0))
19499 "Only vXi1 extract_subvectors need custom lowering")(static_cast<void> (0));
19500
19501 SDLoc dl(Op);
19502 SDValue Vec = Op.getOperand(0);
19503 uint64_t IdxVal = Op.getConstantOperandVal(1);
19504
19505 if (IdxVal == 0) // the operation is legal
19506 return Op;
19507
19508 MVT VecVT = Vec.getSimpleValueType();
19509 unsigned NumElems = VecVT.getVectorNumElements();
19510
19511 // Extend to natively supported kshift.
19512 MVT WideVecVT = VecVT;
19513 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {
19514 WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
19515 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
19516 DAG.getUNDEF(WideVecVT), Vec,
19517 DAG.getIntPtrConstant(0, dl));
19518 }
19519
19520 // Shift to the LSB.
19521 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
19522 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
19523
19524 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
19525 DAG.getIntPtrConstant(0, dl));
19526}
19527
19528// Returns the appropriate wrapper opcode for a global reference.
19529unsigned X86TargetLowering::getGlobalWrapperKind(
19530 const GlobalValue *GV, const unsigned char OpFlags) const {
19531 // References to absolute symbols are never PC-relative.
19532 if (GV && GV->isAbsoluteSymbolRef())
19533 return X86ISD::Wrapper;
19534
19535 CodeModel::Model M = getTargetMachine().getCodeModel();
19536 if (Subtarget.isPICStyleRIPRel() &&
19537 (M == CodeModel::Small || M == CodeModel::Kernel))
19538 return X86ISD::WrapperRIP;
19539
19540 // GOTPCREL references must always use RIP.
19541 if (OpFlags == X86II::MO_GOTPCREL)
19542 return X86ISD::WrapperRIP;
19543
19544 return X86ISD::Wrapper;
19545}
19546
19547// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
19548// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
19549// one of the above mentioned nodes. It has to be wrapped because otherwise
19550// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
19551// be used to form addressing mode. These wrapped nodes will be selected
19552// into MOV32ri.
19553SDValue
19554X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
19555 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
19556
19557 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19558 // global base reg.
19559 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
19560
19561 auto PtrVT = getPointerTy(DAG.getDataLayout());
19562 SDValue Result = DAG.getTargetConstantPool(
19563 CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);
19564 SDLoc DL(CP);
19565 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
19566 // With PIC, the address is actually $g + Offset.
19567 if (OpFlag) {
19568 Result =
19569 DAG.getNode(ISD::ADD, DL, PtrVT,
19570 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
19571 }
19572
19573 return Result;
19574}
19575
19576SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
19577 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
19578
19579 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19580 // global base reg.
19581 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
19582
19583 auto PtrVT = getPointerTy(DAG.getDataLayout());
19584 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
19585 SDLoc DL(JT);
19586 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
19587
19588 // With PIC, the address is actually $g + Offset.
19589 if (OpFlag)
19590 Result =
19591 DAG.getNode(ISD::ADD, DL, PtrVT,
19592 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
19593
19594 return Result;
19595}
19596
19597SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
19598 SelectionDAG &DAG) const {
19599 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
19600}
19601
19602SDValue
19603X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
19604 // Create the TargetBlockAddressAddress node.
19605 unsigned char OpFlags =
19606 Subtarget.classifyBlockAddressReference();
19607 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
19608 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
19609 SDLoc dl(Op);
19610 auto PtrVT = getPointerTy(DAG.getDataLayout());
19611 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
19612 Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);
19613
19614 // With PIC, the address is actually $g + Offset.
19615 if (isGlobalRelativeToPICBase(OpFlags)) {
19616 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
19617 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
19618 }
19619
19620 return Result;
19621}
19622
19623/// Creates target global address or external symbol nodes for calls or
19624/// other uses.
19625SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
19626 bool ForCall) const {
19627 // Unpack the global address or external symbol.
19628 const SDLoc &dl = SDLoc(Op);
19629 const GlobalValue *GV = nullptr;
19630 int64_t Offset = 0;
19631 const char *ExternalSym = nullptr;
19632 if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {
19633 GV = G->getGlobal();
19634 Offset = G->getOffset();
19635 } else {
19636 const auto *ES = cast<ExternalSymbolSDNode>(Op);
19637 ExternalSym = ES->getSymbol();
19638 }
19639
19640 // Calculate some flags for address lowering.
19641 const Module &Mod = *DAG.getMachineFunction().getFunction().getParent();
19642 unsigned char OpFlags;
19643 if (ForCall)
19644 OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);
19645 else
19646 OpFlags = Subtarget.classifyGlobalReference(GV, Mod);
19647 bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);
19648 bool NeedsLoad = isGlobalStubReference(OpFlags);
19649
19650 CodeModel::Model M = DAG.getTarget().getCodeModel();
19651 auto PtrVT = getPointerTy(DAG.getDataLayout());
19652 SDValue Result;
19653
19654 if (GV) {
19655 // Create a target global address if this is a global. If possible, fold the
19656 // offset into the global address reference. Otherwise, ADD it on later.
19657 // Suppress the folding if Offset is negative: movl foo-1, %eax is not
19658 // allowed because if the address of foo is 0, the ELF R_X86_64_32
19659 // relocation will compute to a negative value, which is invalid.
19660 int64_t GlobalOffset = 0;
19661 if (OpFlags == X86II::MO_NO_FLAG && Offset >= 0 &&
19662 X86::isOffsetSuitableForCodeModel(Offset, M, true)) {
19663 std::swap(GlobalOffset, Offset);
19664 }
19665 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
19666 } else {
19667 // If this is not a global address, this must be an external symbol.
19668 Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);
19669 }
19670
19671 // If this is a direct call, avoid the wrapper if we don't need to do any
19672 // loads or adds. This allows SDAG ISel to match direct calls.
19673 if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
19674 return Result;
19675
19676 Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
19677
19678 // With PIC, the address is actually $g + Offset.
19679 if (HasPICReg) {
19680 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
19681 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
19682 }
19683
19684 // For globals that require a load from a stub to get the address, emit the
19685 // load.
19686 if (NeedsLoad)
19687 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
19688 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
19689
19690 // If there was a non-zero offset that we didn't fold, create an explicit
19691 // addition for it.
19692 if (Offset != 0)
19693 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
19694 DAG.getConstant(Offset, dl, PtrVT));
19695
19696 return Result;
19697}
19698
19699SDValue
19700X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
19701 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
19702}
19703
19704static SDValue
19705GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
19706 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
19707 unsigned char OperandFlags, bool LocalDynamic = false) {
19708 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
19709 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19710 SDLoc dl(GA);
19711 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19712 GA->getValueType(0),
19713 GA->getOffset(),
19714 OperandFlags);
19715
19716 X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
19717 : X86ISD::TLSADDR;
19718
19719 if (InFlag) {
19720 SDValue Ops[] = { Chain, TGA, *InFlag };
19721 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
19722 } else {
19723 SDValue Ops[] = { Chain, TGA };
19724 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
19725 }
19726
19727 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
19728 MFI.setAdjustsStack(true);
19729 MFI.setHasCalls(true);
19730
19731 SDValue Flag = Chain.getValue(1);
19732 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
19733}
19734
19735// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
19736static SDValue
19737LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
19738 const EVT PtrVT) {
19739 SDValue InFlag;
19740 SDLoc dl(GA); // ? function entry point might be better
19741 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
19742 DAG.getNode(X86ISD::GlobalBaseReg,
19743 SDLoc(), PtrVT), InFlag);
19744 InFlag = Chain.getValue(1);
19745
19746 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
19747}
19748
19749// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64
19750static SDValue
19751LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
19752 const EVT PtrVT) {
19753 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
19754 X86::RAX, X86II::MO_TLSGD);
19755}
19756
19757// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32
19758static SDValue
19759LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
19760 const EVT PtrVT) {
19761 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
19762 X86::EAX, X86II::MO_TLSGD);
19763}
19764
19765static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
19766 SelectionDAG &DAG, const EVT PtrVT,
19767 bool Is64Bit, bool Is64BitLP64) {
19768 SDLoc dl(GA);
19769
19770 // Get the start address of the TLS block for this module.
19771 X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
19772 .getInfo<X86MachineFunctionInfo>();
19773 MFI->incNumLocalDynamicTLSAccesses();
19774
19775 SDValue Base;
19776 if (Is64Bit) {
19777 unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;
19778 Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, ReturnReg,
19779 X86II::MO_TLSLD, /*LocalDynamic=*/true);
19780 } else {
19781 SDValue InFlag;
19782 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
19783 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
19784 InFlag = Chain.getValue(1);
19785 Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
19786 X86II::MO_TLSLDM, /*LocalDynamic=*/true);
19787 }
19788
19789 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
19790 // of Base.
19791
19792 // Build x@dtpoff.
19793 unsigned char OperandFlags = X86II::MO_DTPOFF;
19794 unsigned WrapperKind = X86ISD::Wrapper;
19795 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19796 GA->getValueType(0),
19797 GA->getOffset(), OperandFlags);
19798 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
19799
19800 // Add x@dtpoff with the base.
19801 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
19802}
19803
19804// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
19805static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
19806 const EVT PtrVT, TLSModel::Model model,
19807 bool is64Bit, bool isPIC) {
19808 SDLoc dl(GA);
19809
19810 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
19811 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
19812 is64Bit ? 257 : 256));
19813
19814 SDValue ThreadPointer =
19815 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
19816 MachinePointerInfo(Ptr));
19817
19818 unsigned char OperandFlags = 0;
19819 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
19820 // initialexec.
19821 unsigned WrapperKind = X86ISD::Wrapper;
19822 if (model == TLSModel::LocalExec) {
19823 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
19824 } else if (model == TLSModel::InitialExec) {
19825 if (is64Bit) {
19826 OperandFlags = X86II::MO_GOTTPOFF;
19827 WrapperKind = X86ISD::WrapperRIP;
19828 } else {
19829 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
19830 }
19831 } else {
19832 llvm_unreachable("Unexpected model")__builtin_unreachable();
19833 }
19834
19835 // emit "addl x@ntpoff,%eax" (local exec)
19836 // or "addl x@indntpoff,%eax" (initial exec)
19837 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
19838 SDValue TGA =
19839 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
19840 GA->getOffset(), OperandFlags);
19841 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
19842
19843 if (model == TLSModel::InitialExec) {
19844 if (isPIC && !is64Bit) {
19845 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
19846 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
19847 Offset);
19848 }
19849
19850 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
19851 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
19852 }
19853
19854 // The address of the thread local variable is the add of the thread
19855 // pointer with the offset of the variable.
19856 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
19857}
19858
19859SDValue
19860X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
19861
19862 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
19863
19864 if (DAG.getTarget().useEmulatedTLS())
19865 return LowerToTLSEmulatedModel(GA, DAG);
19866
19867 const GlobalValue *GV = GA->getGlobal();
19868 auto PtrVT = getPointerTy(DAG.getDataLayout());
19869 bool PositionIndependent = isPositionIndependent();
19870
19871 if (Subtarget.isTargetELF()) {
19872 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
19873 switch (model) {
19874 case TLSModel::GeneralDynamic:
19875 if (Subtarget.is64Bit()) {
19876 if (Subtarget.isTarget64BitLP64())
19877 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
19878 return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);
19879 }
19880 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
19881 case TLSModel::LocalDynamic:
19882 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),
19883 Subtarget.isTarget64BitLP64());
19884 case TLSModel::InitialExec:
19885 case TLSModel::LocalExec:
19886 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
19887 PositionIndependent);
19888 }
19889 llvm_unreachable("Unknown TLS model.")__builtin_unreachable();
19890 }
19891
19892 if (Subtarget.isTargetDarwin()) {
19893 // Darwin only has one model of TLS. Lower to that.
19894 unsigned char OpFlag = 0;
19895 unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
19896 X86ISD::WrapperRIP : X86ISD::Wrapper;
19897
19898 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19899 // global base reg.
19900 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
19901 if (PIC32)
19902 OpFlag = X86II::MO_TLVP_PIC_BASE;
19903 else
19904 OpFlag = X86II::MO_TLVP;
19905 SDLoc DL(Op);
19906 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
19907 GA->getValueType(0),
19908 GA->getOffset(), OpFlag);
19909 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
19910
19911 // With PIC32, the address is actually $g + Offset.
19912 if (PIC32)
19913 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
19914 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
19915 Offset);
19916
19917 // Lowering the machine isd will make sure everything is in the right
19918 // location.
19919 SDValue Chain = DAG.getEntryNode();
19920 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19921 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
19922 SDValue Args[] = { Chain, Offset };
19923 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
19924 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
19925 DAG.getIntPtrConstant(0, DL, true),
19926 Chain.getValue(1), DL);
19927
19928 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
19929 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
19930 MFI.setAdjustsStack(true);
19931
19932 // And our return value (tls address) is in the standard call return value
19933 // location.
19934 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
19935 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
19936 }
19937
19938 if (Subtarget.isOSWindows()) {
19939 // Just use the implicit TLS architecture
19940 // Need to generate something similar to:
19941 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
19942 // ; from TEB
19943 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
19944 // mov rcx, qword [rdx+rcx*8]
19945 // mov eax, .tls$:tlsvar
19946 // [rax+rcx] contains the address
19947 // Windows 64bit: gs:0x58
19948 // Windows 32bit: fs:__tls_array
19949
19950 SDLoc dl(GA);
19951 SDValue Chain = DAG.getEntryNode();
19952
19953 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
19954 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
19955 // use its literal value of 0x2C.
19956 Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
19957 ? Type::getInt8PtrTy(*DAG.getContext(),
19958 256)
19959 : Type::getInt32PtrTy(*DAG.getContext(),
19960 257));
19961
19962 SDValue TlsArray = Subtarget.is64Bit()
19963 ? DAG.getIntPtrConstant(0x58, dl)
19964 : (Subtarget.isTargetWindowsGNU()
19965 ? DAG.getIntPtrConstant(0x2C, dl)
19966 : DAG.getExternalSymbol("_tls_array", PtrVT));
19967
19968 SDValue ThreadPointer =
19969 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
19970
19971 SDValue res;
19972 if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
19973 res = ThreadPointer;
19974 } else {
19975 // Load the _tls_index variable
19976 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
19977 if (Subtarget.is64Bit())
19978 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
19979 MachinePointerInfo(), MVT::i32);
19980 else
19981 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
19982
19983 const DataLayout &DL = DAG.getDataLayout();
19984 SDValue Scale =
19985 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
19986 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
19987
19988 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
19989 }
19990
19991 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
19992
19993 // Get the offset of start of .tls section
19994 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19995 GA->getValueType(0),
19996 GA->getOffset(), X86II::MO_SECREL);
19997 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
19998
19999 // The address of the thread local variable is the add of the thread
20000 // pointer with the offset of the variable.
20001 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
20002 }
20003
20004 llvm_unreachable("TLS not implemented for this target.")__builtin_unreachable();
20005}
20006
20007/// Lower SRA_PARTS and friends, which return two i32 values
20008/// and take a 2 x i32 value to shift plus a shift amount.
20009/// TODO: Can this be moved to general expansion code?
20010static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
20011 SDValue Lo, Hi;
20012 DAG.getTargetLoweringInfo().expandShiftParts(Op.getNode(), Lo, Hi, DAG);
20013 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
20014}
20015
20016static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
20017 SelectionDAG &DAG) {
20018 MVT VT = Op.getSimpleValueType();
20019 assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) &&(static_cast<void> (0))
20020 "Unexpected funnel shift opcode!")(static_cast<void> (0));
20021
20022 SDLoc DL(Op);
20023 SDValue Op0 = Op.getOperand(0);
20024 SDValue Op1 = Op.getOperand(1);
20025 SDValue Amt = Op.getOperand(2);
20026
20027 bool IsFSHR = Op.getOpcode() == ISD::FSHR;
20028
20029 if (VT.isVector()) {
20030 assert(Subtarget.hasVBMI2() && "Expected VBMI2")(static_cast<void> (0));
20031
20032 if (IsFSHR)
20033 std::swap(Op0, Op1);
20034
20035 // With AVX512, but not VLX we need to widen to get a 512-bit result type.
20036 if (!Subtarget.hasVLX() && !VT.is512BitVector()) {
20037 Op0 = widenSubVector(Op0, false, Subtarget, DAG, DL, 512);
20038 Op1 = widenSubVector(Op1, false, Subtarget, DAG, DL, 512);
20039 }
20040
20041 SDValue Funnel;
20042 APInt APIntShiftAmt;
20043 MVT ResultVT = Op0.getSimpleValueType();
20044 if (X86::isConstantSplat(Amt, APIntShiftAmt)) {
20045 uint64_t ShiftAmt = APIntShiftAmt.urem(VT.getScalarSizeInBits());
20046 Funnel =
20047 DAG.getNode(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, ResultVT, Op0,
20048 Op1, DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
20049 } else {
20050 if (!Subtarget.hasVLX() && !VT.is512BitVector())
20051 Amt = widenSubVector(Amt, false, Subtarget, DAG, DL, 512);
20052 Funnel = DAG.getNode(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL,
20053 ResultVT, Op0, Op1, Amt);
20054 }
20055 if (!Subtarget.hasVLX() && !VT.is512BitVector())
20056 Funnel = extractSubVector(Funnel, 0, DAG, DL, VT.getSizeInBits());
20057 return Funnel;
20058 }
20059 assert((static_cast<void> (0))
20060 (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&(static_cast<void> (0))
20061 "Unexpected funnel shift type!")(static_cast<void> (0));
20062
20063 // Expand slow SHLD/SHRD cases if we are not optimizing for size.
20064 bool OptForSize = DAG.shouldOptForSize();
20065 bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow();
20066
20067 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
20068 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
20069 if ((VT == MVT::i8 || (ExpandFunnel && VT == MVT::i16)) &&
20070 !isa<ConstantSDNode>(Amt)) {
20071 unsigned EltSizeInBits = VT.getScalarSizeInBits();
20072 SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType());
20073 SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType());
20074 Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32);
20075 Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32);
20076 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask);
20077 SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift);
20078 Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1);
20079 if (IsFSHR) {
20080 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt);
20081 } else {
20082 Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt);
20083 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift);
20084 }
20085 return DAG.getZExtOrTrunc(Res, DL, VT);
20086 }
20087
20088 if (VT == MVT::i8 || ExpandFunnel)
20089 return SDValue();
20090
20091 // i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
20092 if (VT == MVT::i16) {
20093 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
20094 DAG.getConstant(15, DL, Amt.getValueType()));
20095 unsigned FSHOp = (IsFSHR ? X86ISD::FSHR : X86ISD::FSHL);
20096 return DAG.getNode(FSHOp, DL, VT, Op0, Op1, Amt);
20097 }
20098
20099 return Op;
20100}
20101
20102// Try to use a packed vector operation to handle i64 on 32-bit targets when
20103// AVX512DQ is enabled.
20104static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,
20105 const X86Subtarget &Subtarget) {
20106 assert((Op.getOpcode() == ISD::SINT_TO_FP ||(static_cast<void> (0))
20107 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||(static_cast<void> (0))
20108 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||(static_cast<void> (0))
20109 Op.getOpcode() == ISD::UINT_TO_FP) &&(static_cast<void> (0))
20110 "Unexpected opcode!")(static_cast<void> (0));
20111 bool IsStrict = Op->isStrictFPOpcode();
20112 unsigned OpNo = IsStrict ? 1 : 0;
20113 SDValue Src = Op.getOperand(OpNo);
20114 MVT SrcVT = Src.getSimpleValueType();
20115 MVT VT = Op.getSimpleValueType();
20116
20117 if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
20118 (VT != MVT::f32 && VT != MVT::f64))
20119 return SDValue();
20120
20121 // Pack the i64 into a vector, do the operation and extract.
20122
20123 // Using 256-bit to ensure result is 128-bits for f32 case.
20124 unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
20125 MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
20126 MVT VecVT = MVT::getVectorVT(VT, NumElts);
20127
20128 SDLoc dl(Op);
20129 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
20130 if (IsStrict) {
20131 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},
20132 {Op.getOperand(0), InVec});
20133 SDValue Chain = CvtVec.getValue(1);
20134 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
20135 DAG.getIntPtrConstant(0, dl));
20136 return DAG.getMergeValues({Value, Chain}, dl);
20137 }
20138
20139 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
20140
20141 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
20142 DAG.getIntPtrConstant(0, dl));
20143}
20144
20145// Try to use a packed vector operation to handle i64 on 32-bit targets.
20146static SDValue LowerI64IntToFP16(SDValue Op, SelectionDAG &DAG,
20147 const X86Subtarget &Subtarget) {
20148 assert((Op.getOpcode() == ISD::SINT_TO_FP ||(static_cast<void> (0))
20149 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||(static_cast<void> (0))
20150 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||(static_cast<void> (0))
20151 Op.getOpcode() == ISD::UINT_TO_FP) &&(static_cast<void> (0))
20152 "Unexpected opcode!")(static_cast<void> (0));
20153 bool IsStrict = Op->isStrictFPOpcode();
20154 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
20155 MVT SrcVT = Src.getSimpleValueType();
20156 MVT VT = Op.getSimpleValueType();
20157
20158 if (SrcVT != MVT::i64 || Subtarget.is64Bit() || VT != MVT::f16)
20159 return SDValue();
20160
20161 // Pack the i64 into a vector, do the operation and extract.
20162
20163 assert(Subtarget.hasFP16() && "Expected FP16")(static_cast<void> (0));
20164
20165 SDLoc dl(Op);
20166 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
20167 if (IsStrict) {
20168 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {MVT::v2f16, MVT::Other},
20169 {Op.getOperand(0), InVec});
20170 SDValue Chain = CvtVec.getValue(1);
20171 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
20172 DAG.getIntPtrConstant(0, dl));
20173 return DAG.getMergeValues({Value, Chain}, dl);
20174 }
20175
20176 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, MVT::v2f16, InVec);
20177
20178 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
20179 DAG.getIntPtrConstant(0, dl));
20180}
20181
20182static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
20183 const X86Subtarget &Subtarget) {
20184 switch (Opcode) {
20185 case ISD::SINT_TO_FP:
20186 // TODO: Handle wider types with AVX/AVX512.
20187 if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
20188 return false;
20189 // CVTDQ2PS or (V)CVTDQ2PD
20190 return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
20191
20192 case ISD::UINT_TO_FP:
20193 // TODO: Handle wider types and i64 elements.
20194 if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
20195 return false;
20196 // VCVTUDQ2PS or VCVTUDQ2PD
20197 return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
20198
20199 default:
20200 return false;
20201 }
20202}
20203
20204/// Given a scalar cast operation that is extracted from a vector, try to
20205/// vectorize the cast op followed by extraction. This will avoid an expensive
20206/// round-trip between XMM and GPR.
20207static SDValue vectorizeExtractedCast(SDValue Cast, SelectionDAG &DAG,
20208 const X86Subtarget &Subtarget) {
20209 // TODO: This could be enhanced to handle smaller integer types by peeking
20210 // through an extend.
20211 SDValue Extract = Cast.getOperand(0);
20212 MVT DestVT = Cast.getSimpleValueType();
20213 if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
20214 !isa<ConstantSDNode>(Extract.getOperand(1)))
20215 return SDValue();
20216
20217 // See if we have a 128-bit vector cast op for this type of cast.
20218 SDValue VecOp = Extract.getOperand(0);
20219 MVT FromVT = VecOp.getSimpleValueType();
20220 unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();
20221 MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);
20222 MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);
20223 if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))
20224 return SDValue();
20225
20226 // If we are extracting from a non-zero element, first shuffle the source
20227 // vector to allow extracting from element zero.
20228 SDLoc DL(Cast);
20229 if (!isNullConstant(Extract.getOperand(1))) {
20230 SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
20231 Mask[0] = Extract.getConstantOperandVal(1);
20232 VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);
20233 }
20234 // If the source vector is wider than 128-bits, extract the low part. Do not
20235 // create an unnecessarily wide vector cast op.
20236 if (FromVT != Vec128VT)
20237 VecOp = extract128BitVector(VecOp, 0, DAG, DL);
20238
20239 // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
20240 // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
20241 SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
20242 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
20243 DAG.getIntPtrConstant(0, DL));
20244}
20245
20246/// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),
20247/// try to vectorize the cast ops. This will avoid an expensive round-trip
20248/// between XMM and GPR.
20249static SDValue lowerFPToIntToFP(SDValue CastToFP, SelectionDAG &DAG,
20250 const X86Subtarget &Subtarget) {
20251 // TODO: Allow FP_TO_UINT.
20252 SDValue CastToInt = CastToFP.getOperand(0);
20253 MVT VT = CastToFP.getSimpleValueType();
20254 if (CastToInt.getOpcode() != ISD::FP_TO_SINT || VT.isVector())
20255 return SDValue();
20256
20257 MVT IntVT = CastToInt.getSimpleValueType();
20258 SDValue X = CastToInt.getOperand(0);
20259 MVT SrcVT = X.getSimpleValueType();
20260 if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
20261 return SDValue();
20262
20263 // See if we have 128-bit vector cast instructions for this type of cast.
20264 // We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.
20265 if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||
20266 IntVT != MVT::i32)
20267 return SDValue();
20268
20269 unsigned SrcSize = SrcVT.getSizeInBits();
20270 unsigned IntSize = IntVT.getSizeInBits();
20271 unsigned VTSize = VT.getSizeInBits();
20272 MVT VecSrcVT = MVT::getVectorVT(SrcVT, 128 / SrcSize);
20273 MVT VecIntVT = MVT::getVectorVT(IntVT, 128 / IntSize);
20274 MVT VecVT = MVT::getVectorVT(VT, 128 / VTSize);
20275
20276 // We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.
20277 unsigned ToIntOpcode =
20278 SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;
20279 unsigned ToFPOpcode =
20280 IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;
20281
20282 // sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0
20283 //
20284 // We are not defining the high elements (for example, zero them) because
20285 // that could nullify any performance advantage that we hoped to gain from
20286 // this vector op hack. We do not expect any adverse effects (like denorm
20287 // penalties) with cast ops.
20288 SDLoc DL(CastToFP);
20289 SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
20290 SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);
20291 SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);
20292 SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);
20293 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);
20294}
20295
20296static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG,
20297 const X86Subtarget &Subtarget) {
20298 SDLoc DL(Op);
20299 bool IsStrict = Op->isStrictFPOpcode();
20300 MVT VT = Op->getSimpleValueType(0);
20301 SDValue Src = Op->getOperand(IsStrict ? 1 : 0);
20302
20303 if (Subtarget.hasDQI()) {
20304 assert(!Subtarget.hasVLX() && "Unexpected features")(static_cast<void> (0));
20305
20306 assert((Src.getSimpleValueType() == MVT::v2i64 ||(static_cast<void> (0))
20307 Src.getSimpleValueType() == MVT::v4i64) &&(static_cast<void> (0))
20308 "Unsupported custom type")(static_cast<void> (0));
20309
20310 // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.
20311 assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&(static_cast<void> (0))
20312 "Unexpected VT!")(static_cast<void> (0));
20313 MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
20314
20315 // Need to concat with zero vector for strict fp to avoid spurious
20316 // exceptions.
20317 SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)
20318 : DAG.getUNDEF(MVT::v8i64);
20319 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,
20320 DAG.getIntPtrConstant(0, DL));
20321 SDValue Res, Chain;
20322 if (IsStrict) {
20323 Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},
20324 {Op->getOperand(0), Src});
20325 Chain = Res.getValue(1);
20326 } else {
20327 Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);
20328 }
20329
20330 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
20331 DAG.getIntPtrConstant(0, DL));
20332
20333 if (IsStrict)
20334 return DAG.getMergeValues({Res, Chain}, DL);
20335 return Res;
20336 }
20337
20338 bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||
20339 Op->getOpcode() == ISD::STRICT_SINT_TO_FP;
20340 if (VT != MVT::v4f32 || IsSigned)
20341 return SDValue();
20342
20343 SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);
20344 SDValue One = DAG.getConstant(1, DL, MVT::v4i64);
20345 SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,
20346 DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),
20347 DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));
20348 SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);
20349 SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);
20350 SmallVector<SDValue, 4> SignCvts(4);
20351 SmallVector<SDValue, 4> Chains(4);
20352 for (int i = 0; i != 4; ++i) {
20353 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
20354 DAG.getIntPtrConstant(i, DL));
20355 if (IsStrict) {
20356 SignCvts[i] =
20357 DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},
20358 {Op.getOperand(0), Elt});
20359 Chains[i] = SignCvts[i].getValue(1);
20360 } else {
20361 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);
20362 }
20363 }
20364 SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);
20365
20366 SDValue Slow, Chain;
20367 if (IsStrict) {
20368 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
20369 Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},
20370 {Chain, SignCvt, SignCvt});
20371 Chain = Slow.getValue(1);
20372 } else {
20373 Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);
20374 }
20375
20376 IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);
20377 SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);
20378
20379 if (IsStrict)
20380 return DAG.getMergeValues({Cvt, Chain}, DL);
20381
20382 return Cvt;
20383}
20384
20385SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
20386 SelectionDAG &DAG) const {
20387 bool IsStrict = Op->isStrictFPOpcode();
20388 unsigned OpNo = IsStrict ? 1 : 0;
20389 SDValue Src = Op.getOperand(OpNo);
20390 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
20391 MVT SrcVT = Src.getSimpleValueType();
20392 MVT VT = Op.getSimpleValueType();
20393 SDLoc dl(Op);
20394
20395 if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
20396 return Extract;
20397
20398 if (SDValue R = lowerFPToIntToFP(Op, DAG, Subtarget))
20399 return R;
20400
20401 if (SrcVT.isVector()) {
20402 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
20403 // Note: Since v2f64 is a legal type. We don't need to zero extend the
20404 // source for strict FP.
20405 if (IsStrict)
20406 return DAG.getNode(
20407 X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
20408 {Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
20409 DAG.getUNDEF(SrcVT))});
20410 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
20411 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
20412 DAG.getUNDEF(SrcVT)));
20413 }
20414 if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)
20415 return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);
20416
20417 return SDValue();
20418 }
20419
20420 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&(static_cast<void> (0))
20421 "Unknown SINT_TO_FP to lower!")(static_cast<void> (0));
20422
20423 bool UseSSEReg = isScalarFPTypeInSSEReg(VT);
20424
20425 // These are really Legal; return the operand so the caller accepts it as
20426 // Legal.
20427 if (SrcVT == MVT::i32 && UseSSEReg)
20428 return Op;
20429 if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())
20430 return Op;
20431
20432 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
20433 return V;
20434 if (SDValue V = LowerI64IntToFP16(Op, DAG, Subtarget))
20435 return V;
20436
20437 // SSE doesn't have an i16 conversion so we need to promote.
20438 if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {
20439 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);
20440 if (IsStrict)
20441 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
20442 {Chain, Ext});
20443
20444 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);
20445 }
20446
20447 if (VT == MVT::f128)
20448 return SDValue();
20449
20450 SDValue ValueToStore = Src;
20451 if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())
20452 // Bitcasting to f64 here allows us to do a single 64-bit store from
20453 // an SSE register, avoiding the store forwarding penalty that would come
20454 // with two 32-bit stores.
20455 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
20456
20457 unsigned Size = SrcVT.getStoreSize();
20458 Align Alignment(Size);
20459 MachineFunction &MF = DAG.getMachineFunction();
20460 auto PtrVT = getPointerTy(MF.getDataLayout());
20461 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);
20462 MachinePointerInfo MPI =
20463 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);
20464 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20465 Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);
20466 std::pair<SDValue, SDValue> Tmp =
20467 BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);
20468
20469 if (IsStrict)
20470 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
20471
20472 return Tmp.first;
20473}
20474
20475std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(
20476 EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,
20477 MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {
20478 // Build the FILD
20479 SDVTList Tys;
20480 bool useSSE = isScalarFPTypeInSSEReg(DstVT);
20481 if (useSSE)
20482 Tys = DAG.getVTList(MVT::f80, MVT::Other);
20483 else
20484 Tys = DAG.getVTList(DstVT, MVT::Other);
20485
20486 SDValue FILDOps[] = {Chain, Pointer};
20487 SDValue Result =
20488 DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,
20489 Alignment, MachineMemOperand::MOLoad);
20490 Chain = Result.getValue(1);
20491
20492 if (useSSE) {
20493 MachineFunction &MF = DAG.getMachineFunction();
20494 unsigned SSFISize = DstVT.getStoreSize();
20495 int SSFI =
20496 MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false);
20497 auto PtrVT = getPointerTy(MF.getDataLayout());
20498 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20499 Tys = DAG.getVTList(MVT::Other);
20500 SDValue FSTOps[] = {Chain, Result, StackSlot};
20501 MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand(
20502 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
20503 MachineMemOperand::MOStore, SSFISize, Align(SSFISize));
20504
20505 Chain =
20506 DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);
20507 Result = DAG.getLoad(
20508 DstVT, DL, Chain, StackSlot,
20509 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
20510 Chain = Result.getValue(1);
20511 }
20512
20513 return { Result, Chain };
20514}
20515
20516/// Horizontal vector math instructions may be slower than normal math with
20517/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
20518/// implementation, and likely shuffle complexity of the alternate sequence.
20519static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
20520 const X86Subtarget &Subtarget) {
20521 bool IsOptimizingSize = DAG.shouldOptForSize();
20522 bool HasFastHOps = Subtarget.hasFastHorizontalOps();
20523 return !IsSingleSource || IsOptimizingSize || HasFastHOps;
20524}
20525
20526/// 64-bit unsigned integer to double expansion.
20527static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
20528 const X86Subtarget &Subtarget) {
20529 // We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0
20530 // when converting 0 when rounding toward negative infinity. Caller will
20531 // fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.
20532 assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!")(static_cast<void> (0));
20533 // This algorithm is not obvious. Here it is what we're trying to output:
20534 /*
20535 movq %rax, %xmm0
20536 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
20537 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
20538 #ifdef __SSE3__
20539 haddpd %xmm0, %xmm0
20540 #else
20541 pshufd $0x4e, %xmm0, %xmm1
20542 addpd %xmm1, %xmm0
20543 #endif
20544 */
20545
20546 SDLoc dl(Op);
20547 LLVMContext *Context = DAG.getContext();
20548
20549 // Build some magic constants.
20550 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
20551 Constant *C0 = ConstantDataVector::get(*Context, CV0);
20552 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
20553 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));
20554
20555 SmallVector<Constant*,2> CV1;
20556 CV1.push_back(
20557 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
20558 APInt(64, 0x4330000000000000ULL))));
20559 CV1.push_back(
20560 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
20561 APInt(64, 0x4530000000000000ULL))));
20562 Constant *C1 = ConstantVector::get(CV1);
20563 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));
20564
20565 // Load the 64-bit value into an XMM register.
20566 SDValue XR1 =
20567 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0));
20568 SDValue CLod0 = DAG.getLoad(
20569 MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
20570 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));
20571 SDValue Unpck1 =
20572 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
20573
20574 SDValue CLod1 = DAG.getLoad(
20575 MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
20576 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));
20577 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
20578 // TODO: Are there any fast-math-flags to propagate here?
20579 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
20580 SDValue Result;
20581
20582 if (Subtarget.hasSSE3() &&
20583 shouldUseHorizontalOp(true, DAG, Subtarget)) {
20584 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
20585 } else {
20586 SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
20587 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
20588 }
20589 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
20590 DAG.getIntPtrConstant(0, dl));
20591 return Result;
20592}
20593
20594/// 32-bit unsigned integer to float expansion.
20595static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG,
20596 const X86Subtarget &Subtarget) {
20597 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
20598 SDLoc dl(Op);
20599 // FP constant to bias correct the final result.
20600 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
20601 MVT::f64);
20602
20603 // Load the 32-bit value into an XMM register.
20604 SDValue Load =
20605 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));
20606
20607 // Zero out the upper parts of the register.
20608 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
20609
20610 // Or the load with the bias.
20611 SDValue Or = DAG.getNode(
20612 ISD::OR, dl, MVT::v2i64,
20613 DAG.getBitcast(MVT::v2i64, Load),
20614 DAG.getBitcast(MVT::v2i64,
20615 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
20616 Or =
20617 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
20618 DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
20619
20620 if (Op.getNode()->isStrictFPOpcode()) {
20621 // Subtract the bias.
20622 // TODO: Are there any fast-math-flags to propagate here?
20623 SDValue Chain = Op.getOperand(0);
20624 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},
20625 {Chain, Or, Bias});
20626
20627 if (Op.getValueType() == Sub.getValueType())
20628 return Sub;
20629
20630 // Handle final rounding.
20631 std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(
20632 Sub, Sub.getValue(1), dl, Op.getSimpleValueType());
20633
20634 return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);
20635 }
20636
20637 // Subtract the bias.
20638 // TODO: Are there any fast-math-flags to propagate here?
20639 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
20640
20641 // Handle final rounding.
20642 return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
20643}
20644
20645static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
20646 const X86Subtarget &Subtarget,
20647 const SDLoc &DL) {
20648 if (Op.getSimpleValueType() != MVT::v2f64)
20649 return SDValue();
20650
20651 bool IsStrict = Op->isStrictFPOpcode();
20652
20653 SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);
20654 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type")(static_cast<void> (0));
20655
20656 if (Subtarget.hasAVX512()) {
20657 if (!Subtarget.hasVLX()) {
20658 // Let generic type legalization widen this.
20659 if (!IsStrict)
20660 return SDValue();
20661 // Otherwise pad the integer input with 0s and widen the operation.
20662 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
20663 DAG.getConstant(0, DL, MVT::v2i32));
20664 SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},
20665 {Op.getOperand(0), N0});
20666 SDValue Chain = Res.getValue(1);
20667 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,
20668 DAG.getIntPtrConstant(0, DL));
20669 return DAG.getMergeValues({Res, Chain}, DL);
20670 }
20671
20672 // Legalize to v4i32 type.
20673 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
20674 DAG.getUNDEF(MVT::v2i32));
20675 if (IsStrict)
20676 return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},
20677 {Op.getOperand(0), N0});
20678 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
20679 }
20680
20681 // Zero extend to 2i64, OR with the floating point representation of 2^52.
20682 // This gives us the floating point equivalent of 2^52 + the i32 integer
20683 // since double has 52-bits of mantissa. Then subtract 2^52 in floating
20684 // point leaving just our i32 integers in double format.
20685 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);
20686 SDValue VBias =
20687 DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), DL, MVT::v2f64);
20688 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,
20689 DAG.getBitcast(MVT::v2i64, VBias));
20690 Or = DAG.getBitcast(MVT::v2f64, Or);
20691
20692 if (IsStrict)
20693 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},
20694 {Op.getOperand(0), Or, VBias});
20695 return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);
20696}
20697
20698static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
20699 const X86Subtarget &Subtarget) {
20700 SDLoc DL(Op);
20701 bool IsStrict = Op->isStrictFPOpcode();
20702 SDValue V = Op->getOperand(IsStrict ? 1 : 0);
20703 MVT VecIntVT = V.getSimpleValueType();
20704 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&(static_cast<void> (0))
20705 "Unsupported custom type")(static_cast<void> (0));
20706
20707 if (Subtarget.hasAVX512()) {
20708 // With AVX512, but not VLX we need to widen to get a 512-bit result type.
20709 assert(!Subtarget.hasVLX() && "Unexpected features")(static_cast<void> (0));
20710 MVT VT = Op->getSimpleValueType(0);
20711
20712 // v8i32->v8f64 is legal with AVX512 so just return it.
20713 if (VT == MVT::v8f64)
20714 return Op;
20715
20716 assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) &&(static_cast<void> (0))
20717 "Unexpected VT!")(static_cast<void> (0));
20718 MVT WideVT = VT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
20719 MVT WideIntVT = VT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
20720 // Need to concat with zero vector for strict fp to avoid spurious
20721 // exceptions.
20722 SDValue Tmp =
20723 IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);
20724 V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,
20725 DAG.getIntPtrConstant(0, DL));
20726 SDValue Res, Chain;
20727 if (IsStrict) {
20728 Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},
20729 {Op->getOperand(0), V});
20730 Chain = Res.getValue(1);
20731 } else {
20732 Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);
20733 }
20734
20735 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
20736 DAG.getIntPtrConstant(0, DL));
20737
20738 if (IsStrict)
20739 return DAG.getMergeValues({Res, Chain}, DL);
20740 return Res;
20741 }
20742
20743 if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&
20744 Op->getSimpleValueType(0) == MVT::v4f64) {
20745 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);
20746 Constant *Bias = ConstantFP::get(
20747 *DAG.getContext(),
20748 APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));
20749 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
20750 SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8));
20751 SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);
20752 SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
20753 SDValue VBias = DAG.getMemIntrinsicNode(
20754 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
20755 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(8),
20756 MachineMemOperand::MOLoad);
20757
20758 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,
20759 DAG.getBitcast(MVT::v4i64, VBias));
20760 Or = DAG.getBitcast(MVT::v4f64, Or);
20761
20762 if (IsStrict)
20763 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},
20764 {Op.getOperand(0), Or, VBias});
20765 return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);
20766 }
20767
20768 // The algorithm is the following:
20769 // #ifdef __SSE4_1__
20770 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
20771 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
20772 // (uint4) 0x53000000, 0xaa);
20773 // #else
20774 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
20775 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
20776 // #endif
20777 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
20778 // return (float4) lo + fhi;
20779
20780 bool Is128 = VecIntVT == MVT::v4i32;
20781 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
20782 // If we convert to something else than the supported type, e.g., to v4f64,
20783 // abort early.
20784 if (VecFloatVT != Op->getSimpleValueType(0))
20785 return SDValue();
20786
20787 // In the #idef/#else code, we have in common:
20788 // - The vector of constants:
20789 // -- 0x4b000000
20790 // -- 0x53000000
20791 // - A shift:
20792 // -- v >> 16
20793
20794 // Create the splat vector for 0x4b000000.
20795 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
20796 // Create the splat vector for 0x53000000.
20797 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
20798
20799 // Create the right shift.
20800 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
20801 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
20802
20803 SDValue Low, High;
20804 if (Subtarget.hasSSE41()) {
20805 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
20806 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
20807 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
20808 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
20809 // Low will be bitcasted right away, so do not bother bitcasting back to its
20810 // original type.
20811 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
20812 VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
20813 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
20814 // (uint4) 0x53000000, 0xaa);
20815 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
20816 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
20817 // High will be bitcasted right away, so do not bother bitcasting back to
20818 // its original type.
20819 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
20820 VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
20821 } else {
20822 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
20823 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
20824 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
20825 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
20826
20827 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
20828 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
20829 }
20830
20831 // Create the vector constant for (0x1.0p39f + 0x1.0p23f).
20832 SDValue VecCstFSub = DAG.getConstantFP(
20833 APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);
20834
20835 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
20836 // NOTE: By using fsub of a positive constant instead of fadd of a negative
20837 // constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is
20838 // enabled. See PR24512.
20839 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
20840 // TODO: Are there any fast-math-flags to propagate here?
20841 // (float4) lo;
20842 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
20843 // return (float4) lo + fhi;
20844 if (IsStrict) {
20845 SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},
20846 {Op.getOperand(0), HighBitcast, VecCstFSub});
20847 return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},
20848 {FHigh.getValue(1), LowBitcast, FHigh});
20849 }
20850
20851 SDValue FHigh =
20852 DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);
20853 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
20854}
20855
20856static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG,
20857 const X86Subtarget &Subtarget) {
20858 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
20859 SDValue N0 = Op.getOperand(OpNo);
20860 MVT SrcVT = N0.getSimpleValueType();
20861 SDLoc dl(Op);
20862
20863 switch (SrcVT.SimpleTy) {
20864 default:
20865 llvm_unreachable("Custom UINT_TO_FP is not supported!")__builtin_unreachable();
20866 case MVT::v2i32:
20867 return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
20868 case MVT::v4i32:
20869 case MVT::v8i32:
20870 return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
20871 case MVT::v2i64:
20872 case MVT::v4i64:
20873 return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);
20874 }
20875}
20876
20877SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
20878 SelectionDAG &DAG) const {
20879 bool IsStrict = Op->isStrictFPOpcode();
20880 unsigned OpNo = IsStrict ? 1 : 0;
20881 SDValue Src = Op.getOperand(OpNo);
20882 SDLoc dl(Op);
20883 auto PtrVT = getPointerTy(DAG.getDataLayout());
20884 MVT SrcVT = Src.getSimpleValueType();
20885 MVT DstVT = Op->getSimpleValueType(0);
20886 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
20887
20888 if (DstVT == MVT::f128)
20889 return SDValue();
20890
20891 if (DstVT.isVector())
20892 return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);
20893
20894 if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
20895 return Extract;
20896
20897 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
20898 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
20899 // Conversions from unsigned i32 to f32/f64 are legal,
20900 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
20901 return Op;
20902 }
20903
20904 // Promote i32 to i64 and use a signed conversion on 64-bit targets.
20905 if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
20906 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);
20907 if (IsStrict)
20908 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},
20909 {Chain, Src});
20910 return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);
20911 }
20912
20913 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
20914 return V;
20915 if (SDValue V = LowerI64IntToFP16(Op, DAG, Subtarget))
20916 return V;
20917
20918 // The transform for i64->f64 isn't correct for 0 when rounding to negative
20919 // infinity. It produces -0.0, so disable under strictfp.
20920 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64 && !IsStrict)
20921 return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);
20922 if (SrcVT == MVT::i32 && X86ScalarSSEf64 && DstVT != MVT::f80)
20923 return LowerUINT_TO_FP_i32(Op, DAG, Subtarget);
20924 if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&
20925 (DstVT == MVT::f32 || DstVT == MVT::f64))
20926 return SDValue();
20927
20928 // Make a 64-bit buffer, and use it to build an FILD.
20929 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);
20930 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
20931 Align SlotAlign(8);
20932 MachinePointerInfo MPI =
20933 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);
20934 if (SrcVT == MVT::i32) {
20935 SDValue OffsetSlot =
20936 DAG.getMemBasePlusOffset(StackSlot, TypeSize::Fixed(4), dl);
20937 SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);
20938 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
20939 OffsetSlot, MPI.getWithOffset(4), SlotAlign);
20940 std::pair<SDValue, SDValue> Tmp =
20941 BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);
20942 if (IsStrict)
20943 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
20944
20945 return Tmp.first;
20946 }
20947
20948 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP")(static_cast<void> (0));
20949 SDValue ValueToStore = Src;
20950 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {
20951 // Bitcasting to f64 here allows us to do a single 64-bit store from
20952 // an SSE register, avoiding the store forwarding penalty that would come
20953 // with two 32-bit stores.
20954 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
20955 }
20956 SDValue Store =
20957 DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);
20958 // For i64 source, we need to add the appropriate power of 2 if the input
20959 // was negative. We must be careful to do the computation in x87 extended
20960 // precision, not in SSE.
20961 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
20962 SDValue Ops[] = { Store, StackSlot };
20963 SDValue Fild =
20964 DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,
20965 SlotAlign, MachineMemOperand::MOLoad);
20966 Chain = Fild.getValue(1);
20967
20968
20969 // Check whether the sign bit is set.
20970 SDValue SignSet = DAG.getSetCC(
20971 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
20972 Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
20973
20974 // Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.
20975 APInt FF(64, 0x5F80000000000000ULL);
20976 SDValue FudgePtr = DAG.getConstantPool(
20977 ConstantInt::get(*DAG.getContext(), FF), PtrVT);
20978 Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();
20979
20980 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
20981 SDValue Zero = DAG.getIntPtrConstant(0, dl);
20982 SDValue Four = DAG.getIntPtrConstant(4, dl);
20983 SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);
20984 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
20985
20986 // Load the value out, extending it from f32 to f80.
20987 SDValue Fudge = DAG.getExtLoad(
20988 ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,
20989 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
20990 CPAlignment);
20991 Chain = Fudge.getValue(1);
20992 // Extend everything to 80 bits to force it to be done on x87.
20993 // TODO: Are there any fast-math-flags to propagate here?
20994 if (IsStrict) {
20995 SDValue Add = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::f80, MVT::Other},
20996 {Chain, Fild, Fudge});
20997 // STRICT_FP_ROUND can't handle equal types.
20998 if (DstVT == MVT::f80)
20999 return Add;
21000 return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},
21001 {Add.getValue(1), Add, DAG.getIntPtrConstant(0, dl)});
21002 }
21003 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
21004 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
21005 DAG.getIntPtrConstant(0, dl));
21006}
21007
21008// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
21009// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
21010// just return an SDValue().
21011// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
21012// to i16, i32 or i64, and we lower it to a legal sequence and return the
21013// result.
21014SDValue
21015X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
21016 bool IsSigned, SDValue &Chain) const {
21017 bool IsStrict = Op->isStrictFPOpcode();
21018 SDLoc DL(Op);
21019
21020 EVT DstTy = Op.getValueType();
21021 SDValue Value = Op.getOperand(IsStrict ? 1 : 0);
21022 EVT TheVT = Value.getValueType();
21023 auto PtrVT = getPointerTy(DAG.getDataLayout());
21024
21025 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
21026 // f16 must be promoted before using the lowering in this routine.
21027 // fp128 does not use this lowering.
21028 return SDValue();
21029 }
21030
21031 // If using FIST to compute an unsigned i64, we'll need some fixup
21032 // to handle values above the maximum signed i64. A FIST is always
21033 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
21034 bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;
21035
21036 // FIXME: This does not generate an invalid exception if the input does not
21037 // fit in i32. PR44019
21038 if (!IsSigned && DstTy != MVT::i64) {
21039 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
21040 // The low 32 bits of the fist result will have the correct uint32 result.
21041 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT")(static_cast<void> (0));
21042 DstTy = MVT::i64;
21043 }
21044
21045 assert(DstTy.getSimpleVT() <= MVT::i64 &&(static_cast<void> (0))
21046 DstTy.getSimpleVT() >= MVT::i16 &&(static_cast<void> (0))
21047 "Unknown FP_TO_INT to lower!")(static_cast<void> (0));
21048
21049 // We lower FP->int64 into FISTP64 followed by a load from a temporary
21050 // stack slot.
21051 MachineFunction &MF = DAG.getMachineFunction();
21052 unsigned MemSize = DstTy.getStoreSize();
21053 int SSFI =
21054 MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false);
21055 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
21056
21057 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
21058
21059 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
21060
21061 if (UnsignedFixup) {
21062 //
21063 // Conversion to unsigned i64 is implemented with a select,
21064 // depending on whether the source value fits in the range
21065 // of a signed i64. Let Thresh be the FP equivalent of
21066 // 0x8000000000000000ULL.
21067 //
21068 // Adjust = (Value >= Thresh) ? 0x80000000 : 0;
21069 // FltOfs = (Value >= Thresh) ? 0x80000000 : 0;
21070 // FistSrc = (Value - FltOfs);
21071 // Fist-to-mem64 FistSrc
21072 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
21073 // to XOR'ing the high 32 bits with Adjust.
21074 //
21075 // Being a power of 2, Thresh is exactly representable in all FP formats.
21076 // For X87 we'd like to use the smallest FP type for this constant, but
21077 // for DAG type consistency we have to match the FP operand type.
21078
21079 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
21080 LLVM_ATTRIBUTE_UNUSED__attribute__((__unused__)) APFloat::opStatus Status = APFloat::opOK;
21081 bool LosesInfo = false;
21082 if (TheVT == MVT::f64)
21083 // The rounding mode is irrelevant as the conversion should be exact.
21084 Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
21085 &LosesInfo);
21086 else if (TheVT == MVT::f80)
21087 Status = Thresh.convert(APFloat::x87DoubleExtended(),
21088 APFloat::rmNearestTiesToEven, &LosesInfo);
21089
21090 assert(Status == APFloat::opOK && !LosesInfo &&(static_cast<void> (0))
21091 "FP conversion should have been exact")(static_cast<void> (0));
21092
21093 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
21094
21095 EVT ResVT = getSetCCResultType(DAG.getDataLayout(),
21096 *DAG.getContext(), TheVT);
21097 SDValue Cmp;
21098 if (IsStrict) {
21099 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,
21100 /*IsSignaling*/ true);
21101 Chain = Cmp.getValue(1);
21102 } else {
21103 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE);
21104 }
21105
21106 // Our preferred lowering of
21107 //
21108 // (Value >= Thresh) ? 0x8000000000000000ULL : 0
21109 //
21110 // is
21111 //
21112 // (Value >= Thresh) << 63
21113 //
21114 // but since we can get here after LegalOperations, DAGCombine might do the
21115 // wrong thing if we create a select. So, directly create the preferred
21116 // version.
21117 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp);
21118 SDValue Const63 = DAG.getConstant(63, DL, MVT::i8);
21119 Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63);
21120
21121 SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal,
21122 DAG.getConstantFP(0.0, DL, TheVT));
21123
21124 if (IsStrict) {
21125 Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
21126 { Chain, Value, FltOfs });
21127 Chain = Value.getValue(1);
21128 } else
21129 Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);
21130 }
21131
21132 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
21133
21134 // FIXME This causes a redundant load/store if the SSE-class value is already
21135 // in memory, such as if it is on the callstack.
21136 if (isScalarFPTypeInSSEReg(TheVT)) {
21137 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!")(static_cast<void> (0));
21138 Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
21139 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
21140 SDValue Ops[] = { Chain, StackSlot };
21141
21142 unsigned FLDSize = TheVT.getStoreSize();
21143 assert(FLDSize <= MemSize && "Stack slot not big enough")(static_cast<void> (0));
21144 MachineMemOperand *MMO = MF.getMachineMemOperand(
21145 MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize));
21146 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);
21147 Chain = Value.getValue(1);
21148 }
21149
21150 // Build the FP_TO_INT*_IN_MEM
21151 MachineMemOperand *MMO = MF.getMachineMemOperand(
21152 MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));
21153 SDValue Ops[] = { Chain, Value, StackSlot };
21154 SDValue FIST = DAG.getMemIntrinsicNode(X86ISD::FP_TO_INT_IN_MEM, DL,
21155 DAG.getVTList(MVT::Other),
21156 Ops, DstTy, MMO);
21157
21158 SDValue Res = DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, MPI);
21159 Chain = Res.getValue(1);
21160
21161 // If we need an unsigned fixup, XOR the result with adjust.
21162 if (UnsignedFixup)
21163 Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);
21164
21165 return Res;
21166}
21167
21168static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
21169 const X86Subtarget &Subtarget) {
21170 MVT VT = Op.getSimpleValueType();
21171 SDValue In = Op.getOperand(0);
21172 MVT InVT = In.getSimpleValueType();
21173 SDLoc dl(Op);
21174 unsigned Opc = Op.getOpcode();
21175
21176 assert(VT.isVector() && InVT.isVector() && "Expected vector type")(static_cast<void> (0));
21177 assert((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) &&(static_cast<void> (0))
21178 "Unexpected extension opcode")(static_cast<void> (0));
21179 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&(static_cast<void> (0))
21180 "Expected same number of elements")(static_cast<void> (0));
21181 assert((VT.getVectorElementType() == MVT::i16 ||(static_cast<void> (0))
21182 VT.getVectorElementType() == MVT::i32 ||(static_cast<void> (0))
21183 VT.getVectorElementType() == MVT::i64) &&(static_cast<void> (0))
21184 "Unexpected element type")(static_cast<void> (0));
21185 assert((InVT.getVectorElementType() == MVT::i8 ||(static_cast<void> (0))
21186 InVT.getVectorElementType() == MVT::i16 ||(static_cast<void> (0))
21187 InVT.getVectorElementType() == MVT::i32) &&(static_cast<void> (0))
21188 "Unexpected element type")(static_cast<void> (0));
21189
21190 unsigned ExtendInVecOpc = getOpcode_EXTEND_VECTOR_INREG(Opc);
21191
21192 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
21193 assert(InVT == MVT::v32i8 && "Unexpected VT!")(static_cast<void> (0));
21194 return splitVectorIntUnary(Op, DAG);
21195 }
21196
21197 if (Subtarget.hasInt256())
21198 return Op;
21199
21200 // Optimize vectors in AVX mode:
21201 //
21202 // v8i16 -> v8i32
21203 // Use vpmovzwd for 4 lower elements v8i16 -> v4i32.
21204 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
21205 // Concat upper and lower parts.
21206 //
21207 // v4i32 -> v4i64
21208 // Use vpmovzdq for 4 lower elements v4i32 -> v2i64.
21209 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
21210 // Concat upper and lower parts.
21211 //
21212 MVT HalfVT = VT.getHalfNumVectorElementsVT();
21213 SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);
21214
21215 // Short-circuit if we can determine that each 128-bit half is the same value.
21216 // Otherwise, this is difficult to match and optimize.
21217 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))
21218 if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))
21219 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);
21220
21221 SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
21222 SDValue Undef = DAG.getUNDEF(InVT);
21223 bool NeedZero = Opc == ISD::ZERO_EXTEND;
21224 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
21225 OpHi = DAG.getBitcast(HalfVT, OpHi);
21226
21227 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
21228}
21229
21230// Helper to split and extend a v16i1 mask to v16i8 or v16i16.
21231static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
21232 const SDLoc &dl, SelectionDAG &DAG) {
21233 assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.")(static_cast<void> (0));
21234 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
21235 DAG.getIntPtrConstant(0, dl));
21236 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
21237 DAG.getIntPtrConstant(8, dl));
21238 Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
21239 Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
21240 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
21241 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21242}
21243
21244static SDValue LowerZERO_EXTEND_Mask(SDValue Op,
21245 const X86Subtarget &Subtarget,
21246 SelectionDAG &DAG) {
21247 MVT VT = Op->getSimpleValueType(0);
21248 SDValue In = Op->getOperand(0);
21249 MVT InVT = In.getSimpleValueType();
21250 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!")(static_cast<void> (0));
21251 SDLoc DL(Op);
21252 unsigned NumElts = VT.getVectorNumElements();
21253
21254 // For all vectors, but vXi8 we can just emit a sign_extend and a shift. This
21255 // avoids a constant pool load.
21256 if (VT.getVectorElementType() != MVT::i8) {
21257 SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
21258 return DAG.getNode(ISD::SRL, DL, VT, Extend,
21259 DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
21260 }
21261
21262 // Extend VT if BWI is not supported.
21263 MVT ExtVT = VT;
21264 if (!Subtarget.hasBWI()) {
21265 // If v16i32 is to be avoided, we'll need to split and concatenate.
21266 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
21267 return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);
21268
21269 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
21270 }
21271
21272 // Widen to 512-bits if VLX is not supported.
21273 MVT WideVT = ExtVT;
21274 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
21275 NumElts *= 512 / ExtVT.getSizeInBits();
21276 InVT = MVT::getVectorVT(MVT::i1, NumElts);
21277 In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),
21278 In, DAG.getIntPtrConstant(0, DL));
21279 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(),
21280 NumElts);
21281 }
21282
21283 SDValue One = DAG.getConstant(1, DL, WideVT);
21284 SDValue Zero = DAG.getConstant(0, DL, WideVT);
21285
21286 SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
21287
21288 // Truncate if we had to extend above.
21289 if (VT != ExtVT) {
21290 WideVT = MVT::getVectorVT(MVT::i8, NumElts);
21291 SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
21292 }
21293
21294 // Extract back to 128/256-bit if we widened.
21295 if (WideVT != VT)
21296 SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
21297 DAG.getIntPtrConstant(0, DL));
21298
21299 return SelectedVal;
21300}
21301
21302static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
21303 SelectionDAG &DAG) {
21304 SDValue In = Op.getOperand(0);
21305 MVT SVT = In.getSimpleValueType();
21306
21307 if (SVT.getVectorElementType() == MVT::i1)
21308 return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);
21309
21310 assert(Subtarget.hasAVX() && "Expected AVX support")(static_cast<void> (0));
21311 return LowerAVXExtend(Op, DAG, Subtarget);
21312}
21313
21314/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
21315/// It makes use of the fact that vectors with enough leading sign/zero bits
21316/// prevent the PACKSS/PACKUS from saturating the results.
21317/// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
21318/// within each 128-bit lane.
21319static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
21320 const SDLoc &DL, SelectionDAG &DAG,
21321 const X86Subtarget &Subtarget) {
21322 assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&(static_cast<void> (0))
21323 "Unexpected PACK opcode")(static_cast<void> (0));
21324 assert(DstVT.isVector() && "VT not a vector?")(static_cast<void> (0));
21325
21326 // Requires SSE2 for PACKSS (SSE41 PACKUSDW is handled below).
21327 if (!Subtarget.hasSSE2())
21328 return SDValue();
21329
21330 EVT SrcVT = In.getValueType();
21331
21332 // No truncation required, we might get here due to recursive calls.
21333 if (SrcVT == DstVT)
21334 return In;
21335
21336 // We only support vector truncation to 64bits or greater from a
21337 // 128bits or greater source.
21338 unsigned DstSizeInBits = DstVT.getSizeInBits();
21339 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
21340 if ((DstSizeInBits % 64) != 0 || (SrcSizeInBits % 128) != 0)
21341 return SDValue();
21342
21343 unsigned NumElems = SrcVT.getVectorNumElements();
21344 if (!isPowerOf2_32(NumElems))
21345 return SDValue();
21346
21347 LLVMContext &Ctx = *DAG.getContext();
21348 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation")(static_cast<void> (0));
21349 assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation")(static_cast<void> (0));
21350
21351 EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
21352
21353 // Pack to the largest type possible:
21354 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
21355 EVT InVT = MVT::i16, OutVT = MVT::i8;
21356 if (SrcVT.getScalarSizeInBits() > 16 &&
21357 (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
21358 InVT = MVT::i32;
21359 OutVT = MVT::i16;
21360 }
21361
21362 // 128bit -> 64bit truncate - PACK 128-bit src in the lower subvector.
21363 if (SrcVT.is128BitVector()) {
21364 InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
21365 OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
21366 In = DAG.getBitcast(InVT, In);
21367 SDValue Res = DAG.getNode(Opcode, DL, OutVT, In, DAG.getUNDEF(InVT));
21368 Res = extractSubVector(Res, 0, DAG, DL, 64);
21369 return DAG.getBitcast(DstVT, Res);
21370 }
21371
21372 // Split lower/upper subvectors.
21373 SDValue Lo, Hi;
21374 std::tie(Lo, Hi) = splitVector(In, DAG, DL);
21375
21376 unsigned SubSizeInBits = SrcSizeInBits / 2;
21377 InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
21378 OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
21379
21380 // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
21381 if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
21382 Lo = DAG.getBitcast(InVT, Lo);
21383 Hi = DAG.getBitcast(InVT, Hi);
21384 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
21385 return DAG.getBitcast(DstVT, Res);
21386 }
21387
21388 // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
21389 // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
21390 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
21391 Lo = DAG.getBitcast(InVT, Lo);
21392 Hi = DAG.getBitcast(InVT, Hi);
21393 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
21394
21395 // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
21396 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
21397 // Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.
21398 SmallVector<int, 64> Mask;
21399 int Scale = 64 / OutVT.getScalarSizeInBits();
21400 narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask);
21401 Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);
21402
21403 if (DstVT.is256BitVector())
21404 return DAG.getBitcast(DstVT, Res);
21405
21406 // If 512bit -> 128bit truncate another stage.
21407 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
21408 Res = DAG.getBitcast(PackedVT, Res);
21409 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21410 }
21411
21412 // Recursively pack lower/upper subvectors, concat result and pack again.
21413 assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater")(static_cast<void> (0));
21414 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems / 2);
21415 Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget);
21416 Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget);
21417
21418 PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
21419 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
21420 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21421}
21422
21423static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
21424 const X86Subtarget &Subtarget) {
21425
21426 SDLoc DL(Op);
21427 MVT VT = Op.getSimpleValueType();
21428 SDValue In = Op.getOperand(0);
21429 MVT InVT = In.getSimpleValueType();
21430
21431 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.")(static_cast<void> (0));
21432
21433 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
21434 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
21435 if (InVT.getScalarSizeInBits() <= 16) {
21436 if (Subtarget.hasBWI()) {
21437 // legal, will go to VPMOVB2M, VPMOVW2M
21438 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
21439 // We need to shift to get the lsb into sign position.
21440 // Shift packed bytes not supported natively, bitcast to word
21441 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
21442 In = DAG.getNode(ISD::SHL, DL, ExtVT,
21443 DAG.getBitcast(ExtVT, In),
21444 DAG.getConstant(ShiftInx, DL, ExtVT));
21445 In = DAG.getBitcast(InVT, In);
21446 }
21447 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
21448 In, ISD::SETGT);
21449 }
21450 // Use TESTD/Q, extended vector to packed dword/qword.
21451 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&(static_cast<void> (0))
21452 "Unexpected vector type.")(static_cast<void> (0));
21453 unsigned NumElts = InVT.getVectorNumElements();
21454 assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements")(static_cast<void> (0));
21455 // We need to change to a wider element type that we have support for.
21456 // For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
21457 // For 16 element vectors we extend to v16i32 unless we are explicitly
21458 // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
21459 // we need to split into two 8 element vectors which we can extend to v8i32,
21460 // truncate and concat the results. There's an additional complication if
21461 // the original type is v16i8. In that case we can't split the v16i8
21462 // directly, so we need to shuffle high elements to low and use
21463 // sign_extend_vector_inreg.
21464 if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
21465 SDValue Lo, Hi;
21466 if (InVT == MVT::v16i8) {
21467 Lo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, In);
21468 Hi = DAG.getVectorShuffle(
21469 InVT, DL, In, In,
21470 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
21471 Hi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, Hi);
21472 } else {
21473 assert(InVT == MVT::v16i16 && "Unexpected VT!")(static_cast<void> (0));
21474 Lo = extract128BitVector(In, 0, DAG, DL);
21475 Hi = extract128BitVector(In, 8, DAG, DL);
21476 }
21477 // We're split now, just emit two truncates and a concat. The two
21478 // truncates will trigger legalization to come back to this function.
21479 Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
21480 Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
21481 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21482 }
21483 // We either have 8 elements or we're allowed to use 512-bit vectors.
21484 // If we have VLX, we want to use the narrowest vector that can get the
21485 // job done so we use vXi32.
21486 MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
21487 MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
21488 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
21489 InVT = ExtVT;
21490 ShiftInx = InVT.getScalarSizeInBits() - 1;
21491 }
21492
21493 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
21494 // We need to shift to get the lsb into sign position.
21495 In = DAG.getNode(ISD::SHL, DL, InVT, In,
21496 DAG.getConstant(ShiftInx, DL, InVT));
21497 }
21498 // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
21499 if (Subtarget.hasDQI())
21500 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);
21501 return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);
21502}
21503
21504SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
21505 SDLoc DL(Op);
21506 MVT VT = Op.getSimpleValueType();
21507 SDValue In = Op.getOperand(0);
21508 MVT InVT = In.getSimpleValueType();
21509 unsigned InNumEltBits = InVT.getScalarSizeInBits();
21510
21511 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&(static_cast<void> (0))
21512 "Invalid TRUNCATE operation")(static_cast<void> (0));
21513
21514 // If we're called by the type legalizer, handle a few cases.
21515 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21516 if (!TLI.isTypeLegal(InVT)) {
21517 if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&
21518 VT.is128BitVector()) {
21519 assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) &&(static_cast<void> (0))
21520 "Unexpected subtarget!")(static_cast<void> (0));
21521 // The default behavior is to truncate one step, concatenate, and then
21522 // truncate the remainder. We'd rather produce two 64-bit results and
21523 // concatenate those.
21524 SDValue Lo, Hi;
21525 std::tie(Lo, Hi) = DAG.SplitVector(In, DL);
21526
21527 EVT LoVT, HiVT;
21528 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
21529
21530 Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);
21531 Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);
21532 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21533 }
21534
21535 // Otherwise let default legalization handle it.
21536 return SDValue();
21537 }
21538
21539 if (VT.getVectorElementType() == MVT::i1)
21540 return LowerTruncateVecI1(Op, DAG, Subtarget);
21541
21542 // vpmovqb/w/d, vpmovdb/w, vpmovwb
21543 if (Subtarget.hasAVX512()) {
21544 if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) {
21545 assert(VT == MVT::v32i8 && "Unexpected VT!")(static_cast<void> (0));
21546 return splitVectorIntUnary(Op, DAG);
21547 }
21548
21549 // word to byte only under BWI. Otherwise we have to promoted to v16i32
21550 // and then truncate that. But we should only do that if we haven't been
21551 // asked to avoid 512-bit vectors. The actual promotion to v16i32 will be
21552 // handled by isel patterns.
21553 if (InVT != MVT::v16i16 || Subtarget.hasBWI() ||
21554 Subtarget.canExtendTo512DQ())
21555 return Op;
21556 }
21557
21558 unsigned NumPackedSignBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);
21559 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
21560
21561 // Truncate with PACKUS if we are truncating a vector with leading zero bits
21562 // that extend all the way to the packed/truncated value.
21563 // Pre-SSE41 we can only use PACKUSWB.
21564 KnownBits Known = DAG.computeKnownBits(In);
21565 if ((InNumEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros())
21566 if (SDValue V =
21567 truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget))
21568 return V;
21569
21570 // Truncate with PACKSS if we are truncating a vector with sign-bits that
21571 // extend all the way to the packed/truncated value.
21572 if ((InNumEltBits - NumPackedSignBits) < DAG.ComputeNumSignBits(In))
21573 if (SDValue V =
21574 truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))
21575 return V;
21576
21577 // Handle truncation of V256 to V128 using shuffles.
21578 assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!")(static_cast<void> (0));
21579
21580 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
21581 In = DAG.getBitcast(MVT::v8i32, In);
21582
21583 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
21584 if (Subtarget.hasInt256()) {
21585 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
21586 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
21587 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
21588 DAG.getIntPtrConstant(0, DL));
21589 }
21590
21591 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
21592 DAG.getIntPtrConstant(0, DL));
21593 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
21594 DAG.getIntPtrConstant(4, DL));
21595 static const int ShufMask[] = {0, 2, 4, 6};
21596 return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
21597 }
21598
21599 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
21600 In = DAG.getBitcast(MVT::v32i8, In);
21601
21602 // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
21603 if (Subtarget.hasInt256()) {
21604 // The PSHUFB mask:
21605 static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
21606 -1, -1, -1, -1, -1, -1, -1, -1,
21607 16, 17, 20, 21, 24, 25, 28, 29,
21608 -1, -1, -1, -1, -1, -1, -1, -1 };
21609 In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
21610 In = DAG.getBitcast(MVT::v4i64, In);
21611
21612 static const int ShufMask2[] = {0, 2, -1, -1};
21613 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
21614 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16,
21615 DAG.getBitcast(MVT::v16i16, In),
21616 DAG.getIntPtrConstant(0, DL));
21617 }
21618
21619 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, In,
21620 DAG.getIntPtrConstant(0, DL));
21621 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, In,
21622 DAG.getIntPtrConstant(16, DL));
21623
21624 // The PSHUFB mask:
21625 static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
21626 -1, -1, -1, -1, -1, -1, -1, -1};
21627
21628 OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);
21629 OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);
21630
21631 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
21632 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
21633
21634 // The MOVLHPS Mask:
21635 static const int ShufMask2[] = {0, 1, 4, 5};
21636 SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
21637 return DAG.getBitcast(MVT::v8i16, res);
21638 }
21639
21640 if (VT == MVT::v16i8 && InVT == MVT::v16i16) {
21641 // Use an AND to zero uppper bits for PACKUS.
21642 In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(255, DL, InVT));
21643
21644 SDValue InLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,
21645 DAG.getIntPtrConstant(0, DL));
21646 SDValue InHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,
21647 DAG.getIntPtrConstant(8, DL));
21648 return DAG.getNode(X86ISD::PACKUS, DL, VT, InLo, InHi);
21649 }
21650
21651 llvm_unreachable("All 256->128 cases should have been handled above!")__builtin_unreachable();
21652}
21653
21654// We can leverage the specific way the "cvttps2dq/cvttpd2dq" instruction
21655// behaves on out of range inputs to generate optimized conversions.
21656static SDValue expandFP_TO_UINT_SSE(MVT VT, SDValue Src, const SDLoc &dl,
21657 SelectionDAG &DAG,
21658 const X86Subtarget &Subtarget) {
21659 MVT SrcVT = Src.getSimpleValueType();
21660 unsigned DstBits = VT.getScalarSizeInBits();
21661 assert(DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported")(static_cast<void> (0));
21662
21663 // Calculate the converted result for values in the range 0 to
21664 // 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
21665 SDValue Small = DAG.getNode(X86ISD::CVTTP2SI, dl, VT, Src);
21666 SDValue Big =
21667 DAG.getNode(X86ISD::CVTTP2SI, dl, VT,
21668 DAG.getNode(ISD::FSUB, dl, SrcVT, Src,
21669 DAG.getConstantFP(2147483648.0f, dl, SrcVT)));
21670
21671 // The "CVTTP2SI" instruction conveniently sets the sign bit if
21672 // and only if the value was out of range. So we can use that
21673 // as our indicator that we rather use "Big" instead of "Small".
21674 //
21675 // Use "Small" if "IsOverflown" has all bits cleared
21676 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
21677
21678 // AVX1 can't use the signsplat masking for 256-bit vectors - we have to
21679 // use the slightly slower blendv select instead.
21680 if (VT == MVT::v8i32 && !Subtarget.hasAVX2()) {
21681 SDValue Overflow = DAG.getNode(ISD::OR, dl, VT, Small, Big);
21682 return DAG.getNode(X86ISD::BLENDV, dl, VT, Small, Overflow, Small);
21683 }
21684
21685 SDValue IsOverflown =
21686 DAG.getNode(X86ISD::VSRAI, dl, VT, Small,
21687 DAG.getTargetConstant(DstBits - 1, dl, MVT::i8));
21688 return DAG.getNode(ISD::OR, dl, VT, Small,
21689 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
21690}
21691
21692SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
21693 bool IsStrict = Op->isStrictFPOpcode();
21694 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
21695 Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
21696 MVT VT = Op->getSimpleValueType(0);
21697 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21698 SDValue Chain = IsStrict ? Op->getOperand(0) : SDValue();
21699 MVT SrcVT = Src.getSimpleValueType();
21700 SDLoc dl(Op);
21701
21702 SDValue Res;
21703 if (VT.isVector()) {
21704 if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
21705 MVT ResVT = MVT::v4i32;
21706 MVT TruncVT = MVT::v4i1;
21707 unsigned Opc;
21708 if (IsStrict)
21709 Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
21710 else
21711 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
21712
21713 if (!IsSigned && !Subtarget.hasVLX()) {
21714 assert(Subtarget.useAVX512Regs() && "Unexpected features!")(static_cast<void> (0));
21715 // Widen to 512-bits.
21716 ResVT = MVT::v8i32;
21717 TruncVT = MVT::v8i1;
21718 Opc = Op.getOpcode();
21719 // Need to concat with zero vector for strict fp to avoid spurious
21720 // exceptions.
21721 // TODO: Should we just do this for non-strict as well?
21722 SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64)
21723 : DAG.getUNDEF(MVT::v8f64);
21724 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src,
21725 DAG.getIntPtrConstant(0, dl));
21726 }
21727 if (IsStrict) {
21728 Res = DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Chain, Src});
21729 Chain = Res.getValue(1);
21730 } else {
21731 Res = DAG.getNode(Opc, dl, ResVT, Src);
21732 }
21733
21734 Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
21735 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
21736 DAG.getIntPtrConstant(0, dl));
21737 if (IsStrict)
21738 return DAG.getMergeValues({Res, Chain}, dl);
21739 return Res;
21740 }
21741
21742 if (Subtarget.hasFP16() && SrcVT.getVectorElementType() == MVT::f16) {
21743 if (VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16)
21744 return Op;
21745
21746 MVT ResVT = VT;
21747 MVT EleVT = VT.getVectorElementType();
21748 if (EleVT != MVT::i64)
21749 ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
21750
21751 if (SrcVT != MVT::v8f16) {
21752 SDValue Tmp =
21753 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
21754 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
21755 Ops[0] = Src;
21756 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
21757 }
21758
21759 if (IsStrict) {
21760 Res = DAG.getNode(IsSigned ? X86ISD::STRICT_CVTTP2SI
21761 : X86ISD::STRICT_CVTTP2UI,
21762 dl, {ResVT, MVT::Other}, {Chain, Src});
21763 Chain = Res.getValue(1);
21764 } else {
21765 Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl,
21766 ResVT, Src);
21767 }
21768
21769 // TODO: Need to add exception check code for strict FP.
21770 if (EleVT.getSizeInBits() < 16) {
21771 ResVT = MVT::getVectorVT(EleVT, 8);
21772 Res = DAG.getNode(ISD::TRUNCATE, dl, ResVT, Res);
21773 }
21774
21775 if (ResVT != VT)
21776 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21777 DAG.getIntPtrConstant(0, dl));
21778
21779 if (IsStrict)
21780 return DAG.getMergeValues({Res, Chain}, dl);
21781 return Res;
21782 }
21783
21784 if (VT == MVT::v8i16 && (SrcVT == MVT::v8f32 || SrcVT == MVT::v8f64)) {
21785 if (IsStrict) {
21786 Res = DAG.getNode(IsSigned ? ISD::STRICT_FP_TO_SINT
21787 : ISD::STRICT_FP_TO_UINT,
21788 dl, {MVT::v8i32, MVT::Other}, {Chain, Src});
21789 Chain = Res.getValue(1);
21790 } else {
21791 Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, dl,
21792 MVT::v8i32, Src);
21793 }
21794
21795 // TODO: Need to add exception check code for strict FP.
21796 Res = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i16, Res);
21797
21798 if (IsStrict)
21799 return DAG.getMergeValues({Res, Chain}, dl);
21800 return Res;
21801 }
21802
21803 // v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.
21804 if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {
21805 assert(!IsSigned && "Expected unsigned conversion!")(static_cast<void> (0));
21806 assert(Subtarget.useAVX512Regs() && "Requires avx512f")(static_cast<void> (0));
21807 return Op;
21808 }
21809
21810 // Widen vXi32 fp_to_uint with avx512f to 512-bit source.
21811 if ((VT == MVT::v4i32 || VT == MVT::v8i32) &&
21812 (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32) &&
21813 Subtarget.useAVX512Regs()) {
21814 assert(!IsSigned && "Expected unsigned conversion!")(static_cast<void> (0));
21815 assert(!Subtarget.hasVLX() && "Unexpected features!")(static_cast<void> (0));
21816 MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
21817 MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
21818 // Need to concat with zero vector for strict fp to avoid spurious
21819 // exceptions.
21820 // TODO: Should we just do this for non-strict as well?
21821 SDValue Tmp =
21822 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
21823 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
21824 DAG.getIntPtrConstant(0, dl));
21825
21826 if (IsStrict) {
21827 Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other},
21828 {Chain, Src});
21829 Chain = Res.getValue(1);
21830 } else {
21831 Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src);
21832 }
21833
21834 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21835 DAG.getIntPtrConstant(0, dl));
21836
21837 if (IsStrict)
21838 return DAG.getMergeValues({Res, Chain}, dl);
21839 return Res;
21840 }
21841
21842 // Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.
21843 if ((VT == MVT::v2i64 || VT == MVT::v4i64) &&
21844 (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32) &&
21845 Subtarget.useAVX512Regs() && Subtarget.hasDQI()) {
21846 assert(!Subtarget.hasVLX() && "Unexpected features!")(static_cast<void> (0));
21847 MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
21848 // Need to concat with zero vector for strict fp to avoid spurious
21849 // exceptions.
21850 // TODO: Should we just do this for non-strict as well?
21851 SDValue Tmp =
21852 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
21853 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
21854 DAG.getIntPtrConstant(0, dl));
21855
21856 if (IsStrict) {
21857 Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21858 {Chain, Src});
21859 Chain = Res.getValue(1);
21860 } else {
21861 Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src);
21862 }
21863
21864 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21865 DAG.getIntPtrConstant(0, dl));
21866
21867 if (IsStrict)
21868 return DAG.getMergeValues({Res, Chain}, dl);
21869 return Res;
21870 }
21871
21872 if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
21873 if (!Subtarget.hasVLX()) {
21874 // Non-strict nodes without VLX can we widened to v4f32->v4i64 by type
21875 // legalizer and then widened again by vector op legalization.
21876 if (!IsStrict)
21877 return SDValue();
21878
21879 SDValue Zero = DAG.getConstantFP(0.0, dl, MVT::v2f32);
21880 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32,
21881 {Src, Zero, Zero, Zero});
21882 Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21883 {Chain, Tmp});
21884 SDValue Chain = Tmp.getValue(1);
21885 Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,
21886 DAG.getIntPtrConstant(0, dl));
21887 return DAG.getMergeValues({Tmp, Chain}, dl);
21888 }
21889
21890 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL")(static_cast<void> (0));
21891 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
21892 DAG.getUNDEF(MVT::v2f32));
21893 if (IsStrict) {
21894 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI
21895 : X86ISD::STRICT_CVTTP2UI;
21896 return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});
21897 }
21898 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
21899 return DAG.getNode(Opc, dl, VT, Tmp);
21900 }
21901
21902 // Generate optimized instructions for pre AVX512 unsigned conversions from
21903 // vXf32 to vXi32.
21904 if ((VT == MVT::v4i32 && SrcVT == MVT::v4f32) ||
21905 (VT == MVT::v4i32 && SrcVT == MVT::v4f64) ||
21906 (VT == MVT::v8i32 && SrcVT == MVT::v8f32)) {
21907 assert(!IsSigned && "Expected unsigned conversion!")(static_cast<void> (0));
21908 return expandFP_TO_UINT_SSE(VT, Src, dl, DAG, Subtarget);
21909 }
21910
21911 return SDValue();
21912 }
21913
21914 assert(!VT.isVector())(static_cast<void> (0));
21915
21916 bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);
21917
21918 if (!IsSigned && UseSSEReg) {
21919 // Conversions from f32/f64 with AVX512 should be legal.
21920 if (Subtarget.hasAVX512())
21921 return Op;
21922
21923 // We can leverage the specific way the "cvttss2si/cvttsd2si" instruction
21924 // behaves on out of range inputs to generate optimized conversions.
21925 if (!IsStrict && ((VT == MVT::i32 && !Subtarget.is64Bit()) ||
21926 (VT == MVT::i64 && Subtarget.is64Bit()))) {
21927 unsigned DstBits = VT.getScalarSizeInBits();
21928 APInt UIntLimit = APInt::getSignMask(DstBits);
21929 SDValue FloatOffset = DAG.getNode(ISD::UINT_TO_FP, dl, SrcVT,
21930 DAG.getConstant(UIntLimit, dl, VT));
21931 MVT SrcVecVT = MVT::getVectorVT(SrcVT, 128 / SrcVT.getScalarSizeInBits());
21932
21933 // Calculate the converted result for values in the range:
21934 // (i32) 0 to 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
21935 // (i64) 0 to 2^63-1 ("Small") and from 2^63 to 2^64-1 ("Big").
21936 SDValue Small =
21937 DAG.getNode(X86ISD::CVTTS2SI, dl, VT,
21938 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT, Src));
21939 SDValue Big = DAG.getNode(
21940 X86ISD::CVTTS2SI, dl, VT,
21941 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT,
21942 DAG.getNode(ISD::FSUB, dl, SrcVT, Src, FloatOffset)));
21943
21944 // The "CVTTS2SI" instruction conveniently sets the sign bit if
21945 // and only if the value was out of range. So we can use that
21946 // as our indicator that we rather use "Big" instead of "Small".
21947 //
21948 // Use "Small" if "IsOverflown" has all bits cleared
21949 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
21950 SDValue IsOverflown = DAG.getNode(
21951 ISD::SRA, dl, VT, Small, DAG.getConstant(DstBits - 1, dl, MVT::i8));
21952 return DAG.getNode(ISD::OR, dl, VT, Small,
21953 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
21954 }
21955
21956 // Use default expansion for i64.
21957 if (VT == MVT::i64)
21958 return SDValue();
21959
21960 assert(VT == MVT::i32 && "Unexpected VT!")(static_cast<void> (0));
21961
21962 // Promote i32 to i64 and use a signed operation on 64-bit targets.
21963 // FIXME: This does not generate an invalid exception if the input does not
21964 // fit in i32. PR44019
21965 if (Subtarget.is64Bit()) {
21966 if (IsStrict) {
21967 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i64, MVT::Other},
21968 {Chain, Src});
21969 Chain = Res.getValue(1);
21970 } else
21971 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);
21972
21973 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21974 if (IsStrict)
21975 return DAG.getMergeValues({Res, Chain}, dl);
21976 return Res;
21977 }
21978
21979 // Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can
21980 // use fisttp which will be handled later.
21981 if (!Subtarget.hasSSE3())
21982 return SDValue();
21983 }
21984
21985 // Promote i16 to i32 if we can use a SSE operation or the type is f128.
21986 // FIXME: This does not generate an invalid exception if the input does not
21987 // fit in i16. PR44019
21988 if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) {
21989 assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!")(static_cast<void> (0));
21990 if (IsStrict) {
21991 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i32, MVT::Other},
21992 {Chain, Src});
21993 Chain = Res.getValue(1);
21994 } else
21995 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);
21996
21997 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21998 if (IsStrict)
21999 return DAG.getMergeValues({Res, Chain}, dl);
22000 return Res;
22001 }
22002
22003 // If this is a FP_TO_SINT using SSEReg we're done.
22004 if (UseSSEReg && IsSigned)
22005 return Op;
22006
22007 // fp128 needs to use a libcall.
22008 if (SrcVT == MVT::f128) {
22009 RTLIB::Libcall LC;
22010 if (IsSigned)
22011 LC = RTLIB::getFPTOSINT(SrcVT, VT);
22012 else
22013 LC = RTLIB::getFPTOUINT(SrcVT, VT);
22014
22015 MakeLibCallOptions CallOptions;
22016 std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, LC, VT, Src, CallOptions,
22017 SDLoc(Op), Chain);
22018
22019 if (IsStrict)
22020 return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
22021
22022 return Tmp.first;
22023 }
22024
22025 // Fall back to X87.
22026 if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) {
22027 if (IsStrict)
22028 return DAG.getMergeValues({V, Chain}, dl);
22029 return V;
22030 }
22031
22032 llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.")__builtin_unreachable();
22033}
22034
22035SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,
22036 SelectionDAG &DAG) const {
22037 SDValue Src = Op.getOperand(0);
22038 MVT SrcVT = Src.getSimpleValueType();
22039
22040 // If the source is in an SSE register, the node is Legal.
22041 if (isScalarFPTypeInSSEReg(SrcVT))
22042 return Op;
22043
22044 return LRINT_LLRINTHelper(Op.getNode(), DAG);
22045}
22046
22047SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
22048 SelectionDAG &DAG) const {
22049 EVT DstVT = N->getValueType(0);
22050 SDValue Src = N->getOperand(0);
22051 EVT SrcVT = Src.getValueType();
22052
22053 if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) {
22054 // f16 must be promoted before using the lowering in this routine.
22055 // fp128 does not use this lowering.
22056 return SDValue();
22057 }
22058
22059 SDLoc DL(N);
22060 SDValue Chain = DAG.getEntryNode();
22061
22062 bool UseSSE = isScalarFPTypeInSSEReg(SrcVT);
22063
22064 // If we're converting from SSE, the stack slot needs to hold both types.
22065 // Otherwise it only needs to hold the DstVT.
22066 EVT OtherVT = UseSSE ? SrcVT : DstVT;
22067 SDValue StackPtr = DAG.CreateStackTemporary(DstVT, OtherVT);
22068 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
22069 MachinePointerInfo MPI =
22070 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
22071
22072 if (UseSSE) {
22073 assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!")(static_cast<void> (0));
22074 Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI);
22075 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
22076 SDValue Ops[] = { Chain, StackPtr };
22077
22078 Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI,
22079 /*Align*/ None, MachineMemOperand::MOLoad);
22080 Chain = Src.getValue(1);
22081 }
22082
22083 SDValue StoreOps[] = { Chain, Src, StackPtr };
22084 Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other),
22085 StoreOps, DstVT, MPI, /*Align*/ None,
22086 MachineMemOperand::MOStore);
22087
22088 return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);
22089}
22090
22091SDValue
22092X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
22093 // This is based on the TargetLowering::expandFP_TO_INT_SAT implementation,
22094 // but making use of X86 specifics to produce better instruction sequences.
22095 SDNode *Node = Op.getNode();
22096 bool IsSigned = Node->getOpcode() == ISD::FP_TO_SINT_SAT;
22097 unsigned FpToIntOpcode = IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
22098 SDLoc dl(SDValue(Node, 0));
22099 SDValue Src = Node->getOperand(0);
22100
22101 // There are three types involved here: SrcVT is the source floating point
22102 // type, DstVT is the type of the result, and TmpVT is the result of the
22103 // intermediate FP_TO_*INT operation we'll use (which may be a promotion of
22104 // DstVT).
22105 EVT SrcVT = Src.getValueType();
22106 EVT DstVT = Node->getValueType(0);
22107 EVT TmpVT = DstVT;
22108
22109 // This code is only for floats and doubles. Fall back to generic code for
22110 // anything else.
22111 if (!isScalarFPTypeInSSEReg(SrcVT))
22112 return SDValue();
22113
22114 EVT SatVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
22115 unsigned SatWidth = SatVT.getScalarSizeInBits();
22116 unsigned DstWidth = DstVT.getScalarSizeInBits();
22117 unsigned TmpWidth = TmpVT.getScalarSizeInBits();
22118 assert(SatWidth <= DstWidth && SatWidth <= TmpWidth &&(static_cast<void> (0))
22119 "Expected saturation width smaller than result width")(static_cast<void> (0));
22120
22121 // Promote result of FP_TO_*INT to at least 32 bits.
22122 if (TmpWidth < 32) {
22123 TmpVT = MVT::i32;
22124 TmpWidth = 32;
22125 }
22126
22127 // Promote conversions to unsigned 32-bit to 64-bit, because it will allow
22128 // us to use a native signed conversion instead.
22129 if (SatWidth == 32 && !IsSigned && Subtarget.is64Bit()) {
22130 TmpVT = MVT::i64;
22131 TmpWidth = 64;
22132 }
22133
22134 // If the saturation width is smaller than the size of the temporary result,
22135 // we can always use signed conversion, which is native.
22136 if (SatWidth < TmpWidth)
22137 FpToIntOpcode = ISD::FP_TO_SINT;
22138
22139 // Determine minimum and maximum integer values and their corresponding
22140 // floating-point values.
22141 APInt MinInt, MaxInt;
22142 if (IsSigned) {
22143 MinInt = APInt::getSignedMinValue(SatWidth).sextOrSelf(DstWidth);
22144 MaxInt = APInt::getSignedMaxValue(SatWidth).sextOrSelf(DstWidth);
22145 } else {
22146 MinInt = APInt::getMinValue(SatWidth).zextOrSelf(DstWidth);
22147 MaxInt = APInt::getMaxValue(SatWidth).zextOrSelf(DstWidth);
22148 }
22149
22150 APFloat MinFloat(DAG.EVTToAPFloatSemantics(SrcVT));
22151 APFloat MaxFloat(DAG.EVTToAPFloatSemantics(SrcVT));
22152
22153 APFloat::opStatus MinStatus = MinFloat.convertFromAPInt(
22154 MinInt, IsSigned, APFloat::rmTowardZero);
22155 APFloat::opStatus MaxStatus = MaxFloat.convertFromAPInt(
22156 MaxInt, IsSigned, APFloat::rmTowardZero);
22157 bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact)
22158 && !(MaxStatus & APFloat::opStatus::opInexact);
22159
22160 SDValue MinFloatNode = DAG.getConstantFP(MinFloat, dl, SrcVT);
22161 SDValue MaxFloatNode = DAG.getConstantFP(MaxFloat, dl, SrcVT);
22162
22163 // If the integer bounds are exactly representable as floats, emit a
22164 // min+max+fptoi sequence. Otherwise use comparisons and selects.
22165 if (AreExactFloatBounds) {
22166 if (DstVT != TmpVT) {
22167 // Clamp by MinFloat from below. If Src is NaN, propagate NaN.
22168 SDValue MinClamped = DAG.getNode(
22169 X86ISD::FMAX, dl, SrcVT, MinFloatNode, Src);
22170 // Clamp by MaxFloat from above. If Src is NaN, propagate NaN.
22171 SDValue BothClamped = DAG.getNode(
22172 X86ISD::FMIN, dl, SrcVT, MaxFloatNode, MinClamped);
22173 // Convert clamped value to integer.
22174 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, BothClamped);
22175
22176 // NaN will become INDVAL, with the top bit set and the rest zero.
22177 // Truncation will discard the top bit, resulting in zero.
22178 return DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
22179 }
22180
22181 // Clamp by MinFloat from below. If Src is NaN, the result is MinFloat.
22182 SDValue MinClamped = DAG.getNode(
22183 X86ISD::FMAX, dl, SrcVT, Src, MinFloatNode);
22184 // Clamp by MaxFloat from above. NaN cannot occur.
22185 SDValue BothClamped = DAG.getNode(
22186 X86ISD::FMINC, dl, SrcVT, MinClamped, MaxFloatNode);
22187 // Convert clamped value to integer.
22188 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, DstVT, BothClamped);
22189
22190 if (!IsSigned) {
22191 // In the unsigned case we're done, because we mapped NaN to MinFloat,
22192 // which is zero.
22193 return FpToInt;
22194 }
22195
22196 // Otherwise, select zero if Src is NaN.
22197 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
22198 return DAG.getSelectCC(
22199 dl, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO);
22200 }
22201
22202 SDValue MinIntNode = DAG.getConstant(MinInt, dl, DstVT);
22203 SDValue MaxIntNode = DAG.getConstant(MaxInt, dl, DstVT);
22204
22205 // Result of direct conversion, which may be selected away.
22206 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, Src);
22207
22208 if (DstVT != TmpVT) {
22209 // NaN will become INDVAL, with the top bit set and the rest zero.
22210 // Truncation will discard the top bit, resulting in zero.
22211 FpToInt = DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
22212 }
22213
22214 SDValue Select = FpToInt;
22215 // For signed conversions where we saturate to the same size as the
22216 // result type of the fptoi instructions, INDVAL coincides with integer
22217 // minimum, so we don't need to explicitly check it.
22218 if (!IsSigned || SatWidth != TmpVT.getScalarSizeInBits()) {
22219 // If Src ULT MinFloat, select MinInt. In particular, this also selects
22220 // MinInt if Src is NaN.
22221 Select = DAG.getSelectCC(
22222 dl, Src, MinFloatNode, MinIntNode, Select, ISD::CondCode::SETULT);
22223 }
22224
22225 // If Src OGT MaxFloat, select MaxInt.
22226 Select = DAG.getSelectCC(
22227 dl, Src, MaxFloatNode, MaxIntNode, Select, ISD::CondCode::SETOGT);
22228
22229 // In the unsigned case we are done, because we mapped NaN to MinInt, which
22230 // is already zero. The promoted case was already handled above.
22231 if (!IsSigned || DstVT != TmpVT) {
22232 return Select;
22233 }
22234
22235 // Otherwise, select 0 if Src is NaN.
22236 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
22237 return DAG.getSelectCC(
22238 dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO);
22239}
22240
22241SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
22242 bool IsStrict = Op->isStrictFPOpcode();
22243
22244 SDLoc DL(Op);
22245 MVT VT = Op.getSimpleValueType();
22246 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
22247 MVT SVT = In.getSimpleValueType();
22248
22249 if (VT == MVT::f128)
22250 return SDValue();
22251
22252 if (VT == MVT::f80) {
22253 if (SVT == MVT::f16) {
22254 assert(Subtarget.hasFP16() && "Unexpected features!")(static_cast<void> (0));
22255 RTLIB::Libcall LC = RTLIB::getFPEXT(SVT, VT);
22256 MakeLibCallOptions CallOptions;
22257 std::pair<SDValue, SDValue> Tmp =
22258 makeLibCall(DAG, LC, VT, In, CallOptions, DL,
22259 IsStrict ? Op.getOperand(0) : SDValue());
22260 if (IsStrict)
22261 return DAG.getMergeValues({Tmp.first, Tmp.second}, DL);
22262 else
22263 return Tmp.first;
22264 }
22265 return Op;
22266 }
22267
22268 if (SVT.getVectorElementType() == MVT::f16) {
22269 assert(Subtarget.hasFP16() && Subtarget.hasVLX() && "Unexpected features!")(static_cast<void> (0));
22270 if (SVT == MVT::v2f16)
22271 In = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f16, In,
22272 DAG.getUNDEF(MVT::v2f16));
22273 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f16, In,
22274 DAG.getUNDEF(MVT::v4f16));
22275 if (IsStrict)
22276 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
22277 {Op->getOperand(0), Res});
22278 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
22279 }
22280
22281 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!")(static_cast<void> (0));
22282
22283 SDValue Res =
22284 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT));
22285 if (IsStrict)
22286 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
22287 {Op->getOperand(0), Res});
22288 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
22289}
22290
22291SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
22292 bool IsStrict = Op->isStrictFPOpcode();
22293 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
22294 MVT VT = Op.getSimpleValueType();
22295 MVT SVT = In.getSimpleValueType();
22296
22297 // It's legal except when f128 is involved or we're converting f80->f16.
22298 if (SVT != MVT::f128 && !(VT == MVT::f16 && SVT == MVT::f80))
22299 return Op;
22300
22301 return SDValue();
22302}
22303
22304static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG) {
22305 bool IsStrict = Op->isStrictFPOpcode();
22306 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
22307 assert(Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 &&(static_cast<void> (0))
22308 "Unexpected VT!")(static_cast<void> (0));
22309
22310 SDLoc dl(Op);
22311 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16,
22312 DAG.getConstant(0, dl, MVT::v8i16), Src,
22313 DAG.getIntPtrConstant(0, dl));
22314
22315 SDValue Chain;
22316 if (IsStrict) {
22317 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {MVT::v4f32, MVT::Other},
22318 {Op.getOperand(0), Res});
22319 Chain = Res.getValue(1);
22320 } else {
22321 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
22322 }
22323
22324 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
22325 DAG.getIntPtrConstant(0, dl));
22326
22327 if (IsStrict)
22328 return DAG.getMergeValues({Res, Chain}, dl);
22329
22330 return Res;
22331}
22332
22333static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) {
22334 bool IsStrict = Op->isStrictFPOpcode();
22335 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
22336 assert(Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 &&(static_cast<void> (0))
22337 "Unexpected VT!")(static_cast<void> (0));
22338
22339 SDLoc dl(Op);
22340 SDValue Res, Chain;
22341 if (IsStrict) {
22342 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4f32,
22343 DAG.getConstantFP(0, dl, MVT::v4f32), Src,
22344 DAG.getIntPtrConstant(0, dl));
22345 Res = DAG.getNode(
22346 X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
22347 {Op.getOperand(0), Res, DAG.getTargetConstant(4, dl, MVT::i32)});
22348 Chain = Res.getValue(1);
22349 } else {
22350 // FIXME: Should we use zeros for upper elements for non-strict?
22351 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Src);
22352 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
22353 DAG.getTargetConstant(4, dl, MVT::i32));
22354 }
22355
22356 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res,
22357 DAG.getIntPtrConstant(0, dl));
22358
22359 if (IsStrict)
22360 return DAG.getMergeValues({Res, Chain}, dl);
22361
22362 return Res;
22363}
22364
22365/// Depending on uarch and/or optimizing for size, we might prefer to use a
22366/// vector operation in place of the typical scalar operation.
22367static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG,
22368 const X86Subtarget &Subtarget) {
22369 // If both operands have other uses, this is probably not profitable.
22370 SDValue LHS = Op.getOperand(0);
22371 SDValue RHS = Op.getOperand(1);
22372 if (!LHS.hasOneUse() && !RHS.hasOneUse())
22373 return Op;
22374
22375 // FP horizontal add/sub were added with SSE3. Integer with SSSE3.
22376 bool IsFP = Op.getSimpleValueType().isFloatingPoint();
22377 if (IsFP && !Subtarget.hasSSE3())
22378 return Op;
22379 if (!IsFP && !Subtarget.hasSSSE3())
22380 return Op;
22381
22382 // Extract from a common vector.
22383 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
22384 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
22385 LHS.getOperand(0) != RHS.getOperand(0) ||
22386 !isa<ConstantSDNode>(LHS.getOperand(1)) ||
22387 !isa<ConstantSDNode>(RHS.getOperand(1)) ||
22388 !shouldUseHorizontalOp(true, DAG, Subtarget))
22389 return Op;
22390
22391 // Allow commuted 'hadd' ops.
22392 // TODO: Allow commuted (f)sub by negating the result of (F)HSUB?
22393 unsigned HOpcode;
22394 switch (Op.getOpcode()) {
22395 case ISD::ADD: HOpcode = X86ISD::HADD; break;
22396 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
22397 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
22398 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
22399 default:
22400 llvm_unreachable("Trying to lower unsupported opcode to horizontal op")__builtin_unreachable();
22401 }
22402 unsigned LExtIndex = LHS.getConstantOperandVal(1);
22403 unsigned RExtIndex = RHS.getConstantOperandVal(1);
22404 if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 &&
22405 (HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD))
22406 std::swap(LExtIndex, RExtIndex);
22407
22408 if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1))
22409 return Op;
22410
22411 SDValue X = LHS.getOperand(0);
22412 EVT VecVT = X.getValueType();
22413 unsigned BitWidth = VecVT.getSizeInBits();
22414 unsigned NumLanes = BitWidth / 128;
22415 unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes;
22416 assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&(static_cast<void> (0))
22417 "Not expecting illegal vector widths here")(static_cast<void> (0));
22418
22419 // Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit
22420 // equivalent, so extract the 256/512-bit source op to 128-bit if we can.
22421 SDLoc DL(Op);
22422 if (BitWidth == 256 || BitWidth == 512) {
22423 unsigned LaneIdx = LExtIndex / NumEltsPerLane;
22424 X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);
22425 LExtIndex %= NumEltsPerLane;
22426 }
22427
22428 // add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
22429 // add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
22430 // add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1
22431 // sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0
22432 SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);
22433 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,
22434 DAG.getIntPtrConstant(LExtIndex / 2, DL));
22435}
22436
22437/// Depending on uarch and/or optimizing for size, we might prefer to use a
22438/// vector operation in place of the typical scalar operation.
22439SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {
22440 assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) &&(static_cast<void> (0))
22441 "Only expecting float/double")(static_cast<void> (0));
22442 return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
22443}
22444
22445/// ISD::FROUND is defined to round to nearest with ties rounding away from 0.
22446/// This mode isn't supported in hardware on X86. But as long as we aren't
22447/// compiling with trapping math, we can emulate this with
22448/// floor(X + copysign(nextafter(0.5, 0.0), X)).
22449static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) {
22450 SDValue N0 = Op.getOperand(0);
22451 SDLoc dl(Op);
22452 MVT VT = Op.getSimpleValueType();
22453
22454 // N0 += copysign(nextafter(0.5, 0.0), N0)
22455 const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
22456 bool Ignored;
22457 APFloat Point5Pred = APFloat(0.5f);
22458 Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);
22459 Point5Pred.next(/*nextDown*/true);
22460
22461 SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,
22462 DAG.getConstantFP(Point5Pred, dl, VT), N0);
22463 N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder);
22464
22465 // Truncate the result to remove fraction.
22466 return DAG.getNode(ISD::FTRUNC, dl, VT, N0);
22467}
22468
22469/// The only differences between FABS and FNEG are the mask and the logic op.
22470/// FNEG also has a folding opportunity for FNEG(FABS(x)).
22471static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
22472 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&(static_cast<void> (0))
22473 "Wrong opcode for lowering FABS or FNEG.")(static_cast<void> (0));
22474
22475 bool IsFABS = (Op.getOpcode() == ISD::FABS);
22476
22477 // If this is a FABS and it has an FNEG user, bail out to fold the combination
22478 // into an FNABS. We'll lower the FABS after that if it is still in use.
22479 if (IsFABS)
22480 for (SDNode *User : Op->uses())
22481 if (User->getOpcode() == ISD::FNEG)
22482 return Op;
22483
22484 SDLoc dl(Op);
22485 MVT VT = Op.getSimpleValueType();
22486
22487 bool IsF128 = (VT == MVT::f128);
22488 assert(VT.isFloatingPoint() && VT != MVT::f80 &&(static_cast<void> (0))
22489 DAG.getTargetLoweringInfo().isTypeLegal(VT) &&(static_cast<void> (0))
22490 "Unexpected type in LowerFABSorFNEG")(static_cast<void> (0));
22491
22492 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
22493 // decide if we should generate a 16-byte constant mask when we only need 4 or
22494 // 8 bytes for the scalar case.
22495
22496 // There are no scalar bitwise logical SSE/AVX instructions, so we
22497 // generate a 16-byte vector constant and logic op even for the scalar case.
22498 // Using a 16-byte mask allows folding the load of the mask with
22499 // the logic op, so it can save (~4 bytes) on code size.
22500 bool IsFakeVector = !VT.isVector() && !IsF128;
22501 MVT LogicVT = VT;
22502 if (IsFakeVector)
22503 LogicVT = (VT == MVT::f64) ? MVT::v2f64
22504 : (VT == MVT::f32) ? MVT::v4f32
22505 : MVT::v8f16;
22506
22507 unsigned EltBits = VT.getScalarSizeInBits();
22508 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
22509 APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :
22510 APInt::getSignMask(EltBits);
22511 const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
22512 SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
22513
22514 SDValue Op0 = Op.getOperand(0);
22515 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
22516 unsigned LogicOp = IsFABS ? X86ISD::FAND :
22517 IsFNABS ? X86ISD::FOR :
22518 X86ISD::FXOR;
22519 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
22520
22521 if (VT.isVector() || IsF128)
22522 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
22523
22524 // For the scalar case extend to a 128-bit vector, perform the logic op,
22525 // and extract the scalar result back out.
22526 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
22527 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
22528 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
22529 DAG.getIntPtrConstant(0, dl));
22530}
22531
22532static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
22533 SDValue Mag = Op.getOperand(0);
22534 SDValue Sign = Op.getOperand(1);
22535 SDLoc dl(Op);
22536
22537 // If the sign operand is smaller, extend it first.
22538 MVT VT = Op.getSimpleValueType();
22539 if (Sign.getSimpleValueType().bitsLT(VT))
22540 Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
22541
22542 // And if it is bigger, shrink it first.
22543 if (Sign.getSimpleValueType().bitsGT(VT))
22544 Sign =
22545 DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(0, dl));
22546
22547 // At this point the operands and the result should have the same
22548 // type, and that won't be f80 since that is not custom lowered.
22549 bool IsF128 = (VT == MVT::f128);
22550 assert(VT.isFloatingPoint() && VT != MVT::f80 &&(static_cast<void> (0))
22551 DAG.getTargetLoweringInfo().isTypeLegal(VT) &&(static_cast<void> (0))
22552 "Unexpected type in LowerFCOPYSIGN")(static_cast<void> (0));
22553
22554 const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
22555
22556 // Perform all scalar logic operations as 16-byte vectors because there are no
22557 // scalar FP logic instructions in SSE.
22558 // TODO: This isn't necessary. If we used scalar types, we might avoid some
22559 // unnecessary splats, but we might miss load folding opportunities. Should
22560 // this decision be based on OptimizeForSize?
22561 bool IsFakeVector = !VT.isVector() && !IsF128;
22562 MVT LogicVT = VT;
22563 if (IsFakeVector)
22564 LogicVT = (VT == MVT::f64) ? MVT::v2f64
22565 : (VT == MVT::f32) ? MVT::v4f32
22566 : MVT::v8f16;
22567
22568 // The mask constants are automatically splatted for vector types.
22569 unsigned EltSizeInBits = VT.getScalarSizeInBits();
22570 SDValue SignMask = DAG.getConstantFP(
22571 APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
22572 SDValue MagMask = DAG.getConstantFP(
22573 APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT);
22574
22575 // First, clear all bits but the sign bit from the second operand (sign).
22576 if (IsFakeVector)
22577 Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
22578 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
22579
22580 // Next, clear the sign bit from the first operand (magnitude).
22581 // TODO: If we had general constant folding for FP logic ops, this check
22582 // wouldn't be necessary.
22583 SDValue MagBits;
22584 if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) {
22585 APFloat APF = Op0CN->getValueAPF();
22586 APF.clearSign();
22587 MagBits = DAG.getConstantFP(APF, dl, LogicVT);
22588 } else {
22589 // If the magnitude operand wasn't a constant, we need to AND out the sign.
22590 if (IsFakeVector)
22591 Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
22592 MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
22593 }
22594
22595 // OR the magnitude value with the sign bit.
22596 SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
22597 return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
22598 DAG.getIntPtrConstant(0, dl));
22599}
22600
22601static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
22602 SDValue N0 = Op.getOperand(0);
22603 SDLoc dl(Op);
22604 MVT VT = Op.getSimpleValueType();
22605
22606 MVT OpVT = N0.getSimpleValueType();
22607 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&(static_cast<void> (0))
22608 "Unexpected type for FGETSIGN")(static_cast<void> (0));
22609
22610 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
22611 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
22612 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
22613 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
22614 Res = DAG.getZExtOrTrunc(Res, dl, VT);
22615 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
22616 return Res;
22617}
22618
22619/// Helper for creating a X86ISD::SETCC node.
22620static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
22621 SelectionDAG &DAG) {
22622 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
22623 DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS);
22624}
22625
22626/// Helper for matching OR(EXTRACTELT(X,0),OR(EXTRACTELT(X,1),...))
22627/// style scalarized (associative) reduction patterns. Partial reductions
22628/// are supported when the pointer SrcMask is non-null.
22629/// TODO - move this to SelectionDAG?
22630static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp,
22631 SmallVectorImpl<SDValue> &SrcOps,
22632 SmallVectorImpl<APInt> *SrcMask = nullptr) {
22633 SmallVector<SDValue, 8> Opnds;
22634 DenseMap<SDValue, APInt> SrcOpMap;
22635 EVT VT = MVT::Other;
22636
22637 // Recognize a special case where a vector is casted into wide integer to
22638 // test all 0s.
22639 assert(Op.getOpcode() == unsigned(BinOp) &&(static_cast<void> (0))
22640 "Unexpected bit reduction opcode")(static_cast<void> (0));
22641 Opnds.push_back(Op.getOperand(0));
22642 Opnds.push_back(Op.getOperand(1));
22643
22644 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
22645 SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
22646 // BFS traverse all BinOp operands.
22647 if (I->getOpcode() == unsigned(BinOp)) {
22648 Opnds.push_back(I->getOperand(0));
22649 Opnds.push_back(I->getOperand(1));
22650 // Re-evaluate the number of nodes to be traversed.
22651 e += 2; // 2 more nodes (LHS and RHS) are pushed.
22652 continue;
22653 }
22654
22655 // Quit if a non-EXTRACT_VECTOR_ELT
22656 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
22657 return false;
22658
22659 // Quit if without a constant index.
22660 auto *Idx = dyn_cast<ConstantSDNode>(I->getOperand(1));
22661 if (!Idx)
22662 return false;
22663
22664 SDValue Src = I->getOperand(0);
22665 DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src);
22666 if (M == SrcOpMap.end()) {
22667 VT = Src.getValueType();
22668 // Quit if not the same type.
22669 if (!SrcOpMap.empty() && VT != SrcOpMap.begin()->first.getValueType())
22670 return false;
22671 unsigned NumElts = VT.getVectorNumElements();
22672 APInt EltCount = APInt::getNullValue(NumElts);
22673 M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;
22674 SrcOps.push_back(Src);
22675 }
22676
22677 // Quit if element already used.
22678 unsigned CIdx = Idx->getZExtValue();
22679 if (M->second[CIdx])
22680 return false;
22681 M->second.setBit(CIdx);
22682 }
22683
22684 if (SrcMask) {
22685 // Collect the source partial masks.
22686 for (SDValue &SrcOp : SrcOps)
22687 SrcMask->push_back(SrcOpMap[SrcOp]);
22688 } else {
22689 // Quit if not all elements are used.
22690 for (const auto &I : SrcOpMap)
22691 if (!I.second.isAllOnesValue())
22692 return false;
22693 }
22694
22695 return true;
22696}
22697
22698// Helper function for comparing all bits of a vector against zero.
22699static SDValue LowerVectorAllZero(const SDLoc &DL, SDValue V, ISD::CondCode CC,
22700 const APInt &Mask,
22701 const X86Subtarget &Subtarget,
22702 SelectionDAG &DAG, X86::CondCode &X86CC) {
22703 EVT VT = V.getValueType();
22704 unsigned ScalarSize = VT.getScalarSizeInBits();
22705 if (Mask.getBitWidth() != ScalarSize) {
22706 assert(ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch")(static_cast<void> (0));
22707 return SDValue();
22708 }
22709
22710 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode")(static_cast<void> (0));
22711 X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE);
22712
22713 auto MaskBits = [&](SDValue Src) {
22714 if (Mask.isAllOnesValue())
22715 return Src;
22716 EVT SrcVT = Src.getValueType();
22717 SDValue MaskValue = DAG.getConstant(Mask, DL, SrcVT);
22718 return DAG.getNode(ISD::AND, DL, SrcVT, Src, MaskValue);
22719 };
22720
22721 // For sub-128-bit vector, cast to (legal) integer and compare with zero.
22722 if (VT.getSizeInBits() < 128) {
22723 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
22724 if (!DAG.getTargetLoweringInfo().isTypeLegal(IntVT))
22725 return SDValue();
22726 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
22727 DAG.getBitcast(IntVT, MaskBits(V)),
22728 DAG.getConstant(0, DL, IntVT));
22729 }
22730
22731 // Quit if not splittable to 128/256-bit vector.
22732 if (!isPowerOf2_32(VT.getSizeInBits()))
22733 return SDValue();
22734
22735 // Split down to 128/256-bit vector.
22736 unsigned TestSize = Subtarget.hasAVX() ? 256 : 128;
22737 while (VT.getSizeInBits() > TestSize) {
22738 auto Split = DAG.SplitVector(V, DL);
22739 VT = Split.first.getValueType();
22740 V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second);
22741 }
22742
22743 bool UsePTEST = Subtarget.hasSSE41();
22744 if (UsePTEST) {
22745 MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
22746 V = DAG.getBitcast(TestVT, MaskBits(V));
22747 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V);
22748 }
22749
22750 // Without PTEST, a masked v2i64 or-reduction is not faster than
22751 // scalarization.
22752 if (!Mask.isAllOnesValue() && VT.getScalarSizeInBits() > 32)
22753 return SDValue();
22754
22755 V = DAG.getBitcast(MVT::v16i8, MaskBits(V));
22756 V = DAG.getNode(X86ISD::PCMPEQ, DL, MVT::v16i8, V,
22757 getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
22758 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
22759 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
22760 DAG.getConstant(0xFFFF, DL, MVT::i32));
22761}
22762
22763// Check whether an OR'd reduction tree is PTEST-able, or if we can fallback to
22764// CMP(MOVMSK(PCMPEQB(X,0))).
22765static SDValue MatchVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
22766 const SDLoc &DL,
22767 const X86Subtarget &Subtarget,
22768 SelectionDAG &DAG, SDValue &X86CC) {
22769 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode")(static_cast<void> (0));
22770
22771 if (!Subtarget.hasSSE2() || !Op->hasOneUse())
22772 return SDValue();
22773
22774 // Check whether we're masking/truncating an OR-reduction result, in which
22775 // case track the masked bits.
22776 APInt Mask = APInt::getAllOnesValue(Op.getScalarValueSizeInBits());
22777 switch (Op.getOpcode()) {
22778 case ISD::TRUNCATE: {
22779 SDValue Src = Op.getOperand(0);
22780 Mask = APInt::getLowBitsSet(Src.getScalarValueSizeInBits(),
22781 Op.getScalarValueSizeInBits());
22782 Op = Src;
22783 break;
22784 }
22785 case ISD::AND: {
22786 if (auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
22787 Mask = Cst->getAPIntValue();
22788 Op = Op.getOperand(0);
22789 }
22790 break;
22791 }
22792 }
22793
22794 SmallVector<SDValue, 8> VecIns;
22795 if (Op.getOpcode() == ISD::OR && matchScalarReduction(Op, ISD::OR, VecIns)) {
22796 EVT VT = VecIns[0].getValueType();
22797 assert(llvm::all_of(VecIns,(static_cast<void> (0))
22798 [VT](SDValue V) { return VT == V.getValueType(); }) &&(static_cast<void> (0))
22799 "Reduction source vector mismatch")(static_cast<void> (0));
22800
22801 // Quit if less than 128-bits or not splittable to 128/256-bit vector.
22802 if (VT.getSizeInBits() < 128 || !isPowerOf2_32(VT.getSizeInBits()))
22803 return SDValue();
22804
22805 // If more than one full vector is evaluated, OR them first before PTEST.
22806 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1;
22807 Slot += 2, e += 1) {
22808 // Each iteration will OR 2 nodes and append the result until there is
22809 // only 1 node left, i.e. the final OR'd value of all vectors.
22810 SDValue LHS = VecIns[Slot];
22811 SDValue RHS = VecIns[Slot + 1];
22812 VecIns.push_back(DAG.getNode(ISD::OR, DL, VT, LHS, RHS));
22813 }
22814
22815 X86::CondCode CCode;
22816 if (SDValue V = LowerVectorAllZero(DL, VecIns.back(), CC, Mask, Subtarget,
22817 DAG, CCode)) {
22818 X86CC = DAG.getTargetConstant(CCode, DL, MVT::i8);
22819 return V;
22820 }
22821 }
22822
22823 if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
22824 ISD::NodeType BinOp;
22825 if (SDValue Match =
22826 DAG.matchBinOpReduction(Op.getNode(), BinOp, {ISD::OR})) {
22827 X86::CondCode CCode;
22828 if (SDValue V =
22829 LowerVectorAllZero(DL, Match, CC, Mask, Subtarget, DAG, CCode)) {
22830 X86CC = DAG.getTargetConstant(CCode, DL, MVT::i8);
22831 return V;
22832 }
22833 }
22834 }
22835
22836 return SDValue();
22837}
22838
22839/// return true if \c Op has a use that doesn't just read flags.
22840static bool hasNonFlagsUse(SDValue Op) {
22841 for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
22842 ++UI) {
22843 SDNode *User = *UI;
22844 unsigned UOpNo = UI.getOperandNo();
22845 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
22846 // Look pass truncate.
22847 UOpNo = User->use_begin().getOperandNo();
22848 User = *User->use_begin();
22849 }
22850
22851 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
22852 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
22853 return true;
22854 }
22855 return false;
22856}
22857
22858// Transform to an x86-specific ALU node with flags if there is a chance of
22859// using an RMW op or only the flags are used. Otherwise, leave
22860// the node alone and emit a 'cmp' or 'test' instruction.
22861static bool isProfitableToUseFlagOp(SDValue Op) {
22862 for (SDNode *U : Op->uses())
22863 if (U->getOpcode() != ISD::CopyToReg &&
22864 U->getOpcode() != ISD::SETCC &&
22865 U->getOpcode() != ISD::STORE)
22866 return false;
22867
22868 return true;
22869}
22870
22871/// Emit nodes that will be selected as "test Op0,Op0", or something
22872/// equivalent.
22873static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
22874 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
22875 // CF and OF aren't always set the way we want. Determine which
22876 // of these we need.
22877 bool NeedCF = false;
22878 bool NeedOF = false;
22879 switch (X86CC) {
22880 default: break;
22881 case X86::COND_A: case X86::COND_AE:
22882 case X86::COND_B: case X86::COND_BE:
22883 NeedCF = true;
22884 break;
22885 case X86::COND_G: case X86::COND_GE:
22886 case X86::COND_L: case X86::COND_LE:
22887 case X86::COND_O: case X86::COND_NO: {
22888 // Check if we really need to set the
22889 // Overflow flag. If NoSignedWrap is present
22890 // that is not actually needed.
22891 switch (Op->getOpcode()) {
22892 case ISD::ADD:
22893 case ISD::SUB:
22894 case ISD::MUL:
22895 case ISD::SHL:
22896 if (Op.getNode()->getFlags().hasNoSignedWrap())
22897 break;
22898 LLVM_FALLTHROUGH[[gnu::fallthrough]];
22899 default:
22900 NeedOF = true;
22901 break;
22902 }
22903 break;
22904 }
22905 }
22906 // See if we can use the EFLAGS value from the operand instead of
22907 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
22908 // we prove that the arithmetic won't overflow, we can't use OF or CF.
22909 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
22910 // Emit a CMP with 0, which is the TEST pattern.
22911 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
22912 DAG.getConstant(0, dl, Op.getValueType()));
22913 }
22914 unsigned Opcode = 0;
22915 unsigned NumOperands = 0;
22916
22917 SDValue ArithOp = Op;
22918
22919 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
22920 // which may be the result of a CAST. We use the variable 'Op', which is the
22921 // non-casted variable when we check for possible users.
22922 switch (ArithOp.getOpcode()) {
22923 case ISD::AND:
22924 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
22925 // because a TEST instruction will be better.
22926 if (!hasNonFlagsUse(Op))
22927 break;
22928
22929 LLVM_FALLTHROUGH[[gnu::fallthrough]];
22930 case ISD::ADD:
22931 case ISD::SUB:
22932 case ISD::OR:
22933 case ISD::XOR:
22934 if (!isProfitableToUseFlagOp(Op))
22935 break;
22936
22937 // Otherwise use a regular EFLAGS-setting instruction.
22938 switch (ArithOp.getOpcode()) {
22939 default: llvm_unreachable("unexpected operator!")__builtin_unreachable();
22940 case ISD::ADD: Opcode = X86ISD::ADD; break;
22941 case ISD::SUB: Opcode = X86ISD::SUB; break;
22942 case ISD::XOR: Opcode = X86ISD::XOR; break;
22943 case ISD::AND: Opcode = X86ISD::AND; break;
22944 case ISD::OR: Opcode = X86ISD::OR; break;
22945 }
22946
22947 NumOperands = 2;
22948 break;
22949 case X86ISD::ADD:
22950 case X86ISD::SUB:
22951 case X86ISD::OR:
22952 case X86ISD::XOR:
22953 case X86ISD::AND:
22954 return SDValue(Op.getNode(), 1);
22955 case ISD::SSUBO:
22956 case ISD::USUBO: {
22957 // /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag.
22958 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
22959 return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),
22960 Op->getOperand(1)).getValue(1);
22961 }
22962 default:
22963 break;
22964 }
22965
22966 if (Opcode == 0) {
22967 // Emit a CMP with 0, which is the TEST pattern.
22968 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
22969 DAG.getConstant(0, dl, Op.getValueType()));
22970 }
22971 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
22972 SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
22973
22974 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
22975 DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);
22976 return SDValue(New.getNode(), 1);
22977}
22978
22979/// Emit nodes that will be selected as "cmp Op0,Op1", or something
22980/// equivalent.
22981static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
22982 const SDLoc &dl, SelectionDAG &DAG,
22983 const X86Subtarget &Subtarget) {
22984 if (isNullConstant(Op1))
22985 return EmitTest(Op0, X86CC, dl, DAG, Subtarget);
22986
22987 EVT CmpVT = Op0.getValueType();
22988
22989 assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||(static_cast<void> (0))
22990 CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!")(static_cast<void> (0));
22991
22992 // Only promote the compare up to I32 if it is a 16 bit operation
22993 // with an immediate. 16 bit immediates are to be avoided.
22994 if (CmpVT == MVT::i16 && !Subtarget.isAtom() &&
22995 !DAG.getMachineFunction().getFunction().hasMinSize()) {
22996 ConstantSDNode *COp0 = dyn_cast<ConstantSDNode>(Op0);
22997 ConstantSDNode *COp1 = dyn_cast<ConstantSDNode>(Op1);
22998 // Don't do this if the immediate can fit in 8-bits.
22999 if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||
23000 (COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {
23001 unsigned ExtendOp =
23002 isX86CCSigned(X86CC) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
23003 if (X86CC == X86::COND_E || X86CC == X86::COND_NE) {
23004 // For equality comparisons try to use SIGN_EXTEND if the input was
23005 // truncate from something with enough sign bits.
23006 if (Op0.getOpcode() == ISD::TRUNCATE) {
23007 SDValue In = Op0.getOperand(0);
23008 unsigned EffBits =
23009 In.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(In) + 1;
23010 if (EffBits <= 16)
23011 ExtendOp = ISD::SIGN_EXTEND;
23012 } else if (Op1.getOpcode() == ISD::TRUNCATE) {
23013 SDValue In = Op1.getOperand(0);
23014 unsigned EffBits =
23015 In.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(In) + 1;
23016 if (EffBits <= 16)
23017 ExtendOp = ISD::SIGN_EXTEND;
23018 }
23019 }
23020
23021 CmpVT = MVT::i32;
23022 Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0);
23023 Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);
23024 }
23025 }
23026
23027 // Try to shrink i64 compares if the input has enough zero bits.
23028 // FIXME: Do this for non-constant compares for constant on LHS?
23029 if (CmpVT == MVT::i64 && isa<ConstantSDNode>(Op1) && !isX86CCSigned(X86CC) &&
23030 Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
23031 cast<ConstantSDNode>(Op1)->getAPIntValue().getActiveBits() <= 32 &&
23032 DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) {
23033 CmpVT = MVT::i32;
23034 Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
23035 Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
23036 }
23037
23038 // 0-x == y --> x+y == 0
23039 // 0-x != y --> x+y != 0
23040 if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) &&
23041 Op0.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
23042 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
23043 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(1), Op1);
23044 return Add.getValue(1);
23045 }
23046
23047 // x == 0-y --> x+y == 0
23048 // x != 0-y --> x+y != 0
23049 if (Op1.getOpcode() == ISD::SUB && isNullConstant(Op1.getOperand(0)) &&
23050 Op1.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
23051 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
23052 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0, Op1.getOperand(1));
23053 return Add.getValue(1);
23054 }
23055
23056 // Use SUB instead of CMP to enable CSE between SUB and CMP.
23057 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
23058 SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
23059 return Sub.getValue(1);
23060}
23061
23062/// Check if replacement of SQRT with RSQRT should be disabled.
23063bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
23064 EVT VT = Op.getValueType();
23065
23066 // We never want to use both SQRT and RSQRT instructions for the same input.
23067 if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
23068 return false;
23069
23070 if (VT.isVector())
23071 return Subtarget.hasFastVectorFSQRT();
23072 return Subtarget.hasFastScalarFSQRT();
23073}
23074
23075/// The minimum architected relative accuracy is 2^-12. We need one
23076/// Newton-Raphson step to have a good float result (24 bits of precision).
23077SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
23078 SelectionDAG &DAG, int Enabled,
23079 int &RefinementSteps,
23080 bool &UseOneConstNR,
23081 bool Reciprocal) const {
23082 EVT VT = Op.getValueType();
23083
23084 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
23085 // It is likely not profitable to do this for f64 because a double-precision
23086 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
23087 // instructions: convert to single, rsqrtss, convert back to double, refine
23088 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
23089 // along with FMA, this could be a throughput win.
23090 // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
23091 // after legalize types.
23092 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
23093 (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
23094 (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
23095 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
23096 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
23097 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23098 RefinementSteps = 1;
23099
23100 UseOneConstNR = false;
23101 // There is no FSQRT for 512-bits, but there is RSQRT14.
23102 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
23103 return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
23104 }
23105 return SDValue();
23106}
23107
23108/// The minimum architected relative accuracy is 2^-12. We need one
23109/// Newton-Raphson step to have a good float result (24 bits of precision).
23110SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
23111 int Enabled,
23112 int &RefinementSteps) const {
23113 EVT VT = Op.getValueType();
23114
23115 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
23116 // It is likely not profitable to do this for f64 because a double-precision
23117 // reciprocal estimate with refinement on x86 prior to FMA requires
23118 // 15 instructions: convert to single, rcpss, convert back to double, refine
23119 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
23120 // along with FMA, this could be a throughput win.
23121
23122 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
23123 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
23124 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
23125 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
23126 // Enable estimate codegen with 1 refinement step for vector division.
23127 // Scalar division estimates are disabled because they break too much
23128 // real-world code. These defaults are intended to match GCC behavior.
23129 if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
23130 return SDValue();
23131
23132 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23133 RefinementSteps = 1;
23134
23135 // There is no FSQRT for 512-bits, but there is RCP14.
23136 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
23137 return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
23138 }
23139 return SDValue();
23140}
23141
23142/// If we have at least two divisions that use the same divisor, convert to
23143/// multiplication by a reciprocal. This may need to be adjusted for a given
23144/// CPU if a division's cost is not at least twice the cost of a multiplication.
23145/// This is because we still need one division to calculate the reciprocal and
23146/// then we need two multiplies by that reciprocal as replacements for the
23147/// original divisions.
23148unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
23149 return 2;
23150}
23151
23152SDValue
23153X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
23154 SelectionDAG &DAG,
23155 SmallVectorImpl<SDNode *> &Created) const {
23156 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
23157 if (isIntDivCheap(N->getValueType(0), Attr))
23158 return SDValue(N,0); // Lower SDIV as SDIV
23159
23160 assert((Divisor.isPowerOf2() || (-Divisor).isPowerOf2()) &&(static_cast<void> (0))
23161 "Unexpected divisor!")(static_cast<void> (0));
23162
23163 // Only perform this transform if CMOV is supported otherwise the select
23164 // below will become a branch.
23165 if (!Subtarget.hasCMov())
23166 return SDValue();
23167
23168 // fold (sdiv X, pow2)
23169 EVT VT = N->getValueType(0);
23170 // FIXME: Support i8.
23171 if (VT != MVT::i16 && VT != MVT::i32 &&
23172 !(Subtarget.is64Bit() && VT == MVT::i64))
23173 return SDValue();
23174
23175 unsigned Lg2 = Divisor.countTrailingZeros();
23176
23177 // If the divisor is 2 or -2, the default expansion is better.
23178 if (Lg2 == 1)
23179 return SDValue();
23180
23181 SDLoc DL(N);
23182 SDValue N0 = N->getOperand(0);
23183 SDValue Zero = DAG.getConstant(0, DL, VT);
23184 APInt Lg2Mask = APInt::getLowBitsSet(VT.getSizeInBits(), Lg2);
23185 SDValue Pow2MinusOne = DAG.getConstant(Lg2Mask, DL, VT);
23186
23187 // If N0 is negative, we need to add (Pow2 - 1) to it before shifting right.
23188 SDValue Cmp = DAG.getSetCC(DL, MVT::i8, N0, Zero, ISD::SETLT);
23189 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
23190 SDValue CMov = DAG.getNode(ISD::SELECT, DL, VT, Cmp, Add, N0);
23191
23192 Created.push_back(Cmp.getNode());
23193 Created.push_back(Add.getNode());
23194 Created.push_back(CMov.getNode());
23195
23196 // Divide by pow2.
23197 SDValue SRA =
23198 DAG.getNode(ISD::SRA, DL, VT, CMov, DAG.getConstant(Lg2, DL, MVT::i8));
23199
23200 // If we're dividing by a positive value, we're done. Otherwise, we must
23201 // negate the result.
23202 if (Divisor.isNonNegative())
23203 return SRA;
23204
23205 Created.push_back(SRA.getNode());
23206 return DAG.getNode(ISD::SUB, DL, VT, Zero, SRA);
23207}
23208
23209/// Result of 'and' is compared against zero. Change to a BT node if possible.
23210/// Returns the BT node and the condition code needed to use it.
23211static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
23212 const SDLoc &dl, SelectionDAG &DAG,
23213 SDValue &X86CC) {
23214 assert(And.getOpcode() == ISD::AND && "Expected AND node!")(static_cast<void> (0));
23215 SDValue Op0 = And.getOperand(0);
23216 SDValue Op1 = And.getOperand(1);
23217 if (Op0.getOpcode() == ISD::TRUNCATE)
23218 Op0 = Op0.getOperand(0);
23219 if (Op1.getOpcode() == ISD::TRUNCATE)
23220 Op1 = Op1.getOperand(0);
23221
23222 SDValue Src, BitNo;
23223 if (Op1.getOpcode() == ISD::SHL)
23224 std::swap(Op0, Op1);
23225 if (Op0.getOpcode() == ISD::SHL) {
23226 if (isOneConstant(Op0.getOperand(0))) {
23227 // If we looked past a truncate, check that it's only truncating away
23228 // known zeros.
23229 unsigned BitWidth = Op0.getValueSizeInBits();
23230 unsigned AndBitWidth = And.getValueSizeInBits();
23231 if (BitWidth > AndBitWidth) {
23232 KnownBits Known = DAG.computeKnownBits(Op0);
23233 if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
23234 return SDValue();
23235 }
23236 Src = Op1;
23237 BitNo = Op0.getOperand(1);
23238 }
23239 } else if (Op1.getOpcode() == ISD::Constant) {
23240 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
23241 uint64_t AndRHSVal = AndRHS->getZExtValue();
23242 SDValue AndLHS = Op0;
23243
23244 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
23245 Src = AndLHS.getOperand(0);
23246 BitNo = AndLHS.getOperand(1);
23247 } else {
23248 // Use BT if the immediate can't be encoded in a TEST instruction or we
23249 // are optimizing for size and the immedaite won't fit in a byte.
23250 bool OptForSize = DAG.shouldOptForSize();
23251 if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
23252 isPowerOf2_64(AndRHSVal)) {
23253 Src = AndLHS;
23254 BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,
23255 Src.getValueType());
23256 }
23257 }
23258 }
23259
23260 // No patterns found, give up.
23261 if (!Src.getNode())
23262 return SDValue();
23263
23264 // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
23265 // instruction. Since the shift amount is in-range-or-undefined, we know
23266 // that doing a bittest on the i32 value is ok. We extend to i32 because
23267 // the encoding for the i16 version is larger than the i32 version.
23268 // Also promote i16 to i32 for performance / code size reason.
23269 if (Src.getValueType() == MVT::i8 || Src.getValueType() == MVT::i16)
23270 Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);
23271
23272 // See if we can use the 32-bit instruction instead of the 64-bit one for a
23273 // shorter encoding. Since the former takes the modulo 32 of BitNo and the
23274 // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
23275 // known to be zero.
23276 if (Src.getValueType() == MVT::i64 &&
23277 DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
23278 Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
23279
23280 // If the operand types disagree, extend the shift amount to match. Since
23281 // BT ignores high bits (like shifts) we can use anyextend.
23282 if (Src.getValueType() != BitNo.getValueType())
23283 BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);
23284
23285 X86CC = DAG.getTargetConstant(CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B,
23286 dl, MVT::i8);
23287 return DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
23288}
23289
23290/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
23291/// CMPs.
23292static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
23293 SDValue &Op1, bool &IsAlwaysSignaling) {
23294 unsigned SSECC;
23295 bool Swap = false;
23296
23297 // SSE Condition code mapping:
23298 // 0 - EQ
23299 // 1 - LT
23300 // 2 - LE
23301 // 3 - UNORD
23302 // 4 - NEQ
23303 // 5 - NLT
23304 // 6 - NLE
23305 // 7 - ORD
23306 switch (SetCCOpcode) {
23307 default: llvm_unreachable("Unexpected SETCC condition")__builtin_unreachable();
23308 case ISD::SETOEQ:
23309 case ISD::SETEQ: SSECC = 0; break;
23310 case ISD::SETOGT:
23311 case ISD::SETGT: Swap = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];
23312 case ISD::SETLT:
23313 case ISD::SETOLT: SSECC = 1; break;
23314 case ISD::SETOGE:
23315 case ISD::SETGE: Swap = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];
23316 case ISD::SETLE:
23317 case ISD::SETOLE: SSECC = 2; break;
23318 case ISD::SETUO: SSECC = 3; break;
23319 case ISD::SETUNE:
23320 case ISD::SETNE: SSECC = 4; break;
23321 case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];
23322 case ISD::SETUGE: SSECC = 5; break;
23323 case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];
23324 case ISD::SETUGT: SSECC = 6; break;
23325 case ISD::SETO: SSECC = 7; break;
23326 case ISD::SETUEQ: SSECC = 8; break;
23327 case ISD::SETONE: SSECC = 12; break;
23328 }
23329 if (Swap)
23330 std::swap(Op0, Op1);
23331
23332 switch (SetCCOpcode) {
23333 default:
23334 IsAlwaysSignaling = true;
23335 break;
23336 case ISD::SETEQ:
23337 case ISD::SETOEQ:
23338 case ISD::SETUEQ:
23339 case ISD::SETNE:
23340 case ISD::SETONE:
23341 case ISD::SETUNE:
23342 case ISD::SETO:
23343 case ISD::SETUO:
23344 IsAlwaysSignaling = false;
23345 break;
23346 }
23347
23348 return SSECC;
23349}
23350
23351/// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
23352/// concatenate the result back.
23353static SDValue splitIntVSETCC(EVT VT, SDValue LHS, SDValue RHS,
23354 ISD::CondCode Cond, SelectionDAG &DAG,
23355 const SDLoc &dl) {
23356 assert(VT.isInteger() && VT == LHS.getValueType() &&(static_cast<void> (0))
23357 VT == RHS.getValueType() && "Unsupported VTs!")(static_cast<void> (0));
23358
23359 SDValue CC = DAG.getCondCode(Cond);
23360
23361 // Extract the LHS Lo/Hi vectors
23362 SDValue LHS1, LHS2;
23363 std::tie(LHS1, LHS2) = splitVector(LHS, DAG, dl);
23364
23365 // Extract the RHS Lo/Hi vectors
23366 SDValue RHS1, RHS2;
23367 std::tie(RHS1, RHS2) = splitVector(RHS, DAG, dl);
23368
23369 // Issue the operation on the smaller types and concatenate the result back
23370 EVT LoVT, HiVT;
23371 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
23372 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
23373 DAG.getNode(ISD::SETCC, dl, LoVT, LHS1, RHS1, CC),
23374 DAG.getNode(ISD::SETCC, dl, HiVT, LHS2, RHS2, CC));
23375}
23376
23377static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
23378
23379 SDValue Op0 = Op.getOperand(0);
23380 SDValue Op1 = Op.getOperand(1);
23381 SDValue CC = Op.getOperand(2);
23382 MVT VT = Op.getSimpleValueType();
23383 SDLoc dl(Op);
23384
23385 assert(VT.getVectorElementType() == MVT::i1 &&(static_cast<void> (0))
23386 "Cannot set masked compare for this operation")(static_cast<void> (0));
23387
23388 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
23389
23390 // Prefer SETGT over SETLT.
23391 if (SetCCOpcode == ISD::SETLT) {
23392 SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
23393 std::swap(Op0, Op1);
23394 }
23395
23396 return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
23397}
23398
23399/// Given a buildvector constant, return a new vector constant with each element
23400/// incremented or decremented. If incrementing or decrementing would result in
23401/// unsigned overflow or underflow or this is not a simple vector constant,
23402/// return an empty value.
23403static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc) {
23404 auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());
23405 if (!BV)
23406 return SDValue();
23407
23408 MVT VT = V.getSimpleValueType();
23409 MVT EltVT = VT.getVectorElementType();
23410 unsigned NumElts = VT.getVectorNumElements();
23411 SmallVector<SDValue, 8> NewVecC;
23412 SDLoc DL(V);
23413 for (unsigned i = 0; i < NumElts; ++i) {
23414 auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
23415 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)
23416 return SDValue();
23417
23418 // Avoid overflow/underflow.
23419 const APInt &EltC = Elt->getAPIntValue();
23420 if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isNullValue()))
23421 return SDValue();
23422
23423 NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));
23424 }
23425
23426 return DAG.getBuildVector(VT, DL, NewVecC);
23427}
23428
23429/// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
23430/// Op0 u<= Op1:
23431/// t = psubus Op0, Op1
23432/// pcmpeq t, <0..0>
23433static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,
23434 ISD::CondCode Cond, const SDLoc &dl,
23435 const X86Subtarget &Subtarget,
23436 SelectionDAG &DAG) {
23437 if (!Subtarget.hasSSE2())
23438 return SDValue();
23439
23440 MVT VET = VT.getVectorElementType();
23441 if (VET != MVT::i8 && VET != MVT::i16)
23442 return SDValue();
23443
23444 switch (Cond) {
23445 default:
23446 return SDValue();
23447 case ISD::SETULT: {
23448 // If the comparison is against a constant we can turn this into a
23449 // setule. With psubus, setule does not require a swap. This is
23450 // beneficial because the constant in the register is no longer
23451 // destructed as the destination so it can be hoisted out of a loop.
23452 // Only do this pre-AVX since vpcmp* is no longer destructive.
23453 if (Subtarget.hasAVX())
23454 return SDValue();
23455 SDValue ULEOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/false);
23456 if (!ULEOp1)
23457 return SDValue();
23458 Op1 = ULEOp1;
23459 break;
23460 }
23461 case ISD::SETUGT: {
23462 // If the comparison is against a constant, we can turn this into a setuge.
23463 // This is beneficial because materializing a constant 0 for the PCMPEQ is
23464 // probably cheaper than XOR+PCMPGT using 2 different vector constants:
23465 // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
23466 SDValue UGEOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/true);
23467 if (!UGEOp1)
23468 return SDValue();
23469 Op1 = Op0;
23470 Op0 = UGEOp1;
23471 break;
23472 }
23473 // Psubus is better than flip-sign because it requires no inversion.
23474 case ISD::SETUGE:
23475 std::swap(Op0, Op1);
23476 break;
23477 case ISD::SETULE:
23478 break;
23479 }
23480
23481 SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1);
23482 return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
23483 DAG.getConstant(0, dl, VT));
23484}
23485
23486static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
23487 SelectionDAG &DAG) {
23488 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
23489 Op.getOpcode() == ISD::STRICT_FSETCCS;
23490 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
23491 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
23492 SDValue CC = Op.getOperand(IsStrict ? 3 : 2);
23493 MVT VT = Op->getSimpleValueType(0);
23494 ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
23495 bool isFP = Op1.getSimpleValueType().isFloatingPoint();
23496 SDLoc dl(Op);
23497
23498 if (isFP) {
23499#ifndef NDEBUG1
23500 MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
23501 assert(EltVT == MVT::f16 || EltVT == MVT::f32 || EltVT == MVT::f64)(static_cast<void> (0));
23502#endif
23503
23504 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
23505 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
23506
23507 // If we have a strict compare with a vXi1 result and the input is 128/256
23508 // bits we can't use a masked compare unless we have VLX. If we use a wider
23509 // compare like we do for non-strict, we might trigger spurious exceptions
23510 // from the upper elements. Instead emit a AVX compare and convert to mask.
23511 unsigned Opc;
23512 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 &&
23513 (!IsStrict || Subtarget.hasVLX() ||
23514 Op0.getSimpleValueType().is512BitVector())) {
23515#ifndef NDEBUG1
23516 unsigned Num = VT.getVectorNumElements();
23517 assert(Num <= 16 || (Num == 32 && EltVT == MVT::f16))(static_cast<void> (0));
23518#endif
23519 Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;
23520 } else {
23521 Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP;
23522 // The SSE/AVX packed FP comparison nodes are defined with a
23523 // floating-point vector result that matches the operand type. This allows
23524 // them to work with an SSE1 target (integer vector types are not legal).
23525 VT = Op0.getSimpleValueType();
23526 }
23527
23528 SDValue Cmp;
23529 bool IsAlwaysSignaling;
23530 unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1, IsAlwaysSignaling);
23531 if (!Subtarget.hasAVX()) {
23532 // TODO: We could use following steps to handle a quiet compare with
23533 // signaling encodings.
23534 // 1. Get ordered masks from a quiet ISD::SETO
23535 // 2. Use the masks to mask potential unordered elements in operand A, B
23536 // 3. Get the compare results of masked A, B
23537 // 4. Calculating final result using the mask and result from 3
23538 // But currently, we just fall back to scalar operations.
23539 if (IsStrict && IsAlwaysSignaling && !IsSignaling)
23540 return SDValue();
23541
23542 // Insert an extra signaling instruction to raise exception.
23543 if (IsStrict && !IsAlwaysSignaling && IsSignaling) {
23544 SDValue SignalCmp = DAG.getNode(
23545 Opc, dl, {VT, MVT::Other},
23546 {Chain, Op0, Op1, DAG.getTargetConstant(1, dl, MVT::i8)}); // LT_OS
23547 // FIXME: It seems we need to update the flags of all new strict nodes.
23548 // Otherwise, mayRaiseFPException in MI will return false due to
23549 // NoFPExcept = false by default. However, I didn't find it in other
23550 // patches.
23551 SignalCmp->setFlags(Op->getFlags());
23552 Chain = SignalCmp.getValue(1);
23553 }
23554
23555 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
23556 // emit two comparisons and a logic op to tie them together.
23557 if (SSECC >= 8) {
23558 // LLVM predicate is SETUEQ or SETONE.
23559 unsigned CC0, CC1;
23560 unsigned CombineOpc;
23561 if (Cond == ISD::SETUEQ) {
23562 CC0 = 3; // UNORD
23563 CC1 = 0; // EQ
23564 CombineOpc = X86ISD::FOR;
23565 } else {
23566 assert(Cond == ISD::SETONE)(static_cast<void> (0));
23567 CC0 = 7; // ORD
23568 CC1 = 4; // NEQ
23569 CombineOpc = X86ISD::FAND;
23570 }
23571
23572 SDValue Cmp0, Cmp1;
23573 if (IsStrict) {
23574 Cmp0 = DAG.getNode(
23575 Opc, dl, {VT, MVT::Other},
23576 {Chain, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)});
23577 Cmp1 = DAG.getNode(
23578 Opc, dl, {VT, MVT::Other},
23579 {Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)});
23580 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1),
23581 Cmp1.getValue(1));
23582 } else {
23583 Cmp0 = DAG.getNode(
23584 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8));
23585 Cmp1 = DAG.getNode(
23586 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8));
23587 }
23588 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
23589 } else {
23590 if (IsStrict) {
23591 Cmp = DAG.getNode(
23592 Opc, dl, {VT, MVT::Other},
23593 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
23594 Chain = Cmp.getValue(1);
23595 } else
23596 Cmp = DAG.getNode(
23597 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
23598 }
23599 } else {
23600 // Handle all other FP comparisons here.
23601 if (IsStrict) {
23602 // Make a flip on already signaling CCs before setting bit 4 of AVX CC.
23603 SSECC |= (IsAlwaysSignaling ^ IsSignaling) << 4;
23604 Cmp = DAG.getNode(
23605 Opc, dl, {VT, MVT::Other},
23606 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
23607 Chain = Cmp.getValue(1);
23608 } else
23609 Cmp = DAG.getNode(
23610 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
23611 }
23612
23613 if (VT.getFixedSizeInBits() >
23614 Op.getSimpleValueType().getFixedSizeInBits()) {
23615 // We emitted a compare with an XMM/YMM result. Finish converting to a
23616 // mask register using a vptestm.
23617 EVT CastVT = EVT(VT).changeVectorElementTypeToInteger();
23618 Cmp = DAG.getBitcast(CastVT, Cmp);
23619 Cmp = DAG.getSetCC(dl, Op.getSimpleValueType(), Cmp,
23620 DAG.getConstant(0, dl, CastVT), ISD::SETNE);
23621 } else {
23622 // If this is SSE/AVX CMPP, bitcast the result back to integer to match
23623 // the result type of SETCC. The bitcast is expected to be optimized
23624 // away during combining/isel.
23625 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
23626 }
23627
23628 if (IsStrict)
23629 return DAG.getMergeValues({Cmp, Chain}, dl);
23630
23631 return Cmp;
23632 }
23633
23634 assert(!IsStrict && "Strict SETCC only handles FP operands.")(static_cast<void> (0));
23635
23636 MVT VTOp0 = Op0.getSimpleValueType();
23637 (void)VTOp0;
23638 assert(VTOp0 == Op1.getSimpleValueType() &&(static_cast<void> (0))
23639 "Expected operands with same type!")(static_cast<void> (0));
23640 assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&(static_cast<void> (0))
23641 "Invalid number of packed elements for source and destination!")(static_cast<void> (0));
23642
23643 // The non-AVX512 code below works under the assumption that source and
23644 // destination types are the same.
23645 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&(static_cast<void> (0))
23646 "Value types for source and destination must be the same!")(static_cast<void> (0));
23647
23648 // The result is boolean, but operands are int/float
23649 if (VT.getVectorElementType() == MVT::i1) {
23650 // In AVX-512 architecture setcc returns mask with i1 elements,
23651 // But there is no compare instruction for i8 and i16 elements in KNL.
23652 assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&(static_cast<void> (0))
23653 "Unexpected operand type")(static_cast<void> (0));
23654 return LowerIntVSETCC_AVX512(Op, DAG);
23655 }
23656
23657 // Lower using XOP integer comparisons.
23658 if (VT.is128BitVector() && Subtarget.hasXOP()) {
23659 // Translate compare code to XOP PCOM compare mode.
23660 unsigned CmpMode = 0;
23661 switch (Cond) {
23662 default: llvm_unreachable("Unexpected SETCC condition")__builtin_unreachable();
23663 case ISD::SETULT:
23664 case ISD::SETLT: CmpMode = 0x00; break;
23665 case ISD::SETULE:
23666 case ISD::SETLE: CmpMode = 0x01; break;
23667 case ISD::SETUGT:
23668 case ISD::SETGT: CmpMode = 0x02; break;
23669 case ISD::SETUGE:
23670 case ISD::SETGE: CmpMode = 0x03; break;
23671 case ISD::SETEQ: CmpMode = 0x04; break;
23672 case ISD::SETNE: CmpMode = 0x05; break;
23673 }
23674
23675 // Are we comparing unsigned or signed integers?
23676 unsigned Opc =
23677 ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;
23678
23679 return DAG.getNode(Opc, dl, VT, Op0, Op1,
23680 DAG.getTargetConstant(CmpMode, dl, MVT::i8));
23681 }
23682
23683 // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
23684 // Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
23685 if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) {
23686 SDValue BC0 = peekThroughBitcasts(Op0);
23687 if (BC0.getOpcode() == ISD::AND) {
23688 APInt UndefElts;
23689 SmallVector<APInt, 64> EltBits;
23690 if (getTargetConstantBitsFromNode(BC0.getOperand(1),
23691 VT.getScalarSizeInBits(), UndefElts,
23692 EltBits, false, false)) {
23693 if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) {
23694 Cond = ISD::SETEQ;
23695 Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
23696 }
23697 }
23698 }
23699 }
23700
23701 // ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.
23702 if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND &&
23703 Op0.getOperand(1) == Op1 && Op0.hasOneUse()) {
23704 ConstantSDNode *C1 = isConstOrConstSplat(Op1);
23705 if (C1 && C1->getAPIntValue().isPowerOf2()) {
23706 unsigned BitWidth = VT.getScalarSizeInBits();
23707 unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;
23708
23709 SDValue Result = Op0.getOperand(0);
23710 Result = DAG.getNode(ISD::SHL, dl, VT, Result,
23711 DAG.getConstant(ShiftAmt, dl, VT));
23712 Result = DAG.getNode(ISD::SRA, dl, VT, Result,
23713 DAG.getConstant(BitWidth - 1, dl, VT));
23714 return Result;
23715 }
23716 }
23717
23718 // Break 256-bit integer vector compare into smaller ones.
23719 if (VT.is256BitVector() && !Subtarget.hasInt256())
23720 return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);
23721
23722 if (VT == MVT::v32i16 || VT == MVT::v64i8) {
23723 assert(!Subtarget.hasBWI() && "Unexpected VT with AVX512BW!")(static_cast<void> (0));
23724 return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);
23725 }
23726
23727 // If we have a limit constant, try to form PCMPGT (signed cmp) to avoid
23728 // not-of-PCMPEQ:
23729 // X != INT_MIN --> X >s INT_MIN
23730 // X != INT_MAX --> X <s INT_MAX --> INT_MAX >s X
23731 // +X != 0 --> +X >s 0
23732 APInt ConstValue;
23733 if (Cond == ISD::SETNE &&
23734 ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {
23735 if (ConstValue.isMinSignedValue())
23736 Cond = ISD::SETGT;
23737 else if (ConstValue.isMaxSignedValue())
23738 Cond = ISD::SETLT;
23739 else if (ConstValue.isNullValue() && DAG.SignBitIsZero(Op0))
23740 Cond = ISD::SETGT;
23741 }
23742
23743 // If both operands are known non-negative, then an unsigned compare is the
23744 // same as a signed compare and there's no need to flip signbits.
23745 // TODO: We could check for more general simplifications here since we're
23746 // computing known bits.
23747 bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
23748 !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
23749
23750 // Special case: Use min/max operations for unsigned compares.
23751 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23752 if (ISD::isUnsignedIntSetCC(Cond) &&
23753 (FlipSigns || ISD::isTrueWhenEqual(Cond)) &&
23754 TLI.isOperationLegal(ISD::UMIN, VT)) {
23755 // If we have a constant operand, increment/decrement it and change the
23756 // condition to avoid an invert.
23757 if (Cond == ISD::SETUGT) {
23758 // X > C --> X >= (C+1) --> X == umax(X, C+1)
23759 if (SDValue UGTOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/true)) {
23760 Op1 = UGTOp1;
23761 Cond = ISD::SETUGE;
23762 }
23763 }
23764 if (Cond == ISD::SETULT) {
23765 // X < C --> X <= (C-1) --> X == umin(X, C-1)
23766 if (SDValue ULTOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/false)) {
23767 Op1 = ULTOp1;
23768 Cond = ISD::SETULE;
23769 }
23770 }
23771 bool Invert = false;
23772 unsigned Opc;
23773 switch (Cond) {
23774 default: llvm_unreachable("Unexpected condition code")__builtin_unreachable();
23775 case ISD::SETUGT: Invert = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];
23776 case ISD::SETULE: Opc = ISD::UMIN; break;
23777 case ISD::SETULT: Invert = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];
23778 case ISD::SETUGE: Opc = ISD::UMAX; break;
23779 }
23780
23781 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
23782 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
23783
23784 // If the logical-not of the result is required, perform that now.
23785 if (Invert)
23786 Result = DAG.getNOT(dl, Result, VT);
23787
23788 return Result;
23789 }
23790
23791 // Try to use SUBUS and PCMPEQ.
23792 if (FlipSigns)
23793 if (SDValue V =
23794 LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
23795 return V;
23796
23797 // We are handling one of the integer comparisons here. Since SSE only has
23798 // GT and EQ comparisons for integer, swapping operands and multiple
23799 // operations may be required for some comparisons.
23800 unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
23801 : X86ISD::PCMPGT;
23802 bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
23803 Cond == ISD::SETGE || Cond == ISD::SETUGE;
23804 bool Invert = Cond == ISD::SETNE ||
23805 (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));
23806
23807 if (Swap)
23808 std::swap(Op0, Op1);
23809
23810 // Check that the operation in question is available (most are plain SSE2,
23811 // but PCMPGTQ and PCMPEQQ have different requirements).
23812 if (VT == MVT::v2i64) {
23813 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
23814 assert(Subtarget.hasSSE2() && "Don't know how to lower!")(static_cast<void> (0));
23815
23816 // Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle
23817 // the odd elements over the even elements.
23818 if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) {
23819 Op0 = DAG.getConstant(0, dl, MVT::v4i32);
23820 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
23821
23822 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
23823 static const int MaskHi[] = { 1, 1, 3, 3 };
23824 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
23825
23826 return DAG.getBitcast(VT, Result);
23827 }
23828
23829 if (!FlipSigns && !Invert && ISD::isBuildVectorAllOnes(Op1.getNode())) {
23830 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
23831 Op1 = DAG.getConstant(-1, dl, MVT::v4i32);
23832
23833 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
23834 static const int MaskHi[] = { 1, 1, 3, 3 };
23835 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
23836
23837 return DAG.getBitcast(VT, Result);
23838 }
23839
23840 // Since SSE has no unsigned integer comparisons, we need to flip the sign
23841 // bits of the inputs before performing those operations. The lower
23842 // compare is always unsigned.
23843 SDValue SB;
23844 if (FlipSigns) {
23845 SB = DAG.getConstant(0x8000000080000000ULL, dl, MVT::v2i64);
23846 } else {
23847 SB = DAG.getConstant(0x0000000080000000ULL, dl, MVT::v2i64);
23848 }
23849 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB);
23850 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB);
23851
23852 // Cast everything to the right type.
23853 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
23854 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
23855
23856 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
23857 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
23858 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
23859
23860 // Create masks for only the low parts/high parts of the 64 bit integers.
23861 static const int MaskHi[] = { 1, 1, 3, 3 };
23862 static const int MaskLo[] = { 0, 0, 2, 2 };
23863 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
23864 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
23865 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
23866
23867 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
23868 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
23869
23870 if (Invert)
23871 Result = DAG.getNOT(dl, Result, MVT::v4i32);
23872
23873 return DAG.getBitcast(VT, Result);
23874 }
23875
23876 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
23877 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
23878 // pcmpeqd + pshufd + pand.
23879 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!")(static_cast<void> (0));
23880
23881 // First cast everything to the right type.
23882 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
23883 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
23884
23885 // Do the compare.
23886 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
23887
23888 // Make sure the lower and upper halves are both all-ones.
23889 static const int Mask[] = { 1, 0, 3, 2 };
23890 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
23891 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
23892
23893 if (Invert)
23894 Result = DAG.getNOT(dl, Result, MVT::v4i32);
23895
23896 return DAG.getBitcast(VT, Result);
23897 }
23898 }
23899
23900 // Since SSE has no unsigned integer comparisons, we need to flip the sign
23901 // bits of the inputs before performing those operations.
23902 if (FlipSigns) {
23903 MVT EltVT = VT.getVectorElementType();
23904 SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
23905 VT);
23906 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
23907 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
23908 }
23909
23910 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
23911
23912 // If the logical-not of the result is required, perform that now.
23913 if (Invert)
23914 Result = DAG.getNOT(dl, Result, VT);
23915
23916 return Result;
23917}
23918
23919// Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible.
23920static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC,
23921 const SDLoc &dl, SelectionDAG &DAG,
23922 const X86Subtarget &Subtarget,
23923 SDValue &X86CC) {
23924 // Only support equality comparisons.
23925 if (CC != ISD::SETEQ && CC != ISD::SETNE)
23926 return SDValue();
23927
23928 // Must be a bitcast from vXi1.
23929 if (Op0.getOpcode() != ISD::BITCAST)
23930 return SDValue();
23931
23932 Op0 = Op0.getOperand(0);
23933 MVT VT = Op0.getSimpleValueType();
23934 if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&
23935 !(Subtarget.hasDQI() && VT == MVT::v8i1) &&
23936 !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
23937 return SDValue();
23938
23939 X86::CondCode X86Cond;
23940 if (isNullConstant(Op1)) {
23941 X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
23942 } else if (isAllOnesConstant(Op1)) {
23943 // C flag is set for all ones.
23944 X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;
23945 } else
23946 return SDValue();
23947
23948 // If the input is an AND, we can combine it's operands into the KTEST.
23949 bool KTestable = false;
23950 if (Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1))
23951 KTestable = true;
23952 if (Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1))
23953 KTestable = true;
23954 if (!isNullConstant(Op1))
23955 KTestable = false;
23956 if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) {
23957 SDValue LHS = Op0.getOperand(0);
23958 SDValue RHS = Op0.getOperand(1);
23959 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
23960 return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS);
23961 }
23962
23963 // If the input is an OR, we can combine it's operands into the KORTEST.
23964 SDValue LHS = Op0;
23965 SDValue RHS = Op0;
23966 if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {
23967 LHS = Op0.getOperand(0);
23968 RHS = Op0.getOperand(1);
23969 }
23970
23971 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
23972 return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
23973}
23974
23975/// Emit flags for the given setcc condition and operands. Also returns the
23976/// corresponding X86 condition code constant in X86CC.
23977SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
23978 ISD::CondCode CC, const SDLoc &dl,
23979 SelectionDAG &DAG,
23980 SDValue &X86CC) const {
23981 // Optimize to BT if possible.
23982 // Lower (X & (1 << N)) == 0 to BT(X, N).
23983 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
23984 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
23985 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1) &&
23986 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
23987 if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CC))
23988 return BT;
23989 }
23990
23991 // Try to use PTEST/PMOVMSKB for a tree ORs equality compared with 0.
23992 // TODO: We could do AND tree with all 1s as well by using the C flag.
23993 if (isNullConstant(Op1) && (CC == ISD::SETEQ || CC == ISD::SETNE))
23994 if (SDValue CmpZ =
23995 MatchVectorAllZeroTest(Op0, CC, dl, Subtarget, DAG, X86CC))
23996 return CmpZ;
23997
23998 // Try to lower using KORTEST or KTEST.
23999 if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))
24000 return Test;
24001
24002 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
24003 // these.
24004 if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
24005 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
24006 // If the input is a setcc, then reuse the input setcc or use a new one with
24007 // the inverted condition.
24008 if (Op0.getOpcode() == X86ISD::SETCC) {
24009 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
24010
24011 X86CC = Op0.getOperand(0);
24012 if (Invert) {
24013 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
24014 CCode = X86::GetOppositeBranchCondition(CCode);
24015 X86CC = DAG.getTargetConstant(CCode, dl, MVT::i8);
24016 }
24017
24018 return Op0.getOperand(1);
24019 }
24020 }
24021
24022 // Try to use the carry flag from the add in place of an separate CMP for:
24023 // (seteq (add X, -1), -1). Similar for setne.
24024 if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD &&
24025 Op0.getOperand(1) == Op1 && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
24026 if (isProfitableToUseFlagOp(Op0)) {
24027 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
24028
24029 SDValue New = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(0),
24030 Op0.getOperand(1));
24031 DAG.ReplaceAllUsesOfValueWith(SDValue(Op0.getNode(), 0), New);
24032 X86::CondCode CCode = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
24033 X86CC = DAG.getTargetConstant(CCode, dl, MVT::i8);
24034 return SDValue(New.getNode(), 1);
24035 }
24036 }
24037
24038 X86::CondCode CondCode =
24039 TranslateX86CC(CC, dl, /*IsFP*/ false, Op0, Op1, DAG);
24040 assert(CondCode != X86::COND_INVALID && "Unexpected condition code!")(static_cast<void> (0));
24041
24042 SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget);
24043 X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
24044 return EFLAGS;
24045}
24046
24047SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
24048
24049 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
24050 Op.getOpcode() == ISD::STRICT_FSETCCS;
24051 MVT VT = Op->getSimpleValueType(0);
24052
24053 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
24054
24055 assert(VT == MVT::i8 && "SetCC type must be 8-bit integer")(static_cast<void> (0));
24056 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
24057 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
24058 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
24059 SDLoc dl(Op);
24060 ISD::CondCode CC =
24061 cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();
24062
24063 // Handle f128 first, since one possible outcome is a normal integer
24064 // comparison which gets handled by emitFlagsForSetcc.
24065 if (Op0.getValueType() == MVT::f128) {
24066 softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1, Chain,
24067 Op.getOpcode() == ISD::STRICT_FSETCCS);
24068
24069 // If softenSetCCOperands returned a scalar, use it.
24070 if (!Op1.getNode()) {
24071 assert(Op0.getValueType() == Op.getValueType() &&(static_cast<void> (0))
24072 "Unexpected setcc expansion!")(static_cast<void> (0));
24073 if (IsStrict)
24074 return DAG.getMergeValues({Op0, Chain}, dl);
24075 return Op0;
24076 }
24077 }
24078
24079 if (Op0.getSimpleValueType().isInteger()) {
24080 // Attempt to canonicalize SGT/UGT -> SGE/UGE compares with constant which
24081 // reduces the number of EFLAGs bit reads (the GE conditions don't read ZF),
24082 // this may translate to less uops depending on uarch implementation. The
24083 // equivalent for SLE/ULE -> SLT/ULT isn't likely to happen as we already
24084 // canonicalize to that CondCode.
24085 // NOTE: Only do this if incrementing the constant doesn't increase the bit
24086 // encoding size - so it must either already be a i8 or i32 immediate, or it
24087 // shrinks down to that. We don't do this for any i64's to avoid additional
24088 // constant materializations.
24089 // TODO: Can we move this to TranslateX86CC to handle jumps/branches too?
24090 if (auto *Op1C = dyn_cast<ConstantSDNode>(Op1)) {
24091 const APInt &Op1Val = Op1C->getAPIntValue();
24092 if (!Op1Val.isNullValue()) {
24093 // Ensure the constant+1 doesn't overflow.
24094 if ((CC == ISD::CondCode::SETGT && !Op1Val.isMaxSignedValue()) ||
24095 (CC == ISD::CondCode::SETUGT && !Op1Val.isMaxValue())) {
24096 APInt Op1ValPlusOne = Op1Val + 1;
24097 if (Op1ValPlusOne.isSignedIntN(32) &&
24098 (!Op1Val.isSignedIntN(8) || Op1ValPlusOne.isSignedIntN(8))) {
24099 Op1 = DAG.getConstant(Op1ValPlusOne, dl, Op0.getValueType());
24100 CC = CC == ISD::CondCode::SETGT ? ISD::CondCode::SETGE
24101 : ISD::CondCode::SETUGE;
24102 }
24103 }
24104 }
24105 }
24106
24107 SDValue X86CC;
24108 SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);
24109 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
24110 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
24111 }
24112
24113 // Handle floating point.
24114 X86::CondCode CondCode = TranslateX86CC(CC, dl, /*IsFP*/ true, Op0, Op1, DAG);
24115 if (CondCode == X86::COND_INVALID)
24116 return SDValue();
24117
24118 SDValue EFLAGS;
24119 if (IsStrict) {
24120 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
24121 EFLAGS =
24122 DAG.getNode(IsSignaling ? X86ISD::STRICT_FCMPS : X86ISD::STRICT_FCMP,
24123 dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});
24124 Chain = EFLAGS.getValue(1);
24125 } else {
24126 EFLAGS = DAG.getNode(X86ISD::FCMP, dl, MVT::i32, Op0, Op1);
24127 }
24128
24129 SDValue X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
24130 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
24131 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
24132}
24133
24134SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
24135 SDValue LHS = Op.getOperand(0);
24136 SDValue RHS = Op.getOperand(1);
24137 SDValue Carry = Op.getOperand(2);
24138 SDValue Cond = Op.getOperand(3);
24139 SDLoc DL(Op);
24140
24141 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.")(static_cast<void> (0));
24142 X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
24143
24144 // Recreate the carry if needed.
24145 EVT CarryVT = Carry.getValueType();
24146 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
24147 Carry, DAG.getAllOnesConstant(DL, CarryVT));
24148
24149 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
24150 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
24151 return getSETCC(CC, Cmp.getValue(1), DL, DAG);
24152}
24153
24154// This function returns three things: the arithmetic computation itself
24155// (Value), an EFLAGS result (Overflow), and a condition code (Cond). The
24156// flag and the condition code define the case in which the arithmetic
24157// computation overflows.
24158static std::pair<SDValue, SDValue>
24159getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG) {
24160 assert(Op.getResNo() == 0 && "Unexpected result number!")(static_cast<void> (0));
24161 SDValue Value, Overflow;
24162 SDValue LHS = Op.getOperand(0);
24163 SDValue RHS = Op.getOperand(1);
24164 unsigned BaseOp = 0;
24165 SDLoc DL(Op);
24166 switch (Op.getOpcode()) {
24167 default: llvm_unreachable("Unknown ovf instruction!")__builtin_unreachable();
24168 case ISD::SADDO:
24169 BaseOp = X86ISD::ADD;
24170 Cond = X86::COND_O;
24171 break;
24172 case ISD::UADDO:
24173 BaseOp = X86ISD::ADD;
24174 Cond = isOneConstant(RHS) ? X86::COND_E : X86::COND_B;
24175 break;
24176 case ISD::SSUBO:
24177 BaseOp = X86ISD::SUB;
24178 Cond = X86::COND_O;
24179 break;
24180 case ISD::USUBO:
24181 BaseOp = X86ISD::SUB;
24182 Cond = X86::COND_B;
24183 break;
24184 case ISD::SMULO:
24185 BaseOp = X86ISD::SMUL;
24186 Cond = X86::COND_O;
24187 break;
24188 case ISD::UMULO:
24189 BaseOp = X86ISD::UMUL;
24190 Cond = X86::COND_O;
24191 break;
24192 }
24193
24194 if (BaseOp) {
24195 // Also sets EFLAGS.
24196 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
24197 Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
24198 Overflow = Value.getValue(1);
24199 }
24200
24201 return std::make_pair(Value, Overflow);
24202}
24203
24204static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
24205 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
24206 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
24207 // looks for this combo and may remove the "setcc" instruction if the "setcc"
24208 // has only one use.
24209 SDLoc DL(Op);
24210 X86::CondCode Cond;
24211 SDValue Value, Overflow;
24212 std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);
24213
24214 SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);
24215 assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!")(static_cast<void> (0));
24216 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);
24217}
24218
24219/// Return true if opcode is a X86 logical comparison.
24220static bool isX86LogicalCmp(SDValue Op) {
24221 unsigned Opc = Op.getOpcode();
24222 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
24223 Opc == X86ISD::FCMP)
24224 return true;
24225 if (Op.getResNo() == 1 &&
24226 (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
24227 Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL ||
24228 Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND))
24229 return true;
24230
24231 return false;
24232}
24233
24234static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
24235 if (V.getOpcode() != ISD::TRUNCATE)
24236 return false;
24237
24238 SDValue VOp0 = V.getOperand(0);
24239 unsigned InBits = VOp0.getValueSizeInBits();
24240 unsigned Bits = V.getValueSizeInBits();
24241 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
24242}
24243
24244SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
24245 bool AddTest = true;
24246 SDValue Cond = Op.getOperand(0);
24247 SDValue Op1 = Op.getOperand(1);
24248 SDValue Op2 = Op.getOperand(2);
24249 SDLoc DL(Op);
24250 MVT VT = Op1.getSimpleValueType();
24251 SDValue CC;
24252
24253 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
24254 // are available or VBLENDV if AVX is available.
24255 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
24256 if (Cond.getOpcode() == ISD::SETCC && isScalarFPTypeInSSEReg(VT) &&
24257 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
24258 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
24259 bool IsAlwaysSignaling;
24260 unsigned SSECC =
24261 translateX86FSETCC(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
24262 CondOp0, CondOp1, IsAlwaysSignaling);
24263
24264 if (Subtarget.hasAVX512()) {
24265 SDValue Cmp =
24266 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1,
24267 DAG.getTargetConstant(SSECC, DL, MVT::i8));
24268 assert(!VT.isVector() && "Not a scalar type?")(static_cast<void> (0));
24269 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
24270 }
24271
24272 if (SSECC < 8 || Subtarget.hasAVX()) {
24273 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
24274 DAG.getTargetConstant(SSECC, DL, MVT::i8));
24275
24276 // If we have AVX, we can use a variable vector select (VBLENDV) instead
24277 // of 3 logic instructions for size savings and potentially speed.
24278 // Unfortunately, there is no scalar form of VBLENDV.
24279
24280 // If either operand is a +0.0 constant, don't try this. We can expect to
24281 // optimize away at least one of the logic instructions later in that
24282 // case, so that sequence would be faster than a variable blend.
24283
24284 // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
24285 // uses XMM0 as the selection register. That may need just as many
24286 // instructions as the AND/ANDN/OR sequence due to register moves, so
24287 // don't bother.
24288 if (Subtarget.hasAVX() && !isNullFPConstant(Op1) &&
24289 !isNullFPConstant(Op2)) {
24290 // Convert to vectors, do a VSELECT, and convert back to scalar.
24291 // All of the conversions should be optimized away.
24292 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
24293 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
24294 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
24295 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
24296
24297 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
24298 VCmp = DAG.getBitcast(VCmpVT, VCmp);
24299
24300 SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
24301
24302 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
24303 VSel, DAG.getIntPtrConstant(0, DL));
24304 }
24305 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
24306 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
24307 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
24308 }
24309 }
24310
24311 // AVX512 fallback is to lower selects of scalar floats to masked moves.
24312 if (isScalarFPTypeInSSEReg(VT) && Subtarget.hasAVX512()) {
24313 SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
24314 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
24315 }
24316
24317 if (Cond.getOpcode() == ISD::SETCC) {
24318 if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
24319 Cond = NewCond;
24320 // If the condition was updated, it's possible that the operands of the
24321 // select were also updated (for example, EmitTest has a RAUW). Refresh
24322 // the local references to the select operands in case they got stale.
24323 Op1 = Op.getOperand(1);
24324 Op2 = Op.getOperand(2);
24325 }
24326 }
24327
24328 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
24329 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
24330 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
24331 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
24332 // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
24333 // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
24334 if (Cond.getOpcode() == X86ISD::SETCC &&
24335 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
24336 isNullConstant(Cond.getOperand(1).getOperand(1))) {
24337 SDValue Cmp = Cond.getOperand(1);
24338 SDValue CmpOp0 = Cmp.getOperand(0);
24339 unsigned CondCode = Cond.getConstantOperandVal(0);
24340
24341 // Special handling for __builtin_ffs(X) - 1 pattern which looks like
24342 // (select (seteq X, 0), -1, (cttz_zero_undef X)). Disable the special
24343 // handle to keep the CMP with 0. This should be removed by
24344 // optimizeCompareInst by using the flags from the BSR/TZCNT used for the
24345 // cttz_zero_undef.
24346 auto MatchFFSMinus1 = [&](SDValue Op1, SDValue Op2) {
24347 return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() &&
24348 Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2));
24349 };
24350 if (Subtarget.hasCMov() && (VT == MVT::i32 || VT == MVT::i64) &&
24351 ((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) ||
24352 (CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) {
24353 // Keep Cmp.
24354 } else if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
24355 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
24356 SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
24357
24358 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
24359 SDVTList CmpVTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
24360
24361 // Apply further optimizations for special cases
24362 // (select (x != 0), -1, 0) -> neg & sbb
24363 // (select (x == 0), 0, -1) -> neg & sbb
24364 if (isNullConstant(Y) &&
24365 (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
24366 SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
24367 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpOp0);
24368 Zero = DAG.getConstant(0, DL, Op.getValueType());
24369 return DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Neg.getValue(1));
24370 }
24371
24372 Cmp = DAG.getNode(X86ISD::SUB, DL, CmpVTs,
24373 CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
24374
24375 SDValue Zero = DAG.getConstant(0, DL, Op.getValueType());
24376 SDValue Res = // Res = 0 or -1.
24377 DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Cmp.getValue(1));
24378
24379 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
24380 Res = DAG.getNOT(DL, Res, Res.getValueType());
24381
24382 return DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
24383 } else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&
24384 Cmp.getOperand(0).getOpcode() == ISD::AND &&
24385 isOneConstant(Cmp.getOperand(0).getOperand(1))) {
24386 SDValue Src1, Src2;
24387 // true if Op2 is XOR or OR operator and one of its operands
24388 // is equal to Op1
24389 // ( a , a op b) || ( b , a op b)
24390 auto isOrXorPattern = [&]() {
24391 if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&
24392 (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {
24393 Src1 =
24394 Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
24395 Src2 = Op1;
24396 return true;
24397 }
24398 return false;
24399 };
24400
24401 if (isOrXorPattern()) {
24402 SDValue Neg;
24403 unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
24404 // we need mask of all zeros or ones with same size of the other
24405 // operands.
24406 if (CmpSz > VT.getSizeInBits())
24407 Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
24408 else if (CmpSz < VT.getSizeInBits())
24409 Neg = DAG.getNode(ISD::AND, DL, VT,
24410 DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
24411 DAG.getConstant(1, DL, VT));
24412 else
24413 Neg = CmpOp0;
24414 SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
24415 Neg); // -(and (x, 0x1))
24416 SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
24417 return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y
24418 }
24419 }
24420 }
24421
24422 // Look past (and (setcc_carry (cmp ...)), 1).
24423 if (Cond.getOpcode() == ISD::AND &&
24424 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
24425 isOneConstant(Cond.getOperand(1)))
24426 Cond = Cond.getOperand(0);
24427
24428 // If condition flag is set by a X86ISD::CMP, then use it as the condition
24429 // setting operand in place of the X86ISD::SETCC.
24430 unsigned CondOpcode = Cond.getOpcode();
24431 if (CondOpcode == X86ISD::SETCC ||
24432 CondOpcode == X86ISD::SETCC_CARRY) {
24433 CC = Cond.getOperand(0);
24434
24435 SDValue Cmp = Cond.getOperand(1);
24436 bool IllegalFPCMov = false;
24437 if (VT.isFloatingPoint() && !VT.isVector() &&
24438 !isScalarFPTypeInSSEReg(VT) && Subtarget.hasCMov()) // FPStack?
24439 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
24440
24441 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
24442 Cmp.getOpcode() == X86ISD::BT) { // FIXME
24443 Cond = Cmp;
24444 AddTest = false;
24445 }
24446 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
24447 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
24448 CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
24449 SDValue Value;
24450 X86::CondCode X86Cond;
24451 std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
24452
24453 CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);
24454 AddTest = false;
24455 }
24456
24457 if (AddTest) {
24458 // Look past the truncate if the high bits are known zero.
24459 if (isTruncWithZeroHighBitsInput(Cond, DAG))
24460 Cond = Cond.getOperand(0);
24461
24462 // We know the result of AND is compared against zero. Try to match
24463 // it to BT.
24464 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
24465 SDValue BTCC;
24466 if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, BTCC)) {
24467 CC = BTCC;
24468 Cond = BT;
24469 AddTest = false;
24470 }
24471 }
24472 }
24473
24474 if (AddTest) {
24475 CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
24476 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);
24477 }
24478
24479 // a < b ? -1 : 0 -> RES = ~setcc_carry
24480 // a < b ? 0 : -1 -> RES = setcc_carry
24481 // a >= b ? -1 : 0 -> RES = setcc_carry
24482 // a >= b ? 0 : -1 -> RES = ~setcc_carry
24483 if (Cond.getOpcode() == X86ISD::SUB) {
24484 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
24485
24486 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
24487 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
24488 (isNullConstant(Op1) || isNullConstant(Op2))) {
24489 SDValue Res =
24490 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
24491 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond);
24492 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
24493 return DAG.getNOT(DL, Res, Res.getValueType());
24494 return Res;
24495 }
24496 }
24497
24498 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
24499 // widen the cmov and push the truncate through. This avoids introducing a new
24500 // branch during isel and doesn't add any extensions.
24501 if (Op.getValueType() == MVT::i8 &&
24502 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
24503 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
24504 if (T1.getValueType() == T2.getValueType() &&
24505 // Exclude CopyFromReg to avoid partial register stalls.
24506 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
24507 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
24508 CC, Cond);
24509 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
24510 }
24511 }
24512
24513 // Or finally, promote i8 cmovs if we have CMOV,
24514 // or i16 cmovs if it won't prevent folding a load.
24515 // FIXME: we should not limit promotion of i8 case to only when the CMOV is
24516 // legal, but EmitLoweredSelect() can not deal with these extensions
24517 // being inserted between two CMOV's. (in i16 case too TBN)
24518 // https://bugs.llvm.org/show_bug.cgi?id=40974
24519 if ((Op.getValueType() == MVT::i8 && Subtarget.hasCMov()) ||
24520 (Op.getValueType() == MVT::i16 && !MayFoldLoad(Op1) &&
24521 !MayFoldLoad(Op2))) {
24522 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
24523 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
24524 SDValue Ops[] = { Op2, Op1, CC, Cond };
24525 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
24526 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
24527 }
24528
24529 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
24530 // condition is true.
24531 SDValue Ops[] = { Op2, Op1, CC, Cond };
24532 return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops);
24533}
24534
24535static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,
24536 const X86Subtarget &Subtarget,
24537 SelectionDAG &DAG) {
24538 MVT VT = Op->getSimpleValueType(0);
24539 SDValue In = Op->getOperand(0);
24540 MVT InVT = In.getSimpleValueType();
24541 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!")(static_cast<void> (0));
24542 MVT VTElt = VT.getVectorElementType();
24543 SDLoc dl(Op);
24544
24545 unsigned NumElts = VT.getVectorNumElements();
24546
24547 // Extend VT if the scalar type is i8/i16 and BWI is not supported.
24548 MVT ExtVT = VT;
24549 if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
24550 // If v16i32 is to be avoided, we'll need to split and concatenate.
24551 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
24552 return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG);
24553
24554 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
24555 }
24556
24557 // Widen to 512-bits if VLX is not supported.
24558 MVT WideVT = ExtVT;
24559 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
24560 NumElts *= 512 / ExtVT.getSizeInBits();
24561 InVT = MVT::getVectorVT(MVT::i1, NumElts);
24562 In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT),
24563 In, DAG.getIntPtrConstant(0, dl));
24564 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
24565 }
24566
24567 SDValue V;
24568 MVT WideEltVT = WideVT.getVectorElementType();
24569 if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
24570 (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
24571 V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);
24572 } else {
24573 SDValue NegOne = DAG.getConstant(-1, dl, WideVT);
24574 SDValue Zero = DAG.getConstant(0, dl, WideVT);
24575 V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
24576 }
24577
24578 // Truncate if we had to extend i16/i8 above.
24579 if (VT != ExtVT) {
24580 WideVT = MVT::getVectorVT(VTElt, NumElts);
24581 V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
24582 }
24583
24584 // Extract back to 128/256-bit if we widened.
24585 if (WideVT != VT)
24586 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
24587 DAG.getIntPtrConstant(0, dl));
24588
24589 return V;
24590}
24591
24592static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
24593 SelectionDAG &DAG) {
24594 SDValue In = Op->getOperand(0);
24595 MVT InVT = In.getSimpleValueType();
24596
24597 if (InVT.getVectorElementType() == MVT::i1)
24598 return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
24599
24600 assert(Subtarget.hasAVX() && "Expected AVX support")(static_cast<void> (0));
24601 return LowerAVXExtend(Op, DAG, Subtarget);
24602}
24603
24604// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
24605// For sign extend this needs to handle all vector sizes and SSE4.1 and
24606// non-SSE4.1 targets. For zero extend this should only handle inputs of
24607// MVT::v64i8 when BWI is not supported, but AVX512 is.
24608static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
24609 const X86Subtarget &Subtarget,
24610 SelectionDAG &DAG) {
24611 SDValue In = Op->getOperand(0);
24612 MVT VT = Op->getSimpleValueType(0);
24613 MVT InVT = In.getSimpleValueType();
24614
24615 MVT SVT = VT.getVectorElementType();
24616 MVT InSVT = InVT.getVectorElementType();
24617 assert(SVT.getFixedSizeInBits() > InSVT.getFixedSizeInBits())(static_cast<void> (0));
24618
24619 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
24620 return SDValue();
24621 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
24622 return SDValue();
24623 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
24624 !(VT.is256BitVector() && Subtarget.hasAVX()) &&
24625 !(VT.is512BitVector() && Subtarget.hasAVX512()))
24626 return SDValue();
24627
24628 SDLoc dl(Op);
24629 unsigned Opc = Op.getOpcode();
24630 unsigned NumElts = VT.getVectorNumElements();
24631
24632 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
24633 // For 512-bit vectors, we need 128-bits or 256-bits.
24634 if (InVT.getSizeInBits() > 128) {
24635 // Input needs to be at least the same number of elements as output, and
24636 // at least 128-bits.
24637 int InSize = InSVT.getSizeInBits() * NumElts;
24638 In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
24639 InVT = In.getSimpleValueType();
24640 }
24641
24642 // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,
24643 // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
24644 // need to be handled here for 256/512-bit results.
24645 if (Subtarget.hasInt256()) {
24646 assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension")(static_cast<void> (0));
24647
24648 if (InVT.getVectorNumElements() != NumElts)
24649 return DAG.getNode(Op.getOpcode(), dl, VT, In);
24650
24651 // FIXME: Apparently we create inreg operations that could be regular
24652 // extends.
24653 unsigned ExtOpc =
24654 Opc == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SIGN_EXTEND
24655 : ISD::ZERO_EXTEND;
24656 return DAG.getNode(ExtOpc, dl, VT, In);
24657 }
24658
24659 // pre-AVX2 256-bit extensions need to be split into 128-bit instructions.
24660 if (Subtarget.hasAVX()) {
24661 assert(VT.is256BitVector() && "256-bit vector expected")(static_cast<void> (0));
24662 MVT HalfVT = VT.getHalfNumVectorElementsVT();
24663 int HalfNumElts = HalfVT.getVectorNumElements();
24664
24665 unsigned NumSrcElts = InVT.getVectorNumElements();
24666 SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);
24667 for (int i = 0; i != HalfNumElts; ++i)
24668 HiMask[i] = HalfNumElts + i;
24669
24670 SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In);
24671 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask);
24672 Hi = DAG.getNode(Opc, dl, HalfVT, Hi);
24673 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
24674 }
24675
24676 // We should only get here for sign extend.
24677 assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!")(static_cast<void> (0));
24678 assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs")(static_cast<void> (0));
24679
24680 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
24681 SDValue Curr = In;
24682 SDValue SignExt = Curr;
24683
24684 // As SRAI is only available on i16/i32 types, we expand only up to i32
24685 // and handle i64 separately.
24686 if (InVT != MVT::v4i32) {
24687 MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
24688
24689 unsigned DestWidth = DestVT.getScalarSizeInBits();
24690 unsigned Scale = DestWidth / InSVT.getSizeInBits();
24691
24692 unsigned InNumElts = InVT.getVectorNumElements();
24693 unsigned DestElts = DestVT.getVectorNumElements();
24694
24695 // Build a shuffle mask that takes each input element and places it in the
24696 // MSBs of the new element size.
24697 SmallVector<int, 16> Mask(InNumElts, SM_SentinelUndef);
24698 for (unsigned i = 0; i != DestElts; ++i)
24699 Mask[i * Scale + (Scale - 1)] = i;
24700
24701 Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask);
24702 Curr = DAG.getBitcast(DestVT, Curr);
24703
24704 unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();
24705 SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,
24706 DAG.getTargetConstant(SignExtShift, dl, MVT::i8));
24707 }
24708
24709 if (VT == MVT::v2i64) {
24710 assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT")(static_cast<void> (0));
24711 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
24712 SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT);
24713 SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5});
24714 SignExt = DAG.getBitcast(VT, SignExt);
24715 }
24716
24717 return SignExt;
24718}
24719
24720static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
24721 SelectionDAG &DAG) {
24722 MVT VT = Op->getSimpleValueType(0);
24723 SDValue In = Op->getOperand(0);
24724 MVT InVT = In.getSimpleValueType();
24725 SDLoc dl(Op);
24726
24727 if (InVT.getVectorElementType() == MVT::i1)
24728 return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
24729
24730 assert(VT.isVector() && InVT.isVector() && "Expected vector type")(static_cast<void> (0));
24731 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&(static_cast<void> (0))
24732 "Expected same number of elements")(static_cast<void> (0));
24733 assert((VT.getVectorElementType() == MVT::i16 ||(static_cast<void> (0))
24734 VT.getVectorElementType() == MVT::i32 ||(static_cast<void> (0))
24735 VT.getVectorElementType() == MVT::i64) &&(static_cast<void> (0))
24736 "Unexpected element type")(static_cast<void> (0));
24737 assert((InVT.getVectorElementType() == MVT::i8 ||(static_cast<void> (0))
24738 InVT.getVectorElementType() == MVT::i16 ||(static_cast<void> (0))
24739 InVT.getVectorElementType() == MVT::i32) &&(static_cast<void> (0))
24740 "Unexpected element type")(static_cast<void> (0));
24741
24742 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
24743 assert(InVT == MVT::v32i8 && "Unexpected VT!")(static_cast<void> (0));
24744 return splitVectorIntUnary(Op, DAG);
24745 }
24746
24747 if (Subtarget.hasInt256())
24748 return Op;
24749
24750 // Optimize vectors in AVX mode
24751 // Sign extend v8i16 to v8i32 and
24752 // v4i32 to v4i64
24753 //
24754 // Divide input vector into two parts
24755 // for v4i32 the high shuffle mask will be {2, 3, -1, -1}
24756 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
24757 // concat the vectors to original VT
24758 MVT HalfVT = VT.getHalfNumVectorElementsVT();
24759 SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);
24760
24761 unsigned NumElems = InVT.getVectorNumElements();
24762 SmallVector<int,8> ShufMask(NumElems, -1);
24763 for (unsigned i = 0; i != NumElems/2; ++i)
24764 ShufMask[i] = i + NumElems/2;
24765
24766 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
24767 OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);
24768
24769 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
24770}
24771
24772/// Change a vector store into a pair of half-size vector stores.
24773static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) {
24774 SDValue StoredVal = Store->getValue();
24775 assert((StoredVal.getValueType().is256BitVector() ||(static_cast<void> (0))
24776 StoredVal.getValueType().is512BitVector()) &&(static_cast<void> (0))
24777 "Expecting 256/512-bit op")(static_cast<void> (0));
24778
24779 // Splitting volatile memory ops is not allowed unless the operation was not
24780 // legal to begin with. Assume the input store is legal (this transform is
24781 // only used for targets with AVX). Note: It is possible that we have an
24782 // illegal type like v2i128, and so we could allow splitting a volatile store
24783 // in that case if that is important.
24784 if (!Store->isSimple())
24785 return SDValue();
24786
24787 SDLoc DL(Store);
24788 SDValue Value0, Value1;
24789 std::tie(Value0, Value1) = splitVector(StoredVal, DAG, DL);
24790 unsigned HalfOffset = Value0.getValueType().getStoreSize();
24791 SDValue Ptr0 = Store->getBasePtr();
24792 SDValue Ptr1 =
24793 DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(HalfOffset), DL);
24794 SDValue Ch0 =
24795 DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),
24796 Store->getOriginalAlign(),
24797 Store->getMemOperand()->getFlags());
24798 SDValue Ch1 = DAG.getStore(Store->getChain(), DL, Value1, Ptr1,
24799 Store->getPointerInfo().getWithOffset(HalfOffset),
24800 Store->getOriginalAlign(),
24801 Store->getMemOperand()->getFlags());
24802 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);
24803}
24804
24805/// Scalarize a vector store, bitcasting to TargetVT to determine the scalar
24806/// type.
24807static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT,
24808 SelectionDAG &DAG) {
24809 SDValue StoredVal = Store->getValue();
24810 assert(StoreVT.is128BitVector() &&(static_cast<void> (0))
24811 StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op")(static_cast<void> (0));
24812 StoredVal = DAG.getBitcast(StoreVT, StoredVal);
24813
24814 // Splitting volatile memory ops is not allowed unless the operation was not
24815 // legal to begin with. We are assuming the input op is legal (this transform
24816 // is only used for targets with AVX).
24817 if (!Store->isSimple())
24818 return SDValue();
24819
24820 MVT StoreSVT = StoreVT.getScalarType();
24821 unsigned NumElems = StoreVT.getVectorNumElements();
24822 unsigned ScalarSize = StoreSVT.getStoreSize();
24823
24824 SDLoc DL(Store);
24825 SmallVector<SDValue, 4> Stores;
24826 for (unsigned i = 0; i != NumElems; ++i) {
24827 unsigned Offset = i * ScalarSize;
24828 SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(),
24829 TypeSize::Fixed(Offset), DL);
24830 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,
24831 DAG.getIntPtrConstant(i, DL));
24832 SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr,
24833 Store->getPointerInfo().getWithOffset(Offset),
24834 Store->getOriginalAlign(),
24835 Store->getMemOperand()->getFlags());
24836 Stores.push_back(Ch);
24837 }
24838 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
24839}
24840
24841static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
24842 SelectionDAG &DAG) {
24843 StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
24844 SDLoc dl(St);
24845 SDValue StoredVal = St->getValue();
24846
24847 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.
24848 if (StoredVal.getValueType().isVector() &&
24849 StoredVal.getValueType().getVectorElementType() == MVT::i1) {
24850 unsigned NumElts = StoredVal.getValueType().getVectorNumElements();
24851 assert(NumElts <= 8 && "Unexpected VT")(static_cast<void> (0));
24852 assert(!St->isTruncatingStore() && "Expected non-truncating store")(static_cast<void> (0));
24853 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&(static_cast<void> (0))
24854 "Expected AVX512F without AVX512DQI")(static_cast<void> (0));
24855
24856 // We must pad with zeros to ensure we store zeroes to any unused bits.
24857 StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
24858 DAG.getUNDEF(MVT::v16i1), StoredVal,
24859 DAG.getIntPtrConstant(0, dl));
24860 StoredVal = DAG.getBitcast(MVT::i16, StoredVal);
24861 StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);
24862 // Make sure we store zeros in the extra bits.
24863 if (NumElts < 8)
24864 StoredVal = DAG.getZeroExtendInReg(
24865 StoredVal, dl, EVT::getIntegerVT(*DAG.getContext(), NumElts));
24866
24867 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
24868 St->getPointerInfo(), St->getOriginalAlign(),
24869 St->getMemOperand()->getFlags());
24870 }
24871
24872 if (St->isTruncatingStore())
24873 return SDValue();
24874
24875 // If this is a 256-bit store of concatenated ops, we are better off splitting
24876 // that store into two 128-bit stores. This avoids spurious use of 256-bit ops
24877 // and each half can execute independently. Some cores would split the op into
24878 // halves anyway, so the concat (vinsertf128) is purely an extra op.
24879 MVT StoreVT = StoredVal.getSimpleValueType();
24880 if (StoreVT.is256BitVector() ||
24881 ((StoreVT == MVT::v32i16 || StoreVT == MVT::v64i8) &&
24882 !Subtarget.hasBWI())) {
24883 SmallVector<SDValue, 4> CatOps;
24884 if (StoredVal.hasOneUse() && collectConcatOps(StoredVal.getNode(), CatOps))
24885 return splitVectorStore(St, DAG);
24886 return SDValue();
24887 }
24888
24889 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24890 assert(StoreVT.isVector() && StoreVT.getSizeInBits() == 64 &&(static_cast<void> (0))
24891 "Unexpected VT")(static_cast<void> (0));
24892 assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) ==(static_cast<void> (0))
24893 TargetLowering::TypeWidenVector && "Unexpected type action!")(static_cast<void> (0));
24894
24895 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT);
24896 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
24897 DAG.getUNDEF(StoreVT));
24898
24899 if (Subtarget.hasSSE2()) {
24900 // Widen the vector, cast to a v2x64 type, extract the single 64-bit element
24901 // and store it.
24902 MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
24903 MVT CastVT = MVT::getVectorVT(StVT, 2);
24904 StoredVal = DAG.getBitcast(CastVT, StoredVal);
24905 StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
24906 DAG.getIntPtrConstant(0, dl));
24907
24908 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
24909 St->getPointerInfo(), St->getOriginalAlign(),
24910 St->getMemOperand()->getFlags());
24911 }
24912 assert(Subtarget.hasSSE1() && "Expected SSE")(static_cast<void> (0));
24913 SDVTList Tys = DAG.getVTList(MVT::Other);
24914 SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};
24915 return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,
24916 St->getMemOperand());
24917}
24918
24919// Lower vector extended loads using a shuffle. If SSSE3 is not available we
24920// may emit an illegal shuffle but the expansion is still better than scalar
24921// code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise
24922// we'll emit a shuffle and a arithmetic shift.
24923// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
24924// TODO: It is possible to support ZExt by zeroing the undef values during
24925// the shuffle phase or after the shuffle.
24926static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
24927 SelectionDAG &DAG) {
24928 MVT RegVT = Op.getSimpleValueType();
24929 assert(RegVT.isVector() && "We only custom lower vector loads.")(static_cast<void> (0));
24930 assert(RegVT.isInteger() &&(static_cast<void> (0))
24931 "We only custom lower integer vector loads.")(static_cast<void> (0));
24932
24933 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
24934 SDLoc dl(Ld);
24935
24936 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
24937 if (RegVT.getVectorElementType() == MVT::i1) {
24938 assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load")(static_cast<void> (0));
24939 assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT")(static_cast<void> (0));
24940 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&(static_cast<void> (0))
24941 "Expected AVX512F without AVX512DQI")(static_cast<void> (0));
24942
24943 SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
24944 Ld->getPointerInfo(), Ld->getOriginalAlign(),
24945 Ld->getMemOperand()->getFlags());
24946
24947 // Replace chain users with the new chain.
24948 assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!")(static_cast<void> (0));
24949
24950 SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd);
24951 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
24952 DAG.getBitcast(MVT::v16i1, Val),
24953 DAG.getIntPtrConstant(0, dl));
24954 return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);
24955 }
24956
24957 return SDValue();
24958}
24959
24960/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
24961/// each of which has no other use apart from the AND / OR.
24962static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
24963 Opc = Op.getOpcode();
24964 if (Opc != ISD::OR && Opc != ISD::AND)
24965 return false;
24966 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
24967 Op.getOperand(0).hasOneUse() &&
24968 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
24969 Op.getOperand(1).hasOneUse());
24970}
24971
24972SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
24973 SDValue Chain = Op.getOperand(0);
24974 SDValue Cond = Op.getOperand(1);
24975 SDValue Dest = Op.getOperand(2);
24976 SDLoc dl(Op);
24977
24978 if (Cond.getOpcode() == ISD::SETCC &&
24979 Cond.getOperand(0).getValueType() != MVT::f128) {
24980 SDValue LHS = Cond.getOperand(0);
24981 SDValue RHS = Cond.getOperand(1);
24982 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
24983
24984 // Special case for
24985 // setcc([su]{add,sub,mul}o == 0)
24986 // setcc([su]{add,sub,mul}o != 1)
24987 if (ISD::isOverflowIntrOpRes(LHS) &&
24988 (CC == ISD::SETEQ || CC == ISD::SETNE) &&
24989 (isNullConstant(RHS) || isOneConstant(RHS))) {
24990 SDValue Value, Overflow;
24991 X86::CondCode X86Cond;
24992 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, LHS.getValue(0), DAG);
24993
24994 if ((CC == ISD::SETEQ) == isNullConstant(RHS))
24995 X86Cond = X86::GetOppositeBranchCondition(X86Cond);
24996
24997 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24998 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24999 Overflow);
25000 }
25001
25002 if (LHS.getSimpleValueType().isInteger()) {
25003 SDValue CCVal;
25004 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, CC, SDLoc(Cond), DAG, CCVal);
25005 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25006 EFLAGS);
25007 }
25008
25009 if (CC == ISD::SETOEQ) {
25010 // For FCMP_OEQ, we can emit
25011 // two branches instead of an explicit AND instruction with a
25012 // separate test. However, we only do this if this block doesn't
25013 // have a fall-through edge, because this requires an explicit
25014 // jmp when the condition is false.
25015 if (Op.getNode()->hasOneUse()) {
25016 SDNode *User = *Op.getNode()->use_begin();
25017 // Look for an unconditional branch following this conditional branch.
25018 // We need this because we need to reverse the successors in order
25019 // to implement FCMP_OEQ.
25020 if (User->getOpcode() == ISD::BR) {
25021 SDValue FalseBB = User->getOperand(1);
25022 SDNode *NewBR =
25023 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
25024 assert(NewBR == User)(static_cast<void> (0));
25025 (void)NewBR;
25026 Dest = FalseBB;
25027
25028 SDValue Cmp =
25029 DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
25030 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
25031 Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest,
25032 CCVal, Cmp);
25033 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
25034 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25035 Cmp);
25036 }
25037 }
25038 } else if (CC == ISD::SETUNE) {
25039 // For FCMP_UNE, we can emit
25040 // two branches instead of an explicit OR instruction with a
25041 // separate test.
25042 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
25043 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
25044 Chain =
25045 DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, Cmp);
25046 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
25047 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25048 Cmp);
25049 } else {
25050 X86::CondCode X86Cond =
25051 TranslateX86CC(CC, dl, /*IsFP*/ true, LHS, RHS, DAG);
25052 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
25053 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25054 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25055 Cmp);
25056 }
25057 }
25058
25059 if (ISD::isOverflowIntrOpRes(Cond)) {
25060 SDValue Value, Overflow;
25061 X86::CondCode X86Cond;
25062 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
25063
25064 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25065 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25066 Overflow);
25067 }
25068
25069 // Look past the truncate if the high bits are known zero.
25070 if (isTruncWithZeroHighBitsInput(Cond, DAG))
25071 Cond = Cond.getOperand(0);
25072
25073 EVT CondVT = Cond.getValueType();
25074
25075 // Add an AND with 1 if we don't already have one.
25076 if (!(Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1))))
25077 Cond =
25078 DAG.getNode(ISD::AND, dl, CondVT, Cond, DAG.getConstant(1, dl, CondVT));
25079
25080 SDValue LHS = Cond;
25081 SDValue RHS = DAG.getConstant(0, dl, CondVT);
25082
25083 SDValue CCVal;
25084 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, ISD::SETNE, dl, DAG, CCVal);
25085 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25086 EFLAGS);
25087}
25088
25089// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
25090// Calls to _alloca are needed to probe the stack when allocating more than 4k
25091// bytes in one go. Touching the stack at 4K increments is necessary to ensure
25092// that the guard pages used by the OS virtual memory manager are allocated in
25093// correct sequence.
25094SDValue
25095X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
25096 SelectionDAG &DAG) const {
25097 MachineFunction &MF = DAG.getMachineFunction();
25098 bool SplitStack = MF.shouldSplitStack();
25099 bool EmitStackProbeCall = hasStackProbeSymbol(MF);
25100 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
25101 SplitStack || EmitStackProbeCall;
25102 SDLoc dl(Op);
25103
25104 // Get the inputs.
25105 SDNode *Node = Op.getNode();
25106 SDValue Chain = Op.getOperand(0);
25107 SDValue Size = Op.getOperand(1);
25108 MaybeAlign Alignment(Op.getConstantOperandVal(2));
25109 EVT VT = Node->getValueType(0);
25110
25111 // Chain the dynamic stack allocation so that it doesn't modify the stack
25112 // pointer when other instructions are using the stack.
25113 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
25114
25115 bool Is64Bit = Subtarget.is64Bit();
25116 MVT SPTy = getPointerTy(DAG.getDataLayout());
25117
25118 SDValue Result;
25119 if (!Lower) {
25120 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25121 Register SPReg = TLI.getStackPointerRegisterToSaveRestore();
25122 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"(static_cast<void> (0))
25123 " not tell us which reg is the stack pointer!")(static_cast<void> (0));
25124
25125 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
25126 const Align StackAlign = TFI.getStackAlign();
25127 if (hasInlineStackProbe(MF)) {
25128 MachineRegisterInfo &MRI = MF.getRegInfo();
25129
25130 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
25131 Register Vreg = MRI.createVirtualRegister(AddrRegClass);
25132 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
25133 Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, SPTy, Chain,
25134 DAG.getRegister(Vreg, SPTy));
25135 } else {
25136 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
25137 Chain = SP.getValue(1);
25138 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
25139 }
25140 if (Alignment && *Alignment > StackAlign)
25141 Result =
25142 DAG.getNode(ISD::AND, dl, VT, Result,
25143 DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
25144 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
25145 } else if (SplitStack) {
25146 MachineRegisterInfo &MRI = MF.getRegInfo();
25147
25148 if (Is64Bit) {
25149 // The 64 bit implementation of segmented stacks needs to clobber both r10
25150 // r11. This makes it impossible to use it along with nested parameters.
25151 const Function &F = MF.getFunction();
25152 for (const auto &A : F.args()) {
25153 if (A.hasNestAttr())
25154 report_fatal_error("Cannot use segmented stacks with functions that "
25155 "have nested arguments.");
25156 }
25157 }
25158
25159 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
25160 Register Vreg = MRI.createVirtualRegister(AddrRegClass);
25161 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
25162 Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
25163 DAG.getRegister(Vreg, SPTy));
25164 } else {
25165 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
25166 Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
25167 MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);
25168
25169 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
25170 Register SPReg = RegInfo->getStackRegister();
25171 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
25172 Chain = SP.getValue(1);
25173
25174 if (Alignment) {
25175 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
25176 DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
25177 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
25178 }
25179
25180 Result = SP;
25181 }
25182
25183 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
25184 DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
25185
25186 SDValue Ops[2] = {Result, Chain};
25187 return DAG.getMergeValues(Ops, dl);
25188}
25189
25190SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
25191 MachineFunction &MF = DAG.getMachineFunction();
25192 auto PtrVT = getPointerTy(MF.getDataLayout());
25193 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
25194
25195 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
25196 SDLoc DL(Op);
25197
25198 if (!Subtarget.is64Bit() ||
25199 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
25200 // vastart just stores the address of the VarArgsFrameIndex slot into the
25201 // memory location argument.
25202 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
25203 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
25204 MachinePointerInfo(SV));
25205 }
25206
25207 // __va_list_tag:
25208 // gp_offset (0 - 6 * 8)
25209 // fp_offset (48 - 48 + 8 * 16)
25210 // overflow_arg_area (point to parameters coming in memory).
25211 // reg_save_area
25212 SmallVector<SDValue, 8> MemOps;
25213 SDValue FIN = Op.getOperand(1);
25214 // Store gp_offset
25215 SDValue Store = DAG.getStore(
25216 Op.getOperand(0), DL,
25217 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
25218 MachinePointerInfo(SV));
25219 MemOps.push_back(Store);
25220
25221 // Store fp_offset
25222 FIN = DAG.getMemBasePlusOffset(FIN, TypeSize::Fixed(4), DL);
25223 Store = DAG.getStore(
25224 Op.getOperand(0), DL,
25225 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
25226 MachinePointerInfo(SV, 4));
25227 MemOps.push_back(Store);
25228
25229 // Store ptr to overflow_arg_area
25230 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
25231 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
25232 Store =
25233 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
25234 MemOps.push_back(Store);
25235
25236 // Store ptr to reg_save_area.
25237 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
25238 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
25239 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
25240 Store = DAG.getStore(
25241 Op.getOperand(0), DL, RSFIN, FIN,
25242 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
25243 MemOps.push_back(Store);
25244 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
25245}
25246
25247SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
25248 assert(Subtarget.is64Bit() &&(static_cast<void> (0))
25249 "LowerVAARG only handles 64-bit va_arg!")(static_cast<void> (0));
25250 assert(Op.getNumOperands() == 4)(static_cast<void> (0));
25251
25252 MachineFunction &MF = DAG.getMachineFunction();
25253 if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
25254 // The Win64 ABI uses char* instead of a structure.
25255 return DAG.expandVAArg(Op.getNode());
25256
25257 SDValue Chain = Op.getOperand(0);
25258 SDValue SrcPtr = Op.getOperand(1);
25259 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
25260 unsigned Align = Op.getConstantOperandVal(3);
25261 SDLoc dl(Op);
25262
25263 EVT ArgVT = Op.getNode()->getValueType(0);
25264 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
25265 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
25266 uint8_t ArgMode;
25267
25268 // Decide which area this value should be read from.
25269 // TODO: Implement the AMD64 ABI in its entirety. This simple
25270 // selection mechanism works only for the basic types.
25271 assert(ArgVT != MVT::f80 && "va_arg for f80 not yet implemented")(static_cast<void> (0));
25272 if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
25273 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
25274 } else {
25275 assert(ArgVT.isInteger() && ArgSize <= 32 /*bytes*/ &&(static_cast<void> (0))
25276 "Unhandled argument type in LowerVAARG")(static_cast<void> (0));
25277 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
25278 }
25279
25280 if (ArgMode == 2) {
25281 // Sanity Check: Make sure using fp_offset makes sense.
25282 assert(!Subtarget.useSoftFloat() &&(static_cast<void> (0))
25283 !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&(static_cast<void> (0))
25284 Subtarget.hasSSE1())(static_cast<void> (0));
25285 }
25286
25287 // Insert VAARG node into the DAG
25288 // VAARG returns two values: Variable Argument Address, Chain
25289 SDValue InstOps[] = {Chain, SrcPtr,
25290 DAG.getTargetConstant(ArgSize, dl, MVT::i32),
25291 DAG.getTargetConstant(ArgMode, dl, MVT::i8),
25292 DAG.getTargetConstant(Align, dl, MVT::i32)};
25293 SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
25294 SDValue VAARG = DAG.getMemIntrinsicNode(
25295 Subtarget.isTarget64BitLP64() ? X86ISD::VAARG_64 : X86ISD::VAARG_X32, dl,
25296 VTs, InstOps, MVT::i64, MachinePointerInfo(SV),
25297 /*Alignment=*/None,
25298 MachineMemOperand::MOLoad | MachineMemOperand::MOStore);
25299 Chain = VAARG.getValue(1);
25300
25301 // Load the next argument and return it
25302 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
25303}
25304
25305static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
25306 SelectionDAG &DAG) {
25307 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
25308 // where a va_list is still an i8*.
25309 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!")(static_cast<void> (0));
25310 if (Subtarget.isCallingConvWin64(
25311 DAG.getMachineFunction().getFunction().getCallingConv()))
25312 // Probably a Win64 va_copy.
25313 return DAG.expandVACopy(Op.getNode());
25314
25315 SDValue Chain = Op.getOperand(0);
25316 SDValue DstPtr = Op.getOperand(1);
25317 SDValue SrcPtr = Op.getOperand(2);
25318 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
25319 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
25320 SDLoc DL(Op);
25321
25322 return DAG.getMemcpy(
25323 Chain, DL, DstPtr, SrcPtr,
25324 DAG.getIntPtrConstant(Subtarget.isTarget64BitLP64() ? 24 : 16, DL),
25325 Align(Subtarget.isTarget64BitLP64() ? 8 : 4), /*isVolatile*/ false, false,
25326 false, MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
25327}
25328
25329// Helper to get immediate/variable SSE shift opcode from other shift opcodes.
25330static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {
25331 switch (Opc) {
25332 case ISD::SHL:
25333 case X86ISD::VSHL:
25334 case X86ISD::VSHLI:
25335 return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;
25336 case ISD::SRL:
25337 case X86ISD::VSRL:
25338 case X86ISD::VSRLI:
25339 return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;
25340 case ISD::SRA:
25341 case X86ISD::VSRA:
25342 case X86ISD::VSRAI:
25343 return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;
25344 }
25345 llvm_unreachable("Unknown target vector shift node")__builtin_unreachable();
25346}
25347
25348/// Handle vector element shifts where the shift amount is a constant.
25349/// Takes immediate version of shift as input.
25350static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
25351 SDValue SrcOp, uint64_t ShiftAmt,
25352 SelectionDAG &DAG) {
25353 MVT ElementType = VT.getVectorElementType();
25354
25355 // Bitcast the source vector to the output type, this is mainly necessary for
25356 // vXi8/vXi64 shifts.
25357 if (VT != SrcOp.getSimpleValueType())
25358 SrcOp = DAG.getBitcast(VT, SrcOp);
25359
25360 // Fold this packed shift into its first operand if ShiftAmt is 0.
25361 if (ShiftAmt == 0)
25362 return SrcOp;
25363
25364 // Check for ShiftAmt >= element width
25365 if (ShiftAmt >= ElementType.getSizeInBits()) {
25366 if (Opc == X86ISD::VSRAI)
25367 ShiftAmt = ElementType.getSizeInBits() - 1;
25368 else
25369 return DAG.getConstant(0, dl, VT);
25370 }
25371
25372 assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)(static_cast<void> (0))
25373 && "Unknown target vector shift-by-constant node")(static_cast<void> (0));
25374
25375 // Fold this packed vector shift into a build vector if SrcOp is a
25376 // vector of Constants or UNDEFs.
25377 if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
25378 SmallVector<SDValue, 8> Elts;
25379 unsigned NumElts = SrcOp->getNumOperands();
25380
25381 switch (Opc) {
25382 default: llvm_unreachable("Unknown opcode!")__builtin_unreachable();
25383 case X86ISD::VSHLI:
25384 for (unsigned i = 0; i != NumElts; ++i) {
25385 SDValue CurrentOp = SrcOp->getOperand(i);
25386 if (CurrentOp->isUndef()) {
25387 // Must produce 0s in the correct bits.
25388 Elts.push_back(DAG.getConstant(0, dl, ElementType));
25389 continue;
25390 }
25391 auto *ND = cast<ConstantSDNode>(CurrentOp);
25392 const APInt &C = ND->getAPIntValue();
25393 Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
25394 }
25395 break;
25396 case X86ISD::VSRLI:
25397 for (unsigned i = 0; i != NumElts; ++i) {
25398 SDValue CurrentOp = SrcOp->getOperand(i);
25399 if (CurrentOp->isUndef()) {
25400 // Must produce 0s in the correct bits.
25401 Elts.push_back(DAG.getConstant(0, dl, ElementType));
25402 continue;
25403 }
25404 auto *ND = cast<ConstantSDNode>(CurrentOp);
25405 const APInt &C = ND->getAPIntValue();
25406 Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
25407 }
25408 break;
25409 case X86ISD::VSRAI:
25410 for (unsigned i = 0; i != NumElts; ++i) {
25411 SDValue CurrentOp = SrcOp->getOperand(i);
25412 if (CurrentOp->isUndef()) {
25413 // All shifted in bits must be the same so use 0.
25414 Elts.push_back(DAG.getConstant(0, dl, ElementType));
25415 continue;
25416 }
25417 auto *ND = cast<ConstantSDNode>(CurrentOp);
25418 const APInt &C = ND->getAPIntValue();
25419 Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
25420 }
25421 break;
25422 }
25423
25424 return DAG.getBuildVector(VT, dl, Elts);
25425 }
25426
25427 return DAG.getNode(Opc, dl, VT, SrcOp,
25428 DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));
25429}
25430
25431/// Handle vector element shifts where the shift amount may or may not be a
25432/// constant. Takes immediate version of shift as input.
25433static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
25434 SDValue SrcOp, SDValue ShAmt,
25435 const X86Subtarget &Subtarget,
25436 SelectionDAG &DAG) {
25437 MVT SVT = ShAmt.getSimpleValueType();
25438 assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!")(static_cast<void> (0));
25439
25440 // Catch shift-by-constant.
25441 if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
25442 return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
25443 CShAmt->getZExtValue(), DAG);
25444
25445 // Change opcode to non-immediate version.
25446 Opc = getTargetVShiftUniformOpcode(Opc, true);
25447
25448 // Need to build a vector containing shift amount.
25449 // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
25450 // +====================+============+=======================================+
25451 // | ShAmt is | HasSSE4.1? | Construct ShAmt vector as |
25452 // +====================+============+=======================================+
25453 // | i64 | Yes, No | Use ShAmt as lowest elt |
25454 // | i32 | Yes | zero-extend in-reg |
25455 // | (i32 zext(i16/i8)) | Yes | zero-extend in-reg |
25456 // | (i32 zext(i16/i8)) | No | byte-shift-in-reg |
25457 // | i16/i32 | No | v4i32 build_vector(ShAmt, 0, ud, ud)) |
25458 // +====================+============+=======================================+
25459
25460 if (SVT == MVT::i64)
25461 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
25462 else if (ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
25463 ShAmt.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
25464 (ShAmt.getOperand(0).getSimpleValueType() == MVT::i16 ||
25465 ShAmt.getOperand(0).getSimpleValueType() == MVT::i8)) {
25466 ShAmt = ShAmt.getOperand(0);
25467 MVT AmtTy = ShAmt.getSimpleValueType() == MVT::i8 ? MVT::v16i8 : MVT::v8i16;
25468 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), AmtTy, ShAmt);
25469 if (Subtarget.hasSSE41())
25470 ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
25471 MVT::v2i64, ShAmt);
25472 else {
25473 SDValue ByteShift = DAG.getTargetConstant(
25474 (128 - AmtTy.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
25475 ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);
25476 ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
25477 ByteShift);
25478 ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
25479 ByteShift);
25480 }
25481 } else if (Subtarget.hasSSE41() &&
25482 ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
25483 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
25484 ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
25485 MVT::v2i64, ShAmt);
25486 } else {
25487 SDValue ShOps[4] = {ShAmt, DAG.getConstant(0, dl, SVT), DAG.getUNDEF(SVT),
25488 DAG.getUNDEF(SVT)};
25489 ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);
25490 }
25491
25492 // The return type has to be a 128-bit type with the same element
25493 // type as the input type.
25494 MVT EltVT = VT.getVectorElementType();
25495 MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());
25496
25497 ShAmt = DAG.getBitcast(ShVT, ShAmt);
25498 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
25499}
25500
25501/// Return Mask with the necessary casting or extending
25502/// for \p Mask according to \p MaskVT when lowering masking intrinsics
25503static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
25504 const X86Subtarget &Subtarget, SelectionDAG &DAG,
25505 const SDLoc &dl) {
25506
25507 if (isAllOnesConstant(Mask))
25508 return DAG.getConstant(1, dl, MaskVT);
25509 if (X86::isZeroNode(Mask))
25510 return DAG.getConstant(0, dl, MaskVT);
25511
25512 assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!")(static_cast<void> (0));
25513
25514 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
25515 assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!")(static_cast<void> (0));
25516 assert(Subtarget.hasBWI() && "Expected AVX512BW target!")(static_cast<void> (0));
25517 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
25518 SDValue Lo, Hi;
25519 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
25520 DAG.getConstant(0, dl, MVT::i32));
25521 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
25522 DAG.getConstant(1, dl, MVT::i32));
25523
25524 Lo = DAG.getBitcast(MVT::v32i1, Lo);
25525 Hi = DAG.getBitcast(MVT::v32i1, Hi);
25526
25527 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
25528 } else {
25529 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
25530 Mask.getSimpleValueType().getSizeInBits());
25531 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
25532 // are extracted by EXTRACT_SUBVECTOR.
25533 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
25534 DAG.getBitcast(BitcastVT, Mask),
25535 DAG.getIntPtrConstant(0, dl));
25536 }
25537}
25538
25539/// Return (and \p Op, \p Mask) for compare instructions or
25540/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
25541/// necessary casting or extending for \p Mask when lowering masking intrinsics
25542static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
25543 SDValue PreservedSrc,
25544 const X86Subtarget &Subtarget,
25545 SelectionDAG &DAG) {
25546 MVT VT = Op.getSimpleValueType();
25547 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
25548 unsigned OpcodeSelect = ISD::VSELECT;
25549 SDLoc dl(Op);
25550
25551 if (isAllOnesConstant(Mask))
25552 return Op;
25553
25554 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
25555
25556 if (PreservedSrc.isUndef())
25557 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
25558 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
25559}
25560
25561/// Creates an SDNode for a predicated scalar operation.
25562/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
25563/// The mask is coming as MVT::i8 and it should be transformed
25564/// to MVT::v1i1 while lowering masking intrinsics.
25565/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
25566/// "X86select" instead of "vselect". We just can't create the "vselect" node
25567/// for a scalar instruction.
25568static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
25569 SDValue PreservedSrc,
25570 const X86Subtarget &Subtarget,
25571 SelectionDAG &DAG) {
25572
25573 if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
25574 if (MaskConst->getZExtValue() & 0x1)
25575 return Op;
25576
25577 MVT VT = Op.getSimpleValueType();
25578 SDLoc dl(Op);
25579
25580 assert(Mask.getValueType() == MVT::i8 && "Unexpect type")(static_cast<void> (0));
25581 SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1,
25582 DAG.getBitcast(MVT::v8i1, Mask),
25583 DAG.getIntPtrConstant(0, dl));
25584 if (Op.getOpcode() == X86ISD::FSETCCM ||
25585 Op.getOpcode() == X86ISD::FSETCCM_SAE ||
25586 Op.getOpcode() == X86ISD::VFPCLASSS)
25587 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
25588
25589 if (PreservedSrc.isUndef())
25590 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
25591 return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
25592}
25593
25594static int getSEHRegistrationNodeSize(const Function *Fn) {
25595 if (!Fn->hasPersonalityFn())
25596 report_fatal_error(
25597 "querying registration node size for function without personality");
25598 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
25599 // WinEHStatePass for the full struct definition.
25600 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
25601 case EHPersonality::MSVC_X86SEH: return 24;
25602 case EHPersonality::MSVC_CXX: return 16;
25603 default: break;
25604 }
25605 report_fatal_error(
25606 "can only recover FP for 32-bit MSVC EH personality functions");
25607}
25608
25609/// When the MSVC runtime transfers control to us, either to an outlined
25610/// function or when returning to a parent frame after catching an exception, we
25611/// recover the parent frame pointer by doing arithmetic on the incoming EBP.
25612/// Here's the math:
25613/// RegNodeBase = EntryEBP - RegNodeSize
25614/// ParentFP = RegNodeBase - ParentFrameOffset
25615/// Subtracting RegNodeSize takes us to the offset of the registration node, and
25616/// subtracting the offset (negative on x86) takes us back to the parent FP.
25617static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
25618 SDValue EntryEBP) {
25619 MachineFunction &MF = DAG.getMachineFunction();
25620 SDLoc dl;
25621
25622 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25623 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
25624
25625 // It's possible that the parent function no longer has a personality function
25626 // if the exceptional code was optimized away, in which case we just return
25627 // the incoming EBP.
25628 if (!Fn->hasPersonalityFn())
25629 return EntryEBP;
25630
25631 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
25632 // registration, or the .set_setframe offset.
25633 MCSymbol *OffsetSym =
25634 MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
25635 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
25636 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
25637 SDValue ParentFrameOffset =
25638 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
25639
25640 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
25641 // prologue to RBP in the parent function.
25642 const X86Subtarget &Subtarget =
25643 static_cast<const X86Subtarget &>(DAG.getSubtarget());
25644 if (Subtarget.is64Bit())
25645 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
25646
25647 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
25648 // RegNodeBase = EntryEBP - RegNodeSize
25649 // ParentFP = RegNodeBase - ParentFrameOffset
25650 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
25651 DAG.getConstant(RegNodeSize, dl, PtrVT));
25652 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
25653}
25654
25655SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
25656 SelectionDAG &DAG) const {
25657 // Helper to detect if the operand is CUR_DIRECTION rounding mode.
25658 auto isRoundModeCurDirection = [](SDValue Rnd) {
25659 if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
25660 return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;
25661
25662 return false;
25663 };
25664 auto isRoundModeSAE = [](SDValue Rnd) {
25665 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
25666 unsigned RC = C->getZExtValue();
25667 if (RC & X86::STATIC_ROUNDING::NO_EXC) {
25668 // Clear the NO_EXC bit and check remaining bits.
25669 RC ^= X86::STATIC_ROUNDING::NO_EXC;
25670 // As a convenience we allow no other bits or explicitly
25671 // current direction.
25672 return RC == 0 || RC == X86::STATIC_ROUNDING::CUR_DIRECTION;
25673 }
25674 }
25675
25676 return false;
25677 };
25678 auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) {
25679 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
25680 RC = C->getZExtValue();
25681 if (RC & X86::STATIC_ROUNDING::NO_EXC) {
25682 // Clear the NO_EXC bit and check remaining bits.
25683 RC ^= X86::STATIC_ROUNDING::NO_EXC;
25684 return RC == X86::STATIC_ROUNDING::TO_NEAREST_INT ||
25685 RC == X86::STATIC_ROUNDING::TO_NEG_INF ||
25686 RC == X86::STATIC_ROUNDING::TO_POS_INF ||
25687 RC == X86::STATIC_ROUNDING::TO_ZERO;
25688 }
25689 }
25690
25691 return false;
25692 };
25693
25694 SDLoc dl(Op);
25695 unsigned IntNo = Op.getConstantOperandVal(0);
25696 MVT VT = Op.getSimpleValueType();
25697 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
25698
25699 // Propagate flags from original node to transformed node(s).
25700 SelectionDAG::FlagInserter FlagsInserter(DAG, Op->getFlags());
25701
25702 if (IntrData) {
25703 switch(IntrData->Type) {
25704 case INTR_TYPE_1OP: {
25705 // We specify 2 possible opcodes for intrinsics with rounding modes.
25706 // First, we check if the intrinsic may have non-default rounding mode,
25707 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25708 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25709 if (IntrWithRoundingModeOpcode != 0) {
25710 SDValue Rnd = Op.getOperand(2);
25711 unsigned RC = 0;
25712 if (isRoundModeSAEToX(Rnd, RC))
25713 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25714 Op.getOperand(1),
25715 DAG.getTargetConstant(RC, dl, MVT::i32));
25716 if (!isRoundModeCurDirection(Rnd))
25717 return SDValue();
25718 }
25719 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25720 Op.getOperand(1));
25721 }
25722 case INTR_TYPE_1OP_SAE: {
25723 SDValue Sae = Op.getOperand(2);
25724
25725 unsigned Opc;
25726 if (isRoundModeCurDirection(Sae))
25727 Opc = IntrData->Opc0;
25728 else if (isRoundModeSAE(Sae))
25729 Opc = IntrData->Opc1;
25730 else
25731 return SDValue();
25732
25733 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1));
25734 }
25735 case INTR_TYPE_2OP: {
25736 SDValue Src2 = Op.getOperand(2);
25737
25738 // We specify 2 possible opcodes for intrinsics with rounding modes.
25739 // First, we check if the intrinsic may have non-default rounding mode,
25740 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25741 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25742 if (IntrWithRoundingModeOpcode != 0) {
25743 SDValue Rnd = Op.getOperand(3);
25744 unsigned RC = 0;
25745 if (isRoundModeSAEToX(Rnd, RC))
25746 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25747 Op.getOperand(1), Src2,
25748 DAG.getTargetConstant(RC, dl, MVT::i32));
25749 if (!isRoundModeCurDirection(Rnd))
25750 return SDValue();
25751 }
25752
25753 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25754 Op.getOperand(1), Src2);
25755 }
25756 case INTR_TYPE_2OP_SAE: {
25757 SDValue Sae = Op.getOperand(3);
25758
25759 unsigned Opc;
25760 if (isRoundModeCurDirection(Sae))
25761 Opc = IntrData->Opc0;
25762 else if (isRoundModeSAE(Sae))
25763 Opc = IntrData->Opc1;
25764 else
25765 return SDValue();
25766
25767 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
25768 Op.getOperand(2));
25769 }
25770 case INTR_TYPE_3OP:
25771 case INTR_TYPE_3OP_IMM8: {
25772 SDValue Src1 = Op.getOperand(1);
25773 SDValue Src2 = Op.getOperand(2);
25774 SDValue Src3 = Op.getOperand(3);
25775
25776 if (IntrData->Type == INTR_TYPE_3OP_IMM8 &&
25777 Src3.getValueType() != MVT::i8) {
25778 Src3 = DAG.getTargetConstant(
25779 cast<ConstantSDNode>(Src3)->getZExtValue() & 0xff, dl, MVT::i8);
25780 }
25781
25782 // We specify 2 possible opcodes for intrinsics with rounding modes.
25783 // First, we check if the intrinsic may have non-default rounding mode,
25784 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25785 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25786 if (IntrWithRoundingModeOpcode != 0) {
25787 SDValue Rnd = Op.getOperand(4);
25788 unsigned RC = 0;
25789 if (isRoundModeSAEToX(Rnd, RC))
25790 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25791 Src1, Src2, Src3,
25792 DAG.getTargetConstant(RC, dl, MVT::i32));
25793 if (!isRoundModeCurDirection(Rnd))
25794 return SDValue();
25795 }
25796
25797 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25798 {Src1, Src2, Src3});
25799 }
25800 case INTR_TYPE_4OP_IMM8: {
25801 assert(Op.getOperand(4)->getOpcode() == ISD::TargetConstant)(static_cast<void> (0));
25802 SDValue Src4 = Op.getOperand(4);
25803 if (Src4.getValueType() != MVT::i8) {
25804 Src4 = DAG.getTargetConstant(
25805 cast<ConstantSDNode>(Src4)->getZExtValue() & 0xff, dl, MVT::i8);
25806 }
25807
25808 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25809 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
25810 Src4);
25811 }
25812 case INTR_TYPE_1OP_MASK: {
25813 SDValue Src = Op.getOperand(1);
25814 SDValue PassThru = Op.getOperand(2);
25815 SDValue Mask = Op.getOperand(3);
25816 // We add rounding mode to the Node when
25817 // - RC Opcode is specified and
25818 // - RC is not "current direction".
25819 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25820 if (IntrWithRoundingModeOpcode != 0) {
25821 SDValue Rnd = Op.getOperand(4);
25822 unsigned RC = 0;
25823 if (isRoundModeSAEToX(Rnd, RC))
25824 return getVectorMaskingNode(
25825 DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25826 Src, DAG.getTargetConstant(RC, dl, MVT::i32)),
25827 Mask, PassThru, Subtarget, DAG);
25828 if (!isRoundModeCurDirection(Rnd))
25829 return SDValue();
25830 }
25831 return getVectorMaskingNode(
25832 DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,
25833 Subtarget, DAG);
25834 }
25835 case INTR_TYPE_1OP_MASK_SAE: {
25836 SDValue Src = Op.getOperand(1);
25837 SDValue PassThru = Op.getOperand(2);
25838 SDValue Mask = Op.getOperand(3);
25839 SDValue Rnd = Op.getOperand(4);
25840
25841 unsigned Opc;
25842 if (isRoundModeCurDirection(Rnd))
25843 Opc = IntrData->Opc0;
25844 else if (isRoundModeSAE(Rnd))
25845 Opc = IntrData->Opc1;
25846 else
25847 return SDValue();
25848
25849 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), Mask, PassThru,
25850 Subtarget, DAG);
25851 }
25852 case INTR_TYPE_SCALAR_MASK: {
25853 SDValue Src1 = Op.getOperand(1);
25854 SDValue Src2 = Op.getOperand(2);
25855 SDValue passThru = Op.getOperand(3);
25856 SDValue Mask = Op.getOperand(4);
25857 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25858 // There are 2 kinds of intrinsics in this group:
25859 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
25860 // (2) With rounding mode and sae - 7 operands.
25861 bool HasRounding = IntrWithRoundingModeOpcode != 0;
25862 if (Op.getNumOperands() == (5U + HasRounding)) {
25863 if (HasRounding) {
25864 SDValue Rnd = Op.getOperand(5);
25865 unsigned RC = 0;
25866 if (isRoundModeSAEToX(Rnd, RC))
25867 return getScalarMaskingNode(
25868 DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2,
25869 DAG.getTargetConstant(RC, dl, MVT::i32)),
25870 Mask, passThru, Subtarget, DAG);
25871 if (!isRoundModeCurDirection(Rnd))
25872 return SDValue();
25873 }
25874 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
25875 Src2),
25876 Mask, passThru, Subtarget, DAG);
25877 }
25878
25879 assert(Op.getNumOperands() == (6U + HasRounding) &&(static_cast<void> (0))
25880 "Unexpected intrinsic form")(static_cast<void> (0));
25881 SDValue RoundingMode = Op.getOperand(5);
25882 unsigned Opc = IntrData->Opc0;
25883 if (HasRounding) {
25884 SDValue Sae = Op.getOperand(6);
25885 if (isRoundModeSAE(Sae))
25886 Opc = IntrWithRoundingModeOpcode;
25887 else if (!isRoundModeCurDirection(Sae))
25888 return SDValue();
25889 }
25890 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,
25891 Src2, RoundingMode),
25892 Mask, passThru, Subtarget, DAG);
25893 }
25894 case INTR_TYPE_SCALAR_MASK_RND: {
25895 SDValue Src1 = Op.getOperand(1);
25896 SDValue Src2 = Op.getOperand(2);
25897 SDValue passThru = Op.getOperand(3);
25898 SDValue Mask = Op.getOperand(4);
25899 SDValue Rnd = Op.getOperand(5);
25900
25901 SDValue NewOp;
25902 unsigned RC = 0;
25903 if (isRoundModeCurDirection(Rnd))
25904 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
25905 else if (isRoundModeSAEToX(Rnd, RC))
25906 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
25907 DAG.getTargetConstant(RC, dl, MVT::i32));
25908 else
25909 return SDValue();
25910
25911 return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG);
25912 }
25913 case INTR_TYPE_SCALAR_MASK_SAE: {
25914 SDValue Src1 = Op.getOperand(1);
25915 SDValue Src2 = Op.getOperand(2);
25916 SDValue passThru = Op.getOperand(3);
25917 SDValue Mask = Op.getOperand(4);
25918 SDValue Sae = Op.getOperand(5);
25919 unsigned Opc;
25920 if (isRoundModeCurDirection(Sae))
25921 Opc = IntrData->Opc0;
25922 else if (isRoundModeSAE(Sae))
25923 Opc = IntrData->Opc1;
25924 else
25925 return SDValue();
25926
25927 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
25928 Mask, passThru, Subtarget, DAG);
25929 }
25930 case INTR_TYPE_2OP_MASK: {
25931 SDValue Src1 = Op.getOperand(1);
25932 SDValue Src2 = Op.getOperand(2);
25933 SDValue PassThru = Op.getOperand(3);
25934 SDValue Mask = Op.getOperand(4);
25935 SDValue NewOp;
25936 if (IntrData->Opc1 != 0) {
25937 SDValue Rnd = Op.getOperand(5);
25938 unsigned RC = 0;
25939 if (isRoundModeSAEToX(Rnd, RC))
25940 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
25941 DAG.getTargetConstant(RC, dl, MVT::i32));
25942 else if (!isRoundModeCurDirection(Rnd))
25943 return SDValue();
25944 }
25945 if (!NewOp)
25946 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
25947 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
25948 }
25949 case INTR_TYPE_2OP_MASK_SAE: {
25950 SDValue Src1 = Op.getOperand(1);
25951 SDValue Src2 = Op.getOperand(2);
25952 SDValue PassThru = Op.getOperand(3);
25953 SDValue Mask = Op.getOperand(4);
25954
25955 unsigned Opc = IntrData->Opc0;
25956 if (IntrData->Opc1 != 0) {
25957 SDValue Sae = Op.getOperand(5);
25958 if (isRoundModeSAE(Sae))
25959 Opc = IntrData->Opc1;
25960 else if (!isRoundModeCurDirection(Sae))
25961 return SDValue();
25962 }
25963
25964 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
25965 Mask, PassThru, Subtarget, DAG);
25966 }
25967 case INTR_TYPE_3OP_SCALAR_MASK_SAE: {
25968 SDValue Src1 = Op.getOperand(1);
25969 SDValue Src2 = Op.getOperand(2);
25970 SDValue Src3 = Op.getOperand(3);
25971 SDValue PassThru = Op.getOperand(4);
25972 SDValue Mask = Op.getOperand(5);
25973 SDValue Sae = Op.getOperand(6);
25974 unsigned Opc;
25975 if (isRoundModeCurDirection(Sae))
25976 Opc = IntrData->Opc0;
25977 else if (isRoundModeSAE(Sae))
25978 Opc = IntrData->Opc1;
25979 else
25980 return SDValue();
25981
25982 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
25983 Mask, PassThru, Subtarget, DAG);
25984 }
25985 case INTR_TYPE_3OP_MASK_SAE: {
25986 SDValue Src1 = Op.getOperand(1);
25987 SDValue Src2 = Op.getOperand(2);
25988 SDValue Src3 = Op.getOperand(3);
25989 SDValue PassThru = Op.getOperand(4);
25990 SDValue Mask = Op.getOperand(5);
25991
25992 unsigned Opc = IntrData->Opc0;
25993 if (IntrData->Opc1 != 0) {
25994 SDValue Sae = Op.getOperand(6);
25995 if (isRoundModeSAE(Sae))
25996 Opc = IntrData->Opc1;
25997 else if (!isRoundModeCurDirection(Sae))
25998 return SDValue();
25999 }
26000 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
26001 Mask, PassThru, Subtarget, DAG);
26002 }
26003 case BLENDV: {
26004 SDValue Src1 = Op.getOperand(1);
26005 SDValue Src2 = Op.getOperand(2);
26006 SDValue Src3 = Op.getOperand(3);
26007
26008 EVT MaskVT = Src3.getValueType().changeVectorElementTypeToInteger();
26009 Src3 = DAG.getBitcast(MaskVT, Src3);
26010
26011 // Reverse the operands to match VSELECT order.
26012 return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);
26013 }
26014 case VPERM_2OP : {
26015 SDValue Src1 = Op.getOperand(1);
26016 SDValue Src2 = Op.getOperand(2);
26017
26018 // Swap Src1 and Src2 in the node creation
26019 return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
26020 }
26021 case FMA_OP_MASKZ:
26022 case FMA_OP_MASK: {
26023 SDValue Src1 = Op.getOperand(1);
26024 SDValue Src2 = Op.getOperand(2);
26025 SDValue Src3 = Op.getOperand(3);
26026 SDValue Mask = Op.getOperand(4);
26027 MVT VT = Op.getSimpleValueType();
26028
26029 SDValue PassThru = Src1;
26030 if (IntrData->Type == FMA_OP_MASKZ)
26031 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
26032
26033 // We add rounding mode to the Node when
26034 // - RC Opcode is specified and
26035 // - RC is not "current direction".
26036 SDValue NewOp;
26037 if (IntrData->Opc1 != 0) {
26038 SDValue Rnd = Op.getOperand(5);
26039 unsigned RC = 0;
26040 if (isRoundModeSAEToX(Rnd, RC))
26041 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2, Src3,
26042 DAG.getTargetConstant(RC, dl, MVT::i32));
26043 else if (!isRoundModeCurDirection(Rnd))
26044 return SDValue();
26045 }
26046 if (!NewOp)
26047 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, Src3);
26048 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
26049 }
26050 case IFMA_OP:
26051 // NOTE: We need to swizzle the operands to pass the multiply operands
26052 // first.
26053 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26054 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
26055 case FPCLASSS: {
26056 SDValue Src1 = Op.getOperand(1);
26057 SDValue Imm = Op.getOperand(2);
26058 SDValue Mask = Op.getOperand(3);
26059 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
26060 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
26061 Subtarget, DAG);
26062 // Need to fill with zeros to ensure the bitcast will produce zeroes
26063 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
26064 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
26065 DAG.getConstant(0, dl, MVT::v8i1),
26066 FPclassMask, DAG.getIntPtrConstant(0, dl));
26067 return DAG.getBitcast(MVT::i8, Ins);
26068 }
26069
26070 case CMP_MASK_CC: {
26071 MVT MaskVT = Op.getSimpleValueType();
26072 SDValue CC = Op.getOperand(3);
26073 SDValue Mask = Op.getOperand(4);
26074 // We specify 2 possible opcodes for intrinsics with rounding modes.
26075 // First, we check if the intrinsic may have non-default rounding mode,
26076 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26077 if (IntrData->Opc1 != 0) {
26078 SDValue Sae = Op.getOperand(5);
26079 if (isRoundModeSAE(Sae))
26080 return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
26081 Op.getOperand(2), CC, Mask, Sae);
26082 if (!isRoundModeCurDirection(Sae))
26083 return SDValue();
26084 }
26085 //default rounding mode
26086 return DAG.getNode(IntrData->Opc0, dl, MaskVT,
26087 {Op.getOperand(1), Op.getOperand(2), CC, Mask});
26088 }
26089 case CMP_MASK_SCALAR_CC: {
26090 SDValue Src1 = Op.getOperand(1);
26091 SDValue Src2 = Op.getOperand(2);
26092 SDValue CC = Op.getOperand(3);
26093 SDValue Mask = Op.getOperand(4);
26094
26095 SDValue Cmp;
26096 if (IntrData->Opc1 != 0) {
26097 SDValue Sae = Op.getOperand(5);
26098 if (isRoundModeSAE(Sae))
26099 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);
26100 else if (!isRoundModeCurDirection(Sae))
26101 return SDValue();
26102 }
26103 //default rounding mode
26104 if (!Cmp.getNode())
26105 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
26106
26107 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
26108 Subtarget, DAG);
26109 // Need to fill with zeros to ensure the bitcast will produce zeroes
26110 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
26111 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
26112 DAG.getConstant(0, dl, MVT::v8i1),
26113 CmpMask, DAG.getIntPtrConstant(0, dl));
26114 return DAG.getBitcast(MVT::i8, Ins);
26115 }
26116 case COMI: { // Comparison intrinsics
26117 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
26118 SDValue LHS = Op.getOperand(1);
26119 SDValue RHS = Op.getOperand(2);
26120 // Some conditions require the operands to be swapped.
26121 if (CC == ISD::SETLT || CC == ISD::SETLE)
26122 std::swap(LHS, RHS);
26123
26124 SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
26125 SDValue SetCC;
26126 switch (CC) {
26127 case ISD::SETEQ: { // (ZF = 0 and PF = 0)
26128 SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
26129 SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
26130 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
26131 break;
26132 }
26133 case ISD::SETNE: { // (ZF = 1 or PF = 1)
26134 SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
26135 SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
26136 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
26137 break;
26138 }
26139 case ISD::SETGT: // (CF = 0 and ZF = 0)
26140 case ISD::SETLT: { // Condition opposite to GT. Operands swapped above.
26141 SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
26142 break;
26143 }
26144 case ISD::SETGE: // CF = 0
26145 case ISD::SETLE: // Condition opposite to GE. Operands swapped above.
26146 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
26147 break;
26148 default:
26149 llvm_unreachable("Unexpected illegal condition!")__builtin_unreachable();
26150 }
26151 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
26152 }
26153 case COMI_RM: { // Comparison intrinsics with Sae
26154 SDValue LHS = Op.getOperand(1);
26155 SDValue RHS = Op.getOperand(2);
26156 unsigned CondVal = Op.getConstantOperandVal(3);
26157 SDValue Sae = Op.getOperand(4);
26158
26159 SDValue FCmp;
26160 if (isRoundModeCurDirection(Sae))
26161 FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
26162 DAG.getTargetConstant(CondVal, dl, MVT::i8));
26163 else if (isRoundModeSAE(Sae))
26164 FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,
26165 DAG.getTargetConstant(CondVal, dl, MVT::i8), Sae);
26166 else
26167 return SDValue();
26168 // Need to fill with zeros to ensure the bitcast will produce zeroes
26169 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
26170 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
26171 DAG.getConstant(0, dl, MVT::v16i1),
26172 FCmp, DAG.getIntPtrConstant(0, dl));
26173 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
26174 DAG.getBitcast(MVT::i16, Ins));
26175 }
26176 case VSHIFT:
26177 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
26178 Op.getOperand(1), Op.getOperand(2), Subtarget,
26179 DAG);
26180 case COMPRESS_EXPAND_IN_REG: {
26181 SDValue Mask = Op.getOperand(3);
26182 SDValue DataToCompress = Op.getOperand(1);
26183 SDValue PassThru = Op.getOperand(2);
26184 if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is
26185 return Op.getOperand(1);
26186
26187 // Avoid false dependency.
26188 if (PassThru.isUndef())
26189 PassThru = DAG.getConstant(0, dl, VT);
26190
26191 return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,
26192 Mask);
26193 }
26194 case FIXUPIMM:
26195 case FIXUPIMM_MASKZ: {
26196 SDValue Src1 = Op.getOperand(1);
26197 SDValue Src2 = Op.getOperand(2);
26198 SDValue Src3 = Op.getOperand(3);
26199 SDValue Imm = Op.getOperand(4);
26200 SDValue Mask = Op.getOperand(5);
26201 SDValue Passthru = (IntrData->Type == FIXUPIMM)
26202 ? Src1
26203 : getZeroVector(VT, Subtarget, DAG, dl);
26204
26205 unsigned Opc = IntrData->Opc0;
26206 if (IntrData->Opc1 != 0) {
26207 SDValue Sae = Op.getOperand(6);
26208 if (isRoundModeSAE(Sae))
26209 Opc = IntrData->Opc1;
26210 else if (!isRoundModeCurDirection(Sae))
26211 return SDValue();
26212 }
26213
26214 SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm);
26215
26216 if (Opc == X86ISD::VFIXUPIMM || Opc == X86ISD::VFIXUPIMM_SAE)
26217 return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
26218
26219 return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
26220 }
26221 case ROUNDP: {
26222 assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode")(static_cast<void> (0));
26223 // Clear the upper bits of the rounding immediate so that the legacy
26224 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
26225 auto Round = cast<ConstantSDNode>(Op.getOperand(2));
26226 SDValue RoundingMode =
26227 DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);
26228 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26229 Op.getOperand(1), RoundingMode);
26230 }
26231 case ROUNDS: {
26232 assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode")(static_cast<void> (0));
26233 // Clear the upper bits of the rounding immediate so that the legacy
26234 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
26235 auto Round = cast<ConstantSDNode>(Op.getOperand(3));
26236 SDValue RoundingMode =
26237 DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);
26238 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26239 Op.getOperand(1), Op.getOperand(2), RoundingMode);
26240 }
26241 case BEXTRI: {
26242 assert(IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode")(static_cast<void> (0));
26243
26244 uint64_t Imm = Op.getConstantOperandVal(2);
26245 SDValue Control = DAG.getTargetConstant(Imm & 0xffff, dl,
26246 Op.getValueType());
26247 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26248 Op.getOperand(1), Control);
26249 }
26250 // ADC/ADCX/SBB
26251 case ADX: {
26252 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
26253 SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32);
26254
26255 SDValue Res;
26256 // If the carry in is zero, then we should just use ADD/SUB instead of
26257 // ADC/SBB.
26258 if (isNullConstant(Op.getOperand(1))) {
26259 Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),
26260 Op.getOperand(3));
26261 } else {
26262 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1),
26263 DAG.getConstant(-1, dl, MVT::i8));
26264 Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),
26265 Op.getOperand(3), GenCF.getValue(1));
26266 }
26267 SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
26268 SDValue Results[] = { SetCC, Res };
26269 return DAG.getMergeValues(Results, dl);
26270 }
26271 case CVTPD2PS_MASK:
26272 case CVTPD2DQ_MASK:
26273 case CVTQQ2PS_MASK:
26274 case TRUNCATE_TO_REG: {
26275 SDValue Src = Op.getOperand(1);
26276 SDValue PassThru = Op.getOperand(2);
26277 SDValue Mask = Op.getOperand(3);
26278
26279 if (isAllOnesConstant(Mask))
26280 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
26281
26282 MVT SrcVT = Src.getSimpleValueType();
26283 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
26284 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26285 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
26286 {Src, PassThru, Mask});
26287 }
26288 case CVTPS2PH_MASK: {
26289 SDValue Src = Op.getOperand(1);
26290 SDValue Rnd = Op.getOperand(2);
26291 SDValue PassThru = Op.getOperand(3);
26292 SDValue Mask = Op.getOperand(4);
26293
26294 if (isAllOnesConstant(Mask))
26295 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src, Rnd);
26296
26297 MVT SrcVT = Src.getSimpleValueType();
26298 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
26299 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26300 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, Rnd,
26301 PassThru, Mask);
26302
26303 }
26304 case CVTNEPS2BF16_MASK: {
26305 SDValue Src = Op.getOperand(1);
26306 SDValue PassThru = Op.getOperand(2);
26307 SDValue Mask = Op.getOperand(3);
26308
26309 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
26310 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
26311
26312 // Break false dependency.
26313 if (PassThru.isUndef())
26314 PassThru = DAG.getConstant(0, dl, PassThru.getValueType());
26315
26316 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
26317 Mask);
26318 }
26319 default:
26320 break;
26321 }
26322 }
26323
26324 switch (IntNo) {
26325 default: return SDValue(); // Don't custom lower most intrinsics.
26326
26327 // ptest and testp intrinsics. The intrinsic these come from are designed to
26328 // return an integer value, not just an instruction so lower it to the ptest
26329 // or testp pattern and a setcc for the result.
26330 case Intrinsic::x86_avx512_ktestc_b:
26331 case Intrinsic::x86_avx512_ktestc_w:
26332 case Intrinsic::x86_avx512_ktestc_d:
26333 case Intrinsic::x86_avx512_ktestc_q:
26334 case Intrinsic::x86_avx512_ktestz_b:
26335 case Intrinsic::x86_avx512_ktestz_w:
26336 case Intrinsic::x86_avx512_ktestz_d:
26337 case Intrinsic::x86_avx512_ktestz_q:
26338 case Intrinsic::x86_sse41_ptestz:
26339 case Intrinsic::x86_sse41_ptestc:
26340 case Intrinsic::x86_sse41_ptestnzc:
26341 case Intrinsic::x86_avx_ptestz_256:
26342 case Intrinsic::x86_avx_ptestc_256:
26343 case Intrinsic::x86_avx_ptestnzc_256:
26344 case Intrinsic::x86_avx_vtestz_ps:
26345 case Intrinsic::x86_avx_vtestc_ps:
26346 case Intrinsic::x86_avx_vtestnzc_ps:
26347 case Intrinsic::x86_avx_vtestz_pd:
26348 case Intrinsic::x86_avx_vtestc_pd:
26349 case Intrinsic::x86_avx_vtestnzc_pd:
26350 case Intrinsic::x86_avx_vtestz_ps_256:
26351 case Intrinsic::x86_avx_vtestc_ps_256:
26352 case Intrinsic::x86_avx_vtestnzc_ps_256:
26353 case Intrinsic::x86_avx_vtestz_pd_256:
26354 case Intrinsic::x86_avx_vtestc_pd_256:
26355 case Intrinsic::x86_avx_vtestnzc_pd_256: {
26356 unsigned TestOpc = X86ISD::PTEST;
26357 X86::CondCode X86CC;
26358 switch (IntNo) {
26359 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.")__builtin_unreachable();
26360 case Intrinsic::x86_avx512_ktestc_b:
26361 case Intrinsic::x86_avx512_ktestc_w:
26362 case Intrinsic::x86_avx512_ktestc_d:
26363 case Intrinsic::x86_avx512_ktestc_q:
26364 // CF = 1
26365 TestOpc = X86ISD::KTEST;
26366 X86CC = X86::COND_B;
26367 break;
26368 case Intrinsic::x86_avx512_ktestz_b:
26369 case Intrinsic::x86_avx512_ktestz_w:
26370 case Intrinsic::x86_avx512_ktestz_d:
26371 case Intrinsic::x86_avx512_ktestz_q:
26372 TestOpc = X86ISD::KTEST;
26373 X86CC = X86::COND_E;
26374 break;
26375 case Intrinsic::x86_avx_vtestz_ps:
26376 case Intrinsic::x86_avx_vtestz_pd:
26377 case Intrinsic::x86_avx_vtestz_ps_256:
26378 case Intrinsic::x86_avx_vtestz_pd_256:
26379 TestOpc = X86ISD::TESTP;
26380 LLVM_FALLTHROUGH[[gnu::fallthrough]];
26381 case Intrinsic::x86_sse41_ptestz:
26382 case Intrinsic::x86_avx_ptestz_256:
26383 // ZF = 1
26384 X86CC = X86::COND_E;
26385 break;
26386 case Intrinsic::x86_avx_vtestc_ps:
26387 case Intrinsic::x86_avx_vtestc_pd:
26388 case Intrinsic::x86_avx_vtestc_ps_256:
26389 case Intrinsic::x86_avx_vtestc_pd_256:
26390 TestOpc = X86ISD::TESTP;
26391 LLVM_FALLTHROUGH[[gnu::fallthrough]];
26392 case Intrinsic::x86_sse41_ptestc:
26393 case Intrinsic::x86_avx_ptestc_256:
26394 // CF = 1
26395 X86CC = X86::COND_B;
26396 break;
26397 case Intrinsic::x86_avx_vtestnzc_ps:
26398 case Intrinsic::x86_avx_vtestnzc_pd:
26399 case Intrinsic::x86_avx_vtestnzc_ps_256:
26400 case Intrinsic::x86_avx_vtestnzc_pd_256:
26401 TestOpc = X86ISD::TESTP;
26402 LLVM_FALLTHROUGH[[gnu::fallthrough]];
26403 case Intrinsic::x86_sse41_ptestnzc:
26404 case Intrinsic::x86_avx_ptestnzc_256:
26405 // ZF and CF = 0
26406 X86CC = X86::COND_A;
26407 break;
26408 }
26409
26410 SDValue LHS = Op.getOperand(1);
26411 SDValue RHS = Op.getOperand(2);
26412 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
26413 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
26414 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
26415 }
26416
26417 case Intrinsic::x86_sse42_pcmpistria128:
26418 case Intrinsic::x86_sse42_pcmpestria128:
26419 case Intrinsic::x86_sse42_pcmpistric128:
26420 case Intrinsic::x86_sse42_pcmpestric128:
26421 case Intrinsic::x86_sse42_pcmpistrio128:
26422 case Intrinsic::x86_sse42_pcmpestrio128:
26423 case Intrinsic::x86_sse42_pcmpistris128:
26424 case Intrinsic::x86_sse42_pcmpestris128:
26425 case Intrinsic::x86_sse42_pcmpistriz128:
26426 case Intrinsic::x86_sse42_pcmpestriz128: {
26427 unsigned Opcode;
26428 X86::CondCode X86CC;
26429 switch (IntNo) {
26430 default: llvm_unreachable("Impossible intrinsic")__builtin_unreachable(); // Can't reach here.
26431 case Intrinsic::x86_sse42_pcmpistria128:
26432 Opcode = X86ISD::PCMPISTR;
26433 X86CC = X86::COND_A;
26434 break;
26435 case Intrinsic::x86_sse42_pcmpestria128:
26436 Opcode = X86ISD::PCMPESTR;
26437 X86CC = X86::COND_A;
26438 break;
26439 case Intrinsic::x86_sse42_pcmpistric128:
26440 Opcode = X86ISD::PCMPISTR;
26441 X86CC = X86::COND_B;
26442 break;
26443 case Intrinsic::x86_sse42_pcmpestric128:
26444 Opcode = X86ISD::PCMPESTR;
26445 X86CC = X86::COND_B;
26446 break;
26447 case Intrinsic::x86_sse42_pcmpistrio128:
26448 Opcode = X86ISD::PCMPISTR;
26449 X86CC = X86::COND_O;
26450 break;
26451 case Intrinsic::x86_sse42_pcmpestrio128:
26452 Opcode = X86ISD::PCMPESTR;
26453 X86CC = X86::COND_O;
26454 break;
26455 case Intrinsic::x86_sse42_pcmpistris128:
26456 Opcode = X86ISD::PCMPISTR;
26457 X86CC = X86::COND_S;
26458 break;
26459 case Intrinsic::x86_sse42_pcmpestris128:
26460 Opcode = X86ISD::PCMPESTR;
26461 X86CC = X86::COND_S;
26462 break;
26463 case Intrinsic::x86_sse42_pcmpistriz128:
26464 Opcode = X86ISD::PCMPISTR;
26465 X86CC = X86::COND_E;
26466 break;
26467 case Intrinsic::x86_sse42_pcmpestriz128:
26468 Opcode = X86ISD::PCMPESTR;
26469 X86CC = X86::COND_E;
26470 break;
26471 }
26472 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
26473 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
26474 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
26475 SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
26476 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
26477 }
26478
26479 case Intrinsic::x86_sse42_pcmpistri128:
26480 case Intrinsic::x86_sse42_pcmpestri128: {
26481 unsigned Opcode;
26482 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
26483 Opcode = X86ISD::PCMPISTR;
26484 else
26485 Opcode = X86ISD::PCMPESTR;
26486
26487 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
26488 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
26489 return DAG.getNode(Opcode, dl, VTs, NewOps);
26490 }
26491
26492 case Intrinsic::x86_sse42_pcmpistrm128:
26493 case Intrinsic::x86_sse42_pcmpestrm128: {
26494 unsigned Opcode;
26495 if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
26496 Opcode = X86ISD::PCMPISTR;
26497 else
26498 Opcode = X86ISD::PCMPESTR;
26499
26500 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
26501 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
26502 return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
26503 }
26504
26505 case Intrinsic::eh_sjlj_lsda: {
26506 MachineFunction &MF = DAG.getMachineFunction();
26507 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26508 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
26509 auto &Context = MF.getMMI().getContext();
26510 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
26511 Twine(MF.getFunctionNumber()));
26512 return DAG.getNode(getGlobalWrapperKind(), dl, VT,
26513 DAG.getMCSymbol(S, PtrVT));
26514 }
26515
26516 case Intrinsic::x86_seh_lsda: {
26517 // Compute the symbol for the LSDA. We know it'll get emitted later.
26518 MachineFunction &MF = DAG.getMachineFunction();
26519 SDValue Op1 = Op.getOperand(1);
26520 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
26521 MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
26522 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
26523
26524 // Generate a simple absolute symbol reference. This intrinsic is only
26525 // supported on 32-bit Windows, which isn't PIC.
26526 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
26527 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
26528 }
26529
26530 case Intrinsic::eh_recoverfp: {
26531 SDValue FnOp = Op.getOperand(1);
26532 SDValue IncomingFPOp = Op.getOperand(2);
26533 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
26534 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
26535 if (!Fn)
26536 report_fatal_error(
26537 "llvm.eh.recoverfp must take a function as the first argument");
26538 return recoverFramePointer(DAG, Fn, IncomingFPOp);
26539 }
26540
26541 case Intrinsic::localaddress: {
26542 // Returns one of the stack, base, or frame pointer registers, depending on
26543 // which is used to reference local variables.
26544 MachineFunction &MF = DAG.getMachineFunction();
26545 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26546 unsigned Reg;
26547 if (RegInfo->hasBasePointer(MF))
26548 Reg = RegInfo->getBaseRegister();
26549 else { // Handles the SP or FP case.
26550 bool CantUseFP = RegInfo->hasStackRealignment(MF);
26551 if (CantUseFP)
26552 Reg = RegInfo->getPtrSizedStackRegister(MF);
26553 else
26554 Reg = RegInfo->getPtrSizedFrameRegister(MF);
26555 }
26556 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
26557 }
26558 case Intrinsic::swift_async_context_addr: {
26559 auto &MF = DAG.getMachineFunction();
26560 auto X86FI = MF.getInfo<X86MachineFunctionInfo>();
26561 if (Subtarget.is64Bit()) {
26562 MF.getFrameInfo().setFrameAddressIsTaken(true);
26563 X86FI->setHasSwiftAsyncContext(true);
26564 return SDValue(
26565 DAG.getMachineNode(
26566 X86::SUB64ri8, dl, MVT::i64,
26567 DAG.getCopyFromReg(DAG.getEntryNode(), dl, X86::RBP, MVT::i64),
26568 DAG.getTargetConstant(8, dl, MVT::i32)),
26569 0);
26570 } else {
26571 // 32-bit so no special extended frame, create or reuse an existing stack
26572 // slot.
26573 if (!X86FI->getSwiftAsyncContextFrameIdx())
26574 X86FI->setSwiftAsyncContextFrameIdx(
26575 MF.getFrameInfo().CreateStackObject(4, Align(4), false));
26576 return DAG.getFrameIndex(*X86FI->getSwiftAsyncContextFrameIdx(), MVT::i32);
26577 }
26578 }
26579 case Intrinsic::x86_avx512_vp2intersect_q_512:
26580 case Intrinsic::x86_avx512_vp2intersect_q_256:
26581 case Intrinsic::x86_avx512_vp2intersect_q_128:
26582 case Intrinsic::x86_avx512_vp2intersect_d_512:
26583 case Intrinsic::x86_avx512_vp2intersect_d_256:
26584 case Intrinsic::x86_avx512_vp2intersect_d_128: {
26585 MVT MaskVT = Op.getSimpleValueType();
26586
26587 SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
26588 SDLoc DL(Op);
26589
26590 SDValue Operation =
26591 DAG.getNode(X86ISD::VP2INTERSECT, DL, VTs,
26592 Op->getOperand(1), Op->getOperand(2));
26593
26594 SDValue Result0 = DAG.getTargetExtractSubreg(X86::sub_mask_0, DL,
26595 MaskVT, Operation);
26596 SDValue Result1 = DAG.getTargetExtractSubreg(X86::sub_mask_1, DL,
26597 MaskVT, Operation);
26598 return DAG.getMergeValues({Result0, Result1}, DL);
26599 }
26600 case Intrinsic::x86_mmx_pslli_w:
26601 case Intrinsic::x86_mmx_pslli_d:
26602 case Intrinsic::x86_mmx_pslli_q:
26603 case Intrinsic::x86_mmx_psrli_w:
26604 case Intrinsic::x86_mmx_psrli_d:
26605 case Intrinsic::x86_mmx_psrli_q:
26606 case Intrinsic::x86_mmx_psrai_w:
26607 case Intrinsic::x86_mmx_psrai_d: {
26608 SDLoc DL(Op);
26609 SDValue ShAmt = Op.getOperand(2);
26610 // If the argument is a constant, convert it to a target constant.
26611 if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) {
26612 // Clamp out of bounds shift amounts since they will otherwise be masked
26613 // to 8-bits which may make it no longer out of bounds.
26614 unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);
26615 if (ShiftAmount == 0)
26616 return Op.getOperand(1);
26617
26618 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
26619 Op.getOperand(0), Op.getOperand(1),
26620 DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));
26621 }
26622
26623 unsigned NewIntrinsic;
26624 switch (IntNo) {
26625 default: llvm_unreachable("Impossible intrinsic")__builtin_unreachable(); // Can't reach here.
26626 case Intrinsic::x86_mmx_pslli_w:
26627 NewIntrinsic = Intrinsic::x86_mmx_psll_w;
26628 break;
26629 case Intrinsic::x86_mmx_pslli_d:
26630 NewIntrinsic = Intrinsic::x86_mmx_psll_d;
26631 break;
26632 case Intrinsic::x86_mmx_pslli_q:
26633 NewIntrinsic = Intrinsic::x86_mmx_psll_q;
26634 break;
26635 case Intrinsic::x86_mmx_psrli_w:
26636 NewIntrinsic = Intrinsic::x86_mmx_psrl_w;
26637 break;
26638 case Intrinsic::x86_mmx_psrli_d:
26639 NewIntrinsic = Intrinsic::x86_mmx_psrl_d;
26640 break;
26641 case Intrinsic::x86_mmx_psrli_q:
26642 NewIntrinsic = Intrinsic::x86_mmx_psrl_q;
26643 break;
26644 case Intrinsic::x86_mmx_psrai_w:
26645 NewIntrinsic = Intrinsic::x86_mmx_psra_w;
26646 break;
26647 case Intrinsic::x86_mmx_psrai_d:
26648 NewIntrinsic = Intrinsic::x86_mmx_psra_d;
26649 break;
26650 }
26651
26652 // The vector shift intrinsics with scalars uses 32b shift amounts but
26653 // the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an
26654 // MMX register.
26655 ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt);
26656 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
26657 DAG.getTargetConstant(NewIntrinsic, DL,
26658 getPointerTy(DAG.getDataLayout())),
26659 Op.getOperand(1), ShAmt);
26660 }
26661 }
26662}
26663
26664static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
26665 SDValue Src, SDValue Mask, SDValue Base,
26666 SDValue Index, SDValue ScaleOp, SDValue Chain,
26667 const X86Subtarget &Subtarget) {
26668 SDLoc dl(Op);
26669 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26670 // Scale must be constant.
26671 if (!C)
26672 return SDValue();
26673 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26674 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26675 TLI.getPointerTy(DAG.getDataLayout()));
26676 EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();
26677 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
26678 // If source is undef or we know it won't be used, use a zero vector
26679 // to break register dependency.
26680 // TODO: use undef instead and let BreakFalseDeps deal with it?
26681 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
26682 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
26683
26684 // Cast mask to an integer type.
26685 Mask = DAG.getBitcast(MaskVT, Mask);
26686
26687 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26688
26689 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
26690 SDValue Res =
26691 DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
26692 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
26693 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
26694}
26695
26696static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,
26697 SDValue Src, SDValue Mask, SDValue Base,
26698 SDValue Index, SDValue ScaleOp, SDValue Chain,
26699 const X86Subtarget &Subtarget) {
26700 MVT VT = Op.getSimpleValueType();
26701 SDLoc dl(Op);
26702 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26703 // Scale must be constant.
26704 if (!C)
26705 return SDValue();
26706 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26707 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26708 TLI.getPointerTy(DAG.getDataLayout()));
26709 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
26710 VT.getVectorNumElements());
26711 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
26712
26713 // We support two versions of the gather intrinsics. One with scalar mask and
26714 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
26715 if (Mask.getValueType() != MaskVT)
26716 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26717
26718 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
26719 // If source is undef or we know it won't be used, use a zero vector
26720 // to break register dependency.
26721 // TODO: use undef instead and let BreakFalseDeps deal with it?
26722 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
26723 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
26724
26725 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26726
26727 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
26728 SDValue Res =
26729 DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
26730 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
26731 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
26732}
26733
26734static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
26735 SDValue Src, SDValue Mask, SDValue Base,
26736 SDValue Index, SDValue ScaleOp, SDValue Chain,
26737 const X86Subtarget &Subtarget) {
26738 SDLoc dl(Op);
26739 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26740 // Scale must be constant.
26741 if (!C)
26742 return SDValue();
26743 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26744 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26745 TLI.getPointerTy(DAG.getDataLayout()));
26746 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
26747 Src.getSimpleValueType().getVectorNumElements());
26748 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
26749
26750 // We support two versions of the scatter intrinsics. One with scalar mask and
26751 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
26752 if (Mask.getValueType() != MaskVT)
26753 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26754
26755 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26756
26757 SDVTList VTs = DAG.getVTList(MVT::Other);
26758 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
26759 SDValue Res =
26760 DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
26761 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
26762 return Res;
26763}
26764
26765static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
26766 SDValue Mask, SDValue Base, SDValue Index,
26767 SDValue ScaleOp, SDValue Chain,
26768 const X86Subtarget &Subtarget) {
26769 SDLoc dl(Op);
26770 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26771 // Scale must be constant.
26772 if (!C)
26773 return SDValue();
26774 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26775 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26776 TLI.getPointerTy(DAG.getDataLayout()));
26777 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
26778 SDValue Segment = DAG.getRegister(0, MVT::i32);
26779 MVT MaskVT =
26780 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
26781 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26782 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
26783 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
26784 return SDValue(Res, 0);
26785}
26786
26787/// Handles the lowering of builtin intrinsics with chain that return their
26788/// value into registers EDX:EAX.
26789/// If operand ScrReg is a valid register identifier, then operand 2 of N is
26790/// copied to SrcReg. The assumption is that SrcReg is an implicit input to
26791/// TargetOpcode.
26792/// Returns a Glue value which can be used to add extra copy-from-reg if the
26793/// expanded intrinsics implicitly defines extra registers (i.e. not just
26794/// EDX:EAX).
26795static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL,
26796 SelectionDAG &DAG,
26797 unsigned TargetOpcode,
26798 unsigned SrcReg,
26799 const X86Subtarget &Subtarget,
26800 SmallVectorImpl<SDValue> &Results) {
26801 SDValue Chain = N->getOperand(0);
26802 SDValue Glue;
26803
26804 if (SrcReg) {
26805 assert(N->getNumOperands() == 3 && "Unexpected number of operands!")(static_cast<void> (0));
26806 Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);
26807 Glue = Chain.getValue(1);
26808 }
26809
26810 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
26811 SDValue N1Ops[] = {Chain, Glue};
26812 SDNode *N1 = DAG.getMachineNode(
26813 TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1));
26814 Chain = SDValue(N1, 0);
26815
26816 // Reads the content of XCR and returns it in registers EDX:EAX.
26817 SDValue LO, HI;
26818 if (Subtarget.is64Bit()) {
26819 LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
26820 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
26821 LO.getValue(2));
26822 } else {
26823 LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
26824 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
26825 LO.getValue(2));
26826 }
26827 Chain = HI.getValue(1);
26828 Glue = HI.getValue(2);
26829
26830 if (Subtarget.is64Bit()) {
26831 // Merge the two 32-bit values into a 64-bit one.
26832 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
26833 DAG.getConstant(32, DL, MVT::i8));
26834 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
26835 Results.push_back(Chain);
26836 return Glue;
26837 }
26838
26839 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
26840 SDValue Ops[] = { LO, HI };
26841 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
26842 Results.push_back(Pair);
26843 Results.push_back(Chain);
26844 return Glue;
26845}
26846
26847/// Handles the lowering of builtin intrinsics that read the time stamp counter
26848/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
26849/// READCYCLECOUNTER nodes.
26850static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
26851 SelectionDAG &DAG,
26852 const X86Subtarget &Subtarget,
26853 SmallVectorImpl<SDValue> &Results) {
26854 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
26855 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
26856 // and the EAX register is loaded with the low-order 32 bits.
26857 SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,
26858 /* NoRegister */0, Subtarget,
26859 Results);
26860 if (Opcode != X86::RDTSCP)
26861 return;
26862
26863 SDValue Chain = Results[1];
26864 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
26865 // the ECX register. Add 'ecx' explicitly to the chain.
26866 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue);
26867 Results[1] = ecx;
26868 Results.push_back(ecx.getValue(1));
26869}
26870
26871static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
26872 SelectionDAG &DAG) {
26873 SmallVector<SDValue, 3> Results;
26874 SDLoc DL(Op);
26875 getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget,
26876 Results);
26877 return DAG.getMergeValues(Results, DL);
26878}
26879
26880static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
26881 MachineFunction &MF = DAG.getMachineFunction();
26882 SDValue Chain = Op.getOperand(0);
26883 SDValue RegNode = Op.getOperand(2);
26884 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
26885 if (!EHInfo)
26886 report_fatal_error("EH registrations only live in functions using WinEH");
26887
26888 // Cast the operand to an alloca, and remember the frame index.
26889 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
26890 if (!FINode)
26891 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
26892 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
26893
26894 // Return the chain operand without making any DAG nodes.
26895 return Chain;
26896}
26897
26898static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
26899 MachineFunction &MF = DAG.getMachineFunction();
26900 SDValue Chain = Op.getOperand(0);
26901 SDValue EHGuard = Op.getOperand(2);
26902 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
26903 if (!EHInfo)
26904 report_fatal_error("EHGuard only live in functions using WinEH");
26905
26906 // Cast the operand to an alloca, and remember the frame index.
26907 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
26908 if (!FINode)
26909 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
26910 EHInfo->EHGuardFrameIndex = FINode->getIndex();
26911
26912 // Return the chain operand without making any DAG nodes.
26913 return Chain;
26914}
26915
26916/// Emit Truncating Store with signed or unsigned saturation.
26917static SDValue
26918EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
26919 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
26920 SelectionDAG &DAG) {
26921 SDVTList VTs = DAG.getVTList(MVT::Other);
26922 SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
26923 SDValue Ops[] = { Chain, Val, Ptr, Undef };
26924 unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS;
26925 return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);
26926}
26927
26928/// Emit Masked Truncating Store with signed or unsigned saturation.
26929static SDValue
26930EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
26931 SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
26932 MachineMemOperand *MMO, SelectionDAG &DAG) {
26933 SDVTList VTs = DAG.getVTList(MVT::Other);
26934 SDValue Ops[] = { Chain, Val, Ptr, Mask };
26935 unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS;
26936 return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);
26937}
26938
26939static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
26940 SelectionDAG &DAG) {
26941 unsigned IntNo = Op.getConstantOperandVal(1);
26942 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
26943 if (!IntrData) {
26944 switch (IntNo) {
26945 case llvm::Intrinsic::x86_seh_ehregnode:
26946 return MarkEHRegistrationNode(Op, DAG);
26947 case llvm::Intrinsic::x86_seh_ehguard:
26948 return MarkEHGuard(Op, DAG);
26949 case llvm::Intrinsic::x86_rdpkru: {
26950 SDLoc dl(Op);
26951 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
26952 // Create a RDPKRU node and pass 0 to the ECX parameter.
26953 return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0),
26954 DAG.getConstant(0, dl, MVT::i32));
26955 }
26956 case llvm::Intrinsic::x86_wrpkru: {
26957 SDLoc dl(Op);
26958 // Create a WRPKRU node, pass the input to the EAX parameter, and pass 0
26959 // to the EDX and ECX parameters.
26960 return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,
26961 Op.getOperand(0), Op.getOperand(2),
26962 DAG.getConstant(0, dl, MVT::i32),
26963 DAG.getConstant(0, dl, MVT::i32));
26964 }
26965 case llvm::Intrinsic::x86_flags_read_u32:
26966 case llvm::Intrinsic::x86_flags_read_u64:
26967 case llvm::Intrinsic::x86_flags_write_u32:
26968 case llvm::Intrinsic::x86_flags_write_u64: {
26969 // We need a frame pointer because this will get lowered to a PUSH/POP
26970 // sequence.
26971 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
26972 MFI.setHasCopyImplyingStackAdjustment(true);
26973 // Don't do anything here, we will expand these intrinsics out later
26974 // during FinalizeISel in EmitInstrWithCustomInserter.
26975 return Op;
26976 }
26977 case Intrinsic::x86_lwpins32:
26978 case Intrinsic::x86_lwpins64:
26979 case Intrinsic::x86_umwait:
26980 case Intrinsic::x86_tpause: {
26981 SDLoc dl(Op);
26982 SDValue Chain = Op->getOperand(0);
26983 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
26984 unsigned Opcode;
26985
26986 switch (IntNo) {
26987 default: llvm_unreachable("Impossible intrinsic")__builtin_unreachable();
26988 case Intrinsic::x86_umwait:
26989 Opcode = X86ISD::UMWAIT;
26990 break;
26991 case Intrinsic::x86_tpause:
26992 Opcode = X86ISD::TPAUSE;
26993 break;
26994 case Intrinsic::x86_lwpins32:
26995 case Intrinsic::x86_lwpins64:
26996 Opcode = X86ISD::LWPINS;
26997 break;
26998 }
26999
27000 SDValue Operation =
27001 DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
27002 Op->getOperand(3), Op->getOperand(4));
27003 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
27004 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27005 Operation.getValue(1));
27006 }
27007 case Intrinsic::x86_enqcmd:
27008 case Intrinsic::x86_enqcmds: {
27009 SDLoc dl(Op);
27010 SDValue Chain = Op.getOperand(0);
27011 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27012 unsigned Opcode;
27013 switch (IntNo) {
27014 default: llvm_unreachable("Impossible intrinsic!")__builtin_unreachable();
27015 case Intrinsic::x86_enqcmd:
27016 Opcode = X86ISD::ENQCMD;
27017 break;
27018 case Intrinsic::x86_enqcmds:
27019 Opcode = X86ISD::ENQCMDS;
27020 break;
27021 }
27022 SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2),
27023 Op.getOperand(3));
27024 SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG);
27025 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27026 Operation.getValue(1));
27027 }
27028 case Intrinsic::x86_aesenc128kl:
27029 case Intrinsic::x86_aesdec128kl:
27030 case Intrinsic::x86_aesenc256kl:
27031 case Intrinsic::x86_aesdec256kl: {
27032 SDLoc DL(Op);
27033 SDVTList VTs = DAG.getVTList(MVT::v2i64, MVT::i32, MVT::Other);
27034 SDValue Chain = Op.getOperand(0);
27035 unsigned Opcode;
27036
27037 switch (IntNo) {
27038 default: llvm_unreachable("Impossible intrinsic")__builtin_unreachable();
27039 case Intrinsic::x86_aesenc128kl:
27040 Opcode = X86ISD::AESENC128KL;
27041 break;
27042 case Intrinsic::x86_aesdec128kl:
27043 Opcode = X86ISD::AESDEC128KL;
27044 break;
27045 case Intrinsic::x86_aesenc256kl:
27046 Opcode = X86ISD::AESENC256KL;
27047 break;
27048 case Intrinsic::x86_aesdec256kl:
27049 Opcode = X86ISD::AESDEC256KL;
27050 break;
27051 }
27052
27053 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
27054 MachineMemOperand *MMO = MemIntr->getMemOperand();
27055 EVT MemVT = MemIntr->getMemoryVT();
27056 SDValue Operation = DAG.getMemIntrinsicNode(
27057 Opcode, DL, VTs, {Chain, Op.getOperand(2), Op.getOperand(3)}, MemVT,
27058 MMO);
27059 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(1), DL, DAG);
27060
27061 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
27062 {ZF, Operation.getValue(0), Operation.getValue(2)});
27063 }
27064 case Intrinsic::x86_aesencwide128kl:
27065 case Intrinsic::x86_aesdecwide128kl:
27066 case Intrinsic::x86_aesencwide256kl:
27067 case Intrinsic::x86_aesdecwide256kl: {
27068 SDLoc DL(Op);
27069 SDVTList VTs = DAG.getVTList(
27070 {MVT::i32, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64,
27071 MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::Other});
27072 SDValue Chain = Op.getOperand(0);
27073 unsigned Opcode;
27074
27075 switch (IntNo) {
27076 default: llvm_unreachable("Impossible intrinsic")__builtin_unreachable();
27077 case Intrinsic::x86_aesencwide128kl:
27078 Opcode = X86ISD::AESENCWIDE128KL;
27079 break;
27080 case Intrinsic::x86_aesdecwide128kl:
27081 Opcode = X86ISD::AESDECWIDE128KL;
27082 break;
27083 case Intrinsic::x86_aesencwide256kl:
27084 Opcode = X86ISD::AESENCWIDE256KL;
27085 break;
27086 case Intrinsic::x86_aesdecwide256kl:
27087 Opcode = X86ISD::AESDECWIDE256KL;
27088 break;
27089 }
27090
27091 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
27092 MachineMemOperand *MMO = MemIntr->getMemOperand();
27093 EVT MemVT = MemIntr->getMemoryVT();
27094 SDValue Operation = DAG.getMemIntrinsicNode(
27095 Opcode, DL, VTs,
27096 {Chain, Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
27097 Op.getOperand(5), Op.getOperand(6), Op.getOperand(7),
27098 Op.getOperand(8), Op.getOperand(9), Op.getOperand(10)},
27099 MemVT, MMO);
27100 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(0), DL, DAG);
27101
27102 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
27103 {ZF, Operation.getValue(1), Operation.getValue(2),
27104 Operation.getValue(3), Operation.getValue(4),
27105 Operation.getValue(5), Operation.getValue(6),
27106 Operation.getValue(7), Operation.getValue(8),
27107 Operation.getValue(9)});
27108 }
27109 case Intrinsic::x86_testui: {
27110 SDLoc dl(Op);
27111 SDValue Chain = Op.getOperand(0);
27112 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27113 SDValue Operation = DAG.getNode(X86ISD::TESTUI, dl, VTs, Chain);
27114 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
27115 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27116 Operation.getValue(1));
27117 }
27118 }
27119 return SDValue();
27120 }
27121
27122 SDLoc dl(Op);
27123 switch(IntrData->Type) {
27124 default: llvm_unreachable("Unknown Intrinsic Type")__builtin_unreachable();
27125 case RDSEED:
27126 case RDRAND: {
27127 // Emit the node with the right value type.
27128 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
27129 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
27130
27131 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
27132 // Otherwise return the value from Rand, which is always 0, casted to i32.
27133 SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
27134 DAG.getConstant(1, dl, Op->getValueType(1)),
27135 DAG.getTargetConstant(X86::COND_B, dl, MVT::i8),
27136 SDValue(Result.getNode(), 1)};
27137 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
27138
27139 // Return { result, isValid, chain }.
27140 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
27141 SDValue(Result.getNode(), 2));
27142 }
27143 case GATHER_AVX2: {
27144 SDValue Chain = Op.getOperand(0);
27145 SDValue Src = Op.getOperand(2);
27146 SDValue Base = Op.getOperand(3);
27147 SDValue Index = Op.getOperand(4);
27148 SDValue Mask = Op.getOperand(5);
27149 SDValue Scale = Op.getOperand(6);
27150 return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
27151 Scale, Chain, Subtarget);
27152 }
27153 case GATHER: {
27154 //gather(v1, mask, index, base, scale);
27155 SDValue Chain = Op.getOperand(0);
27156 SDValue Src = Op.getOperand(2);
27157 SDValue Base = Op.getOperand(3);
27158 SDValue Index = Op.getOperand(4);
27159 SDValue Mask = Op.getOperand(5);
27160 SDValue Scale = Op.getOperand(6);
27161 return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,
27162 Chain, Subtarget);
27163 }
27164 case SCATTER: {
27165 //scatter(base, mask, index, v1, scale);
27166 SDValue Chain = Op.getOperand(0);
27167 SDValue Base = Op.getOperand(2);
27168 SDValue Mask = Op.getOperand(3);
27169 SDValue Index = Op.getOperand(4);
27170 SDValue Src = Op.getOperand(5);
27171 SDValue Scale = Op.getOperand(6);
27172 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
27173 Scale, Chain, Subtarget);
27174 }
27175 case PREFETCH: {
27176 const APInt &HintVal = Op.getConstantOperandAPInt(6);
27177 assert((HintVal == 2 || HintVal == 3) &&(static_cast<void> (0))
27178 "Wrong prefetch hint in intrinsic: should be 2 or 3")(static_cast<void> (0));
27179 unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
27180 SDValue Chain = Op.getOperand(0);
27181 SDValue Mask = Op.getOperand(2);
27182 SDValue Index = Op.getOperand(3);
27183 SDValue Base = Op.getOperand(4);
27184 SDValue Scale = Op.getOperand(5);
27185 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
27186 Subtarget);
27187 }
27188 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
27189 case RDTSC: {
27190 SmallVector<SDValue, 2> Results;
27191 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
27192 Results);
27193 return DAG.getMergeValues(Results, dl);
27194 }
27195 // Read Performance Monitoring Counters.
27196 case RDPMC:
27197 // GetExtended Control Register.
27198 case XGETBV: {
27199 SmallVector<SDValue, 2> Results;
27200
27201 // RDPMC uses ECX to select the index of the performance counter to read.
27202 // XGETBV uses ECX to select the index of the XCR register to return.
27203 // The result is stored into registers EDX:EAX.
27204 expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,
27205 Subtarget, Results);
27206 return DAG.getMergeValues(Results, dl);
27207 }
27208 // XTEST intrinsics.
27209 case XTEST: {
27210 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
27211 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
27212
27213 SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
27214 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
27215 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
27216 Ret, SDValue(InTrans.getNode(), 1));
27217 }
27218 case TRUNCATE_TO_MEM_VI8:
27219 case TRUNCATE_TO_MEM_VI16:
27220 case TRUNCATE_TO_MEM_VI32: {
27221 SDValue Mask = Op.getOperand(4);
27222 SDValue DataToTruncate = Op.getOperand(3);
27223 SDValue Addr = Op.getOperand(2);
27224 SDValue Chain = Op.getOperand(0);
27225
27226 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
27227 assert(MemIntr && "Expected MemIntrinsicSDNode!")(static_cast<void> (0));
27228
27229 EVT MemVT = MemIntr->getMemoryVT();
27230
27231 uint16_t TruncationOp = IntrData->Opc0;
27232 switch (TruncationOp) {
27233 case X86ISD::VTRUNC: {
27234 if (isAllOnesConstant(Mask)) // return just a truncate store
27235 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
27236 MemIntr->getMemOperand());
27237
27238 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
27239 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27240 SDValue Offset = DAG.getUNDEF(VMask.getValueType());
27241
27242 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask,
27243 MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED,
27244 true /* truncating */);
27245 }
27246 case X86ISD::VTRUNCUS:
27247 case X86ISD::VTRUNCS: {
27248 bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
27249 if (isAllOnesConstant(Mask))
27250 return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
27251 MemIntr->getMemOperand(), DAG);
27252
27253 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
27254 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27255
27256 return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
27257 VMask, MemVT, MemIntr->getMemOperand(), DAG);
27258 }
27259 default:
27260 llvm_unreachable("Unsupported truncstore intrinsic")__builtin_unreachable();
27261 }
27262 }
27263 }
27264}
27265
27266SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
27267 SelectionDAG &DAG) const {
27268 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
27269 MFI.setReturnAddressIsTaken(true);
27270
27271 if (verifyReturnAddressArgumentIsConstant(Op, DAG))
27272 return SDValue();
27273
27274 unsigned Depth = Op.getConstantOperandVal(0);
27275 SDLoc dl(Op);
27276 EVT PtrVT = getPointerTy(DAG.getDataLayout());
27277
27278 if (Depth > 0) {
27279 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
27280 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27281 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
27282 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
27283 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
27284 MachinePointerInfo());
27285 }
27286
27287 // Just load the return address.
27288 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
27289 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
27290 MachinePointerInfo());
27291}
27292
27293SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
27294 SelectionDAG &DAG) const {
27295 DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
27296 return getReturnAddressFrameIndex(DAG);
27297}
27298
27299SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
27300 MachineFunction &MF = DAG.getMachineFunction();
27301 MachineFrameInfo &MFI = MF.getFrameInfo();
27302 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
27303 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27304 EVT VT = Op.getValueType();
27305
27306 MFI.setFrameAddressIsTaken(true);
27307
27308 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
27309 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
27310 // is not possible to crawl up the stack without looking at the unwind codes
27311 // simultaneously.
27312 int FrameAddrIndex = FuncInfo->getFAIndex();
27313 if (!FrameAddrIndex) {
27314 // Set up a frame object for the return address.
27315 unsigned SlotSize = RegInfo->getSlotSize();
27316 FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
27317 SlotSize, /*SPOffset=*/0, /*IsImmutable=*/false);
27318 FuncInfo->setFAIndex(FrameAddrIndex);
27319 }
27320 return DAG.getFrameIndex(FrameAddrIndex, VT);
27321 }
27322
27323 unsigned FrameReg =
27324 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
27325 SDLoc dl(Op); // FIXME probably not meaningful
27326 unsigned Depth = Op.getConstantOperandVal(0);
27327 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||(static_cast<void> (0))
27328 (FrameReg == X86::EBP && VT == MVT::i32)) &&(static_cast<void> (0))
27329 "Invalid Frame Register!")(static_cast<void> (0));
27330 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
27331 while (Depth--)
27332 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
27333 MachinePointerInfo());
27334 return FrameAddr;
27335}
27336
27337// FIXME? Maybe this could be a TableGen attribute on some registers and
27338// this table could be generated automatically from RegInfo.
27339Register X86TargetLowering::getRegisterByName(const char* RegName, LLT VT,
27340 const MachineFunction &MF) const {
27341 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
27342
27343 Register Reg = StringSwitch<unsigned>(RegName)
27344 .Case("esp", X86::ESP)
27345 .Case("rsp", X86::RSP)
27346 .Case("ebp", X86::EBP)
27347 .Case("rbp", X86::RBP)
27348 .Default(0);
27349
27350 if (Reg == X86::EBP || Reg == X86::RBP) {
27351 if (!TFI.hasFP(MF))
27352 report_fatal_error("register " + StringRef(RegName) +
27353 " is allocatable: function has no frame pointer");
27354#ifndef NDEBUG1
27355 else {
27356 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27357 Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF);
27358 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&(static_cast<void> (0))
27359 "Invalid Frame Register!")(static_cast<void> (0));
27360 }
27361#endif
27362 }
27363
27364 if (Reg)
27365 return Reg;
27366
27367 report_fatal_error("Invalid register name global variable");
27368}
27369
27370SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
27371 SelectionDAG &DAG) const {
27372 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27373 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
27374}
27375
27376Register X86TargetLowering::getExceptionPointerRegister(
27377 const Constant *PersonalityFn) const {
27378 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
27379 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
27380
27381 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
27382}
27383
27384Register X86TargetLowering::getExceptionSelectorRegister(
27385 const Constant *PersonalityFn) const {
27386 // Funclet personalities don't use selectors (the runtime does the selection).
27387 if (isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)))
27388 return X86::NoRegister;
27389 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
27390}
27391
27392bool X86TargetLowering::needsFixedCatchObjects() const {
27393 return Subtarget.isTargetWin64();
27394}
27395
27396SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
27397 SDValue Chain = Op.getOperand(0);
27398 SDValue Offset = Op.getOperand(1);
27399 SDValue Handler = Op.getOperand(2);
27400 SDLoc dl (Op);
27401
27402 EVT PtrVT = getPointerTy(DAG.getDataLayout());
27403 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27404 Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
27405 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||(static_cast<void> (0))
27406 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&(static_cast<void> (0))
27407 "Invalid Frame Register!")(static_cast<void> (0));
27408 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
27409 Register StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
27410
27411 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
27412 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
27413 dl));
27414 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
27415 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
27416 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
27417
27418 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
27419 DAG.getRegister(StoreAddrReg, PtrVT));
27420}
27421
27422SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
27423 SelectionDAG &DAG) const {
27424 SDLoc DL(Op);
27425 // If the subtarget is not 64bit, we may need the global base reg
27426 // after isel expand pseudo, i.e., after CGBR pass ran.
27427 // Therefore, ask for the GlobalBaseReg now, so that the pass
27428 // inserts the code for us in case we need it.
27429 // Otherwise, we will end up in a situation where we will
27430 // reference a virtual register that is not defined!
27431 if (!Subtarget.is64Bit()) {
27432 const X86InstrInfo *TII = Subtarget.getInstrInfo();
27433 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
27434 }
27435 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
27436 DAG.getVTList(MVT::i32, MVT::Other),
27437 Op.getOperand(0), Op.getOperand(1));
27438}
27439
27440SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
27441 SelectionDAG &DAG) const {
27442 SDLoc DL(Op);
27443 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
27444 Op.getOperand(0), Op.getOperand(1));
27445}
27446
27447SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
27448 SelectionDAG &DAG) const {
27449 SDLoc DL(Op);
27450 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
27451 Op.getOperand(0));
27452}
27453
27454static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
27455 return Op.getOperand(0);
27456}
27457
27458SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
27459 SelectionDAG &DAG) const {
27460 SDValue Root = Op.getOperand(0);
27461 SDValue Trmp = Op.getOperand(1); // trampoline
27462 SDValue FPtr = Op.getOperand(2); // nested function
27463 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
27464 SDLoc dl (Op);
27465
27466 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
27467 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
27468
27469 if (Subtarget.is64Bit()) {
27470 SDValue OutChains[6];
27471
27472 // Large code-model.
27473 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
27474 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
27475
27476 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
27477 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
27478
27479 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
27480
27481 // Load the pointer to the nested function into R11.
27482 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
27483 SDValue Addr = Trmp;
27484 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
27485 Addr, MachinePointerInfo(TrmpAddr));
27486
27487 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27488 DAG.getConstant(2, dl, MVT::i64));
27489 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
27490 MachinePointerInfo(TrmpAddr, 2), Align(2));
27491
27492 // Load the 'nest' parameter value into R10.
27493 // R10 is specified in X86CallingConv.td
27494 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
27495 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27496 DAG.getConstant(10, dl, MVT::i64));
27497 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
27498 Addr, MachinePointerInfo(TrmpAddr, 10));
27499
27500 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27501 DAG.getConstant(12, dl, MVT::i64));
27502 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
27503 MachinePointerInfo(TrmpAddr, 12), Align(2));
27504
27505 // Jump to the nested function.
27506 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
27507 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27508 DAG.getConstant(20, dl, MVT::i64));
27509 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
27510 Addr, MachinePointerInfo(TrmpAddr, 20));
27511
27512 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
27513 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27514 DAG.getConstant(22, dl, MVT::i64));
27515 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
27516 Addr, MachinePointerInfo(TrmpAddr, 22));
27517
27518 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
27519 } else {
27520 const Function *Func =
27521 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
27522 CallingConv::ID CC = Func->getCallingConv();
27523 unsigned NestReg;
27524
27525 switch (CC) {
27526 default:
27527 llvm_unreachable("Unsupported calling convention")__builtin_unreachable();
27528 case CallingConv::C:
27529 case CallingConv::X86_StdCall: {
27530 // Pass 'nest' parameter in ECX.
27531 // Must be kept in sync with X86CallingConv.td
27532 NestReg = X86::ECX;
27533
27534 // Check that ECX wasn't needed by an 'inreg' parameter.
27535 FunctionType *FTy = Func->getFunctionType();
27536 const AttributeList &Attrs = Func->getAttributes();
27537
27538 if (!Attrs.isEmpty() && !Func->isVarArg()) {
27539 unsigned InRegCount = 0;
27540 unsigned Idx = 0;
27541
27542 for (FunctionType::param_iterator I = FTy->param_begin(),
27543 E = FTy->param_end(); I != E; ++I, ++Idx)
27544 if (Attrs.hasParamAttr(Idx, Attribute::InReg)) {
27545 const DataLayout &DL = DAG.getDataLayout();
27546 // FIXME: should only count parameters that are lowered to integers.
27547 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
27548 }
27549
27550 if (InRegCount > 2) {
27551 report_fatal_error("Nest register in use - reduce number of inreg"
27552 " parameters!");
27553 }
27554 }
27555 break;
27556 }
27557 case CallingConv::X86_FastCall:
27558 case CallingConv::X86_ThisCall:
27559 case CallingConv::Fast:
27560 case CallingConv::Tail:
27561 case CallingConv::SwiftTail:
27562 // Pass 'nest' parameter in EAX.
27563 // Must be kept in sync with X86CallingConv.td
27564 NestReg = X86::EAX;
27565 break;
27566 }
27567
27568 SDValue OutChains[4];
27569 SDValue Addr, Disp;
27570
27571 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
27572 DAG.getConstant(10, dl, MVT::i32));
27573 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
27574
27575 // This is storing the opcode for MOV32ri.
27576 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
27577 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
27578 OutChains[0] =
27579 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
27580 Trmp, MachinePointerInfo(TrmpAddr));
27581
27582 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
27583 DAG.getConstant(1, dl, MVT::i32));
27584 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
27585 MachinePointerInfo(TrmpAddr, 1), Align(1));
27586
27587 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
27588 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
27589 DAG.getConstant(5, dl, MVT::i32));
27590 OutChains[2] =
27591 DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), Addr,
27592 MachinePointerInfo(TrmpAddr, 5), Align(1));
27593
27594 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
27595 DAG.getConstant(6, dl, MVT::i32));
27596 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
27597 MachinePointerInfo(TrmpAddr, 6), Align(1));
27598
27599 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
27600 }
27601}
27602
27603SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
27604 SelectionDAG &DAG) const {
27605 /*
27606 The rounding mode is in bits 11:10 of FPSR, and has the following
27607 settings:
27608 00 Round to nearest
27609 01 Round to -inf
27610 10 Round to +inf
27611 11 Round to 0
27612
27613 FLT_ROUNDS, on the other hand, expects the following:
27614 -1 Undefined
27615 0 Round to 0
27616 1 Round to nearest
27617 2 Round to +inf
27618 3 Round to -inf
27619
27620 To perform the conversion, we use a packed lookup table of the four 2-bit
27621 values that we can index by FPSP[11:10]
27622 0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]
27623
27624 (0x2d >> ((FPSR & 0xc00) >> 9)) & 3
27625 */
27626
27627 MachineFunction &MF = DAG.getMachineFunction();
27628 MVT VT = Op.getSimpleValueType();
27629 SDLoc DL(Op);
27630
27631 // Save FP Control Word to stack slot
27632 int SSFI = MF.getFrameInfo().CreateStackObject(2, Align(2), false);
27633 SDValue StackSlot =
27634 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
27635
27636 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
27637
27638 SDValue Chain = Op.getOperand(0);
27639 SDValue Ops[] = {Chain, StackSlot};
27640 Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
27641 DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI,
27642 Align(2), MachineMemOperand::MOStore);
27643
27644 // Load FP Control Word from stack slot
27645 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, Align(2));
27646 Chain = CWD.getValue(1);
27647
27648 // Mask and turn the control bits into a shift for the lookup table.
27649 SDValue Shift =
27650 DAG.getNode(ISD::SRL, DL, MVT::i16,
27651 DAG.getNode(ISD::AND, DL, MVT::i16,
27652 CWD, DAG.getConstant(0xc00, DL, MVT::i16)),
27653 DAG.getConstant(9, DL, MVT::i8));
27654 Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift);
27655
27656 SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32);
27657 SDValue RetVal =
27658 DAG.getNode(ISD::AND, DL, MVT::i32,
27659 DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),
27660 DAG.getConstant(3, DL, MVT::i32));
27661
27662 RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT);
27663
27664 return DAG.getMergeValues({RetVal, Chain}, DL);
27665}
27666
27667SDValue X86TargetLowering::LowerSET_ROUNDING(SDValue Op,
27668 SelectionDAG &DAG) const {
27669 MachineFunction &MF = DAG.getMachineFunction();
27670 SDLoc DL(Op);
27671 SDValue Chain = Op.getNode()->getOperand(0);
27672
27673 // FP control word may be set only from data in memory. So we need to allocate
27674 // stack space to save/load FP control word.
27675 int OldCWFrameIdx = MF.getFrameInfo().CreateStackObject(4, Align(4), false);
27676 SDValue StackSlot =
27677 DAG.getFrameIndex(OldCWFrameIdx, getPointerTy(DAG.getDataLayout()));
27678 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, OldCWFrameIdx);
27679 MachineMemOperand *MMO =
27680 MF.getMachineMemOperand(MPI, MachineMemOperand::MOStore, 2, Align(2));
27681
27682 // Store FP control word into memory.
27683 SDValue Ops[] = {Chain, StackSlot};
27684 Chain = DAG.getMemIntrinsicNode(
27685 X86ISD::FNSTCW16m, DL, DAG.getVTList(MVT::Other), Ops, MVT::i16, MMO);
27686
27687 // Load FP Control Word from stack slot and clear RM field (bits 11:10).
27688 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI);
27689 Chain = CWD.getValue(1);
27690 CWD = DAG.getNode(ISD::AND, DL, MVT::i16, CWD.getValue(0),
27691 DAG.getConstant(0xf3ff, DL, MVT::i16));
27692
27693 // Calculate new rounding mode.
27694 SDValue NewRM = Op.getNode()->getOperand(1);
27695 SDValue RMBits;
27696 if (auto *CVal = dyn_cast<ConstantSDNode>(NewRM)) {
27697 uint64_t RM = CVal->getZExtValue();
27698 int FieldVal;
27699 switch (static_cast<RoundingMode>(RM)) {
27700 case RoundingMode::NearestTiesToEven: FieldVal = X86::rmToNearest; break;
27701 case RoundingMode::TowardNegative: FieldVal = X86::rmDownward; break;
27702 case RoundingMode::TowardPositive: FieldVal = X86::rmUpward; break;
27703 case RoundingMode::TowardZero: FieldVal = X86::rmTowardZero; break;
27704 default:
27705 llvm_unreachable("rounding mode is not supported by X86 hardware")__builtin_unreachable();
27706 }
27707 RMBits = DAG.getConstant(FieldVal, DL, MVT::i16);
27708 } else {
27709 // Need to convert argument into bits of control word:
27710 // 0 Round to 0 -> 11
27711 // 1 Round to nearest -> 00
27712 // 2 Round to +inf -> 10
27713 // 3 Round to -inf -> 01
27714 // The 2-bit value needs then to be shifted so that it occupies bits 11:10.
27715 // To make the conversion, put all these values into a value 0xc9 and shift
27716 // it left depending on the rounding mode:
27717 // (0xc9 << 4) & 0xc00 = X86::rmTowardZero
27718 // (0xc9 << 6) & 0xc00 = X86::rmToNearest
27719 // ...
27720 // (0xc9 << (2 * NewRM + 4)) & 0xc00
27721 SDValue ShiftValue =
27722 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
27723 DAG.getNode(ISD::ADD, DL, MVT::i32,
27724 DAG.getNode(ISD::SHL, DL, MVT::i32, NewRM,
27725 DAG.getConstant(1, DL, MVT::i8)),
27726 DAG.getConstant(4, DL, MVT::i32)));
27727 SDValue Shifted =
27728 DAG.getNode(ISD::SHL, DL, MVT::i16, DAG.getConstant(0xc9, DL, MVT::i16),
27729 ShiftValue);
27730 RMBits = DAG.getNode(ISD::AND, DL, MVT::i16, Shifted,
27731 DAG.getConstant(0xc00, DL, MVT::i16));
27732 }
27733
27734 // Update rounding mode bits and store the new FP Control Word into stack.
27735 CWD = DAG.getNode(ISD::OR, DL, MVT::i16, CWD, RMBits);
27736 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, /* Alignment = */ 2);
27737
27738 // Load FP control word from the slot.
27739 SDValue OpsLD[] = {Chain, StackSlot};
27740 MachineMemOperand *MMOL =
27741 MF.getMachineMemOperand(MPI, MachineMemOperand::MOLoad, 2, Align(2));
27742 Chain = DAG.getMemIntrinsicNode(
27743 X86ISD::FLDCW16m, DL, DAG.getVTList(MVT::Other), OpsLD, MVT::i16, MMOL);
27744
27745 // If target supports SSE, set MXCSR as well. Rounding mode is encoded in the
27746 // same way but in bits 14:13.
27747 if (Subtarget.hasSSE1()) {
27748 // Store MXCSR into memory.
27749 Chain = DAG.getNode(
27750 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
27751 DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
27752 StackSlot);
27753
27754 // Load MXCSR from stack slot and clear RM field (bits 14:13).
27755 SDValue CWD = DAG.getLoad(MVT::i32, DL, Chain, StackSlot, MPI);
27756 Chain = CWD.getValue(1);
27757 CWD = DAG.getNode(ISD::AND, DL, MVT::i32, CWD.getValue(0),
27758 DAG.getConstant(0xffff9fff, DL, MVT::i32));
27759
27760 // Shift X87 RM bits from 11:10 to 14:13.
27761 RMBits = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, RMBits);
27762 RMBits = DAG.getNode(ISD::SHL, DL, MVT::i32, RMBits,
27763 DAG.getConstant(3, DL, MVT::i8));
27764
27765 // Update rounding mode bits and store the new FP Control Word into stack.
27766 CWD = DAG.getNode(ISD::OR, DL, MVT::i32, CWD, RMBits);
27767 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, /* Alignment = */ 4);
27768
27769 // Load MXCSR from the slot.
27770 Chain = DAG.getNode(
27771 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
27772 DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
27773 StackSlot);
27774 }
27775
27776 return Chain;
27777}
27778
27779/// Lower a vector CTLZ using native supported vector CTLZ instruction.
27780//
27781// i8/i16 vector implemented using dword LZCNT vector instruction
27782// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
27783// split the vector, perform operation on it's Lo a Hi part and
27784// concatenate the results.
27785static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG,
27786 const X86Subtarget &Subtarget) {
27787 assert(Op.getOpcode() == ISD::CTLZ)(static_cast<void> (0));
27788 SDLoc dl(Op);
27789 MVT VT = Op.getSimpleValueType();
27790 MVT EltVT = VT.getVectorElementType();
27791 unsigned NumElems = VT.getVectorNumElements();
27792
27793 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&(static_cast<void> (0))
27794 "Unsupported element type")(static_cast<void> (0));
27795
27796 // Split vector, it's Lo and Hi parts will be handled in next iteration.
27797 if (NumElems > 16 ||
27798 (NumElems == 16 && !Subtarget.canExtendTo512DQ()))
27799 return splitVectorIntUnary(Op, DAG);
27800
27801 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
27802 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&(static_cast<void> (0))
27803 "Unsupported value type for operation")(static_cast<void> (0));
27804
27805 // Use native supported vector instruction vplzcntd.
27806 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
27807 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
27808 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
27809 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
27810
27811 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
27812}
27813
27814// Lower CTLZ using a PSHUFB lookup table implementation.
27815static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
27816 const X86Subtarget &Subtarget,
27817 SelectionDAG &DAG) {
27818 MVT VT = Op.getSimpleValueType();
27819 int NumElts = VT.getVectorNumElements();
27820 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
27821 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
27822
27823 // Per-nibble leading zero PSHUFB lookup table.
27824 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
27825 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
27826 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
27827 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
27828
27829 SmallVector<SDValue, 64> LUTVec;
27830 for (int i = 0; i < NumBytes; ++i)
27831 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
27832 SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
27833
27834 // Begin by bitcasting the input to byte vector, then split those bytes
27835 // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
27836 // If the hi input nibble is zero then we add both results together, otherwise
27837 // we just take the hi result (by masking the lo result to zero before the
27838 // add).
27839 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
27840 SDValue Zero = DAG.getConstant(0, DL, CurrVT);
27841
27842 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
27843 SDValue Lo = Op0;
27844 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
27845 SDValue HiZ;
27846 if (CurrVT.is512BitVector()) {
27847 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
27848 HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
27849 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
27850 } else {
27851 HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
27852 }
27853
27854 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
27855 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
27856 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
27857 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
27858
27859 // Merge result back from vXi8 back to VT, working on the lo/hi halves
27860 // of the current vector width in the same way we did for the nibbles.
27861 // If the upper half of the input element is zero then add the halves'
27862 // leading zero counts together, otherwise just use the upper half's.
27863 // Double the width of the result until we are at target width.
27864 while (CurrVT != VT) {
27865 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
27866 int CurrNumElts = CurrVT.getVectorNumElements();
27867 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
27868 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
27869 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
27870
27871 // Check if the upper half of the input element is zero.
27872 if (CurrVT.is512BitVector()) {
27873 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
27874 HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
27875 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
27876 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
27877 } else {
27878 HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
27879 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
27880 }
27881 HiZ = DAG.getBitcast(NextVT, HiZ);
27882
27883 // Move the upper/lower halves to the lower bits as we'll be extending to
27884 // NextVT. Mask the lower result to zero if HiZ is true and add the results
27885 // together.
27886 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
27887 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
27888 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
27889 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
27890 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
27891 CurrVT = NextVT;
27892 }
27893
27894 return Res;
27895}
27896
27897static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
27898 const X86Subtarget &Subtarget,
27899 SelectionDAG &DAG) {
27900 MVT VT = Op.getSimpleValueType();
27901
27902 if (Subtarget.hasCDI() &&
27903 // vXi8 vectors need to be promoted to 512-bits for vXi32.
27904 (Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))
27905 return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);
27906
27907 // Decompose 256-bit ops into smaller 128-bit ops.
27908 if (VT.is256BitVector() && !Subtarget.hasInt256())
27909 return splitVectorIntUnary(Op, DAG);
27910
27911 // Decompose 512-bit ops into smaller 256-bit ops.
27912 if (VT.is512BitVector() && !Subtarget.hasBWI())
27913 return splitVectorIntUnary(Op, DAG);
27914
27915 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB")(static_cast<void> (0));
27916 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
27917}
27918
27919static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
27920 SelectionDAG &DAG) {
27921 MVT VT = Op.getSimpleValueType();
27922 MVT OpVT = VT;
27923 unsigned NumBits = VT.getSizeInBits();
27924 SDLoc dl(Op);
27925 unsigned Opc = Op.getOpcode();
27926
27927 if (VT.isVector())
27928 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
27929
27930 Op = Op.getOperand(0);
27931 if (VT == MVT::i8) {
27932 // Zero extend to i32 since there is not an i8 bsr.
27933 OpVT = MVT::i32;
27934 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
27935 }
27936
27937 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
27938 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
27939 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
27940
27941 if (Opc == ISD::CTLZ) {
27942 // If src is zero (i.e. bsr sets ZF), returns NumBits.
27943 SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
27944 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
27945 Op.getValue(1)};
27946 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
27947 }
27948
27949 // Finally xor with NumBits-1.
27950 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
27951 DAG.getConstant(NumBits - 1, dl, OpVT));
27952
27953 if (VT == MVT::i8)
27954 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
27955 return Op;
27956}
27957
27958static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
27959 SelectionDAG &DAG) {
27960 MVT VT = Op.getSimpleValueType();
27961 unsigned NumBits = VT.getScalarSizeInBits();
27962 SDValue N0 = Op.getOperand(0);
27963 SDLoc dl(Op);
27964
27965 assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&(static_cast<void> (0))
27966 "Only scalar CTTZ requires custom lowering")(static_cast<void> (0));
27967
27968 // Issue a bsf (scan bits forward) which also sets EFLAGS.
27969 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
27970 Op = DAG.getNode(X86ISD::BSF, dl, VTs, N0);
27971
27972 // If src is zero (i.e. bsf sets ZF), returns NumBits.
27973 SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT),
27974 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
27975 Op.getValue(1)};
27976 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
27977}
27978
27979static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG,
27980 const X86Subtarget &Subtarget) {
27981 MVT VT = Op.getSimpleValueType();
27982 if (VT == MVT::i16 || VT == MVT::i32)
27983 return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
27984
27985 if (VT == MVT::v32i16 || VT == MVT::v64i8)
27986 return splitVectorIntBinary(Op, DAG);
27987
27988 assert(Op.getSimpleValueType().is256BitVector() &&(static_cast<void> (0))
27989 Op.getSimpleValueType().isInteger() &&(static_cast<void> (0))
27990 "Only handle AVX 256-bit vector integer operation")(static_cast<void> (0));
27991 return splitVectorIntBinary(Op, DAG);
27992}
27993
27994static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG,
27995 const X86Subtarget &Subtarget) {
27996 MVT VT = Op.getSimpleValueType();
27997 SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
27998 unsigned Opcode = Op.getOpcode();
27999 SDLoc DL(Op);
28000
28001 if (VT == MVT::v32i16 || VT == MVT::v64i8 ||
28002 (VT.is256BitVector() && !Subtarget.hasInt256())) {
28003 assert(Op.getSimpleValueType().isInteger() &&(static_cast<void> (0))
28004 "Only handle AVX vector integer operation")(static_cast<void> (0));
28005 return splitVectorIntBinary(Op, DAG);
28006 }
28007
28008 // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.
28009 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28010 EVT SetCCResultType =
28011 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
28012
28013 if (Opcode == ISD::USUBSAT && !TLI.isOperationLegal(ISD::UMAX, VT)) {
28014 // usubsat X, Y --> (X >u Y) ? X - Y : 0
28015 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
28016 SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
28017 // TODO: Move this to DAGCombiner?
28018 if (SetCCResultType == VT &&
28019 DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits())
28020 return DAG.getNode(ISD::AND, DL, VT, Cmp, Sub);
28021 return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
28022 }
28023
28024 if ((Opcode == ISD::SADDSAT || Opcode == ISD::SSUBSAT) &&
28025 (!VT.isVector() || VT == MVT::v2i64)) {
28026 unsigned BitWidth = VT.getScalarSizeInBits();
28027 APInt MinVal = APInt::getSignedMinValue(BitWidth);
28028 APInt MaxVal = APInt::getSignedMaxValue(BitWidth);
28029 SDValue Zero = DAG.getConstant(0, DL, VT);
28030 SDValue Result =
28031 DAG.getNode(Opcode == ISD::SADDSAT ? ISD::SADDO : ISD::SSUBO, DL,
28032 DAG.getVTList(VT, SetCCResultType), X, Y);
28033 SDValue SumDiff = Result.getValue(0);
28034 SDValue Overflow = Result.getValue(1);
28035 SDValue SatMin = DAG.getConstant(MinVal, DL, VT);
28036 SDValue SatMax = DAG.getConstant(MaxVal, DL, VT);
28037 SDValue SumNeg =
28038 DAG.getSetCC(DL, SetCCResultType, SumDiff, Zero, ISD::SETLT);
28039 Result = DAG.getSelect(DL, VT, SumNeg, SatMax, SatMin);
28040 return DAG.getSelect(DL, VT, Overflow, Result, SumDiff);
28041 }
28042
28043 // Use default expansion.
28044 return SDValue();
28045}
28046
28047static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
28048 SelectionDAG &DAG) {
28049 MVT VT = Op.getSimpleValueType();
28050 if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
28051 // Since X86 does not have CMOV for 8-bit integer, we don't convert
28052 // 8-bit integer abs to NEG and CMOV.
28053 SDLoc DL(Op);
28054 SDValue N0 = Op.getOperand(0);
28055 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
28056 DAG.getConstant(0, DL, VT), N0);
28057 SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_GE, DL, MVT::i8),
28058 SDValue(Neg.getNode(), 1)};
28059 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
28060 }
28061
28062 // ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).
28063 if ((VT == MVT::v2i64 || VT == MVT::v4i64) && Subtarget.hasSSE41()) {
28064 SDLoc DL(Op);
28065 SDValue Src = Op.getOperand(0);
28066 SDValue Sub =
28067 DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Src);
28068 return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Sub, Src);
28069 }
28070
28071 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
28072 assert(VT.isInteger() &&(static_cast<void> (0))
28073 "Only handle AVX 256-bit vector integer operation")(static_cast<void> (0));
28074 return splitVectorIntUnary(Op, DAG);
28075 }
28076
28077 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
28078 return splitVectorIntUnary(Op, DAG);
28079
28080 // Default to expand.
28081 return SDValue();
28082}
28083
28084static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
28085 MVT VT = Op.getSimpleValueType();
28086
28087 // For AVX1 cases, split to use legal ops (everything but v4i64).
28088 if (VT.getScalarType() != MVT::i64 && VT.is256BitVector())
28089 return splitVectorIntBinary(Op, DAG);
28090
28091 if (VT == MVT::v32i16 || VT == MVT::v64i8)
28092 return splitVectorIntBinary(Op, DAG);
28093
28094 // Default to expand.
28095 return SDValue();
28096}
28097
28098static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
28099 SelectionDAG &DAG) {
28100 SDLoc dl(Op);
28101 MVT VT = Op.getSimpleValueType();
28102
28103 // Decompose 256-bit ops into 128-bit ops.
28104 if (VT.is256BitVector() && !Subtarget.hasInt256())
28105 return splitVectorIntBinary(Op, DAG);
28106
28107 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
28108 return splitVectorIntBinary(Op, DAG);
28109
28110 SDValue A = Op.getOperand(0);
28111 SDValue B = Op.getOperand(1);
28112
28113 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
28114 // vector pairs, multiply and truncate.
28115 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
28116 unsigned NumElts = VT.getVectorNumElements();
28117
28118 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
28119 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
28120 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
28121 return DAG.getNode(
28122 ISD::TRUNCATE, dl, VT,
28123 DAG.getNode(ISD::MUL, dl, ExVT,
28124 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A),
28125 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B)));
28126 }
28127
28128 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
28129
28130 // Extract the lo/hi parts to any extend to i16.
28131 // We're going to mask off the low byte of each result element of the
28132 // pmullw, so it doesn't matter what's in the high byte of each 16-bit
28133 // element.
28134 SDValue Undef = DAG.getUNDEF(VT);
28135 SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef));
28136 SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef));
28137
28138 SDValue BLo, BHi;
28139 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
28140 // If the RHS is a constant, manually unpackl/unpackh.
28141 SmallVector<SDValue, 16> LoOps, HiOps;
28142 for (unsigned i = 0; i != NumElts; i += 16) {
28143 for (unsigned j = 0; j != 8; ++j) {
28144 LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,
28145 MVT::i16));
28146 HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,
28147 MVT::i16));
28148 }
28149 }
28150
28151 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
28152 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
28153 } else {
28154 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));
28155 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));
28156 }
28157
28158 // Multiply, mask the lower 8bits of the lo/hi results and pack.
28159 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
28160 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
28161 RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
28162 RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
28163 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
28164 }
28165
28166 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
28167 if (VT == MVT::v4i32) {
28168 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&(static_cast<void> (0))
28169 "Should not custom lower when pmulld is available!")(static_cast<void> (0));
28170
28171 // Extract the odd parts.
28172 static const int UnpackMask[] = { 1, -1, 3, -1 };
28173 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
28174 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
28175
28176 // Multiply the even parts.
28177 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
28178 DAG.getBitcast(MVT::v2i64, A),
28179 DAG.getBitcast(MVT::v2i64, B));
28180 // Now multiply odd parts.
28181 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
28182 DAG.getBitcast(MVT::v2i64, Aodds),
28183 DAG.getBitcast(MVT::v2i64, Bodds));
28184
28185 Evens = DAG.getBitcast(VT, Evens);
28186 Odds = DAG.getBitcast(VT, Odds);
28187
28188 // Merge the two vectors back together with a shuffle. This expands into 2
28189 // shuffles.
28190 static const int ShufMask[] = { 0, 4, 2, 6 };
28191 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
28192 }
28193
28194 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&(static_cast<void> (0))
28195 "Only know how to lower V2I64/V4I64/V8I64 multiply")(static_cast<void> (0));
28196 assert(!Subtarget.hasDQI() && "DQI should use MULLQ")(static_cast<void> (0));
28197
28198 // Ahi = psrlqi(a, 32);
28199 // Bhi = psrlqi(b, 32);
28200 //
28201 // AloBlo = pmuludq(a, b);
28202 // AloBhi = pmuludq(a, Bhi);
28203 // AhiBlo = pmuludq(Ahi, b);
28204 //
28205 // Hi = psllqi(AloBhi + AhiBlo, 32);
28206 // return AloBlo + Hi;
28207 KnownBits AKnown = DAG.computeKnownBits(A);
28208 KnownBits BKnown = DAG.computeKnownBits(B);
28209
28210 APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
28211 bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);
28212 bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);
28213
28214 APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
28215 bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
28216 bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);
28217
28218 SDValue Zero = DAG.getConstant(0, dl, VT);
28219
28220 // Only multiply lo/hi halves that aren't known to be zero.
28221 SDValue AloBlo = Zero;
28222 if (!ALoIsZero && !BLoIsZero)
28223 AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
28224
28225 SDValue AloBhi = Zero;
28226 if (!ALoIsZero && !BHiIsZero) {
28227 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
28228 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
28229 }
28230
28231 SDValue AhiBlo = Zero;
28232 if (!AHiIsZero && !BLoIsZero) {
28233 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
28234 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
28235 }
28236
28237 SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
28238 Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
28239
28240 return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
28241}
28242
28243static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl,
28244 MVT VT, bool IsSigned,
28245 const X86Subtarget &Subtarget,
28246 SelectionDAG &DAG,
28247 SDValue *Low = nullptr) {
28248 unsigned NumElts = VT.getVectorNumElements();
28249
28250 // For vXi8 we will unpack the low and high half of each 128 bit lane to widen
28251 // to a vXi16 type. Do the multiplies, shift the results and pack the half
28252 // lane results back together.
28253
28254 // We'll take different approaches for signed and unsigned.
28255 // For unsigned we'll use punpcklbw/punpckhbw to put zero extend the bytes
28256 // and use pmullw to calculate the full 16-bit product.
28257 // For signed we'll use punpcklbw/punpckbw to extend the bytes to words and
28258 // shift them left into the upper byte of each word. This allows us to use
28259 // pmulhw to calculate the full 16-bit product. This trick means we don't
28260 // need to sign extend the bytes to use pmullw.
28261
28262 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
28263 SDValue Zero = DAG.getConstant(0, dl, VT);
28264
28265 SDValue ALo, AHi;
28266 if (IsSigned) {
28267 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, A));
28268 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));
28269 } else {
28270 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));
28271 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));
28272 }
28273
28274 SDValue BLo, BHi;
28275 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
28276 // If the RHS is a constant, manually unpackl/unpackh and extend.
28277 SmallVector<SDValue, 16> LoOps, HiOps;
28278 for (unsigned i = 0; i != NumElts; i += 16) {
28279 for (unsigned j = 0; j != 8; ++j) {
28280 SDValue LoOp = B.getOperand(i + j);
28281 SDValue HiOp = B.getOperand(i + j + 8);
28282
28283 if (IsSigned) {
28284 LoOp = DAG.getAnyExtOrTrunc(LoOp, dl, MVT::i16);
28285 HiOp = DAG.getAnyExtOrTrunc(HiOp, dl, MVT::i16);
28286 LoOp = DAG.getNode(ISD::SHL, dl, MVT::i16, LoOp,
28287 DAG.getConstant(8, dl, MVT::i16));
28288 HiOp = DAG.getNode(ISD::SHL, dl, MVT::i16, HiOp,
28289 DAG.getConstant(8, dl, MVT::i16));
28290 } else {
28291 LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
28292 HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
28293 }
28294
28295 LoOps.push_back(LoOp);
28296 HiOps.push_back(HiOp);
28297 }
28298 }
28299
28300 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
28301 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
28302 } else if (IsSigned) {
28303 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));
28304 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));
28305 } else {
28306 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));
28307 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Zero));
28308 }
28309
28310 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
28311 // pack back to vXi8.
28312 unsigned MulOpc = IsSigned ? ISD::MULHS : ISD::MUL;
28313 SDValue RLo = DAG.getNode(MulOpc, dl, ExVT, ALo, BLo);
28314 SDValue RHi = DAG.getNode(MulOpc, dl, ExVT, AHi, BHi);
28315
28316 if (Low) {
28317 // Mask the lower bits and pack the results to rejoin the halves.
28318 SDValue Mask = DAG.getConstant(255, dl, ExVT);
28319 SDValue LLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, Mask);
28320 SDValue LHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, Mask);
28321 *Low = DAG.getNode(X86ISD::PACKUS, dl, VT, LLo, LHi);
28322 }
28323
28324 RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, RLo, 8, DAG);
28325 RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, RHi, 8, DAG);
28326
28327 // Bitcast back to VT and then pack all the even elements from Lo and Hi.
28328 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
28329}
28330
28331static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
28332 SelectionDAG &DAG) {
28333 SDLoc dl(Op);
28334 MVT VT = Op.getSimpleValueType();
28335 bool IsSigned = Op->getOpcode() == ISD::MULHS;
28336 unsigned NumElts = VT.getVectorNumElements();
28337 SDValue A = Op.getOperand(0);
28338 SDValue B = Op.getOperand(1);
28339
28340 // Decompose 256-bit ops into 128-bit ops.
28341 if (VT.is256BitVector() && !Subtarget.hasInt256())
28342 return splitVectorIntBinary(Op, DAG);
28343
28344 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
28345 return splitVectorIntBinary(Op, DAG);
28346
28347 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {
28348 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||(static_cast<void> (0))
28349 (VT == MVT::v8i32 && Subtarget.hasInt256()) ||(static_cast<void> (0))
28350 (VT == MVT::v16i32 && Subtarget.hasAVX512()))(static_cast<void> (0));
28351
28352 // PMULxD operations multiply each even value (starting at 0) of LHS with
28353 // the related value of RHS and produce a widen result.
28354 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
28355 // => <2 x i64> <ae|cg>
28356 //
28357 // In other word, to have all the results, we need to perform two PMULxD:
28358 // 1. one with the even values.
28359 // 2. one with the odd values.
28360 // To achieve #2, with need to place the odd values at an even position.
28361 //
28362 // Place the odd value at an even position (basically, shift all values 1
28363 // step to the left):
28364 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1,
28365 9, -1, 11, -1, 13, -1, 15, -1};
28366 // <a|b|c|d> => <b|undef|d|undef>
28367 SDValue Odd0 = DAG.getVectorShuffle(VT, dl, A, A,
28368 makeArrayRef(&Mask[0], NumElts));
28369 // <e|f|g|h> => <f|undef|h|undef>
28370 SDValue Odd1 = DAG.getVectorShuffle(VT, dl, B, B,
28371 makeArrayRef(&Mask[0], NumElts));
28372
28373 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
28374 // ints.
28375 MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
28376 unsigned Opcode =
28377 (IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ;
28378 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
28379 // => <2 x i64> <ae|cg>
28380 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
28381 DAG.getBitcast(MulVT, A),
28382 DAG.getBitcast(MulVT, B)));
28383 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
28384 // => <2 x i64> <bf|dh>
28385 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
28386 DAG.getBitcast(MulVT, Odd0),
28387 DAG.getBitcast(MulVT, Odd1)));
28388
28389 // Shuffle it back into the right order.
28390 SmallVector<int, 16> ShufMask(NumElts);
28391 for (int i = 0; i != (int)NumElts; ++i)
28392 ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
28393
28394 SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);
28395
28396 // If we have a signed multiply but no PMULDQ fix up the result of an
28397 // unsigned multiply.
28398 if (IsSigned && !Subtarget.hasSSE41()) {
28399 SDValue Zero = DAG.getConstant(0, dl, VT);
28400 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
28401 DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B);
28402 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
28403 DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A);
28404
28405 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
28406 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);
28407 }
28408
28409 return Res;
28410 }
28411
28412 // Only i8 vectors should need custom lowering after this.
28413 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||(static_cast<void> (0))
28414 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&(static_cast<void> (0))
28415 "Unsupported vector type")(static_cast<void> (0));
28416
28417 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
28418 // logical shift down the upper half and pack back to i8.
28419
28420 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
28421 // and then ashr/lshr the upper bits down to the lower bits before multiply.
28422
28423 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
28424 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
28425 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
28426 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
28427 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
28428 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
28429 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
28430 Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
28431 return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
28432 }
28433
28434 return LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG);
28435}
28436
28437// Custom lowering for SMULO/UMULO.
28438static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget,
28439 SelectionDAG &DAG) {
28440 MVT VT = Op.getSimpleValueType();
28441
28442 // Scalars defer to LowerXALUO.
28443 if (!VT.isVector())
28444 return LowerXALUO(Op, DAG);
28445
28446 SDLoc dl(Op);
28447 bool IsSigned = Op->getOpcode() == ISD::SMULO;
28448 SDValue A = Op.getOperand(0);
28449 SDValue B = Op.getOperand(1);
28450 EVT OvfVT = Op->getValueType(1);
28451
28452 if ((VT == MVT::v32i8 && !Subtarget.hasInt256()) ||
28453 (VT == MVT::v64i8 && !Subtarget.hasBWI())) {
28454 // Extract the LHS Lo/Hi vectors
28455 SDValue LHSLo, LHSHi;
28456 std::tie(LHSLo, LHSHi) = splitVector(A, DAG, dl);
28457
28458 // Extract the RHS Lo/Hi vectors
28459 SDValue RHSLo, RHSHi;
28460 std::tie(RHSLo, RHSHi) = splitVector(B, DAG, dl);
28461
28462 EVT LoOvfVT, HiOvfVT;
28463 std::tie(LoOvfVT, HiOvfVT) = DAG.GetSplitDestVTs(OvfVT);
28464 SDVTList LoVTs = DAG.getVTList(LHSLo.getValueType(), LoOvfVT);
28465 SDVTList HiVTs = DAG.getVTList(LHSHi.getValueType(), HiOvfVT);
28466
28467 // Issue the split operations.
28468 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, LoVTs, LHSLo, RHSLo);
28469 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, HiVTs, LHSHi, RHSHi);
28470
28471 // Join the separate data results and the overflow results.
28472 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
28473 SDValue Ovf = DAG.getNode(ISD::CONCAT_VECTORS, dl, OvfVT, Lo.getValue(1),
28474 Hi.getValue(1));
28475
28476 return DAG.getMergeValues({Res, Ovf}, dl);
28477 }
28478
28479 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28480 EVT SetccVT =
28481 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
28482
28483 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
28484 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
28485 unsigned NumElts = VT.getVectorNumElements();
28486 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
28487 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
28488 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
28489 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
28490 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
28491
28492 SDValue Low = DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
28493
28494 SDValue Ovf;
28495 if (IsSigned) {
28496 SDValue High, LowSign;
28497 if (OvfVT.getVectorElementType() == MVT::i1 &&
28498 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
28499 // Rather the truncating try to do the compare on vXi16 or vXi32.
28500 // Shift the high down filling with sign bits.
28501 High = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Mul, 8, DAG);
28502 // Fill all 16 bits with the sign bit from the low.
28503 LowSign =
28504 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExVT, Mul, 8, DAG);
28505 LowSign = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, LowSign,
28506 15, DAG);
28507 SetccVT = OvfVT;
28508 if (!Subtarget.hasBWI()) {
28509 // We can't do a vXi16 compare so sign extend to v16i32.
28510 High = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, High);
28511 LowSign = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, LowSign);
28512 }
28513 } else {
28514 // Otherwise do the compare at vXi8.
28515 High = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
28516 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
28517 LowSign =
28518 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
28519 }
28520
28521 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
28522 } else {
28523 SDValue High =
28524 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
28525 if (OvfVT.getVectorElementType() == MVT::i1 &&
28526 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
28527 // Rather the truncating try to do the compare on vXi16 or vXi32.
28528 SetccVT = OvfVT;
28529 if (!Subtarget.hasBWI()) {
28530 // We can't do a vXi16 compare so sign extend to v16i32.
28531 High = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, High);
28532 }
28533 } else {
28534 // Otherwise do the compare at vXi8.
28535 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
28536 }
28537
28538 Ovf =
28539 DAG.getSetCC(dl, SetccVT, High,
28540 DAG.getConstant(0, dl, High.getValueType()), ISD::SETNE);
28541 }
28542
28543 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
28544
28545 return DAG.getMergeValues({Low, Ovf}, dl);
28546 }
28547
28548 SDValue Low;
28549 SDValue High =
28550 LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG, &Low);
28551
28552 SDValue Ovf;
28553 if (IsSigned) {
28554 // SMULO overflows if the high bits don't match the sign of the low.
28555 SDValue LowSign =
28556 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
28557 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
28558 } else {
28559 // UMULO overflows if the high bits are non-zero.
28560 Ovf =
28561 DAG.getSetCC(dl, SetccVT, High, DAG.getConstant(0, dl, VT), ISD::SETNE);
28562 }
28563
28564 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
28565
28566 return DAG.getMergeValues({Low, Ovf}, dl);
28567}
28568
28569SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
28570 assert(Subtarget.isTargetWin64() && "Unexpected target")(static_cast<void> (0));
28571 EVT VT = Op.getValueType();
28572 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&(static_cast<void> (0))
28573 "Unexpected return type for lowering")(static_cast<void> (0));
28574
28575 RTLIB::Libcall LC;
28576 bool isSigned;
28577 switch (Op->getOpcode()) {
28578 default: llvm_unreachable("Unexpected request for libcall!")__builtin_unreachable();
28579 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
28580 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
28581 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
28582 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
28583 }
28584
28585 SDLoc dl(Op);
28586 SDValue InChain = DAG.getEntryNode();
28587
28588 TargetLowering::ArgListTy Args;
28589 TargetLowering::ArgListEntry Entry;
28590 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
28591 EVT ArgVT = Op->getOperand(i).getValueType();
28592 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&(static_cast<void> (0))
28593 "Unexpected argument type for lowering")(static_cast<void> (0));
28594 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
28595 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
28596 MachinePointerInfo MPI =
28597 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
28598 Entry.Node = StackPtr;
28599 InChain =
28600 DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16));
28601 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
28602 Entry.Ty = PointerType::get(ArgTy,0);
28603 Entry.IsSExt = false;
28604 Entry.IsZExt = false;
28605 Args.push_back(Entry);
28606 }
28607
28608 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
28609 getPointerTy(DAG.getDataLayout()));
28610
28611 TargetLowering::CallLoweringInfo CLI(DAG);
28612 CLI.setDebugLoc(dl)
28613 .setChain(InChain)
28614 .setLibCallee(
28615 getLibcallCallingConv(LC),
28616 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
28617 std::move(Args))
28618 .setInRegister()
28619 .setSExtResult(isSigned)
28620 .setZExtResult(!isSigned);
28621
28622 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
28623 return DAG.getBitcast(VT, CallInfo.first);
28624}
28625
28626// Return true if the required (according to Opcode) shift-imm form is natively
28627// supported by the Subtarget
28628static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
28629 unsigned Opcode) {
28630 if (VT.getScalarSizeInBits() < 16)
28631 return false;
28632
28633 if (VT.is512BitVector() && Subtarget.hasAVX512() &&
28634 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
28635 return true;
28636
28637 bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
28638 (VT.is256BitVector() && Subtarget.hasInt256());
28639
28640 bool AShift = LShift && (Subtarget.hasAVX512() ||
28641 (VT != MVT::v2i64 && VT != MVT::v4i64));
28642 return (Opcode == ISD::SRA) ? AShift : LShift;
28643}
28644
28645// The shift amount is a variable, but it is the same for all vector lanes.
28646// These instructions are defined together with shift-immediate.
28647static
28648bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
28649 unsigned Opcode) {
28650 return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
28651}
28652
28653// Return true if the required (according to Opcode) variable-shift form is
28654// natively supported by the Subtarget
28655static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
28656 unsigned Opcode) {
28657
28658 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
28659 return false;
28660
28661 // vXi16 supported only on AVX-512, BWI
28662 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
28663 return false;
28664
28665 if (Subtarget.hasAVX512())
28666 return true;
28667
28668 bool LShift = VT.is128BitVector() || VT.is256BitVector();
28669 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
28670 return (Opcode == ISD::SRA) ? AShift : LShift;
28671}
28672
28673static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
28674 const X86Subtarget &Subtarget) {
28675 MVT VT = Op.getSimpleValueType();
28676 SDLoc dl(Op);
28677 SDValue R = Op.getOperand(0);
28678 SDValue Amt = Op.getOperand(1);
28679 unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);
28680
28681 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
28682 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type")(static_cast<void> (0));
28683 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
28684 SDValue Ex = DAG.getBitcast(ExVT, R);
28685
28686 // ashr(R, 63) === cmp_slt(R, 0)
28687 if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
28688 assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&(static_cast<void> (0))
28689 "Unsupported PCMPGT op")(static_cast<void> (0));
28690 return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R);
28691 }
28692
28693 if (ShiftAmt >= 32) {
28694 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
28695 SDValue Upper =
28696 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
28697 SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
28698 ShiftAmt - 32, DAG);
28699 if (VT == MVT::v2i64)
28700 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
28701 if (VT == MVT::v4i64)
28702 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
28703 {9, 1, 11, 3, 13, 5, 15, 7});
28704 } else {
28705 // SRA upper i32, SRL whole i64 and select lower i32.
28706 SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
28707 ShiftAmt, DAG);
28708 SDValue Lower =
28709 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
28710 Lower = DAG.getBitcast(ExVT, Lower);
28711 if (VT == MVT::v2i64)
28712 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
28713 if (VT == MVT::v4i64)
28714 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
28715 {8, 1, 10, 3, 12, 5, 14, 7});
28716 }
28717 return DAG.getBitcast(VT, Ex);
28718 };
28719
28720 // Optimize shl/srl/sra with constant shift amount.
28721 APInt APIntShiftAmt;
28722 if (!X86::isConstantSplat(Amt, APIntShiftAmt))
28723 return SDValue();
28724
28725 // If the shift amount is out of range, return undef.
28726 if (APIntShiftAmt.uge(VT.getScalarSizeInBits()))
28727 return DAG.getUNDEF(VT);
28728
28729 uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
28730
28731 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
28732 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
28733
28734 // i64 SRA needs to be performed as partial shifts.
28735 if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
28736 (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
28737 Op.getOpcode() == ISD::SRA)
28738 return ArithmeticShiftRight64(ShiftAmt);
28739
28740 if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) ||
28741 (Subtarget.hasBWI() && VT == MVT::v64i8)) {
28742 unsigned NumElts = VT.getVectorNumElements();
28743 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
28744
28745 // Simple i8 add case
28746 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
28747 // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
28748 // must be 0). (add undef, undef) however can be any value. To make this
28749 // safe, we must freeze R to ensure that register allocation uses the same
28750 // register for an undefined value. This ensures that the result will
28751 // still be even and preserves the original semantics.
28752 R = DAG.getNode(ISD::FREEZE, dl, VT, R);
28753 return DAG.getNode(ISD::ADD, dl, VT, R, R);
28754 }
28755
28756 // ashr(R, 7) === cmp_slt(R, 0)
28757 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
28758 SDValue Zeros = DAG.getConstant(0, dl, VT);
28759 if (VT.is512BitVector()) {
28760 assert(VT == MVT::v64i8 && "Unexpected element type!")(static_cast<void> (0));
28761 SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT);
28762 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
28763 }
28764 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
28765 }
28766
28767 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
28768 if (VT == MVT::v16i8 && Subtarget.hasXOP())
28769 return SDValue();
28770
28771 if (Op.getOpcode() == ISD::SHL) {
28772 // Make a large shift.
28773 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,
28774 ShiftAmt, DAG);
28775 SHL = DAG.getBitcast(VT, SHL);
28776 // Zero out the rightmost bits.
28777 APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);
28778 return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT));
28779 }
28780 if (Op.getOpcode() == ISD::SRL) {
28781 // Make a large shift.
28782 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R,
28783 ShiftAmt, DAG);
28784 SRL = DAG.getBitcast(VT, SRL);
28785 // Zero out the leftmost bits.
28786 APInt Mask = APInt::getLowBitsSet(8, 8 - ShiftAmt);
28787 return DAG.getNode(ISD::AND, dl, VT, SRL, DAG.getConstant(Mask, dl, VT));
28788 }
28789 if (Op.getOpcode() == ISD::SRA) {
28790 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
28791 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
28792
28793 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
28794 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
28795 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
28796 return Res;
28797 }
28798 llvm_unreachable("Unknown shift opcode.")__builtin_unreachable();
28799 }
28800
28801 return SDValue();
28802}
28803
28804static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
28805 const X86Subtarget &Subtarget) {
28806 MVT VT = Op.getSimpleValueType();
28807 SDLoc dl(Op);
28808 SDValue R = Op.getOperand(0);
28809 SDValue Amt = Op.getOperand(1);
28810 unsigned Opcode = Op.getOpcode();
28811 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);
28812 unsigned X86OpcV = getTargetVShiftUniformOpcode(Opcode, true);
28813
28814 if (SDValue BaseShAmt = DAG.getSplatValue(Amt)) {
28815 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode)) {
28816 MVT EltVT = VT.getVectorElementType();
28817 assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!")(static_cast<void> (0));
28818 if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
28819 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
28820 else if (EltVT.bitsLT(MVT::i32))
28821 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
28822
28823 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG);
28824 }
28825
28826 // vXi8 shifts - shift as v8i16 + mask result.
28827 if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) ||
28828 (VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) ||
28829 VT == MVT::v64i8) &&
28830 !Subtarget.hasXOP()) {
28831 unsigned NumElts = VT.getVectorNumElements();
28832 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
28833 if (SupportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {
28834 unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);
28835 unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);
28836 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
28837
28838 // Create the mask using vXi16 shifts. For shift-rights we need to move
28839 // the upper byte down before splatting the vXi8 mask.
28840 SDValue BitMask = DAG.getConstant(-1, dl, ExtVT);
28841 BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask,
28842 BaseShAmt, Subtarget, DAG);
28843 if (Opcode != ISD::SHL)
28844 BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask,
28845 8, DAG);
28846 BitMask = DAG.getBitcast(VT, BitMask);
28847 BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask,
28848 SmallVector<int, 64>(NumElts, 0));
28849
28850 SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT,
28851 DAG.getBitcast(ExtVT, R), BaseShAmt,
28852 Subtarget, DAG);
28853 Res = DAG.getBitcast(VT, Res);
28854 Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask);
28855
28856 if (Opcode == ISD::SRA) {
28857 // ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask)
28858 // SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.
28859 SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT);
28860 SignMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask,
28861 BaseShAmt, Subtarget, DAG);
28862 SignMask = DAG.getBitcast(VT, SignMask);
28863 Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask);
28864 Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask);
28865 }
28866 return Res;
28867 }
28868 }
28869 }
28870
28871 // Check cases (mainly 32-bit) where i64 is expanded into high and low parts.
28872 if (VT == MVT::v2i64 && Amt.getOpcode() == ISD::BITCAST &&
28873 Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
28874 Amt = Amt.getOperand(0);
28875 unsigned Ratio = 64 / Amt.getScalarValueSizeInBits();
28876 std::vector<SDValue> Vals(Ratio);
28877 for (unsigned i = 0; i != Ratio; ++i)
28878 Vals[i] = Amt.getOperand(i);
28879 for (unsigned i = Ratio, e = Amt.getNumOperands(); i != e; i += Ratio) {
28880 for (unsigned j = 0; j != Ratio; ++j)
28881 if (Vals[j] != Amt.getOperand(i + j))
28882 return SDValue();
28883 }
28884
28885 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
28886 return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
28887 }
28888 return SDValue();
28889}
28890
28891// Convert a shift/rotate left amount to a multiplication scale factor.
28892static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,
28893 const X86Subtarget &Subtarget,
28894 SelectionDAG &DAG) {
28895 MVT VT = Amt.getSimpleValueType();
28896 if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||
28897 (Subtarget.hasInt256() && VT == MVT::v16i16) ||
28898 (Subtarget.hasVBMI2() && VT == MVT::v32i16) ||
28899 (!Subtarget.hasAVX512() && VT == MVT::v16i8)))
28900 return SDValue();
28901
28902 if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
28903 SmallVector<SDValue, 8> Elts;
28904 MVT SVT = VT.getVectorElementType();
28905 unsigned SVTBits = SVT.getSizeInBits();
28906 APInt One(SVTBits, 1);
28907 unsigned NumElems = VT.getVectorNumElements();
28908
28909 for (unsigned i = 0; i != NumElems; ++i) {
28910 SDValue Op = Amt->getOperand(i);
28911 if (Op->isUndef()) {
28912 Elts.push_back(Op);
28913 continue;
28914 }
28915
28916 ConstantSDNode *ND = cast<ConstantSDNode>(Op);
28917 APInt C(SVTBits, ND->getZExtValue());
28918 uint64_t ShAmt = C.getZExtValue();
28919 if (ShAmt >= SVTBits) {
28920 Elts.push_back(DAG.getUNDEF(SVT));
28921 continue;
28922 }
28923 Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
28924 }
28925 return DAG.getBuildVector(VT, dl, Elts);
28926 }
28927
28928 // If the target doesn't support variable shifts, use either FP conversion
28929 // or integer multiplication to avoid shifting each element individually.
28930 if (VT == MVT::v4i32) {
28931 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
28932 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,
28933 DAG.getConstant(0x3f800000U, dl, VT));
28934 Amt = DAG.getBitcast(MVT::v4f32, Amt);
28935 return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);
28936 }
28937
28938 // AVX2 can more effectively perform this as a zext/trunc to/from v8i32.
28939 if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {
28940 SDValue Z = DAG.getConstant(0, dl, VT);
28941 SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));
28942 SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));
28943 Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);
28944 Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);
28945 if (Subtarget.hasSSE41())
28946 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
28947
28948 return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, Lo),
28949 DAG.getBitcast(VT, Hi),
28950 {0, 2, 4, 6, 8, 10, 12, 14});
28951 }
28952
28953 return SDValue();
28954}
28955
28956static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
28957 SelectionDAG &DAG) {
28958 MVT VT = Op.getSimpleValueType();
28959 SDLoc dl(Op);
28960 SDValue R = Op.getOperand(0);
28961 SDValue Amt = Op.getOperand(1);
28962 unsigned EltSizeInBits = VT.getScalarSizeInBits();
28963 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
28964
28965 unsigned Opc = Op.getOpcode();
28966 unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true);
28967 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false);
28968
28969 assert(VT.isVector() && "Custom lowering only for vector shifts!")(static_cast<void> (0));
28970 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!")(static_cast<void> (0));
28971
28972 if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
28973 return V;
28974
28975 if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
28976 return V;
28977
28978 if (SupportedVectorVarShift(VT, Subtarget, Opc))
28979 return Op;
28980
28981 // XOP has 128-bit variable logical/arithmetic shifts.
28982 // +ve/-ve Amt = shift left/right.
28983 if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||
28984 VT == MVT::v8i16 || VT == MVT::v16i8)) {
28985 if (Opc == ISD::SRL || Opc == ISD::SRA) {
28986 SDValue Zero = DAG.getConstant(0, dl, VT);
28987 Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
28988 }
28989 if (Opc == ISD::SHL || Opc == ISD::SRL)
28990 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
28991 if (Opc == ISD::SRA)
28992 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
28993 }
28994
28995 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
28996 // shifts per-lane and then shuffle the partial results back together.
28997 if (VT == MVT::v2i64 && Opc != ISD::SRA) {
28998 // Splat the shift amounts so the scalar shifts above will catch it.
28999 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
29000 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
29001 SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
29002 SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
29003 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
29004 }
29005
29006 // i64 vector arithmetic shift can be emulated with the transform:
29007 // M = lshr(SIGN_MASK, Amt)
29008 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
29009 if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
29010 Opc == ISD::SRA) {
29011 SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
29012 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
29013 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
29014 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
29015 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
29016 return R;
29017 }
29018
29019 // If possible, lower this shift as a sequence of two shifts by
29020 // constant plus a BLENDing shuffle instead of scalarizing it.
29021 // Example:
29022 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
29023 //
29024 // Could be rewritten as:
29025 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
29026 //
29027 // The advantage is that the two shifts from the example would be
29028 // lowered as X86ISD::VSRLI nodes in parallel before blending.
29029 if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
29030 (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
29031 SDValue Amt1, Amt2;
29032 unsigned NumElts = VT.getVectorNumElements();
29033 SmallVector<int, 8> ShuffleMask;
29034 for (unsigned i = 0; i != NumElts; ++i) {
29035 SDValue A = Amt->getOperand(i);
29036 if (A.isUndef()) {
29037 ShuffleMask.push_back(SM_SentinelUndef);
29038 continue;
29039 }
29040 if (!Amt1 || Amt1 == A) {
29041 ShuffleMask.push_back(i);
29042 Amt1 = A;
29043 continue;
29044 }
29045 if (!Amt2 || Amt2 == A) {
29046 ShuffleMask.push_back(i + NumElts);
29047 Amt2 = A;
29048 continue;
29049 }
29050 break;
29051 }
29052
29053 // Only perform this blend if we can perform it without loading a mask.
29054 if (ShuffleMask.size() == NumElts && Amt1 && Amt2 &&
29055 (VT != MVT::v16i16 ||
29056 is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&
29057 (VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL ||
29058 canWidenShuffleElements(ShuffleMask))) {
29059 auto *Cst1 = dyn_cast<ConstantSDNode>(Amt1);
29060 auto *Cst2 = dyn_cast<ConstantSDNode>(Amt2);
29061 if (Cst1 && Cst2 && Cst1->getAPIntValue().ult(EltSizeInBits) &&
29062 Cst2->getAPIntValue().ult(EltSizeInBits)) {
29063 SDValue Shift1 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
29064 Cst1->getZExtValue(), DAG);
29065 SDValue Shift2 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
29066 Cst2->getZExtValue(), DAG);
29067 return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
29068 }
29069 }
29070 }
29071
29072 // If possible, lower this packed shift into a vector multiply instead of
29073 // expanding it into a sequence of scalar shifts.
29074 if (Opc == ISD::SHL)
29075 if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
29076 return DAG.getNode(ISD::MUL, dl, VT, R, Scale);
29077
29078 // Constant ISD::SRL can be performed efficiently on vXi16 vectors as we
29079 // can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).
29080 if (Opc == ISD::SRL && ConstantAmt &&
29081 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
29082 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
29083 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
29084 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
29085 SDValue Zero = DAG.getConstant(0, dl, VT);
29086 SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);
29087 SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);
29088 return DAG.getSelect(dl, VT, ZAmt, R, Res);
29089 }
29090 }
29091
29092 // Constant ISD::SRA can be performed efficiently on vXi16 vectors as we
29093 // can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).
29094 // TODO: Special case handling for shift by 0/1, really we can afford either
29095 // of these cases in pre-SSE41/XOP/AVX512 but not both.
29096 if (Opc == ISD::SRA && ConstantAmt &&
29097 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256())) &&
29098 ((Subtarget.hasSSE41() && !Subtarget.hasXOP() &&
29099 !Subtarget.hasAVX512()) ||
29100 DAG.isKnownNeverZero(Amt))) {
29101 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
29102 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
29103 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
29104 SDValue Amt0 =
29105 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ);
29106 SDValue Amt1 =
29107 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ);
29108 SDValue Sra1 =
29109 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG);
29110 SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale);
29111 Res = DAG.getSelect(dl, VT, Amt0, R, Res);
29112 return DAG.getSelect(dl, VT, Amt1, Sra1, Res);
29113 }
29114 }
29115
29116 // v4i32 Non Uniform Shifts.
29117 // If the shift amount is constant we can shift each lane using the SSE2
29118 // immediate shifts, else we need to zero-extend each lane to the lower i64
29119 // and shift using the SSE2 variable shifts.
29120 // The separate results can then be blended together.
29121 if (VT == MVT::v4i32) {
29122 SDValue Amt0, Amt1, Amt2, Amt3;
29123 if (ConstantAmt) {
29124 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
29125 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
29126 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
29127 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
29128 } else {
29129 // The SSE2 shifts use the lower i64 as the same shift amount for
29130 // all lanes and the upper i64 is ignored. On AVX we're better off
29131 // just zero-extending, but for SSE just duplicating the top 16-bits is
29132 // cheaper and has the same effect for out of range values.
29133 if (Subtarget.hasAVX()) {
29134 SDValue Z = DAG.getConstant(0, dl, VT);
29135 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
29136 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
29137 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
29138 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
29139 } else {
29140 SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);
29141 SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
29142 {4, 5, 6, 7, -1, -1, -1, -1});
29143 Amt0 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
29144 {0, 1, 1, 1, -1, -1, -1, -1});
29145 Amt1 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
29146 {2, 3, 3, 3, -1, -1, -1, -1});
29147 Amt2 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
29148 {0, 1, 1, 1, -1, -1, -1, -1});
29149 Amt3 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
29150 {2, 3, 3, 3, -1, -1, -1, -1});
29151 }
29152 }
29153
29154 unsigned ShOpc = ConstantAmt ? Opc : X86OpcV;
29155 SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0));
29156 SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1));
29157 SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2));
29158 SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3));
29159
29160 // Merge the shifted lane results optimally with/without PBLENDW.
29161 // TODO - ideally shuffle combining would handle this.
29162 if (Subtarget.hasSSE41()) {
29163 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
29164 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
29165 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
29166 }
29167 SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
29168 SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
29169 return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
29170 }
29171
29172 // It's worth extending once and using the vXi16/vXi32 shifts for smaller
29173 // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
29174 // make the existing SSE solution better.
29175 // NOTE: We honor prefered vector width before promoting to 512-bits.
29176 if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
29177 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||
29178 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||
29179 (Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||
29180 (Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
29181 assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&(static_cast<void> (0))
29182 "Unexpected vector type")(static_cast<void> (0));
29183 MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
29184 MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
29185 unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29186 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
29187 Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
29188 return DAG.getNode(ISD::TRUNCATE, dl, VT,
29189 DAG.getNode(Opc, dl, ExtVT, R, Amt));
29190 }
29191
29192 // Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we
29193 // extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.
29194 if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&
29195 (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
29196 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
29197 !Subtarget.hasXOP()) {
29198 int NumElts = VT.getVectorNumElements();
29199 SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);
29200
29201 // Extend constant shift amount to vXi16 (it doesn't matter if the type
29202 // isn't legal).
29203 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
29204 Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT);
29205 Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt);
29206 Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt);
29207 assert(ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) &&(static_cast<void> (0))
29208 "Constant build vector expected")(static_cast<void> (0));
29209
29210 if (VT == MVT::v16i8 && Subtarget.hasInt256()) {
29211 R = Opc == ISD::SRA ? DAG.getSExtOrTrunc(R, dl, ExVT)
29212 : DAG.getZExtOrTrunc(R, dl, ExVT);
29213 R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt);
29214 R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8);
29215 return DAG.getZExtOrTrunc(R, dl, VT);
29216 }
29217
29218 SmallVector<SDValue, 16> LoAmt, HiAmt;
29219 for (int i = 0; i != NumElts; i += 16) {
29220 for (int j = 0; j != 8; ++j) {
29221 LoAmt.push_back(Amt.getOperand(i + j));
29222 HiAmt.push_back(Amt.getOperand(i + j + 8));
29223 }
29224 }
29225
29226 MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2);
29227 SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt);
29228 SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt);
29229
29230 SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R));
29231 SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R));
29232 LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8);
29233 HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8);
29234 LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA);
29235 HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA);
29236 LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8);
29237 HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8);
29238 return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);
29239 }
29240
29241 if (VT == MVT::v16i8 ||
29242 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
29243 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
29244 MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
29245
29246 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
29247 if (VT.is512BitVector()) {
29248 // On AVX512BW targets we make use of the fact that VSELECT lowers
29249 // to a masked blend which selects bytes based just on the sign bit
29250 // extracted to a mask.
29251 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
29252 V0 = DAG.getBitcast(VT, V0);
29253 V1 = DAG.getBitcast(VT, V1);
29254 Sel = DAG.getBitcast(VT, Sel);
29255 Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,
29256 ISD::SETGT);
29257 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
29258 } else if (Subtarget.hasSSE41()) {
29259 // On SSE41 targets we can use PBLENDVB which selects bytes based just
29260 // on the sign bit.
29261 V0 = DAG.getBitcast(VT, V0);
29262 V1 = DAG.getBitcast(VT, V1);
29263 Sel = DAG.getBitcast(VT, Sel);
29264 return DAG.getBitcast(SelVT,
29265 DAG.getNode(X86ISD::BLENDV, dl, VT, Sel, V0, V1));
29266 }
29267 // On pre-SSE41 targets we test for the sign bit by comparing to
29268 // zero - a negative value will set all bits of the lanes to true
29269 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
29270 SDValue Z = DAG.getConstant(0, dl, SelVT);
29271 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
29272 return DAG.getSelect(dl, SelVT, C, V0, V1);
29273 };
29274
29275 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
29276 // We can safely do this using i16 shifts as we're only interested in
29277 // the 3 lower bits of each byte.
29278 Amt = DAG.getBitcast(ExtVT, Amt);
29279 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG);
29280 Amt = DAG.getBitcast(VT, Amt);
29281
29282 if (Opc == ISD::SHL || Opc == ISD::SRL) {
29283 // r = VSELECT(r, shift(r, 4), a);
29284 SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT));
29285 R = SignBitSelect(VT, Amt, M, R);
29286
29287 // a += a
29288 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
29289
29290 // r = VSELECT(r, shift(r, 2), a);
29291 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT));
29292 R = SignBitSelect(VT, Amt, M, R);
29293
29294 // a += a
29295 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
29296
29297 // return VSELECT(r, shift(r, 1), a);
29298 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT));
29299 R = SignBitSelect(VT, Amt, M, R);
29300 return R;
29301 }
29302
29303 if (Opc == ISD::SRA) {
29304 // For SRA we need to unpack each byte to the higher byte of a i16 vector
29305 // so we can correctly sign extend. We don't care what happens to the
29306 // lower byte.
29307 SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
29308 SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
29309 SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R);
29310 SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R);
29311 ALo = DAG.getBitcast(ExtVT, ALo);
29312 AHi = DAG.getBitcast(ExtVT, AHi);
29313 RLo = DAG.getBitcast(ExtVT, RLo);
29314 RHi = DAG.getBitcast(ExtVT, RHi);
29315
29316 // r = VSELECT(r, shift(r, 4), a);
29317 SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG);
29318 SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG);
29319 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
29320 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
29321
29322 // a += a
29323 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
29324 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
29325
29326 // r = VSELECT(r, shift(r, 2), a);
29327 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG);
29328 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG);
29329 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
29330 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
29331
29332 // a += a
29333 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
29334 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
29335
29336 // r = VSELECT(r, shift(r, 1), a);
29337 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG);
29338 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG);
29339 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
29340 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
29341
29342 // Logical shift the result back to the lower byte, leaving a zero upper
29343 // byte meaning that we can safely pack with PACKUSWB.
29344 RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG);
29345 RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG);
29346 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
29347 }
29348 }
29349
29350 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
29351 MVT ExtVT = MVT::v8i32;
29352 SDValue Z = DAG.getConstant(0, dl, VT);
29353 SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z);
29354 SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z);
29355 SDValue RLo = getUnpackl(DAG, dl, VT, Z, R);
29356 SDValue RHi = getUnpackh(DAG, dl, VT, Z, R);
29357 ALo = DAG.getBitcast(ExtVT, ALo);
29358 AHi = DAG.getBitcast(ExtVT, AHi);
29359 RLo = DAG.getBitcast(ExtVT, RLo);
29360 RHi = DAG.getBitcast(ExtVT, RHi);
29361 SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo);
29362 SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi);
29363 Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG);
29364 Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG);
29365 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
29366 }
29367
29368 if (VT == MVT::v8i16) {
29369 // If we have a constant shift amount, the non-SSE41 path is best as
29370 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
29371 bool UseSSE41 = Subtarget.hasSSE41() &&
29372 !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
29373
29374 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
29375 // On SSE41 targets we can use PBLENDVB which selects bytes based just on
29376 // the sign bit.
29377 if (UseSSE41) {
29378 MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
29379 V0 = DAG.getBitcast(ExtVT, V0);
29380 V1 = DAG.getBitcast(ExtVT, V1);
29381 Sel = DAG.getBitcast(ExtVT, Sel);
29382 return DAG.getBitcast(
29383 VT, DAG.getNode(X86ISD::BLENDV, dl, ExtVT, Sel, V0, V1));
29384 }
29385 // On pre-SSE41 targets we splat the sign bit - a negative value will
29386 // set all bits of the lanes to true and VSELECT uses that in
29387 // its OR(AND(V0,C),AND(V1,~C)) lowering.
29388 SDValue C =
29389 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG);
29390 return DAG.getSelect(dl, VT, C, V0, V1);
29391 };
29392
29393 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
29394 if (UseSSE41) {
29395 // On SSE41 targets we need to replicate the shift mask in both
29396 // bytes for PBLENDVB.
29397 Amt = DAG.getNode(
29398 ISD::OR, dl, VT,
29399 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG),
29400 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG));
29401 } else {
29402 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG);
29403 }
29404
29405 // r = VSELECT(r, shift(r, 8), a);
29406 SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG);
29407 R = SignBitSelect(Amt, M, R);
29408
29409 // a += a
29410 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
29411
29412 // r = VSELECT(r, shift(r, 4), a);
29413 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG);
29414 R = SignBitSelect(Amt, M, R);
29415
29416 // a += a
29417 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
29418
29419 // r = VSELECT(r, shift(r, 2), a);
29420 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG);
29421 R = SignBitSelect(Amt, M, R);
29422
29423 // a += a
29424 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
29425
29426 // return VSELECT(r, shift(r, 1), a);
29427 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG);
29428 R = SignBitSelect(Amt, M, R);
29429 return R;
29430 }
29431
29432 // Decompose 256-bit shifts into 128-bit shifts.
29433 if (VT.is256BitVector())
29434 return splitVectorIntBinary(Op, DAG);
29435
29436 if (VT == MVT::v32i16 || VT == MVT::v64i8)
29437 return splitVectorIntBinary(Op, DAG);
29438
29439 return SDValue();
29440}
29441
29442static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
29443 SelectionDAG &DAG) {
29444 MVT VT = Op.getSimpleValueType();
29445 assert(VT.isVector() && "Custom lowering only for vector rotates!")(static_cast<void> (0));
29446
29447 SDLoc DL(Op);
29448 SDValue R = Op.getOperand(0);
29449 SDValue Amt = Op.getOperand(1);
29450 unsigned Opcode = Op.getOpcode();
29451 unsigned EltSizeInBits = VT.getScalarSizeInBits();
29452 int NumElts = VT.getVectorNumElements();
29453
29454 // Check for constant splat rotation amount.
29455 APInt CstSplatValue;
29456 bool IsCstSplat = X86::isConstantSplat(Amt, CstSplatValue);
29457
29458 // Check for splat rotate by zero.
29459 if (IsCstSplat && CstSplatValue.urem(EltSizeInBits) == 0)
29460 return R;
29461
29462 // AVX512 implicitly uses modulo rotation amounts.
29463 if (Subtarget.hasAVX512() && 32 <= EltSizeInBits) {
29464 // Attempt to rotate by immediate.
29465 if (IsCstSplat) {
29466 unsigned RotOpc = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);
29467 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
29468 return DAG.getNode(RotOpc, DL, VT, R,
29469 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
29470 }
29471
29472 // Else, fall-back on VPROLV/VPRORV.
29473 return Op;
29474 }
29475
29476 // AVX512 VBMI2 vXi16 - lower to funnel shifts.
29477 if (Subtarget.hasVBMI2() && 16 == EltSizeInBits) {
29478 unsigned FunnelOpc = (Opcode == ISD::ROTL ? ISD::FSHL : ISD::FSHR);
29479 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
29480 }
29481
29482 assert((Opcode == ISD::ROTL) && "Only ROTL supported")(static_cast<void> (0));
29483
29484 // XOP has 128-bit vector variable + immediate rotates.
29485 // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
29486 // XOP implicitly uses modulo rotation amounts.
29487 if (Subtarget.hasXOP()) {
29488 if (VT.is256BitVector())
29489 return splitVectorIntBinary(Op, DAG);
29490 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!")(static_cast<void> (0));
29491
29492 // Attempt to rotate by immediate.
29493 if (IsCstSplat) {
29494 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
29495 return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
29496 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
29497 }
29498
29499 // Use general rotate by variable (per-element).
29500 return Op;
29501 }
29502
29503 // Split 256-bit integers on pre-AVX2 targets.
29504 if (VT.is256BitVector() && !Subtarget.hasAVX2())
29505 return splitVectorIntBinary(Op, DAG);
29506
29507 assert((VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||(static_cast<void> (0))
29508 ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8 ||(static_cast<void> (0))
29509 VT == MVT::v32i16) &&(static_cast<void> (0))
29510 Subtarget.hasAVX2())) &&(static_cast<void> (0))
29511 "Only vXi32/vXi16/vXi8 vector rotates supported")(static_cast<void> (0));
29512
29513 // Rotate by an uniform constant - expand back to shifts.
29514 if (IsCstSplat)
29515 return SDValue();
29516
29517 bool IsSplatAmt = DAG.isSplatValue(Amt);
29518
29519 // v16i8/v32i8: Split rotation into rot4/rot2/rot1 stages and select by
29520 // the amount bit.
29521 if (EltSizeInBits == 8 && !IsSplatAmt) {
29522 if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()))
29523 return SDValue();
29524
29525 // We don't need ModuloAmt here as we just peek at individual bits.
29526 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29527
29528 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
29529 if (Subtarget.hasSSE41()) {
29530 // On SSE41 targets we can use PBLENDVB which selects bytes based just
29531 // on the sign bit.
29532 V0 = DAG.getBitcast(VT, V0);
29533 V1 = DAG.getBitcast(VT, V1);
29534 Sel = DAG.getBitcast(VT, Sel);
29535 return DAG.getBitcast(SelVT,
29536 DAG.getNode(X86ISD::BLENDV, DL, VT, Sel, V0, V1));
29537 }
29538 // On pre-SSE41 targets we test for the sign bit by comparing to
29539 // zero - a negative value will set all bits of the lanes to true
29540 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
29541 SDValue Z = DAG.getConstant(0, DL, SelVT);
29542 SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);
29543 return DAG.getSelect(DL, SelVT, C, V0, V1);
29544 };
29545
29546 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
29547 // We can safely do this using i16 shifts as we're only interested in
29548 // the 3 lower bits of each byte.
29549 Amt = DAG.getBitcast(ExtVT, Amt);
29550 Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));
29551 Amt = DAG.getBitcast(VT, Amt);
29552
29553 // r = VSELECT(r, rot(r, 4), a);
29554 SDValue M;
29555 M = DAG.getNode(
29556 ISD::OR, DL, VT,
29557 DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(4, DL, VT)),
29558 DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(4, DL, VT)));
29559 R = SignBitSelect(VT, Amt, M, R);
29560
29561 // a += a
29562 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
29563
29564 // r = VSELECT(r, rot(r, 2), a);
29565 M = DAG.getNode(
29566 ISD::OR, DL, VT,
29567 DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(2, DL, VT)),
29568 DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(6, DL, VT)));
29569 R = SignBitSelect(VT, Amt, M, R);
29570
29571 // a += a
29572 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
29573
29574 // return VSELECT(r, rot(r, 1), a);
29575 M = DAG.getNode(
29576 ISD::OR, DL, VT,
29577 DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(1, DL, VT)),
29578 DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(7, DL, VT)));
29579 return SignBitSelect(VT, Amt, M, R);
29580 }
29581
29582 // ISD::ROT* uses modulo rotate amounts.
29583 if (SDValue BaseRotAmt = DAG.getSplatValue(Amt)) {
29584 // If the amount is a splat, perform the modulo BEFORE the splat,
29585 // this helps LowerScalarVariableShift to remove the splat later.
29586 Amt = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, BaseRotAmt);
29587 Amt = DAG.getNode(ISD::AND, DL, VT, Amt,
29588 DAG.getConstant(EltSizeInBits - 1, DL, VT));
29589 Amt = DAG.getVectorShuffle(VT, DL, Amt, DAG.getUNDEF(VT),
29590 SmallVector<int>(NumElts, 0));
29591 } else {
29592 Amt = DAG.getNode(ISD::AND, DL, VT, Amt,
29593 DAG.getConstant(EltSizeInBits - 1, DL, VT));
29594 }
29595
29596 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
29597 bool LegalVarShifts = SupportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
29598 SupportedVectorVarShift(VT, Subtarget, ISD::SRL);
29599
29600 // Fallback for splats + all supported variable shifts.
29601 // Fallback for non-constants AVX2 vXi16 as well.
29602 if (IsSplatAmt || LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {
29603 SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
29604 AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
29605 SDValue SHL = DAG.getNode(ISD::SHL, DL, VT, R, Amt);
29606 SDValue SRL = DAG.getNode(ISD::SRL, DL, VT, R, AmtR);
29607 return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
29608 }
29609
29610 // As with shifts, convert the rotation amount to a multiplication factor.
29611 SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);
29612 assert(Scale && "Failed to convert ROTL amount to scale")(static_cast<void> (0));
29613
29614 // v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.
29615 if (EltSizeInBits == 16) {
29616 SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);
29617 SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);
29618 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
29619 }
29620
29621 // v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32
29622 // to v2i64 results at a time. The upper 32-bits contain the wrapped bits
29623 // that can then be OR'd with the lower 32-bits.
29624 assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected")(static_cast<void> (0));
29625 static const int OddMask[] = {1, -1, 3, -1};
29626 SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);
29627 SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);
29628
29629 SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
29630 DAG.getBitcast(MVT::v2i64, R),
29631 DAG.getBitcast(MVT::v2i64, Scale));
29632 SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
29633 DAG.getBitcast(MVT::v2i64, R13),
29634 DAG.getBitcast(MVT::v2i64, Scale13));
29635 Res02 = DAG.getBitcast(VT, Res02);
29636 Res13 = DAG.getBitcast(VT, Res13);
29637
29638 return DAG.getNode(ISD::OR, DL, VT,
29639 DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),
29640 DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));
29641}
29642
29643/// Returns true if the operand type is exactly twice the native width, and
29644/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
29645/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
29646/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
29647bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
29648 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
29649
29650 if (OpWidth == 64)
29651 return Subtarget.hasCmpxchg8b() && !Subtarget.is64Bit();
29652 if (OpWidth == 128)
29653 return Subtarget.hasCmpxchg16b();
29654
29655 return false;
29656}
29657
29658bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
29659 Type *MemType = SI->getValueOperand()->getType();
29660
29661 bool NoImplicitFloatOps =
29662 SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
29663 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
29664 !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
29665 (Subtarget.hasSSE1() || Subtarget.hasX87()))
29666 return false;
29667
29668 return needsCmpXchgNb(MemType);
29669}
29670
29671// Note: this turns large loads into lock cmpxchg8b/16b.
29672// TODO: In 32-bit mode, use MOVLPS when SSE1 is available?
29673TargetLowering::AtomicExpansionKind
29674X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
29675 Type *MemType = LI->getType();
29676
29677 // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
29678 // can use movq to do the load. If we have X87 we can load into an 80-bit
29679 // X87 register and store it to a stack temporary.
29680 bool NoImplicitFloatOps =
29681 LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
29682 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
29683 !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
29684 (Subtarget.hasSSE1() || Subtarget.hasX87()))
29685 return AtomicExpansionKind::None;
29686
29687 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
29688 : AtomicExpansionKind::None;
29689}
29690
29691TargetLowering::AtomicExpansionKind
29692X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
29693 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
29694 Type *MemType = AI->getType();
29695
29696 // If the operand is too big, we must see if cmpxchg8/16b is available
29697 // and default to library calls otherwise.
29698 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
29699 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
29700 : AtomicExpansionKind::None;
29701 }
29702
29703 AtomicRMWInst::BinOp Op = AI->getOperation();
29704 switch (Op) {
29705 default:
29706 llvm_unreachable("Unknown atomic operation")__builtin_unreachable();
29707 case AtomicRMWInst::Xchg:
29708 case AtomicRMWInst::Add:
29709 case AtomicRMWInst::Sub:
29710 // It's better to use xadd, xsub or xchg for these in all cases.
29711 return AtomicExpansionKind::None;
29712 case AtomicRMWInst::Or:
29713 case AtomicRMWInst::And:
29714 case AtomicRMWInst::Xor:
29715 // If the atomicrmw's result isn't actually used, we can just add a "lock"
29716 // prefix to a normal instruction for these operations.
29717 return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
29718 : AtomicExpansionKind::None;
29719 case AtomicRMWInst::Nand:
29720 case AtomicRMWInst::Max:
29721 case AtomicRMWInst::Min:
29722 case AtomicRMWInst::UMax:
29723 case AtomicRMWInst::UMin:
29724 case AtomicRMWInst::FAdd:
29725 case AtomicRMWInst::FSub:
29726 // These always require a non-trivial set of data operations on x86. We must
29727 // use a cmpxchg loop.
29728 return AtomicExpansionKind::CmpXChg;
29729 }
29730}
29731
29732LoadInst *
29733X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
29734 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
29735 Type *MemType = AI->getType();
29736 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
29737 // there is no benefit in turning such RMWs into loads, and it is actually
29738 // harmful as it introduces a mfence.
29739 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
29740 return nullptr;
29741
29742 // If this is a canonical idempotent atomicrmw w/no uses, we have a better
29743 // lowering available in lowerAtomicArith.
29744 // TODO: push more cases through this path.
29745 if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
29746 if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
29747 AI->use_empty())
29748 return nullptr;
29749
29750 IRBuilder<> Builder(AI);
29751 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
29752 auto SSID = AI->getSyncScopeID();
29753 // We must restrict the ordering to avoid generating loads with Release or
29754 // ReleaseAcquire orderings.
29755 auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
29756
29757 // Before the load we need a fence. Here is an example lifted from
29758 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
29759 // is required:
29760 // Thread 0:
29761 // x.store(1, relaxed);
29762 // r1 = y.fetch_add(0, release);
29763 // Thread 1:
29764 // y.fetch_add(42, acquire);
29765 // r2 = x.load(relaxed);
29766 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
29767 // lowered to just a load without a fence. A mfence flushes the store buffer,
29768 // making the optimization clearly correct.
29769 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
29770 // otherwise, we might be able to be more aggressive on relaxed idempotent
29771 // rmw. In practice, they do not look useful, so we don't try to be
29772 // especially clever.
29773 if (SSID == SyncScope::SingleThread)
29774 // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
29775 // the IR level, so we must wrap it in an intrinsic.
29776 return nullptr;
29777
29778 if (!Subtarget.hasMFence())
29779 // FIXME: it might make sense to use a locked operation here but on a
29780 // different cache-line to prevent cache-line bouncing. In practice it
29781 // is probably a small win, and x86 processors without mfence are rare
29782 // enough that we do not bother.
29783 return nullptr;
29784
29785 Function *MFence =
29786 llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
29787 Builder.CreateCall(MFence, {});
29788
29789 // Finally we can emit the atomic load.
29790 LoadInst *Loaded = Builder.CreateAlignedLoad(
29791 AI->getType(), AI->getPointerOperand(), AI->getAlign());
29792 Loaded->setAtomic(Order, SSID);
29793 AI->replaceAllUsesWith(Loaded);
29794 AI->eraseFromParent();
29795 return Loaded;
29796}
29797
29798bool X86TargetLowering::lowerAtomicStoreAsStoreSDNode(const StoreInst &SI) const {
29799 if (!SI.isUnordered())
29800 return false;
29801 return ExperimentalUnorderedISEL;
29802}
29803bool X86TargetLowering::lowerAtomicLoadAsLoadSDNode(const LoadInst &LI) const {
29804 if (!LI.isUnordered())
29805 return false;
29806 return ExperimentalUnorderedISEL;
29807}
29808
29809
29810/// Emit a locked operation on a stack location which does not change any
29811/// memory location, but does involve a lock prefix. Location is chosen to be
29812/// a) very likely accessed only by a single thread to minimize cache traffic,
29813/// and b) definitely dereferenceable. Returns the new Chain result.
29814static SDValue emitLockedStackOp(SelectionDAG &DAG,
29815 const X86Subtarget &Subtarget, SDValue Chain,
29816 const SDLoc &DL) {
29817 // Implementation notes:
29818 // 1) LOCK prefix creates a full read/write reordering barrier for memory
29819 // operations issued by the current processor. As such, the location
29820 // referenced is not relevant for the ordering properties of the instruction.
29821 // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
29822 // 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions
29823 // 2) Using an immediate operand appears to be the best encoding choice
29824 // here since it doesn't require an extra register.
29825 // 3) OR appears to be very slightly faster than ADD. (Though, the difference
29826 // is small enough it might just be measurement noise.)
29827 // 4) When choosing offsets, there are several contributing factors:
29828 // a) If there's no redzone, we default to TOS. (We could allocate a cache
29829 // line aligned stack object to improve this case.)
29830 // b) To minimize our chances of introducing a false dependence, we prefer
29831 // to offset the stack usage from TOS slightly.
29832 // c) To minimize concerns about cross thread stack usage - in particular,
29833 // the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which
29834 // captures state in the TOS frame and accesses it from many threads -
29835 // we want to use an offset such that the offset is in a distinct cache
29836 // line from the TOS frame.
29837 //
29838 // For a general discussion of the tradeoffs and benchmark results, see:
29839 // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
29840
29841 auto &MF = DAG.getMachineFunction();
29842 auto &TFL = *Subtarget.getFrameLowering();
29843 const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;
29844
29845 if (Subtarget.is64Bit()) {
29846 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
29847 SDValue Ops[] = {
29848 DAG.getRegister(X86::RSP, MVT::i64), // Base
29849 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
29850 DAG.getRegister(0, MVT::i64), // Index
29851 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
29852 DAG.getRegister(0, MVT::i16), // Segment.
29853 Zero,
29854 Chain};
29855 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
29856 MVT::Other, Ops);
29857 return SDValue(Res, 1);
29858 }
29859
29860 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
29861 SDValue Ops[] = {
29862 DAG.getRegister(X86::ESP, MVT::i32), // Base
29863 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
29864 DAG.getRegister(0, MVT::i32), // Index
29865 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
29866 DAG.getRegister(0, MVT::i16), // Segment.
29867 Zero,
29868 Chain
29869 };
29870 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
29871 MVT::Other, Ops);
29872 return SDValue(Res, 1);
29873}
29874
29875static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
29876 SelectionDAG &DAG) {
29877 SDLoc dl(Op);
29878 AtomicOrdering FenceOrdering =
29879 static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
29880 SyncScope::ID FenceSSID =
29881 static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
29882
29883 // The only fence that needs an instruction is a sequentially-consistent
29884 // cross-thread fence.
29885 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
29886 FenceSSID == SyncScope::System) {
29887 if (Subtarget.hasMFence())
29888 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
29889
29890 SDValue Chain = Op.getOperand(0);
29891 return emitLockedStackOp(DAG, Subtarget, Chain, dl);
29892 }
29893
29894 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
29895 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
29896}
29897
29898static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
29899 SelectionDAG &DAG) {
29900 MVT T = Op.getSimpleValueType();
29901 SDLoc DL(Op);
29902 unsigned Reg = 0;
29903 unsigned size = 0;
29904 switch(T.SimpleTy) {
29905 default: llvm_unreachable("Invalid value type!")__builtin_unreachable();
29906 case MVT::i8: Reg = X86::AL; size = 1; break;
29907 case MVT::i16: Reg = X86::AX; size = 2; break;
29908 case MVT::i32: Reg = X86::EAX; size = 4; break;
29909 case MVT::i64:
29910 assert(Subtarget.is64Bit() && "Node not type legal!")(static_cast<void> (0));
29911 Reg = X86::RAX; size = 8;
29912 break;
29913 }
29914 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
29915 Op.getOperand(2), SDValue());
29916 SDValue Ops[] = { cpIn.getValue(0),
29917 Op.getOperand(1),
29918 Op.getOperand(3),
29919 DAG.getTargetConstant(size, DL, MVT::i8),
29920 cpIn.getValue(1) };
29921 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
29922 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
29923 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
29924 Ops, T, MMO);
29925
29926 SDValue cpOut =
29927 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
29928 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
29929 MVT::i32, cpOut.getValue(2));
29930 SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
29931
29932 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
29933 cpOut, Success, EFLAGS.getValue(1));
29934}
29935
29936// Create MOVMSKB, taking into account whether we need to split for AVX1.
29937static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG,
29938 const X86Subtarget &Subtarget) {
29939 MVT InVT = V.getSimpleValueType();
29940
29941 if (InVT == MVT::v64i8) {
29942 SDValue Lo, Hi;
29943 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
29944 Lo = getPMOVMSKB(DL, Lo, DAG, Subtarget);
29945 Hi = getPMOVMSKB(DL, Hi, DAG, Subtarget);
29946 Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);
29947 Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);
29948 Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,
29949 DAG.getConstant(32, DL, MVT::i8));
29950 return DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);
29951 }
29952 if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
29953 SDValue Lo, Hi;
29954 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
29955 Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
29956 Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
29957 Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
29958 DAG.getConstant(16, DL, MVT::i8));
29959 return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
29960 }
29961
29962 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
29963}
29964
29965static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
29966 SelectionDAG &DAG) {
29967 SDValue Src = Op.getOperand(0);
29968 MVT SrcVT = Src.getSimpleValueType();
29969 MVT DstVT = Op.getSimpleValueType();
29970
29971 // Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each
29972 // half to v32i1 and concatenating the result.
29973 if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {
29974 assert(!Subtarget.is64Bit() && "Expected 32-bit mode")(static_cast<void> (0));
29975 assert(Subtarget.hasBWI() && "Expected BWI target")(static_cast<void> (0));
29976 SDLoc dl(Op);
29977 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
29978 DAG.getIntPtrConstant(0, dl));
29979 Lo = DAG.getBitcast(MVT::v32i1, Lo);
29980 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
29981 DAG.getIntPtrConstant(1, dl));
29982 Hi = DAG.getBitcast(MVT::v32i1, Hi);
29983 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
29984 }
29985
29986 // Use MOVMSK for vector to scalar conversion to prevent scalarization.
29987 if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
29988 assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512")(static_cast<void> (0));
29989 MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
29990 SDLoc DL(Op);
29991 SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
29992 V = getPMOVMSKB(DL, V, DAG, Subtarget);
29993 return DAG.getZExtOrTrunc(V, DL, DstVT);
29994 }
29995
29996 assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||(static_cast<void> (0))
29997 SrcVT == MVT::i64) && "Unexpected VT!")(static_cast<void> (0));
29998
29999 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast<void> (0));
30000 if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) &&
30001 !(DstVT == MVT::x86mmx && SrcVT.isVector()))
30002 // This conversion needs to be expanded.
30003 return SDValue();
30004
30005 SDLoc dl(Op);
30006 if (SrcVT.isVector()) {
30007 // Widen the vector in input in the case of MVT::v2i32.
30008 // Example: from MVT::v2i32 to MVT::v4i32.
30009 MVT NewVT = MVT::getVectorVT(SrcVT.getVectorElementType(),
30010 SrcVT.getVectorNumElements() * 2);
30011 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,
30012 DAG.getUNDEF(SrcVT));
30013 } else {
30014 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&(static_cast<void> (0))
30015 "Unexpected source type in LowerBITCAST")(static_cast<void> (0));
30016 Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
30017 }
30018
30019 MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;
30020 Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);
30021
30022 if (DstVT == MVT::x86mmx)
30023 return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);
30024
30025 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,
30026 DAG.getIntPtrConstant(0, dl));
30027}
30028
30029/// Compute the horizontal sum of bytes in V for the elements of VT.
30030///
30031/// Requires V to be a byte vector and VT to be an integer vector type with
30032/// wider elements than V's type. The width of the elements of VT determines
30033/// how many bytes of V are summed horizontally to produce each element of the
30034/// result.
30035static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
30036 const X86Subtarget &Subtarget,
30037 SelectionDAG &DAG) {
30038 SDLoc DL(V);
30039 MVT ByteVecVT = V.getSimpleValueType();
30040 MVT EltVT = VT.getVectorElementType();
30041 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&(static_cast<void> (0))
30042 "Expected value to have byte element type.")(static_cast<void> (0));
30043 assert(EltVT != MVT::i8 &&(static_cast<void> (0))
30044 "Horizontal byte sum only makes sense for wider elements!")(static_cast<void> (0));
30045 unsigned VecSize = VT.getSizeInBits();
30046 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!")(static_cast<void> (0));
30047
30048 // PSADBW instruction horizontally add all bytes and leave the result in i64
30049 // chunks, thus directly computes the pop count for v2i64 and v4i64.
30050 if (EltVT == MVT::i64) {
30051 SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT);
30052 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
30053 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
30054 return DAG.getBitcast(VT, V);
30055 }
30056
30057 if (EltVT == MVT::i32) {
30058 // We unpack the low half and high half into i32s interleaved with zeros so
30059 // that we can use PSADBW to horizontally sum them. The most useful part of
30060 // this is that it lines up the results of two PSADBW instructions to be
30061 // two v2i64 vectors which concatenated are the 4 population counts. We can
30062 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
30063 SDValue Zeros = DAG.getConstant(0, DL, VT);
30064 SDValue V32 = DAG.getBitcast(VT, V);
30065 SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros);
30066 SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros);
30067
30068 // Do the horizontal sums into two v2i64s.
30069 Zeros = DAG.getConstant(0, DL, ByteVecVT);
30070 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
30071 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
30072 DAG.getBitcast(ByteVecVT, Low), Zeros);
30073 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
30074 DAG.getBitcast(ByteVecVT, High), Zeros);
30075
30076 // Merge them together.
30077 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
30078 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
30079 DAG.getBitcast(ShortVecVT, Low),
30080 DAG.getBitcast(ShortVecVT, High));
30081
30082 return DAG.getBitcast(VT, V);
30083 }
30084
30085 // The only element type left is i16.
30086 assert(EltVT == MVT::i16 && "Unknown how to handle type")(static_cast<void> (0));
30087
30088 // To obtain pop count for each i16 element starting from the pop count for
30089 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
30090 // right by 8. It is important to shift as i16s as i8 vector shift isn't
30091 // directly supported.
30092 SDValue ShifterV = DAG.getConstant(8, DL, VT);
30093 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
30094 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
30095 DAG.getBitcast(ByteVecVT, V));
30096 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
30097}
30098
30099static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
30100 const X86Subtarget &Subtarget,
30101 SelectionDAG &DAG) {
30102 MVT VT = Op.getSimpleValueType();
30103 MVT EltVT = VT.getVectorElementType();
30104 int NumElts = VT.getVectorNumElements();
30105 (void)EltVT;
30106 assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.")(static_cast<void> (0));
30107
30108 // Implement a lookup table in register by using an algorithm based on:
30109 // http://wm.ite.pl/articles/sse-popcount.html
30110 //
30111 // The general idea is that every lower byte nibble in the input vector is an
30112 // index into a in-register pre-computed pop count table. We then split up the
30113 // input vector in two new ones: (1) a vector with only the shifted-right
30114 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
30115 // masked out higher ones) for each byte. PSHUFB is used separately with both
30116 // to index the in-register table. Next, both are added and the result is a
30117 // i8 vector where each element contains the pop count for input byte.
30118 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
30119 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
30120 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
30121 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
30122
30123 SmallVector<SDValue, 64> LUTVec;
30124 for (int i = 0; i < NumElts; ++i)
30125 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
30126 SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);
30127 SDValue M0F = DAG.getConstant(0x0F, DL, VT);
30128
30129 // High nibbles
30130 SDValue FourV = DAG.getConstant(4, DL, VT);
30131 SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);
30132
30133 // Low nibbles
30134 SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);
30135
30136 // The input vector is used as the shuffle mask that index elements into the
30137 // LUT. After counting low and high nibbles, add the vector to obtain the
30138 // final pop count per i8 element.
30139 SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);
30140 SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);
30141 return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);
30142}
30143
30144// Please ensure that any codegen change from LowerVectorCTPOP is reflected in
30145// updated cost models in X86TTIImpl::getIntrinsicInstrCost.
30146static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
30147 SelectionDAG &DAG) {
30148 MVT VT = Op.getSimpleValueType();
30149 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&(static_cast<void> (0))
30150 "Unknown CTPOP type to handle")(static_cast<void> (0));
30151 SDLoc DL(Op.getNode());
30152 SDValue Op0 = Op.getOperand(0);
30153
30154 // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
30155 if (Subtarget.hasVPOPCNTDQ()) {
30156 unsigned NumElems = VT.getVectorNumElements();
30157 assert((VT.getVectorElementType() == MVT::i8 ||(static_cast<void> (0))
30158 VT.getVectorElementType() == MVT::i16) && "Unexpected type")(static_cast<void> (0));
30159 if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
30160 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
30161 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
30162 Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
30163 return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
30164 }
30165 }
30166
30167 // Decompose 256-bit ops into smaller 128-bit ops.
30168 if (VT.is256BitVector() && !Subtarget.hasInt256())
30169 return splitVectorIntUnary(Op, DAG);
30170
30171 // Decompose 512-bit ops into smaller 256-bit ops.
30172 if (VT.is512BitVector() && !Subtarget.hasBWI())
30173 return splitVectorIntUnary(Op, DAG);
30174
30175 // For element types greater than i8, do vXi8 pop counts and a bytesum.
30176 if (VT.getScalarType() != MVT::i8) {
30177 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
30178 SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);
30179 SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);
30180 return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);
30181 }
30182
30183 // We can't use the fast LUT approach, so fall back on LegalizeDAG.
30184 if (!Subtarget.hasSSSE3())
30185 return SDValue();
30186
30187 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
30188}
30189
30190static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
30191 SelectionDAG &DAG) {
30192 assert(Op.getSimpleValueType().isVector() &&(static_cast<void> (0))
30193 "We only do custom lowering for vector population count.")(static_cast<void> (0));
30194 return LowerVectorCTPOP(Op, Subtarget, DAG);
30195}
30196
30197static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
30198 MVT VT = Op.getSimpleValueType();
30199 SDValue In = Op.getOperand(0);
30200 SDLoc DL(Op);
30201
30202 // For scalars, its still beneficial to transfer to/from the SIMD unit to
30203 // perform the BITREVERSE.
30204 if (!VT.isVector()) {
30205 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
30206 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
30207 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
30208 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
30209 DAG.getIntPtrConstant(0, DL));
30210 }
30211
30212 int NumElts = VT.getVectorNumElements();
30213 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
30214
30215 // Decompose 256-bit ops into smaller 128-bit ops.
30216 if (VT.is256BitVector())
30217 return splitVectorIntUnary(Op, DAG);
30218
30219 assert(VT.is128BitVector() &&(static_cast<void> (0))
30220 "Only 128-bit vector bitreverse lowering supported.")(static_cast<void> (0));
30221
30222 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
30223 // perform the BSWAP in the shuffle.
30224 // Its best to shuffle using the second operand as this will implicitly allow
30225 // memory folding for multiple vectors.
30226 SmallVector<SDValue, 16> MaskElts;
30227 for (int i = 0; i != NumElts; ++i) {
30228 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
30229 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
30230 int PermuteByte = SourceByte | (2 << 5);
30231 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
30232 }
30233 }
30234
30235 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
30236 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
30237 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
30238 Res, Mask);
30239 return DAG.getBitcast(VT, Res);
30240}
30241
30242static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
30243 SelectionDAG &DAG) {
30244 MVT VT = Op.getSimpleValueType();
30245
30246 if (Subtarget.hasXOP() && !VT.is512BitVector())
30247 return LowerBITREVERSE_XOP(Op, DAG);
30248
30249 assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE")(static_cast<void> (0));
30250
30251 SDValue In = Op.getOperand(0);
30252 SDLoc DL(Op);
30253
30254 assert(VT.getScalarType() == MVT::i8 &&(static_cast<void> (0))
30255 "Only byte vector BITREVERSE supported")(static_cast<void> (0));
30256
30257 // Split v64i8 without BWI so that we can still use the PSHUFB lowering.
30258 if (VT == MVT::v64i8 && !Subtarget.hasBWI())
30259 return splitVectorIntUnary(Op, DAG);
30260
30261 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
30262 if (VT == MVT::v32i8 && !Subtarget.hasInt256())
30263 return splitVectorIntUnary(Op, DAG);
30264
30265 unsigned NumElts = VT.getVectorNumElements();
30266
30267 // If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.
30268 if (Subtarget.hasGFNI()) {
30269 MVT MatrixVT = MVT::getVectorVT(MVT::i64, NumElts / 8);
30270 SDValue Matrix = DAG.getConstant(0x8040201008040201ULL, DL, MatrixVT);
30271 Matrix = DAG.getBitcast(VT, Matrix);
30272 return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix,
30273 DAG.getTargetConstant(0, DL, MVT::i8));
30274 }
30275
30276 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
30277 // two nibbles and a PSHUFB lookup to find the bitreverse of each
30278 // 0-15 value (moved to the other nibble).
30279 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
30280 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
30281 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
30282
30283 const int LoLUT[16] = {
30284 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
30285 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
30286 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
30287 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
30288 const int HiLUT[16] = {
30289 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
30290 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
30291 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
30292 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
30293
30294 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
30295 for (unsigned i = 0; i < NumElts; ++i) {
30296 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
30297 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
30298 }
30299
30300 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
30301 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
30302 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
30303 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
30304 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
30305}
30306
30307static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget,
30308 SelectionDAG &DAG) {
30309 SDLoc DL(Op);
30310 SDValue X = Op.getOperand(0);
30311 MVT VT = Op.getSimpleValueType();
30312
30313 // Special case. If the input fits in 8-bits we can use a single 8-bit TEST.
30314 if (VT == MVT::i8 ||
30315 DAG.MaskedValueIsZero(X, APInt::getBitsSetFrom(VT.getSizeInBits(), 8))) {
30316 X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
30317 SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X,
30318 DAG.getConstant(0, DL, MVT::i8));
30319 // Copy the inverse of the parity flag into a register with setcc.
30320 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
30321 // Extend to the original type.
30322 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
30323 }
30324
30325 if (VT == MVT::i64) {
30326 // Xor the high and low 16-bits together using a 32-bit operation.
30327 SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
30328 DAG.getNode(ISD::SRL, DL, MVT::i64, X,
30329 DAG.getConstant(32, DL, MVT::i8)));
30330 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
30331 X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
30332 }
30333
30334 if (VT != MVT::i16) {
30335 // Xor the high and low 16-bits together using a 32-bit operation.
30336 SDValue Hi16 = DAG.getNode(ISD::SRL, DL, MVT::i32, X,
30337 DAG.getConstant(16, DL, MVT::i8));
30338 X = DAG.getNode(ISD::XOR, DL, MVT::i32, X, Hi16);
30339 } else {
30340 // If the input is 16-bits, we need to extend to use an i32 shift below.
30341 X = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, X);
30342 }
30343
30344 // Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
30345 // This should allow an h-reg to be used to save a shift.
30346 SDValue Hi = DAG.getNode(
30347 ISD::TRUNCATE, DL, MVT::i8,
30348 DAG.getNode(ISD::SRL, DL, MVT::i32, X, DAG.getConstant(8, DL, MVT::i8)));
30349 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
30350 SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
30351 SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);
30352
30353 // Copy the inverse of the parity flag into a register with setcc.
30354 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
30355 // Extend to the original type.
30356 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
30357}
30358
30359static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
30360 const X86Subtarget &Subtarget) {
30361 unsigned NewOpc = 0;
30362 switch (N->getOpcode()) {
30363 case ISD::ATOMIC_LOAD_ADD:
30364 NewOpc = X86ISD::LADD;
30365 break;
30366 case ISD::ATOMIC_LOAD_SUB:
30367 NewOpc = X86ISD::LSUB;
30368 break;
30369 case ISD::ATOMIC_LOAD_OR:
30370 NewOpc = X86ISD::LOR;
30371 break;
30372 case ISD::ATOMIC_LOAD_XOR:
30373 NewOpc = X86ISD::LXOR;
30374 break;
30375 case ISD::ATOMIC_LOAD_AND:
30376 NewOpc = X86ISD::LAND;
30377 break;
30378 default:
30379 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode")__builtin_unreachable();
30380 }
30381
30382 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
30383
30384 return DAG.getMemIntrinsicNode(
30385 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
30386 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
30387 /*MemVT=*/N->getSimpleValueType(0), MMO);
30388}
30389
30390/// Lower atomic_load_ops into LOCK-prefixed operations.
30391static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
30392 const X86Subtarget &Subtarget) {
30393 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
30394 SDValue Chain = N->getOperand(0);
30395 SDValue LHS = N->getOperand(1);
30396 SDValue RHS = N->getOperand(2);
30397 unsigned Opc = N->getOpcode();
30398 MVT VT = N->getSimpleValueType(0);
30399 SDLoc DL(N);
30400
30401 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
30402 // can only be lowered when the result is unused. They should have already
30403 // been transformed into a cmpxchg loop in AtomicExpand.
30404 if (N->hasAnyUseOfValue(0)) {
30405 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
30406 // select LXADD if LOCK_SUB can't be selected.
30407 if (Opc == ISD::ATOMIC_LOAD_SUB) {
30408 RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
30409 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
30410 RHS, AN->getMemOperand());
30411 }
30412 assert(Opc == ISD::ATOMIC_LOAD_ADD &&(static_cast<void> (0))
30413 "Used AtomicRMW ops other than Add should have been expanded!")(static_cast<void> (0));
30414 return N;
30415 }
30416
30417 // Specialized lowering for the canonical form of an idemptotent atomicrmw.
30418 // The core idea here is that since the memory location isn't actually
30419 // changing, all we need is a lowering for the *ordering* impacts of the
30420 // atomicrmw. As such, we can chose a different operation and memory
30421 // location to minimize impact on other code.
30422 if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS)) {
30423 // On X86, the only ordering which actually requires an instruction is
30424 // seq_cst which isn't SingleThread, everything just needs to be preserved
30425 // during codegen and then dropped. Note that we expect (but don't assume),
30426 // that orderings other than seq_cst and acq_rel have been canonicalized to
30427 // a store or load.
30428 if (AN->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent &&
30429 AN->getSyncScopeID() == SyncScope::System) {
30430 // Prefer a locked operation against a stack location to minimize cache
30431 // traffic. This assumes that stack locations are very likely to be
30432 // accessed only by the owning thread.
30433 SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);
30434 assert(!N->hasAnyUseOfValue(0))(static_cast<void> (0));
30435 // NOTE: The getUNDEF is needed to give something for the unused result 0.
30436 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
30437 DAG.getUNDEF(VT), NewChain);
30438 }
30439 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
30440 SDValue NewChain = DAG.getNode(X86ISD::MEMBARRIER, DL, MVT::Other, Chain);
30441 assert(!N->hasAnyUseOfValue(0))(static_cast<void> (0));
30442 // NOTE: The getUNDEF is needed to give something for the unused result 0.
30443 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
30444 DAG.getUNDEF(VT), NewChain);
30445 }
30446
30447 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
30448 // RAUW the chain, but don't worry about the result, as it's unused.
30449 assert(!N->hasAnyUseOfValue(0))(static_cast<void> (0));
30450 // NOTE: The getUNDEF is needed to give something for the unused result 0.
30451 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
30452 DAG.getUNDEF(VT), LockOp.getValue(1));
30453}
30454
30455static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,
30456 const X86Subtarget &Subtarget) {
30457 auto *Node = cast<AtomicSDNode>(Op.getNode());
30458 SDLoc dl(Node);
30459 EVT VT = Node->getMemoryVT();
30460
30461 bool IsSeqCst =
30462 Node->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent;
30463 bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT);
30464
30465 // If this store is not sequentially consistent and the type is legal
30466 // we can just keep it.
30467 if (!IsSeqCst && IsTypeLegal)
30468 return Op;
30469
30470 if (VT == MVT::i64 && !IsTypeLegal) {
30471 // For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE
30472 // is enabled.
30473 bool NoImplicitFloatOps =
30474 DAG.getMachineFunction().getFunction().hasFnAttribute(
30475 Attribute::NoImplicitFloat);
30476 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
30477 SDValue Chain;
30478 if (Subtarget.hasSSE1()) {
30479 SDValue SclToVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
30480 Node->getOperand(2));
30481 MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
30482 SclToVec = DAG.getBitcast(StVT, SclToVec);
30483 SDVTList Tys = DAG.getVTList(MVT::Other);
30484 SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()};
30485 Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops,
30486 MVT::i64, Node->getMemOperand());
30487 } else if (Subtarget.hasX87()) {
30488 // First load this into an 80-bit X87 register using a stack temporary.
30489 // This will put the whole integer into the significand.
30490 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
30491 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
30492 MachinePointerInfo MPI =
30493 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
30494 Chain =
30495 DAG.getStore(Node->getChain(), dl, Node->getOperand(2), StackPtr,
30496 MPI, MaybeAlign(), MachineMemOperand::MOStore);
30497 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
30498 SDValue LdOps[] = {Chain, StackPtr};
30499 SDValue Value =
30500 DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, LdOps, MVT::i64, MPI,
30501 /*Align*/ None, MachineMemOperand::MOLoad);
30502 Chain = Value.getValue(1);
30503
30504 // Now use an FIST to do the atomic store.
30505 SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()};
30506 Chain =
30507 DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),
30508 StoreOps, MVT::i64, Node->getMemOperand());
30509 }
30510
30511 if (Chain) {
30512 // If this is a sequentially consistent store, also emit an appropriate
30513 // barrier.
30514 if (IsSeqCst)
30515 Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
30516
30517 return Chain;
30518 }
30519 }
30520 }
30521
30522 // Convert seq_cst store -> xchg
30523 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
30524 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
30525 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
30526 Node->getMemoryVT(),
30527 Node->getOperand(0),
30528 Node->getOperand(1), Node->getOperand(2),
30529 Node->getMemOperand());
30530 return Swap.getValue(1);
30531}
30532
30533static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
30534 SDNode *N = Op.getNode();
30535 MVT VT = N->getSimpleValueType(0);
30536 unsigned Opc = Op.getOpcode();
30537
30538 // Let legalize expand this if it isn't a legal type yet.
30539 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
30540 return SDValue();
30541
30542 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
30543 SDLoc DL(N);
30544
30545 // Set the carry flag.
30546 SDValue Carry = Op.getOperand(2);
30547 EVT CarryVT = Carry.getValueType();
30548 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
30549 Carry, DAG.getAllOnesConstant(DL, CarryVT));
30550
30551 bool IsAdd = Opc == ISD::ADDCARRY || Opc == ISD::SADDO_CARRY;
30552 SDValue Sum = DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs,
30553 Op.getOperand(0), Op.getOperand(1),
30554 Carry.getValue(1));
30555
30556 bool IsSigned = Opc == ISD::SADDO_CARRY || Opc == ISD::SSUBO_CARRY;
30557 SDValue SetCC = getSETCC(IsSigned ? X86::COND_O : X86::COND_B,
30558 Sum.getValue(1), DL, DAG);
30559 if (N->getValueType(1) == MVT::i1)
30560 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
30561
30562 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
30563}
30564
30565static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
30566 SelectionDAG &DAG) {
30567 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit())(static_cast<void> (0));
30568
30569 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
30570 // which returns the values as { float, float } (in XMM0) or
30571 // { double, double } (which is returned in XMM0, XMM1).
30572 SDLoc dl(Op);
30573 SDValue Arg = Op.getOperand(0);
30574 EVT ArgVT = Arg.getValueType();
30575 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
30576
30577 TargetLowering::ArgListTy Args;
30578 TargetLowering::ArgListEntry Entry;
30579
30580 Entry.Node = Arg;
30581 Entry.Ty = ArgTy;
30582 Entry.IsSExt = false;
30583 Entry.IsZExt = false;
30584 Args.push_back(Entry);
30585
30586 bool isF64 = ArgVT == MVT::f64;
30587 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
30588 // the small struct {f32, f32} is returned in (eax, edx). For f64,
30589 // the results are returned via SRet in memory.
30590 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30591 RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
30592 const char *LibcallName = TLI.getLibcallName(LC);
30593 SDValue Callee =
30594 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
30595
30596 Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
30597 : (Type *)FixedVectorType::get(ArgTy, 4);
30598
30599 TargetLowering::CallLoweringInfo CLI(DAG);
30600 CLI.setDebugLoc(dl)
30601 .setChain(DAG.getEntryNode())
30602 .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
30603
30604 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
30605
30606 if (isF64)
30607 // Returned in xmm0 and xmm1.
30608 return CallResult.first;
30609
30610 // Returned in bits 0:31 and 32:64 xmm0.
30611 SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
30612 CallResult.first, DAG.getIntPtrConstant(0, dl));
30613 SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
30614 CallResult.first, DAG.getIntPtrConstant(1, dl));
30615 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
30616 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
30617}
30618
30619/// Widen a vector input to a vector of NVT. The
30620/// input vector must have the same element type as NVT.
30621static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
30622 bool FillWithZeroes = false) {
30623 // Check if InOp already has the right width.
30624 MVT InVT = InOp.getSimpleValueType();
30625 if (InVT == NVT)
30626 return InOp;
30627
30628 if (InOp.isUndef())
30629 return DAG.getUNDEF(NVT);
30630
30631 assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&(static_cast<void> (0))
30632 "input and widen element type must match")(static_cast<void> (0));
30633
30634 unsigned InNumElts = InVT.getVectorNumElements();
30635 unsigned WidenNumElts = NVT.getVectorNumElements();
30636 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&(static_cast<void> (0))
30637 "Unexpected request for vector widening")(static_cast<void> (0));
30638
30639 SDLoc dl(InOp);
30640 if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
30641 InOp.getNumOperands() == 2) {
30642 SDValue N1 = InOp.getOperand(1);
30643 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
30644 N1.isUndef()) {
30645 InOp = InOp.getOperand(0);
30646 InVT = InOp.getSimpleValueType();
30647 InNumElts = InVT.getVectorNumElements();
30648 }
30649 }
30650 if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
30651 ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
30652 SmallVector<SDValue, 16> Ops;
30653 for (unsigned i = 0; i < InNumElts; ++i)
30654 Ops.push_back(InOp.getOperand(i));
30655
30656 EVT EltVT = InOp.getOperand(0).getValueType();
30657
30658 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
30659 DAG.getUNDEF(EltVT);
30660 for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
30661 Ops.push_back(FillVal);
30662 return DAG.getBuildVector(NVT, dl, Ops);
30663 }
30664 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
30665 DAG.getUNDEF(NVT);
30666 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
30667 InOp, DAG.getIntPtrConstant(0, dl));
30668}
30669
30670static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
30671 SelectionDAG &DAG) {
30672 assert(Subtarget.hasAVX512() &&(static_cast<void> (0))
30673 "MGATHER/MSCATTER are supported on AVX-512 arch only")(static_cast<void> (0));
30674
30675 MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
30676 SDValue Src = N->getValue();
30677 MVT VT = Src.getSimpleValueType();
30678 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op")(static_cast<void> (0));
30679 SDLoc dl(Op);
30680
30681 SDValue Scale = N->getScale();
30682 SDValue Index = N->getIndex();
30683 SDValue Mask = N->getMask();
30684 SDValue Chain = N->getChain();
30685 SDValue BasePtr = N->getBasePtr();
30686
30687 if (VT == MVT::v2f32 || VT == MVT::v2i32) {
30688 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type")(static_cast<void> (0));
30689 // If the index is v2i64 and we have VLX we can use xmm for data and index.
30690 if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
30691 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30692 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
30693 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));
30694 SDVTList VTs = DAG.getVTList(MVT::Other);
30695 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
30696 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
30697 N->getMemoryVT(), N->getMemOperand());
30698 }
30699 return SDValue();
30700 }
30701
30702 MVT IndexVT = Index.getSimpleValueType();
30703
30704 // If the index is v2i32, we're being called by type legalization and we
30705 // should just let the default handling take care of it.
30706 if (IndexVT == MVT::v2i32)
30707 return SDValue();
30708
30709 // If we don't have VLX and neither the passthru or index is 512-bits, we
30710 // need to widen until one is.
30711 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
30712 !Index.getSimpleValueType().is512BitVector()) {
30713 // Determine how much we need to widen by to get a 512-bit type.
30714 unsigned Factor = std::min(512/VT.getSizeInBits(),
30715 512/IndexVT.getSizeInBits());
30716 unsigned NumElts = VT.getVectorNumElements() * Factor;
30717
30718 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
30719 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
30720 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
30721
30722 Src = ExtendToType(Src, VT, DAG);
30723 Index = ExtendToType(Index, IndexVT, DAG);
30724 Mask = ExtendToType(Mask, MaskVT, DAG, true);
30725 }
30726
30727 SDVTList VTs = DAG.getVTList(MVT::Other);
30728 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
30729 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
30730 N->getMemoryVT(), N->getMemOperand());
30731}
30732
30733static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
30734 SelectionDAG &DAG) {
30735
30736 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
30737 MVT VT = Op.getSimpleValueType();
30738 MVT ScalarVT = VT.getScalarType();
30739 SDValue Mask = N->getMask();
30740 MVT MaskVT = Mask.getSimpleValueType();
30741 SDValue PassThru = N->getPassThru();
30742 SDLoc dl(Op);
30743
30744 // Handle AVX masked loads which don't support passthru other than 0.
30745 if (MaskVT.getVectorElementType() != MVT::i1) {
30746 // We also allow undef in the isel pattern.
30747 if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode()))
30748 return Op;
30749
30750 SDValue NewLoad = DAG.getMaskedLoad(
30751 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
30752 getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(),
30753 N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),
30754 N->isExpandingLoad());
30755 // Emit a blend.
30756 SDValue Select = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
30757 return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);
30758 }
30759
30760 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&(static_cast<void> (0))
30761 "Expanding masked load is supported on AVX-512 target only!")(static_cast<void> (0));
30762
30763 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&(static_cast<void> (0))
30764 "Expanding masked load is supported for 32 and 64-bit types only!")(static_cast<void> (0));
30765
30766 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&(static_cast<void> (0))
30767 "Cannot lower masked load op.")(static_cast<void> (0));
30768
30769 assert((ScalarVT.getSizeInBits() >= 32 ||(static_cast<void> (0))
30770 (Subtarget.hasBWI() &&(static_cast<void> (0))
30771 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&(static_cast<void> (0))
30772 "Unsupported masked load op.")(static_cast<void> (0));
30773
30774 // This operation is legal for targets with VLX, but without
30775 // VLX the vector should be widened to 512 bit
30776 unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
30777 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
30778 PassThru = ExtendToType(PassThru, WideDataVT, DAG);
30779
30780 // Mask element has to be i1.
30781 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&(static_cast<void> (0))
30782 "Unexpected mask type")(static_cast<void> (0));
30783
30784 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
30785
30786 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
30787 SDValue NewLoad = DAG.getMaskedLoad(
30788 WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
30789 PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
30790 N->getExtensionType(), N->isExpandingLoad());
30791
30792 SDValue Extract =
30793 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0),
30794 DAG.getIntPtrConstant(0, dl));
30795 SDValue RetOps[] = {Extract, NewLoad.getValue(1)};
30796 return DAG.getMergeValues(RetOps, dl);
30797}
30798
30799static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
30800 SelectionDAG &DAG) {
30801 MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
30802 SDValue DataToStore = N->getValue();
30803 MVT VT = DataToStore.getSimpleValueType();
30804 MVT ScalarVT = VT.getScalarType();
30805 SDValue Mask = N->getMask();
30806 SDLoc dl(Op);
30807
30808 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&(static_cast<void> (0))
30809 "Expanding masked load is supported on AVX-512 target only!")(static_cast<void> (0));
30810
30811 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&(static_cast<void> (0))
30812 "Expanding masked load is supported for 32 and 64-bit types only!")(static_cast<void> (0));
30813
30814 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&(static_cast<void> (0))
30815 "Cannot lower masked store op.")(static_cast<void> (0));
30816
30817 assert((ScalarVT.getSizeInBits() >= 32 ||(static_cast<void> (0))
30818 (Subtarget.hasBWI() &&(static_cast<void> (0))
30819 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&(static_cast<void> (0))
30820 "Unsupported masked store op.")(static_cast<void> (0));
30821
30822 // This operation is legal for targets with VLX, but without
30823 // VLX the vector should be widened to 512 bit
30824 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
30825 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
30826
30827 // Mask element has to be i1.
30828 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&(static_cast<void> (0))
30829 "Unexpected mask type")(static_cast<void> (0));
30830
30831 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
30832
30833 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
30834 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
30835 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
30836 N->getOffset(), Mask, N->getMemoryVT(),
30837 N->getMemOperand(), N->getAddressingMode(),
30838 N->isTruncatingStore(), N->isCompressingStore());
30839}
30840
30841static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
30842 SelectionDAG &DAG) {
30843 assert(Subtarget.hasAVX2() &&(static_cast<void> (0))
30844 "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only")(static_cast<void> (0));
30845
30846 MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
30847 SDLoc dl(Op);
30848 MVT VT = Op.getSimpleValueType();
30849 SDValue Index = N->getIndex();
30850 SDValue Mask = N->getMask();
30851 SDValue PassThru = N->getPassThru();
30852 MVT IndexVT = Index.getSimpleValueType();
30853
30854 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op")(static_cast<void> (0));
30855
30856 // If the index is v2i32, we're being called by type legalization.
30857 if (IndexVT == MVT::v2i32)
30858 return SDValue();
30859
30860 // If we don't have VLX and neither the passthru or index is 512-bits, we
30861 // need to widen until one is.
30862 MVT OrigVT = VT;
30863 if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
30864 !IndexVT.is512BitVector()) {
30865 // Determine how much we need to widen by to get a 512-bit type.
30866 unsigned Factor = std::min(512/VT.getSizeInBits(),
30867 512/IndexVT.getSizeInBits());
30868
30869 unsigned NumElts = VT.getVectorNumElements() * Factor;
30870
30871 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
30872 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
30873 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
30874
30875 PassThru = ExtendToType(PassThru, VT, DAG);
30876 Index = ExtendToType(Index, IndexVT, DAG);
30877 Mask = ExtendToType(Mask, MaskVT, DAG, true);
30878 }
30879
30880 SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
30881 N->getScale() };
30882 SDValue NewGather = DAG.getMemIntrinsicNode(
30883 X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(),
30884 N->getMemOperand());
30885 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT,
30886 NewGather, DAG.getIntPtrConstant(0, dl));
30887 return DAG.getMergeValues({Extract, NewGather.getValue(1)}, dl);
30888}
30889
30890static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) {
30891 SDLoc dl(Op);
30892 SDValue Src = Op.getOperand(0);
30893 MVT DstVT = Op.getSimpleValueType();
30894
30895 AddrSpaceCastSDNode *N = cast<AddrSpaceCastSDNode>(Op.getNode());
30896 unsigned SrcAS = N->getSrcAddressSpace();
30897
30898 assert(SrcAS != N->getDestAddressSpace() &&(static_cast<void> (0))
30899 "addrspacecast must be between different address spaces")(static_cast<void> (0));
30900
30901 if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) {
30902 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);
30903 } else if (DstVT == MVT::i64) {
30904 Op = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);
30905 } else if (DstVT == MVT::i32) {
30906 Op = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);
30907 } else {
30908 report_fatal_error("Bad address space in addrspacecast");
30909 }
30910 return Op;
30911}
30912
30913SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op,
30914 SelectionDAG &DAG) const {
30915 // TODO: Eventually, the lowering of these nodes should be informed by or
30916 // deferred to the GC strategy for the function in which they appear. For
30917 // now, however, they must be lowered to something. Since they are logically
30918 // no-ops in the case of a null GC strategy (or a GC strategy which does not
30919 // require special handling for these nodes), lower them as literal NOOPs for
30920 // the time being.
30921 SmallVector<SDValue, 2> Ops;
30922
30923 Ops.push_back(Op.getOperand(0));
30924 if (Op->getGluedNode())
30925 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
30926
30927 SDLoc OpDL(Op);
30928 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
30929 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
30930
30931 return NOOP;
30932}
30933
30934// Custom split CVTPS2PH with wide types.
30935static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG) {
30936 SDLoc dl(Op);
30937 EVT VT = Op.getValueType();
30938 SDValue Lo, Hi;
30939 std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
30940 EVT LoVT, HiVT;
30941 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
30942 SDValue RC = Op.getOperand(1);
30943 Lo = DAG.getNode(X86ISD::CVTPS2PH, dl, LoVT, Lo, RC);
30944 Hi = DAG.getNode(X86ISD::CVTPS2PH, dl, HiVT, Hi, RC);
30945 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
30946}
30947
30948/// Provide custom lowering hooks for some operations.
30949SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
30950 switch (Op.getOpcode()) {
30951 default: llvm_unreachable("Should not custom lower this!")__builtin_unreachable();
30952 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
30953 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
30954 return LowerCMP_SWAP(Op, Subtarget, DAG);
30955 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
30956 case ISD::ATOMIC_LOAD_ADD:
30957 case ISD::ATOMIC_LOAD_SUB:
30958 case ISD::ATOMIC_LOAD_OR:
30959 case ISD::ATOMIC_LOAD_XOR:
30960 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
30961 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget);
30962 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
30963 case ISD::PARITY: return LowerPARITY(Op, Subtarget, DAG);
30964 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
30965 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
30966 case ISD::VECTOR_SHUFFLE: return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG);
30967 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
30968 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
30969 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
30970 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
30971 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
30972 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
30973 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
30974 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
30975 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
30976 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
30977 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
30978 case ISD::SHL_PARTS:
30979 case ISD::SRA_PARTS:
30980 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
30981 case ISD::FSHL:
30982 case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG);
30983 case ISD::STRICT_SINT_TO_FP:
30984 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
30985 case ISD::STRICT_UINT_TO_FP:
30986 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
30987 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
30988 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
30989 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
30990 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
30991 case ISD::ZERO_EXTEND_VECTOR_INREG:
30992 case ISD::SIGN_EXTEND_VECTOR_INREG:
30993 return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
30994 case ISD::FP_TO_SINT:
30995 case ISD::STRICT_FP_TO_SINT:
30996 case ISD::FP_TO_UINT:
30997 case ISD::STRICT_FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
30998 case ISD::FP_TO_SINT_SAT:
30999 case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG);
31000 case ISD::FP_EXTEND:
31001 case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
31002 case ISD::FP_ROUND:
31003 case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG);
31004 case ISD::FP16_TO_FP:
31005 case ISD::STRICT_FP16_TO_FP: return LowerFP16_TO_FP(Op, DAG);
31006 case ISD::FP_TO_FP16:
31007 case ISD::STRICT_FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
31008 case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);
31009 case ISD::STORE: return LowerStore(Op, Subtarget, DAG);
31010 case ISD::FADD:
31011 case ISD::FSUB: return lowerFaddFsub(Op, DAG);
31012 case ISD::FROUND: return LowerFROUND(Op, DAG);
31013 case ISD::FABS:
31014 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
31015 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
31016 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
31017 case ISD::LRINT:
31018 case ISD::LLRINT: return LowerLRINT_LLRINT(Op, DAG);
31019 case ISD::SETCC:
31020 case ISD::STRICT_FSETCC:
31021 case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG);
31022 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
31023 case ISD::SELECT: return LowerSELECT(Op, DAG);
31024 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
31025 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
31026 case ISD::VASTART: return LowerVASTART(Op, DAG);
31027 case ISD::VAARG: return LowerVAARG(Op, DAG);
31028 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
31029 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
31030 case ISD::INTRINSIC_VOID:
31031 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
31032 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
31033 case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
31034 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
31035 case ISD::FRAME_TO_ARGS_OFFSET:
31036 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
31037 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
31038 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
31039 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
31040 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
31041 case ISD::EH_SJLJ_SETUP_DISPATCH:
31042 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
31043 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
31044 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
31045 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
31046 case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG);
31047 case ISD::CTLZ:
31048 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
31049 case ISD::CTTZ:
31050 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, Subtarget, DAG);
31051 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
31052 case ISD::MULHS:
31053 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
31054 case ISD::ROTL:
31055 case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
31056 case ISD::SRA:
31057 case ISD::SRL:
31058 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
31059 case ISD::SADDO:
31060 case ISD::UADDO:
31061 case ISD::SSUBO:
31062 case ISD::USUBO: return LowerXALUO(Op, DAG);
31063 case ISD::SMULO:
31064 case ISD::UMULO: return LowerMULO(Op, Subtarget, DAG);
31065 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
31066 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
31067 case ISD::SADDO_CARRY:
31068 case ISD::SSUBO_CARRY:
31069 case ISD::ADDCARRY:
31070 case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG);
31071 case ISD::ADD:
31072 case ISD::SUB: return lowerAddSub(Op, DAG, Subtarget);
31073 case ISD::UADDSAT:
31074 case ISD::SADDSAT:
31075 case ISD::USUBSAT:
31076 case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG, Subtarget);
31077 case ISD::SMAX:
31078 case ISD::SMIN:
31079 case ISD::UMAX:
31080 case ISD::UMIN: return LowerMINMAX(Op, DAG);
31081 case ISD::ABS: return LowerABS(Op, Subtarget, DAG);
31082 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
31083 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
31084 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
31085 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
31086 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
31087 case ISD::GC_TRANSITION_START:
31088 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION(Op, DAG);
31089 case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG);
31090 case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG);
31091 }
31092}
31093
31094/// Replace a node with an illegal result type with a new node built out of
31095/// custom code.
31096void X86TargetLowering::ReplaceNodeResults(SDNode *N,
31097 SmallVectorImpl<SDValue>&Results,
31098 SelectionDAG &DAG) const {
31099 SDLoc dl(N);
31100 switch (N->getOpcode()) {
31101 default:
31102#ifndef NDEBUG1
31103 dbgs() << "ReplaceNodeResults: ";
31104 N->dump(&DAG);
31105#endif
31106 llvm_unreachable("Do not know how to custom type legalize this operation!")__builtin_unreachable();
31107 case X86ISD::CVTPH2PS: {
31108 EVT VT = N->getValueType(0);
31109 SDValue Lo, Hi;
31110 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
31111 EVT LoVT, HiVT;
31112 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
31113 Lo = DAG.getNode(X86ISD::CVTPH2PS, dl, LoVT, Lo);
31114 Hi = DAG.getNode(X86ISD::CVTPH2PS, dl, HiVT, Hi);
31115 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
31116 Results.push_back(Res);
31117 return;
31118 }
31119 case X86ISD::STRICT_CVTPH2PS: {
31120 EVT VT = N->getValueType(0);
31121 SDValue Lo, Hi;
31122 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 1);
31123 EVT LoVT, HiVT;
31124 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
31125 Lo = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {LoVT, MVT::Other},
31126 {N->getOperand(0), Lo});
31127 Hi = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {HiVT, MVT::Other},
31128 {N->getOperand(0), Hi});
31129 SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
31130 Lo.getValue(1), Hi.getValue(1));
31131 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
31132 Results.push_back(Res);
31133 Results.push_back(Chain);
31134 return;
31135 }
31136 case X86ISD::CVTPS2PH:
31137 Results.push_back(LowerCVTPS2PH(SDValue(N, 0), DAG));
31138 return;
31139 case ISD::CTPOP: {
31140 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!")(static_cast<void> (0));
31141 // Use a v2i64 if possible.
31142 bool NoImplicitFloatOps =
31143 DAG.getMachineFunction().getFunction().hasFnAttribute(
31144 Attribute::NoImplicitFloat);
31145 if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {
31146 SDValue Wide =
31147 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));
31148 Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);
31149 // Bit count should fit in 32-bits, extract it as that and then zero
31150 // extend to i64. Otherwise we end up extracting bits 63:32 separately.
31151 Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);
31152 Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,
31153 DAG.getIntPtrConstant(0, dl));
31154 Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);
31155 Results.push_back(Wide);
31156 }
31157 return;
31158 }
31159 case ISD::MUL: {
31160 EVT VT = N->getValueType(0);
31161 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast<void> (0))
31162 VT.getVectorElementType() == MVT::i8 && "Unexpected VT!")(static_cast<void> (0));
31163 // Pre-promote these to vXi16 to avoid op legalization thinking all 16
31164 // elements are needed.
31165 MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
31166 SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
31167 SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
31168 SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);
31169 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
31170 unsigned NumConcats = 16 / VT.getVectorNumElements();
31171 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
31172 ConcatOps[0] = Res;
31173 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);
31174 Results.push_back(Res);
31175 return;
31176 }
31177 case X86ISD::VPMADDWD:
31178 case X86ISD::AVG: {
31179 // Legalize types for X86ISD::AVG/VPMADDWD by widening.
31180 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast<void> (0));
31181
31182 EVT VT = N->getValueType(0);
31183 EVT InVT = N->getOperand(0).getValueType();
31184 assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&(static_cast<void> (0))
31185 "Expected a VT that divides into 128 bits.")(static_cast<void> (0));
31186 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast<void> (0))
31187 "Unexpected type action!")(static_cast<void> (0));
31188 unsigned NumConcat = 128 / InVT.getSizeInBits();
31189
31190 EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),
31191 InVT.getVectorElementType(),
31192 NumConcat * InVT.getVectorNumElements());
31193 EVT WideVT = EVT::getVectorVT(*DAG.getContext(),
31194 VT.getVectorElementType(),
31195 NumConcat * VT.getVectorNumElements());
31196
31197 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
31198 Ops[0] = N->getOperand(0);
31199 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
31200 Ops[0] = N->getOperand(1);
31201 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
31202
31203 SDValue Res = DAG.getNode(N->getOpcode(), dl, WideVT, InVec0, InVec1);
31204 Results.push_back(Res);
31205 return;
31206 }
31207 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
31208 case X86ISD::FMINC:
31209 case X86ISD::FMIN:
31210 case X86ISD::FMAXC:
31211 case X86ISD::FMAX: {
31212 EVT VT = N->getValueType(0);
31213 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.")(static_cast<void> (0));
31214 SDValue UNDEF = DAG.getUNDEF(VT);
31215 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
31216 N->getOperand(0), UNDEF);
31217 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
31218 N->getOperand(1), UNDEF);
31219 Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
31220 return;
31221 }
31222 case ISD::SDIV:
31223 case ISD::UDIV:
31224 case ISD::SREM:
31225 case ISD::UREM: {
31226 EVT VT = N->getValueType(0);
31227 if (VT.isVector()) {
31228 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast<void> (0))
31229 "Unexpected type action!")(static_cast<void> (0));
31230 // If this RHS is a constant splat vector we can widen this and let
31231 // division/remainder by constant optimize it.
31232 // TODO: Can we do something for non-splat?
31233 APInt SplatVal;
31234 if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {
31235 unsigned NumConcats = 128 / VT.getSizeInBits();
31236 SmallVector<SDValue, 8> Ops0(NumConcats, DAG.getUNDEF(VT));
31237 Ops0[0] = N->getOperand(0);
31238 EVT ResVT = getTypeToTransformTo(*DAG.getContext(), VT);
31239 SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0);
31240 SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT);
31241 SDValue Res = DAG.getNode(N->getOpcode(), dl, ResVT, N0, N1);
31242 Results.push_back(Res);
31243 }
31244 return;
31245 }
31246
31247 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
31248 Results.push_back(V);
31249 return;
31250 }
31251 case ISD::TRUNCATE: {
31252 MVT VT = N->getSimpleValueType(0);
31253 if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
31254 return;
31255
31256 // The generic legalizer will try to widen the input type to the same
31257 // number of elements as the widened result type. But this isn't always
31258 // the best thing so do some custom legalization to avoid some cases.
31259 MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT();
31260 SDValue In = N->getOperand(0);
31261 EVT InVT = In.getValueType();
31262
31263 unsigned InBits = InVT.getSizeInBits();
31264 if (128 % InBits == 0) {
31265 // 128 bit and smaller inputs should avoid truncate all together and
31266 // just use a build_vector that will become a shuffle.
31267 // TODO: Widen and use a shuffle directly?
31268 MVT InEltVT = InVT.getSimpleVT().getVectorElementType();
31269 EVT EltVT = VT.getVectorElementType();
31270 unsigned WidenNumElts = WidenVT.getVectorNumElements();
31271 SmallVector<SDValue, 16> Ops(WidenNumElts, DAG.getUNDEF(EltVT));
31272 // Use the original element count so we don't do more scalar opts than
31273 // necessary.
31274 unsigned MinElts = VT.getVectorNumElements();
31275 for (unsigned i=0; i < MinElts; ++i) {
31276 SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, In,
31277 DAG.getIntPtrConstant(i, dl));
31278 Ops[i] = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Val);
31279 }
31280 Results.push_back(DAG.getBuildVector(WidenVT, dl, Ops));
31281 return;
31282 }
31283 // With AVX512 there are some cases that can use a target specific
31284 // truncate node to go from 256/512 to less than 128 with zeros in the
31285 // upper elements of the 128 bit result.
31286 if (Subtarget.hasAVX512() && isTypeLegal(InVT)) {
31287 // We can use VTRUNC directly if for 256 bits with VLX or for any 512.
31288 if ((InBits == 256 && Subtarget.hasVLX()) || InBits == 512) {
31289 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
31290 return;
31291 }
31292 // There's one case we can widen to 512 bits and use VTRUNC.
31293 if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) {
31294 In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In,
31295 DAG.getUNDEF(MVT::v4i64));
31296 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
31297 return;
31298 }
31299 }
31300 if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 &&
31301 getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector &&
31302 isTypeLegal(MVT::v4i64)) {
31303 // Input needs to be split and output needs to widened. Let's use two
31304 // VTRUNCs, and shuffle their results together into the wider type.
31305 SDValue Lo, Hi;
31306 std::tie(Lo, Hi) = DAG.SplitVector(In, dl);
31307
31308 Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo);
31309 Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi);
31310 SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi,
31311 { 0, 1, 2, 3, 16, 17, 18, 19,
31312 -1, -1, -1, -1, -1, -1, -1, -1 });
31313 Results.push_back(Res);
31314 return;
31315 }
31316
31317 return;
31318 }
31319 case ISD::ANY_EXTEND:
31320 // Right now, only MVT::v8i8 has Custom action for an illegal type.
31321 // It's intended to custom handle the input type.
31322 assert(N->getValueType(0) == MVT::v8i8 &&(static_cast<void> (0))
31323 "Do not know how to legalize this Node")(static_cast<void> (0));
31324 return;
31325 case ISD::SIGN_EXTEND:
31326 case ISD::ZERO_EXTEND: {
31327 EVT VT = N->getValueType(0);
31328 SDValue In = N->getOperand(0);
31329 EVT InVT = In.getValueType();
31330 if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
31331 (InVT == MVT::v4i16 || InVT == MVT::v4i8)){
31332 assert(getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector &&(static_cast<void> (0))
31333 "Unexpected type action!")(static_cast<void> (0));
31334 assert(N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode")(static_cast<void> (0));
31335 // Custom split this so we can extend i8/i16->i32 invec. This is better
31336 // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
31337 // sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
31338 // we allow the sra from the extend to i32 to be shared by the split.
31339 In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In);
31340
31341 // Fill a vector with sign bits for each element.
31342 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
31343 SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT);
31344
31345 // Create an unpackl and unpackh to interleave the sign bits then bitcast
31346 // to v2i64.
31347 SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
31348 {0, 4, 1, 5});
31349 Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);
31350 SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
31351 {2, 6, 3, 7});
31352 Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);
31353
31354 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
31355 Results.push_back(Res);
31356 return;
31357 }
31358
31359 if (VT == MVT::v16i32 || VT == MVT::v8i64) {
31360 if (!InVT.is128BitVector()) {
31361 // Not a 128 bit vector, but maybe type legalization will promote
31362 // it to 128 bits.
31363 if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger)
31364 return;
31365 InVT = getTypeToTransformTo(*DAG.getContext(), InVT);
31366 if (!InVT.is128BitVector())
31367 return;
31368
31369 // Promote the input to 128 bits. Type legalization will turn this into
31370 // zext_inreg/sext_inreg.
31371 In = DAG.getNode(N->getOpcode(), dl, InVT, In);
31372 }
31373
31374 // Perform custom splitting instead of the two stage extend we would get
31375 // by default.
31376 EVT LoVT, HiVT;
31377 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
31378 assert(isTypeLegal(LoVT) && "Split VT not legal?")(static_cast<void> (0));
31379
31380 SDValue Lo = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, LoVT, In, DAG);
31381
31382 // We need to shift the input over by half the number of elements.
31383 unsigned NumElts = InVT.getVectorNumElements();
31384 unsigned HalfNumElts = NumElts / 2;
31385 SmallVector<int, 16> ShufMask(NumElts, SM_SentinelUndef);
31386 for (unsigned i = 0; i != HalfNumElts; ++i)
31387 ShufMask[i] = i + HalfNumElts;
31388
31389 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
31390 Hi = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, HiVT, Hi, DAG);
31391
31392 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
31393 Results.push_back(Res);
31394 }
31395 return;
31396 }
31397 case ISD::FP_TO_SINT:
31398 case ISD::STRICT_FP_TO_SINT:
31399 case ISD::FP_TO_UINT:
31400 case ISD::STRICT_FP_TO_UINT: {
31401 bool IsStrict = N->isStrictFPOpcode();
31402 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT ||
31403 N->getOpcode() == ISD::STRICT_FP_TO_SINT;
31404 EVT VT = N->getValueType(0);
31405 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
31406 EVT SrcVT = Src.getValueType();
31407
31408 if (VT.isVector() && Subtarget.hasFP16() &&
31409 SrcVT.getVectorElementType() == MVT::f16) {
31410 EVT EleVT = VT.getVectorElementType();
31411 EVT ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
31412
31413 if (SrcVT != MVT::v8f16) {
31414 SDValue Tmp =
31415 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
31416 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
31417 Ops[0] = Src;
31418 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
31419 }
31420
31421 SDValue Res, Chain;
31422 if (IsStrict) {
31423 unsigned Opc =
31424 IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
31425 Res =
31426 DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {N->getOperand(0), Src});
31427 Chain = Res.getValue(1);
31428 } else {
31429 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
31430 Res = DAG.getNode(Opc, dl, ResVT, Src);
31431 }
31432
31433 // TODO: Need to add exception check code for strict FP.
31434 if (EleVT.getSizeInBits() < 16) {
31435 MVT TmpVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8);
31436 Res = DAG.getNode(ISD::TRUNCATE, dl, TmpVT, Res);
31437
31438 // Now widen to 128 bits.
31439 unsigned NumConcats = 128 / TmpVT.getSizeInBits();
31440 MVT ConcatVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8 * NumConcats);
31441 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(TmpVT));
31442 ConcatOps[0] = Res;
31443 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
31444 }
31445
31446 Results.push_back(Res);
31447 if (IsStrict)
31448 Results.push_back(Chain);
31449
31450 return;
31451 }
31452
31453 if (VT.isVector() && VT.getScalarSizeInBits() < 32) {
31454 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast<void> (0))
31455 "Unexpected type action!")(static_cast<void> (0));
31456
31457 // Try to create a 128 bit vector, but don't exceed a 32 bit element.
31458 unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);
31459 MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),
31460 VT.getVectorNumElements());
31461 SDValue Res;
31462 SDValue Chain;
31463 if (IsStrict) {
31464 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {PromoteVT, MVT::Other},
31465 {N->getOperand(0), Src});
31466 Chain = Res.getValue(1);
31467 } else
31468 Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);
31469
31470 // Preserve what we know about the size of the original result. If the
31471 // result is v2i32, we have to manually widen the assert.
31472 if (PromoteVT == MVT::v2i32)
31473 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
31474 DAG.getUNDEF(MVT::v2i32));
31475
31476 Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext, dl,
31477 Res.getValueType(), Res,
31478 DAG.getValueType(VT.getVectorElementType()));
31479
31480 if (PromoteVT == MVT::v2i32)
31481 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
31482 DAG.getIntPtrConstant(0, dl));
31483
31484 // Truncate back to the original width.
31485 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
31486
31487 // Now widen to 128 bits.
31488 unsigned NumConcats = 128 / VT.getSizeInBits();
31489 MVT ConcatVT = MVT::getVectorVT(VT.getSimpleVT().getVectorElementType(),
31490 VT.getVectorNumElements() * NumConcats);
31491 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
31492 ConcatOps[0] = Res;
31493 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
31494 Results.push_back(Res);
31495 if (IsStrict)
31496 Results.push_back(Chain);
31497 return;
31498 }
31499
31500
31501 if (VT == MVT::v2i32) {
31502 assert((!IsStrict || IsSigned || Subtarget.hasAVX512()) &&(static_cast<void> (0))
31503 "Strict unsigned conversion requires AVX512")(static_cast<void> (0));
31504 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast<void> (0));
31505 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast<void> (0))
31506 "Unexpected type action!")(static_cast<void> (0));
31507 if (Src.getValueType() == MVT::v2f64) {
31508 if (!IsSigned && !Subtarget.hasAVX512()) {
31509 SDValue Res =
31510 expandFP_TO_UINT_SSE(MVT::v4i32, Src, dl, DAG, Subtarget);
31511 Results.push_back(Res);
31512 return;
31513 }
31514
31515 unsigned Opc;
31516 if (IsStrict)
31517 Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
31518 else
31519 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
31520
31521 // If we have VLX we can emit a target specific FP_TO_UINT node,.
31522 if (!IsSigned && !Subtarget.hasVLX()) {
31523 // Otherwise we can defer to the generic legalizer which will widen
31524 // the input as well. This will be further widened during op
31525 // legalization to v8i32<-v8f64.
31526 // For strict nodes we'll need to widen ourselves.
31527 // FIXME: Fix the type legalizer to safely widen strict nodes?
31528 if (!IsStrict)
31529 return;
31530 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, Src,
31531 DAG.getConstantFP(0.0, dl, MVT::v2f64));
31532 Opc = N->getOpcode();
31533 }
31534 SDValue Res;
31535 SDValue Chain;
31536 if (IsStrict) {
31537 Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
31538 {N->getOperand(0), Src});
31539 Chain = Res.getValue(1);
31540 } else {
31541 Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);
31542 }
31543 Results.push_back(Res);
31544 if (IsStrict)
31545 Results.push_back(Chain);
31546 return;
31547 }
31548
31549 // Custom widen strict v2f32->v2i32 by padding with zeros.
31550 // FIXME: Should generic type legalizer do this?
31551 if (Src.getValueType() == MVT::v2f32 && IsStrict) {
31552 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
31553 DAG.getConstantFP(0.0, dl, MVT::v2f32));
31554 SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4i32, MVT::Other},
31555 {N->getOperand(0), Src});
31556 Results.push_back(Res);
31557 Results.push_back(Res.getValue(1));
31558 return;
31559 }
31560
31561 // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
31562 // so early out here.
31563 return;
31564 }
31565
31566 assert(!VT.isVector() && "Vectors should have been handled above!")(static_cast<void> (0));
31567
31568 if ((Subtarget.hasDQI() && VT == MVT::i64 &&
31569 (SrcVT == MVT::f32 || SrcVT == MVT::f64)) ||
31570 (Subtarget.hasFP16() && SrcVT == MVT::f16)) {
31571 assert(!Subtarget.is64Bit() && "i64 should be legal")(static_cast<void> (0));
31572 unsigned NumElts = Subtarget.hasVLX() ? 2 : 8;
31573 // If we use a 128-bit result we might need to use a target specific node.
31574 unsigned SrcElts =
31575 std::max(NumElts, 128U / (unsigned)SrcVT.getSizeInBits());
31576 MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);
31577 MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), SrcElts);
31578 unsigned Opc = N->getOpcode();
31579 if (NumElts != SrcElts) {
31580 if (IsStrict)
31581 Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
31582 else
31583 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
31584 }
31585
31586 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
31587 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
31588 DAG.getConstantFP(0.0, dl, VecInVT), Src,
31589 ZeroIdx);
31590 SDValue Chain;
31591 if (IsStrict) {
31592 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
31593 Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res);
31594 Chain = Res.getValue(1);
31595 } else
31596 Res = DAG.getNode(Opc, SDLoc(N), VecVT, Res);
31597 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);
31598 Results.push_back(Res);
31599 if (IsStrict)
31600 Results.push_back(Chain);
31601 return;
31602 }
31603
31604 SDValue Chain;
31605 if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) {
31606 Results.push_back(V);
31607 if (IsStrict)
31608 Results.push_back(Chain);
31609 }
31610 return;
31611 }
31612 case ISD::LRINT:
31613 case ISD::LLRINT: {
31614 if (SDValue V = LRINT_LLRINTHelper(N, DAG))
31615 Results.push_back(V);
31616 return;
31617 }
31618
31619 case ISD::SINT_TO_FP:
31620 case ISD::STRICT_SINT_TO_FP:
31621 case ISD::UINT_TO_FP:
31622 case ISD::STRICT_UINT_TO_FP: {
31623 bool IsStrict = N->isStrictFPOpcode();
31624 bool IsSigned = N->getOpcode() == ISD::SINT_TO_FP ||
31625 N->getOpcode() == ISD::STRICT_SINT_TO_FP;
31626 EVT VT = N->getValueType(0);
31627 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
31628 if (VT.getVectorElementType() == MVT::f16 && Subtarget.hasFP16() &&
31629 Subtarget.hasVLX()) {
31630 if (Src.getValueType().getVectorElementType() == MVT::i16)
31631 return;
31632
31633 if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2i32)
31634 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
31635 IsStrict ? DAG.getConstant(0, dl, MVT::v2i32)
31636 : DAG.getUNDEF(MVT::v2i32));
31637 if (IsStrict) {
31638 unsigned Opc =
31639 IsSigned ? X86ISD::STRICT_CVTSI2P : X86ISD::STRICT_CVTUI2P;
31640 SDValue Res = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
31641 {N->getOperand(0), Src});
31642 Results.push_back(Res);
31643 Results.push_back(Res.getValue(1));
31644 } else {
31645 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
31646 Results.push_back(DAG.getNode(Opc, dl, MVT::v8f16, Src));
31647 }
31648 return;
31649 }
31650 if (VT != MVT::v2f32)
31651 return;
31652 EVT SrcVT = Src.getValueType();
31653 if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
31654 if (IsStrict) {
31655 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P
31656 : X86ISD::STRICT_CVTUI2P;
31657 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
31658 {N->getOperand(0), Src});
31659 Results.push_back(Res);
31660 Results.push_back(Res.getValue(1));
31661 } else {
31662 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
31663 Results.push_back(DAG.getNode(Opc, dl, MVT::v4f32, Src));
31664 }
31665 return;
31666 }
31667 if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() &&
31668 Subtarget.hasSSE41() && !Subtarget.hasAVX512()) {
31669 SDValue Zero = DAG.getConstant(0, dl, SrcVT);
31670 SDValue One = DAG.getConstant(1, dl, SrcVT);
31671 SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT,
31672 DAG.getNode(ISD::SRL, dl, SrcVT, Src, One),
31673 DAG.getNode(ISD::AND, dl, SrcVT, Src, One));
31674 SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT);
31675 SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src);
31676 SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32));
31677 for (int i = 0; i != 2; ++i) {
31678 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
31679 SignSrc, DAG.getIntPtrConstant(i, dl));
31680 if (IsStrict)
31681 SignCvts[i] =
31682 DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other},
31683 {N->getOperand(0), Elt});
31684 else
31685 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Elt);
31686 };
31687 SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts);
31688 SDValue Slow, Chain;
31689 if (IsStrict) {
31690 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
31691 SignCvts[0].getValue(1), SignCvts[1].getValue(1));
31692 Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other},
31693 {Chain, SignCvt, SignCvt});
31694 Chain = Slow.getValue(1);
31695 } else {
31696 Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt);
31697 }
31698 IsNeg = DAG.getBitcast(MVT::v4i32, IsNeg);
31699 IsNeg =
31700 DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1});
31701 SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt);
31702 Results.push_back(Cvt);
31703 if (IsStrict)
31704 Results.push_back(Chain);
31705 return;
31706 }
31707
31708 if (SrcVT != MVT::v2i32)
31709 return;
31710
31711 if (IsSigned || Subtarget.hasAVX512()) {
31712 if (!IsStrict)
31713 return;
31714
31715 // Custom widen strict v2i32->v2f32 to avoid scalarization.
31716 // FIXME: Should generic type legalizer do this?
31717 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
31718 DAG.getConstant(0, dl, MVT::v2i32));
31719 SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
31720 {N->getOperand(0), Src});
31721 Results.push_back(Res);
31722 Results.push_back(Res.getValue(1));
31723 return;
31724 }
31725
31726 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast<void> (0));
31727 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
31728 SDValue VBias =
31729 DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
31730 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
31731 DAG.getBitcast(MVT::v2i64, VBias));
31732 Or = DAG.getBitcast(MVT::v2f64, Or);
31733 if (IsStrict) {
31734 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},
31735 {N->getOperand(0), Or, VBias});
31736 SDValue Res = DAG.getNode(X86ISD::STRICT_VFPROUND, dl,
31737 {MVT::v4f32, MVT::Other},
31738 {Sub.getValue(1), Sub});
31739 Results.push_back(Res);
31740 Results.push_back(Res.getValue(1));
31741 } else {
31742 // TODO: Are there any fast-math-flags to propagate here?
31743 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
31744 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
31745 }
31746 return;
31747 }
31748 case ISD::STRICT_FP_ROUND:
31749 case ISD::FP_ROUND: {
31750 bool IsStrict = N->isStrictFPOpcode();
31751 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
31752 EVT VT = N->getValueType(0);
31753 EVT NewVT = VT.getVectorElementType() == MVT::f16 ? MVT::v8f16 : MVT::v4f32;
31754 if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2f32) {
31755 SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f32)
31756 : DAG.getUNDEF(MVT::v2f32);
31757 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, Ext);
31758 }
31759 if (!isTypeLegal(Src.getValueType()))
31760 return;
31761 SDValue V;
31762 if (IsStrict)
31763 V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {NewVT, MVT::Other},
31764 {N->getOperand(0), Src});
31765 else
31766 V = DAG.getNode(X86ISD::VFPROUND, dl, NewVT, Src);
31767 Results.push_back(V);
31768 if (IsStrict)
31769 Results.push_back(V.getValue(1));
31770 return;
31771 }
31772 case ISD::FP_EXTEND:
31773 case ISD::STRICT_FP_EXTEND: {
31774 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
31775 // No other ValueType for FP_EXTEND should reach this point.
31776 assert(N->getValueType(0) == MVT::v2f32 &&(static_cast<void> (0))
31777 "Do not know how to legalize this Node")(static_cast<void> (0));
31778 if (!Subtarget.hasFP16() || !Subtarget.hasVLX())
31779 return;
31780 bool IsStrict = N->isStrictFPOpcode();
31781 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
31782 SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f16)
31783 : DAG.getUNDEF(MVT::v2f16);
31784 SDValue V = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f16, Src, Ext);
31785 if (IsStrict)
31786 V = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::v4f32, MVT::Other},
31787 {N->getOperand(0), V});
31788 else
31789 V = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, V);
31790 Results.push_back(V);
31791 if (IsStrict)
31792 Results.push_back(V.getValue(1));
31793 return;
31794 }
31795 case ISD::INTRINSIC_W_CHAIN: {
31796 unsigned IntNo = N->getConstantOperandVal(1);
31797 switch (IntNo) {
31798 default : llvm_unreachable("Do not know how to custom type "__builtin_unreachable()
31799 "legalize this intrinsic operation!")__builtin_unreachable();
31800 case Intrinsic::x86_rdtsc:
31801 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,
31802 Results);
31803 case Intrinsic::x86_rdtscp:
31804 return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget,
31805 Results);
31806 case Intrinsic::x86_rdpmc:
31807 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget,
31808 Results);
31809 return;
31810 case Intrinsic::x86_xgetbv:
31811 expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,
31812 Results);
31813 return;
31814 }
31815 }
31816 case ISD::READCYCLECOUNTER: {
31817 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results);
31818 }
31819 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
31820 EVT T = N->getValueType(0);
31821 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair")(static_cast<void> (0));
31822 bool Regs64bit = T == MVT::i128;
31823 assert((!Regs64bit || Subtarget.hasCmpxchg16b()) &&(static_cast<void> (0))
31824 "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B")(static_cast<void> (0));
31825 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
31826 SDValue cpInL, cpInH;
31827 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
31828 DAG.getConstant(0, dl, HalfT));
31829 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
31830 DAG.getConstant(1, dl, HalfT));
31831 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
31832 Regs64bit ? X86::RAX : X86::EAX,
31833 cpInL, SDValue());
31834 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
31835 Regs64bit ? X86::RDX : X86::EDX,
31836 cpInH, cpInL.getValue(1));
31837 SDValue swapInL, swapInH;
31838 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
31839 DAG.getConstant(0, dl, HalfT));
31840 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
31841 DAG.getConstant(1, dl, HalfT));
31842 swapInH =
31843 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
31844 swapInH, cpInH.getValue(1));
31845
31846 // In 64-bit mode we might need the base pointer in RBX, but we can't know
31847 // until later. So we keep the RBX input in a vreg and use a custom
31848 // inserter.
31849 // Since RBX will be a reserved register the register allocator will not
31850 // make sure its value will be properly saved and restored around this
31851 // live-range.
31852 SDValue Result;
31853 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
31854 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
31855 if (Regs64bit) {
31856 SDValue Ops[] = {swapInH.getValue(0), N->getOperand(1), swapInL,
31857 swapInH.getValue(1)};
31858 Result =
31859 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG16_DAG, dl, Tys, Ops, T, MMO);
31860 } else {
31861 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, X86::EBX, swapInL,
31862 swapInH.getValue(1));
31863 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
31864 swapInL.getValue(1)};
31865 Result =
31866 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, T, MMO);
31867 }
31868
31869 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
31870 Regs64bit ? X86::RAX : X86::EAX,
31871 HalfT, Result.getValue(1));
31872 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
31873 Regs64bit ? X86::RDX : X86::EDX,
31874 HalfT, cpOutL.getValue(2));
31875 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
31876
31877 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
31878 MVT::i32, cpOutH.getValue(2));
31879 SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
31880 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
31881
31882 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
31883 Results.push_back(Success);
31884 Results.push_back(EFLAGS.getValue(1));
31885 return;
31886 }
31887 case ISD::ATOMIC_LOAD: {
31888 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!")(static_cast<void> (0));
31889 bool NoImplicitFloatOps =
31890 DAG.getMachineFunction().getFunction().hasFnAttribute(
31891 Attribute::NoImplicitFloat);
31892 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
31893 auto *Node = cast<AtomicSDNode>(N);
31894 if (Subtarget.hasSSE1()) {
31895 // Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.
31896 // Then extract the lower 64-bits.
31897 MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
31898 SDVTList Tys = DAG.getVTList(LdVT, MVT::Other);
31899 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
31900 SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
31901 MVT::i64, Node->getMemOperand());
31902 if (Subtarget.hasSSE2()) {
31903 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
31904 DAG.getIntPtrConstant(0, dl));
31905 Results.push_back(Res);
31906 Results.push_back(Ld.getValue(1));
31907 return;
31908 }
31909 // We use an alternative sequence for SSE1 that extracts as v2f32 and
31910 // then casts to i64. This avoids a 128-bit stack temporary being
31911 // created by type legalization if we were to cast v4f32->v2i64.
31912 SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld,
31913 DAG.getIntPtrConstant(0, dl));
31914 Res = DAG.getBitcast(MVT::i64, Res);
31915 Results.push_back(Res);
31916 Results.push_back(Ld.getValue(1));
31917 return;
31918 }
31919 if (Subtarget.hasX87()) {
31920 // First load this into an 80-bit X87 register. This will put the whole
31921 // integer into the significand.
31922 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
31923 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
31924 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::FILD,
31925 dl, Tys, Ops, MVT::i64,
31926 Node->getMemOperand());
31927 SDValue Chain = Result.getValue(1);
31928
31929 // Now store the X87 register to a stack temporary and convert to i64.
31930 // This store is not atomic and doesn't need to be.
31931 // FIXME: We don't need a stack temporary if the result of the load
31932 // is already being stored. We could just directly store there.
31933 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
31934 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
31935 MachinePointerInfo MPI =
31936 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
31937 SDValue StoreOps[] = { Chain, Result, StackPtr };
31938 Chain = DAG.getMemIntrinsicNode(
31939 X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64,
31940 MPI, None /*Align*/, MachineMemOperand::MOStore);
31941
31942 // Finally load the value back from the stack temporary and return it.
31943 // This load is not atomic and doesn't need to be.
31944 // This load will be further type legalized.
31945 Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI);
31946 Results.push_back(Result);
31947 Results.push_back(Result.getValue(1));
31948 return;
31949 }
31950 }
31951 // TODO: Use MOVLPS when SSE1 is available?
31952 // Delegate to generic TypeLegalization. Situations we can really handle
31953 // should have already been dealt with by AtomicExpandPass.cpp.
31954 break;
31955 }
31956 case ISD::ATOMIC_SWAP:
31957 case ISD::ATOMIC_LOAD_ADD:
31958 case ISD::ATOMIC_LOAD_SUB:
31959 case ISD::ATOMIC_LOAD_AND:
31960 case ISD::ATOMIC_LOAD_OR:
31961 case ISD::ATOMIC_LOAD_XOR:
31962 case ISD::ATOMIC_LOAD_NAND:
31963 case ISD::ATOMIC_LOAD_MIN:
31964 case ISD::ATOMIC_LOAD_MAX:
31965 case ISD::ATOMIC_LOAD_UMIN:
31966 case ISD::ATOMIC_LOAD_UMAX:
31967 // Delegate to generic TypeLegalization. Situations we can really handle
31968 // should have already been dealt with by AtomicExpandPass.cpp.
31969 break;
31970
31971 case ISD::BITCAST: {
31972 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast<void> (0));
31973 EVT DstVT = N->getValueType(0);
31974 EVT SrcVT = N->getOperand(0).getValueType();
31975
31976 // If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
31977 // we can split using the k-register rather than memory.
31978 if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {
31979 assert(!Subtarget.is64Bit() && "Expected 32-bit mode")(static_cast<void> (0));
31980 SDValue Lo, Hi;
31981 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
31982 Lo = DAG.getBitcast(MVT::i32, Lo);
31983 Hi = DAG.getBitcast(MVT::i32, Hi);
31984 SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
31985 Results.push_back(Res);
31986 return;
31987 }
31988
31989 if (DstVT.isVector() && SrcVT == MVT::x86mmx) {
31990 // FIXME: Use v4f32 for SSE1?
31991 assert(Subtarget.hasSSE2() && "Requires SSE2")(static_cast<void> (0));
31992 assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&(static_cast<void> (0))
31993 "Unexpected type action!")(static_cast<void> (0));
31994 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);
31995 SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64,
31996 N->getOperand(0));
31997 Res = DAG.getBitcast(WideVT, Res);
31998 Results.push_back(Res);
31999 return;
32000 }
32001
32002 return;
32003 }
32004 case ISD::MGATHER: {
32005 EVT VT = N->getValueType(0);
32006 if ((VT == MVT::v2f32 || VT == MVT::v2i32) &&
32007 (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
32008 auto *Gather = cast<MaskedGatherSDNode>(N);
32009 SDValue Index = Gather->getIndex();
32010 if (Index.getValueType() != MVT::v2i64)
32011 return;
32012 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast<void> (0))
32013 "Unexpected type action!")(static_cast<void> (0));
32014 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
32015 SDValue Mask = Gather->getMask();
32016 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type")(static_cast<void> (0));
32017 SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT,
32018 Gather->getPassThru(),
32019 DAG.getUNDEF(VT));
32020 if (!Subtarget.hasVLX()) {
32021 // We need to widen the mask, but the instruction will only use 2
32022 // of its elements. So we can use undef.
32023 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
32024 DAG.getUNDEF(MVT::v2i1));
32025 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
32026 }
32027 SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
32028 Gather->getBasePtr(), Index, Gather->getScale() };
32029 SDValue Res = DAG.getMemIntrinsicNode(
32030 X86ISD::MGATHER, dl, DAG.getVTList(WideVT, MVT::Other), Ops,
32031 Gather->getMemoryVT(), Gather->getMemOperand());
32032 Results.push_back(Res);
32033 Results.push_back(Res.getValue(1));
32034 return;
32035 }
32036 return;
32037 }
32038 case ISD::LOAD: {
32039 // Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This
32040 // avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp
32041 // cast since type legalization will try to use an i64 load.
32042 MVT VT = N->getSimpleValueType(0);
32043 assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT")(static_cast<void> (0));
32044 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast<void> (0))
32045 "Unexpected type action!")(static_cast<void> (0));
32046 if (!ISD::isNON_EXTLoad(N))
32047 return;
32048 auto *Ld = cast<LoadSDNode>(N);
32049 if (Subtarget.hasSSE2()) {
32050 MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
32051 SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
32052 Ld->getPointerInfo(), Ld->getOriginalAlign(),
32053 Ld->getMemOperand()->getFlags());
32054 SDValue Chain = Res.getValue(1);
32055 MVT VecVT = MVT::getVectorVT(LdVT, 2);
32056 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res);
32057 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
32058 Res = DAG.getBitcast(WideVT, Res);
32059 Results.push_back(Res);
32060 Results.push_back(Chain);
32061 return;
32062 }
32063 assert(Subtarget.hasSSE1() && "Expected SSE")(static_cast<void> (0));
32064 SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);
32065 SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
32066 SDValue Res = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
32067 MVT::i64, Ld->getMemOperand());
32068 Results.push_back(Res);
32069 Results.push_back(Res.getValue(1));
32070 return;
32071 }
32072 case ISD::ADDRSPACECAST: {
32073 SDValue V = LowerADDRSPACECAST(SDValue(N,0), DAG);
32074 Results.push_back(V);
32075 return;
32076 }
32077 case ISD::BITREVERSE:
32078 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!")(static_cast<void> (0));
32079 assert(Subtarget.hasXOP() && "Expected XOP")(static_cast<void> (0));
32080 // We can use VPPERM by copying to a vector register and back. We'll need
32081 // to move the scalar in two i32 pieces.
32082 Results.push_back(LowerBITREVERSE(SDValue(N, 0), Subtarget, DAG));
32083 return;
32084 }
32085}
32086
32087const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
32088 switch ((X86ISD::NodeType)Opcode) {
32089 case X86ISD::FIRST_NUMBER: break;
32090#define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE;
32091 NODE_NAME_CASE(BSF)
32092 NODE_NAME_CASE(BSR)
32093 NODE_NAME_CASE(FSHL)
32094 NODE_NAME_CASE(FSHR)
32095 NODE_NAME_CASE(FAND)
32096 NODE_NAME_CASE(FANDN)
32097 NODE_NAME_CASE(FOR)
32098 NODE_NAME_CASE(FXOR)
32099 NODE_NAME_CASE(FILD)
32100 NODE_NAME_CASE(FIST)
32101 NODE_NAME_CASE(FP_TO_INT_IN_MEM)
32102 NODE_NAME_CASE(FLD)
32103 NODE_NAME_CASE(FST)
32104 NODE_NAME_CASE(CALL)
32105 NODE_NAME_CASE(CALL_RVMARKER)
32106 NODE_NAME_CASE(BT)
32107 NODE_NAME_CASE(CMP)
32108 NODE_NAME_CASE(FCMP)
32109 NODE_NAME_CASE(STRICT_FCMP)
32110 NODE_NAME_CASE(STRICT_FCMPS)
32111 NODE_NAME_CASE(COMI)
32112 NODE_NAME_CASE(UCOMI)
32113 NODE_NAME_CASE(CMPM)
32114 NODE_NAME_CASE(CMPMM)
32115 NODE_NAME_CASE(STRICT_CMPM)
32116 NODE_NAME_CASE(CMPMM_SAE)
32117 NODE_NAME_CASE(SETCC)
32118 NODE_NAME_CASE(SETCC_CARRY)
32119 NODE_NAME_CASE(FSETCC)
32120 NODE_NAME_CASE(FSETCCM)
32121 NODE_NAME_CASE(FSETCCM_SAE)
32122 NODE_NAME_CASE(CMOV)
32123 NODE_NAME_CASE(BRCOND)
32124 NODE_NAME_CASE(RET_FLAG)
32125 NODE_NAME_CASE(IRET)
32126 NODE_NAME_CASE(REP_STOS)
32127 NODE_NAME_CASE(REP_MOVS)
32128 NODE_NAME_CASE(GlobalBaseReg)
32129 NODE_NAME_CASE(Wrapper)
32130 NODE_NAME_CASE(WrapperRIP)
32131 NODE_NAME_CASE(MOVQ2DQ)
32132 NODE_NAME_CASE(MOVDQ2Q)
32133 NODE_NAME_CASE(MMX_MOVD2W)
32134 NODE_NAME_CASE(MMX_MOVW2D)
32135 NODE_NAME_CASE(PEXTRB)
32136 NODE_NAME_CASE(PEXTRW)
32137 NODE_NAME_CASE(INSERTPS)
32138 NODE_NAME_CASE(PINSRB)
32139 NODE_NAME_CASE(PINSRW)
32140 NODE_NAME_CASE(PSHUFB)
32141 NODE_NAME_CASE(ANDNP)
32142 NODE_NAME_CASE(BLENDI)
32143 NODE_NAME_CASE(BLENDV)
32144 NODE_NAME_CASE(HADD)
32145 NODE_NAME_CASE(HSUB)
32146 NODE_NAME_CASE(FHADD)
32147 NODE_NAME_CASE(FHSUB)
32148 NODE_NAME_CASE(CONFLICT)
32149 NODE_NAME_CASE(FMAX)
32150 NODE_NAME_CASE(FMAXS)
32151 NODE_NAME_CASE(FMAX_SAE)
32152 NODE_NAME_CASE(FMAXS_SAE)
32153 NODE_NAME_CASE(FMIN)
32154 NODE_NAME_CASE(FMINS)
32155 NODE_NAME_CASE(FMIN_SAE)
32156 NODE_NAME_CASE(FMINS_SAE)
32157 NODE_NAME_CASE(FMAXC)
32158 NODE_NAME_CASE(FMINC)
32159 NODE_NAME_CASE(FRSQRT)
32160 NODE_NAME_CASE(FRCP)
32161 NODE_NAME_CASE(EXTRQI)
32162 NODE_NAME_CASE(INSERTQI)
32163 NODE_NAME_CASE(TLSADDR)
32164 NODE_NAME_CASE(TLSBASEADDR)
32165 NODE_NAME_CASE(TLSCALL)
32166 NODE_NAME_CASE(EH_SJLJ_SETJMP)
32167 NODE_NAME_CASE(EH_SJLJ_LONGJMP)
32168 NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)
32169 NODE_NAME_CASE(EH_RETURN)
32170 NODE_NAME_CASE(TC_RETURN)
32171 NODE_NAME_CASE(FNSTCW16m)
32172 NODE_NAME_CASE(FLDCW16m)
32173 NODE_NAME_CASE(LCMPXCHG_DAG)
32174 NODE_NAME_CASE(LCMPXCHG8_DAG)
32175 NODE_NAME_CASE(LCMPXCHG16_DAG)
32176 NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)
32177 NODE_NAME_CASE(LADD)
32178 NODE_NAME_CASE(LSUB)
32179 NODE_NAME_CASE(LOR)
32180 NODE_NAME_CASE(LXOR)
32181 NODE_NAME_CASE(LAND)
32182 NODE_NAME_CASE(VZEXT_MOVL)
32183 NODE_NAME_CASE(VZEXT_LOAD)
32184 NODE_NAME_CASE(VEXTRACT_STORE)
32185 NODE_NAME_CASE(VTRUNC)
32186 NODE_NAME_CASE(VTRUNCS)
32187 NODE_NAME_CASE(VTRUNCUS)
32188 NODE_NAME_CASE(VMTRUNC)
32189 NODE_NAME_CASE(VMTRUNCS)
32190 NODE_NAME_CASE(VMTRUNCUS)
32191 NODE_NAME_CASE(VTRUNCSTORES)
32192 NODE_NAME_CASE(VTRUNCSTOREUS)
32193 NODE_NAME_CASE(VMTRUNCSTORES)
32194 NODE_NAME_CASE(VMTRUNCSTOREUS)
32195 NODE_NAME_CASE(VFPEXT)
32196 NODE_NAME_CASE(STRICT_VFPEXT)
32197 NODE_NAME_CASE(VFPEXT_SAE)
32198 NODE_NAME_CASE(VFPEXTS)
32199 NODE_NAME_CASE(VFPEXTS_SAE)
32200 NODE_NAME_CASE(VFPROUND)
32201 NODE_NAME_CASE(STRICT_VFPROUND)
32202 NODE_NAME_CASE(VMFPROUND)
32203 NODE_NAME_CASE(VFPROUND_RND)
32204 NODE_NAME_CASE(VFPROUNDS)
32205 NODE_NAME_CASE(VFPROUNDS_RND)
32206 NODE_NAME_CASE(VSHLDQ)
32207 NODE_NAME_CASE(VSRLDQ)
32208 NODE_NAME_CASE(VSHL)
32209 NODE_NAME_CASE(VSRL)
32210 NODE_NAME_CASE(VSRA)
32211 NODE_NAME_CASE(VSHLI)
32212 NODE_NAME_CASE(VSRLI)
32213 NODE_NAME_CASE(VSRAI)
32214 NODE_NAME_CASE(VSHLV)
32215 NODE_NAME_CASE(VSRLV)
32216 NODE_NAME_CASE(VSRAV)
32217 NODE_NAME_CASE(VROTLI)
32218 NODE_NAME_CASE(VROTRI)
32219 NODE_NAME_CASE(VPPERM)
32220 NODE_NAME_CASE(CMPP)
32221 NODE_NAME_CASE(STRICT_CMPP)
32222 NODE_NAME_CASE(PCMPEQ)
32223 NODE_NAME_CASE(PCMPGT)
32224 NODE_NAME_CASE(PHMINPOS)
32225 NODE_NAME_CASE(ADD)
32226 NODE_NAME_CASE(SUB)
32227 NODE_NAME_CASE(ADC)
32228 NODE_NAME_CASE(SBB)
32229 NODE_NAME_CASE(SMUL)
32230 NODE_NAME_CASE(UMUL)
32231 NODE_NAME_CASE(OR)
32232 NODE_NAME_CASE(XOR)
32233 NODE_NAME_CASE(AND)
32234 NODE_NAME_CASE(BEXTR)
32235 NODE_NAME_CASE(BEXTRI)
32236 NODE_NAME_CASE(BZHI)
32237 NODE_NAME_CASE(PDEP)
32238 NODE_NAME_CASE(PEXT)
32239 NODE_NAME_CASE(MUL_IMM)
32240 NODE_NAME_CASE(MOVMSK)
32241 NODE_NAME_CASE(PTEST)
32242 NODE_NAME_CASE(TESTP)
32243 NODE_NAME_CASE(KORTEST)
32244 NODE_NAME_CASE(KTEST)
32245 NODE_NAME_CASE(KADD)
32246 NODE_NAME_CASE(KSHIFTL)
32247 NODE_NAME_CASE(KSHIFTR)
32248 NODE_NAME_CASE(PACKSS)
32249 NODE_NAME_CASE(PACKUS)
32250 NODE_NAME_CASE(PALIGNR)
32251 NODE_NAME_CASE(VALIGN)
32252 NODE_NAME_CASE(VSHLD)
32253 NODE_NAME_CASE(VSHRD)
32254 NODE_NAME_CASE(VSHLDV)
32255 NODE_NAME_CASE(VSHRDV)
32256 NODE_NAME_CASE(PSHUFD)
32257 NODE_NAME_CASE(PSHUFHW)
32258 NODE_NAME_CASE(PSHUFLW)
32259 NODE_NAME_CASE(SHUFP)
32260 NODE_NAME_CASE(SHUF128)
32261 NODE_NAME_CASE(MOVLHPS)
32262 NODE_NAME_CASE(MOVHLPS)
32263 NODE_NAME_CASE(MOVDDUP)
32264 NODE_NAME_CASE(MOVSHDUP)
32265 NODE_NAME_CASE(MOVSLDUP)
32266 NODE_NAME_CASE(MOVSD)
32267 NODE_NAME_CASE(MOVSS)
32268 NODE_NAME_CASE(MOVSH)
32269 NODE_NAME_CASE(UNPCKL)
32270 NODE_NAME_CASE(UNPCKH)
32271 NODE_NAME_CASE(VBROADCAST)
32272 NODE_NAME_CASE(VBROADCAST_LOAD)
32273 NODE_NAME_CASE(VBROADCASTM)
32274 NODE_NAME_CASE(SUBV_BROADCAST_LOAD)
32275 NODE_NAME_CASE(VPERMILPV)
32276 NODE_NAME_CASE(VPERMILPI)
32277 NODE_NAME_CASE(VPERM2X128)
32278 NODE_NAME_CASE(VPERMV)
32279 NODE_NAME_CASE(VPERMV3)
32280 NODE_NAME_CASE(VPERMI)
32281 NODE_NAME_CASE(VPTERNLOG)
32282 NODE_NAME_CASE(VFIXUPIMM)
32283 NODE_NAME_CASE(VFIXUPIMM_SAE)
32284 NODE_NAME_CASE(VFIXUPIMMS)
32285 NODE_NAME_CASE(VFIXUPIMMS_SAE)
32286 NODE_NAME_CASE(VRANGE)
32287 NODE_NAME_CASE(VRANGE_SAE)
32288 NODE_NAME_CASE(VRANGES)
32289 NODE_NAME_CASE(VRANGES_SAE)
32290 NODE_NAME_CASE(PMULUDQ)
32291 NODE_NAME_CASE(PMULDQ)
32292 NODE_NAME_CASE(PSADBW)
32293 NODE_NAME_CASE(DBPSADBW)
32294 NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)
32295 NODE_NAME_CASE(VAARG_64)
32296 NODE_NAME_CASE(VAARG_X32)
32297 NODE_NAME_CASE(WIN_ALLOCA)
32298 NODE_NAME_CASE(MEMBARRIER)
32299 NODE_NAME_CASE(MFENCE)
32300 NODE_NAME_CASE(SEG_ALLOCA)
32301 NODE_NAME_CASE(PROBED_ALLOCA)
32302 NODE_NAME_CASE(RDRAND)
32303 NODE_NAME_CASE(RDSEED)
32304 NODE_NAME_CASE(RDPKRU)
32305 NODE_NAME_CASE(WRPKRU)
32306 NODE_NAME_CASE(VPMADDUBSW)
32307 NODE_NAME_CASE(VPMADDWD)
32308 NODE_NAME_CASE(VPSHA)
32309 NODE_NAME_CASE(VPSHL)
32310 NODE_NAME_CASE(VPCOM)
32311 NODE_NAME_CASE(VPCOMU)
32312 NODE_NAME_CASE(VPERMIL2)
32313 NODE_NAME_CASE(FMSUB)
32314 NODE_NAME_CASE(STRICT_FMSUB)
32315 NODE_NAME_CASE(FNMADD)
32316 NODE_NAME_CASE(STRICT_FNMADD)
32317 NODE_NAME_CASE(FNMSUB)
32318 NODE_NAME_CASE(STRICT_FNMSUB)
32319 NODE_NAME_CASE(FMADDSUB)
32320 NODE_NAME_CASE(FMSUBADD)
32321 NODE_NAME_CASE(FMADD_RND)
32322 NODE_NAME_CASE(FNMADD_RND)
32323 NODE_NAME_CASE(FMSUB_RND)
32324 NODE_NAME_CASE(FNMSUB_RND)
32325 NODE_NAME_CASE(FMADDSUB_RND)
32326 NODE_NAME_CASE(FMSUBADD_RND)
32327 NODE_NAME_CASE(VFMADDC)
32328 NODE_NAME_CASE(VFMADDC_RND)
32329 NODE_NAME_CASE(VFCMADDC)
32330 NODE_NAME_CASE(VFCMADDC_RND)
32331 NODE_NAME_CASE(VFMULC)
32332 NODE_NAME_CASE(VFMULC_RND)
32333 NODE_NAME_CASE(VFCMULC)
32334 NODE_NAME_CASE(VFCMULC_RND)
32335 NODE_NAME_CASE(VFMULCSH)
32336 NODE_NAME_CASE(VFMULCSH_RND)
32337 NODE_NAME_CASE(VFCMULCSH)
32338 NODE_NAME_CASE(VFCMULCSH_RND)
32339 NODE_NAME_CASE(VFMADDCSH)
32340 NODE_NAME_CASE(VFMADDCSH_RND)
32341 NODE_NAME_CASE(VFCMADDCSH)
32342 NODE_NAME_CASE(VFCMADDCSH_RND)
32343 NODE_NAME_CASE(VPMADD52H)
32344 NODE_NAME_CASE(VPMADD52L)
32345 NODE_NAME_CASE(VRNDSCALE)
32346 NODE_NAME_CASE(STRICT_VRNDSCALE)
32347 NODE_NAME_CASE(VRNDSCALE_SAE)
32348 NODE_NAME_CASE(VRNDSCALES)
32349 NODE_NAME_CASE(VRNDSCALES_SAE)
32350 NODE_NAME_CASE(VREDUCE)
32351 NODE_NAME_CASE(VREDUCE_SAE)
32352 NODE_NAME_CASE(VREDUCES)
32353 NODE_NAME_CASE(VREDUCES_SAE)
32354 NODE_NAME_CASE(VGETMANT)
32355 NODE_NAME_CASE(VGETMANT_SAE)
32356 NODE_NAME_CASE(VGETMANTS)
32357 NODE_NAME_CASE(VGETMANTS_SAE)
32358 NODE_NAME_CASE(PCMPESTR)
32359 NODE_NAME_CASE(PCMPISTR)
32360 NODE_NAME_CASE(XTEST)
32361 NODE_NAME_CASE(COMPRESS)
32362 NODE_NAME_CASE(EXPAND)
32363 NODE_NAME_CASE(SELECTS)
32364 NODE_NAME_CASE(ADDSUB)
32365 NODE_NAME_CASE(RCP14)
32366 NODE_NAME_CASE(RCP14S)
32367 NODE_NAME_CASE(RCP28)
32368 NODE_NAME_CASE(RCP28_SAE)
32369 NODE_NAME_CASE(RCP28S)
32370 NODE_NAME_CASE(RCP28S_SAE)
32371 NODE_NAME_CASE(EXP2)
32372 NODE_NAME_CASE(EXP2_SAE)
32373 NODE_NAME_CASE(RSQRT14)
32374 NODE_NAME_CASE(RSQRT14S)
32375 NODE_NAME_CASE(RSQRT28)
32376 NODE_NAME_CASE(RSQRT28_SAE)
32377 NODE_NAME_CASE(RSQRT28S)
32378 NODE_NAME_CASE(RSQRT28S_SAE)
32379 NODE_NAME_CASE(FADD_RND)
32380 NODE_NAME_CASE(FADDS)
32381 NODE_NAME_CASE(FADDS_RND)
32382 NODE_NAME_CASE(FSUB_RND)
32383 NODE_NAME_CASE(FSUBS)
32384 NODE_NAME_CASE(FSUBS_RND)
32385 NODE_NAME_CASE(FMUL_RND)
32386 NODE_NAME_CASE(FMULS)
32387 NODE_NAME_CASE(FMULS_RND)
32388 NODE_NAME_CASE(FDIV_RND)
32389 NODE_NAME_CASE(FDIVS)
32390 NODE_NAME_CASE(FDIVS_RND)
32391 NODE_NAME_CASE(FSQRT_RND)
32392 NODE_NAME_CASE(FSQRTS)
32393 NODE_NAME_CASE(FSQRTS_RND)
32394 NODE_NAME_CASE(FGETEXP)
32395 NODE_NAME_CASE(FGETEXP_SAE)
32396 NODE_NAME_CASE(FGETEXPS)
32397 NODE_NAME_CASE(FGETEXPS_SAE)
32398 NODE_NAME_CASE(SCALEF)
32399 NODE_NAME_CASE(SCALEF_RND)
32400 NODE_NAME_CASE(SCALEFS)
32401 NODE_NAME_CASE(SCALEFS_RND)
32402 NODE_NAME_CASE(AVG)
32403 NODE_NAME_CASE(MULHRS)
32404 NODE_NAME_CASE(SINT_TO_FP_RND)
32405 NODE_NAME_CASE(UINT_TO_FP_RND)
32406 NODE_NAME_CASE(CVTTP2SI)
32407 NODE_NAME_CASE(CVTTP2UI)
32408 NODE_NAME_CASE(STRICT_CVTTP2SI)
32409 NODE_NAME_CASE(STRICT_CVTTP2UI)
32410 NODE_NAME_CASE(MCVTTP2SI)
32411 NODE_NAME_CASE(MCVTTP2UI)
32412 NODE_NAME_CASE(CVTTP2SI_SAE)
32413 NODE_NAME_CASE(CVTTP2UI_SAE)
32414 NODE_NAME_CASE(CVTTS2SI)
32415 NODE_NAME_CASE(CVTTS2UI)
32416 NODE_NAME_CASE(CVTTS2SI_SAE)
32417 NODE_NAME_CASE(CVTTS2UI_SAE)
32418 NODE_NAME_CASE(CVTSI2P)
32419 NODE_NAME_CASE(CVTUI2P)
32420 NODE_NAME_CASE(STRICT_CVTSI2P)
32421 NODE_NAME_CASE(STRICT_CVTUI2P)
32422 NODE_NAME_CASE(MCVTSI2P)
32423 NODE_NAME_CASE(MCVTUI2P)
32424 NODE_NAME_CASE(VFPCLASS)
32425 NODE_NAME_CASE(VFPCLASSS)
32426 NODE_NAME_CASE(MULTISHIFT)
32427 NODE_NAME_CASE(SCALAR_SINT_TO_FP)
32428 NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND)
32429 NODE_NAME_CASE(SCALAR_UINT_TO_FP)
32430 NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)
32431 NODE_NAME_CASE(CVTPS2PH)
32432 NODE_NAME_CASE(STRICT_CVTPS2PH)
32433 NODE_NAME_CASE(MCVTPS2PH)
32434 NODE_NAME_CASE(CVTPH2PS)
32435 NODE_NAME_CASE(STRICT_CVTPH2PS)
32436 NODE_NAME_CASE(CVTPH2PS_SAE)
32437 NODE_NAME_CASE(CVTP2SI)
32438 NODE_NAME_CASE(CVTP2UI)
32439 NODE_NAME_CASE(MCVTP2SI)
32440 NODE_NAME_CASE(MCVTP2UI)
32441 NODE_NAME_CASE(CVTP2SI_RND)
32442 NODE_NAME_CASE(CVTP2UI_RND)
32443 NODE_NAME_CASE(CVTS2SI)
32444 NODE_NAME_CASE(CVTS2UI)
32445 NODE_NAME_CASE(CVTS2SI_RND)
32446 NODE_NAME_CASE(CVTS2UI_RND)
32447 NODE_NAME_CASE(CVTNE2PS2BF16)
32448 NODE_NAME_CASE(CVTNEPS2BF16)
32449 NODE_NAME_CASE(MCVTNEPS2BF16)
32450 NODE_NAME_CASE(DPBF16PS)
32451 NODE_NAME_CASE(LWPINS)
32452 NODE_NAME_CASE(MGATHER)
32453 NODE_NAME_CASE(MSCATTER)
32454 NODE_NAME_CASE(VPDPBUSD)
32455 NODE_NAME_CASE(VPDPBUSDS)
32456 NODE_NAME_CASE(VPDPWSSD)
32457 NODE_NAME_CASE(VPDPWSSDS)
32458 NODE_NAME_CASE(VPSHUFBITQMB)
32459 NODE_NAME_CASE(GF2P8MULB)
32460 NODE_NAME_CASE(GF2P8AFFINEQB)
32461 NODE_NAME_CASE(GF2P8AFFINEINVQB)
32462 NODE_NAME_CASE(NT_CALL)
32463 NODE_NAME_CASE(NT_BRIND)
32464 NODE_NAME_CASE(UMWAIT)
32465 NODE_NAME_CASE(TPAUSE)
32466 NODE_NAME_CASE(ENQCMD)
32467 NODE_NAME_CASE(ENQCMDS)
32468 NODE_NAME_CASE(VP2INTERSECT)
32469 NODE_NAME_CASE(AESENC128KL)
32470 NODE_NAME_CASE(AESDEC128KL)
32471 NODE_NAME_CASE(AESENC256KL)
32472 NODE_NAME_CASE(AESDEC256KL)
32473 NODE_NAME_CASE(AESENCWIDE128KL)
32474 NODE_NAME_CASE(AESDECWIDE128KL)
32475 NODE_NAME_CASE(AESENCWIDE256KL)
32476 NODE_NAME_CASE(AESDECWIDE256KL)
32477 NODE_NAME_CASE(TESTUI)
32478 }
32479 return nullptr;
32480#undef NODE_NAME_CASE
32481}
32482
32483/// Return true if the addressing mode represented by AM is legal for this
32484/// target, for a load/store of the specified type.
32485bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
32486 const AddrMode &AM, Type *Ty,
32487 unsigned AS,
32488 Instruction *I) const {
32489 // X86 supports extremely general addressing modes.
32490 CodeModel::Model M = getTargetMachine().getCodeModel();
32491
32492 // X86 allows a sign-extended 32-bit immediate field as a displacement.
32493 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
32494 return false;
32495
32496 if (AM.BaseGV) {
32497 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
32498
32499 // If a reference to this global requires an extra load, we can't fold it.
32500 if (isGlobalStubReference(GVFlags))
32501 return false;
32502
32503 // If BaseGV requires a register for the PIC base, we cannot also have a
32504 // BaseReg specified.
32505 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
32506 return false;
32507
32508 // If lower 4G is not available, then we must use rip-relative addressing.
32509 if ((M != CodeModel::Small || isPositionIndependent()) &&
32510 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
32511 return false;
32512 }
32513
32514 switch (AM.Scale) {
32515 case 0:
32516 case 1:
32517 case 2:
32518 case 4:
32519 case 8:
32520 // These scales always work.
32521 break;
32522 case 3:
32523 case 5:
32524 case 9:
32525 // These scales are formed with basereg+scalereg. Only accept if there is
32526 // no basereg yet.
32527 if (AM.HasBaseReg)
32528 return false;
32529 break;
32530 default: // Other stuff never works.
32531 return false;
32532 }
32533
32534 return true;
32535}
32536
32537bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
32538 unsigned Bits = Ty->getScalarSizeInBits();
32539
32540 // 8-bit shifts are always expensive, but versions with a scalar amount aren't
32541 // particularly cheaper than those without.
32542 if (Bits == 8)
32543 return false;
32544
32545 // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
32546 // Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred.
32547 if (Subtarget.hasXOP() &&
32548 (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))
32549 return false;
32550
32551 // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
32552 // shifts just as cheap as scalar ones.
32553 if (Subtarget.hasAVX2() && (Bits == 32 || Bits == 64))
32554 return false;
32555
32556 // AVX512BW has shifts such as vpsllvw.
32557 if (Subtarget.hasBWI() && Bits == 16)
32558 return false;
32559
32560 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
32561 // fully general vector.
32562 return true;
32563}
32564
32565bool X86TargetLowering::isBinOp(unsigned Opcode) const {
32566 switch (Opcode) {
32567 // These are non-commutative binops.
32568 // TODO: Add more X86ISD opcodes once we have test coverage.
32569 case X86ISD::ANDNP:
32570 case X86ISD::PCMPGT:
32571 case X86ISD::FMAX:
32572 case X86ISD::FMIN:
32573 case X86ISD::FANDN:
32574 return true;
32575 }
32576
32577 return TargetLoweringBase::isBinOp(Opcode);
32578}
32579
32580bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {
32581 switch (Opcode) {
32582 // TODO: Add more X86ISD opcodes once we have test coverage.
32583 case X86ISD::PCMPEQ:
32584 case X86ISD::PMULDQ:
32585 case X86ISD::PMULUDQ:
32586 case X86ISD::FMAXC:
32587 case X86ISD::FMINC:
32588 case X86ISD::FAND:
32589 case X86ISD::FOR:
32590 case X86ISD::FXOR:
32591 return true;
32592 }
32593
32594 return TargetLoweringBase::isCommutativeBinOp(Opcode);
32595}
32596
32597bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
32598 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
32599 return false;
32600 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
32601 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
32602 return NumBits1 > NumBits2;
32603}
32604
32605bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
32606 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
32607 return false;
32608
32609 if (!isTypeLegal(EVT::getEVT(Ty1)))
32610 return false;
32611
32612 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop")(static_cast<void> (0));
32613
32614 // Assuming the caller doesn't have a zeroext or signext return parameter,
32615 // truncation all the way down to i1 is valid.
32616 return true;
32617}
32618
32619bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
32620 return isInt<32>(Imm);
32621}
32622
32623bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
32624 // Can also use sub to handle negated immediates.
32625 return isInt<32>(Imm);
32626}
32627
32628bool X86TargetLowering::isLegalStoreImmediate(int64_t Imm) const {
32629 return isInt<32>(Imm);
32630}
32631
32632bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
32633 if (!VT1.isScalarInteger() || !VT2.isScalarInteger())
32634 return false;
32635 unsigned NumBits1 = VT1.getSizeInBits();
32636 unsigned NumBits2 = VT2.getSizeInBits();
32637 return NumBits1 > NumBits2;
32638}
32639
32640bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
32641 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
32642 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
32643}
32644
32645bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
32646 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
32647 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
32648}
32649
32650bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
32651 EVT VT1 = Val.getValueType();
32652 if (isZExtFree(VT1, VT2))
32653 return true;
32654
32655 if (Val.getOpcode() != ISD::LOAD)
32656 return false;
32657
32658 if (!VT1.isSimple() || !VT1.isInteger() ||
32659 !VT2.isSimple() || !VT2.isInteger())
32660 return false;
32661
32662 switch (VT1.getSimpleVT().SimpleTy) {
32663 default: break;
32664 case MVT::i8:
32665 case MVT::i16:
32666 case MVT::i32:
32667 // X86 has 8, 16, and 32-bit zero-extending loads.
32668 return true;
32669 }
32670
32671 return false;
32672}
32673
32674bool X86TargetLowering::shouldSinkOperands(Instruction *I,
32675 SmallVectorImpl<Use *> &Ops) const {
32676 using namespace llvm::PatternMatch;
32677
32678 FixedVectorType *VTy = dyn_cast<FixedVectorType>(I->getType());
32679 if (!VTy)
32680 return false;
32681
32682 if (I->getOpcode() == Instruction::Mul &&
32683 VTy->getElementType()->isIntegerTy(64)) {
32684 for (auto &Op : I->operands()) {
32685 // Make sure we are not already sinking this operand
32686 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
32687 continue;
32688
32689 // Look for PMULDQ pattern where the input is a sext_inreg from vXi32 or
32690 // the PMULUDQ pattern where the input is a zext_inreg from vXi32.
32691 if (Subtarget.hasSSE41() &&
32692 match(Op.get(), m_AShr(m_Shl(m_Value(), m_SpecificInt(32)),
32693 m_SpecificInt(32)))) {
32694 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
32695 Ops.push_back(&Op);
32696 } else if (Subtarget.hasSSE2() &&
32697 match(Op.get(),
32698 m_And(m_Value(), m_SpecificInt(UINT64_C(0xffffffff)0xffffffffUL)))) {
32699 Ops.push_back(&Op);
32700 }
32701 }
32702
32703 return !Ops.empty();
32704 }
32705
32706 // A uniform shift amount in a vector shift or funnel shift may be much
32707 // cheaper than a generic variable vector shift, so make that pattern visible
32708 // to SDAG by sinking the shuffle instruction next to the shift.
32709 int ShiftAmountOpNum = -1;
32710 if (I->isShift())
32711 ShiftAmountOpNum = 1;
32712 else if (auto *II = dyn_cast<IntrinsicInst>(I)) {
32713 if (II->getIntrinsicID() == Intrinsic::fshl ||
32714 II->getIntrinsicID() == Intrinsic::fshr)
32715 ShiftAmountOpNum = 2;
32716 }
32717
32718 if (ShiftAmountOpNum == -1)
32719 return false;
32720
32721 auto *Shuf = dyn_cast<ShuffleVectorInst>(I->getOperand(ShiftAmountOpNum));
32722 if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 &&
32723 isVectorShiftByScalarCheap(I->getType())) {
32724 Ops.push_back(&I->getOperandUse(ShiftAmountOpNum));
32725 return true;
32726 }
32727
32728 return false;
32729}
32730
32731bool X86TargetLowering::shouldConvertPhiType(Type *From, Type *To) const {
32732 if (!Subtarget.is64Bit())
32733 return false;
32734 return TargetLowering::shouldConvertPhiType(From, To);
32735}
32736
32737bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
32738 if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))
32739 return false;
32740
32741 EVT SrcVT = ExtVal.getOperand(0).getValueType();
32742
32743 // There is no extending load for vXi1.
32744 if (SrcVT.getScalarType() == MVT::i1)
32745 return false;
32746
32747 return true;
32748}
32749
32750bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
32751 EVT VT) const {
32752 if (!Subtarget.hasAnyFMA())
32753 return false;
32754
32755 VT = VT.getScalarType();
32756
32757 if (!VT.isSimple())
32758 return false;
32759
32760 switch (VT.getSimpleVT().SimpleTy) {
32761 case MVT::f16:
32762 return Subtarget.hasFP16();
32763 case MVT::f32:
32764 case MVT::f64:
32765 return true;
32766 default:
32767 break;
32768 }
32769
32770 return false;
32771}
32772
32773bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
32774 // i16 instructions are longer (0x66 prefix) and potentially slower.
32775 return !(VT1 == MVT::i32 && VT2 == MVT::i16);
32776}
32777
32778/// Targets can use this to indicate that they only support *some*
32779/// VECTOR_SHUFFLE operations, those with specific masks.
32780/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
32781/// are assumed to be legal.
32782bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const {
32783 if (!VT.isSimple())
32784 return false;
32785
32786 // Not for i1 vectors
32787 if (VT.getSimpleVT().getScalarType() == MVT::i1)
32788 return false;
32789
32790 // Very little shuffling can be done for 64-bit vectors right now.
32791 if (VT.getSimpleVT().getSizeInBits() == 64)
32792 return false;
32793
32794 // We only care that the types being shuffled are legal. The lowering can
32795 // handle any possible shuffle mask that results.
32796 return isTypeLegal(VT.getSimpleVT());
32797}
32798
32799bool X86TargetLowering::isVectorClearMaskLegal(ArrayRef<int> Mask,
32800 EVT VT) const {
32801 // Don't convert an 'and' into a shuffle that we don't directly support.
32802 // vpblendw and vpshufb for 256-bit vectors are not available on AVX1.
32803 if (!Subtarget.hasAVX2())
32804 if (VT == MVT::v32i8 || VT == MVT::v16i16)
32805 return false;
32806
32807 // Just delegate to the generic legality, clear masks aren't special.
32808 return isShuffleMaskLegal(Mask, VT);
32809}
32810
32811bool X86TargetLowering::areJTsAllowed(const Function *Fn) const {
32812 // If the subtarget is using thunks, we need to not generate jump tables.
32813 if (Subtarget.useIndirectThunkBranches())
32814 return false;
32815
32816 // Otherwise, fallback on the generic logic.
32817 return TargetLowering::areJTsAllowed(Fn);
32818}
32819
32820//===----------------------------------------------------------------------===//
32821// X86 Scheduler Hooks
32822//===----------------------------------------------------------------------===//
32823
32824// Returns true if EFLAG is consumed after this iterator in the rest of the
32825// basic block or any successors of the basic block.
32826static bool isEFLAGSLiveAfter(MachineBasicBlock::iterator Itr,
32827 MachineBasicBlock *BB) {
32828 // Scan forward through BB for a use/def of EFLAGS.
32829 for (MachineBasicBlock::iterator miI = std::next(Itr), miE = BB->end();
32830 miI != miE; ++miI) {
32831 const MachineInstr& mi = *miI;
32832 if (mi.readsRegister(X86::EFLAGS))
32833 return true;
32834 // If we found a def, we can stop searching.
32835 if (mi.definesRegister(X86::EFLAGS))
32836 return false;
32837 }
32838
32839 // If we hit the end of the block, check whether EFLAGS is live into a
32840 // successor.
32841 for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
32842 sEnd = BB->succ_end();
32843 sItr != sEnd; ++sItr) {
32844 MachineBasicBlock* succ = *sItr;
32845 if (succ->isLiveIn(X86::EFLAGS))
32846 return true;
32847 }
32848
32849 return false;
32850}
32851
32852/// Utility function to emit xbegin specifying the start of an RTM region.
32853static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
32854 const TargetInstrInfo *TII) {
32855 const DebugLoc &DL = MI.getDebugLoc();
32856
32857 const BasicBlock *BB = MBB->getBasicBlock();
32858 MachineFunction::iterator I = ++MBB->getIterator();
32859
32860 // For the v = xbegin(), we generate
32861 //
32862 // thisMBB:
32863 // xbegin sinkMBB
32864 //
32865 // mainMBB:
32866 // s0 = -1
32867 //
32868 // fallBB:
32869 // eax = # XABORT_DEF
32870 // s1 = eax
32871 //
32872 // sinkMBB:
32873 // v = phi(s0/mainBB, s1/fallBB)
32874
32875 MachineBasicBlock *thisMBB = MBB;
32876 MachineFunction *MF = MBB->getParent();
32877 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
32878 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
32879 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
32880 MF->insert(I, mainMBB);
32881 MF->insert(I, fallMBB);
32882 MF->insert(I, sinkMBB);
32883
32884 if (isEFLAGSLiveAfter(MI, MBB)) {
32885 mainMBB->addLiveIn(X86::EFLAGS);
32886 fallMBB->addLiveIn(X86::EFLAGS);
32887 sinkMBB->addLiveIn(X86::EFLAGS);
32888 }
32889
32890 // Transfer the remainder of BB and its successor edges to sinkMBB.
32891 sinkMBB->splice(sinkMBB->begin(), MBB,
32892 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
32893 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
32894
32895 MachineRegisterInfo &MRI = MF->getRegInfo();
32896 Register DstReg = MI.getOperand(0).getReg();
32897 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
32898 Register mainDstReg = MRI.createVirtualRegister(RC);
32899 Register fallDstReg = MRI.createVirtualRegister(RC);
32900
32901 // thisMBB:
32902 // xbegin fallMBB
32903 // # fallthrough to mainMBB
32904 // # abortion to fallMBB
32905 BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
32906 thisMBB->addSuccessor(mainMBB);
32907 thisMBB->addSuccessor(fallMBB);
32908
32909 // mainMBB:
32910 // mainDstReg := -1
32911 BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
32912 BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
32913 mainMBB->addSuccessor(sinkMBB);
32914
32915 // fallMBB:
32916 // ; pseudo instruction to model hardware's definition from XABORT
32917 // EAX := XABORT_DEF
32918 // fallDstReg := EAX
32919 BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));
32920 BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)
32921 .addReg(X86::EAX);
32922 fallMBB->addSuccessor(sinkMBB);
32923
32924 // sinkMBB:
32925 // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
32926 BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)
32927 .addReg(mainDstReg).addMBB(mainMBB)
32928 .addReg(fallDstReg).addMBB(fallMBB);
32929
32930 MI.eraseFromParent();
32931 return sinkMBB;
32932}
32933
32934MachineBasicBlock *
32935X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
32936 MachineBasicBlock *MBB) const {
32937 // Emit va_arg instruction on X86-64.
32938
32939 // Operands to this pseudo-instruction:
32940 // 0 ) Output : destination address (reg)
32941 // 1-5) Input : va_list address (addr, i64mem)
32942 // 6 ) ArgSize : Size (in bytes) of vararg type
32943 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
32944 // 8 ) Align : Alignment of type
32945 // 9 ) EFLAGS (implicit-def)
32946
32947 assert(MI.getNumOperands() == 10 && "VAARG should have 10 operands!")(static_cast<void> (0));
32948 static_assert(X86::AddrNumOperands == 5, "VAARG assumes 5 address operands");
32949
32950 Register DestReg = MI.getOperand(0).getReg();
32951 MachineOperand &Base = MI.getOperand(1);
32952 MachineOperand &Scale = MI.getOperand(2);
32953 MachineOperand &Index = MI.getOperand(3);
32954 MachineOperand &Disp = MI.getOperand(4);
32955 MachineOperand &Segment = MI.getOperand(5);
32956 unsigned ArgSize = MI.getOperand(6).getImm();
32957 unsigned ArgMode = MI.getOperand(7).getImm();
32958 Align Alignment = Align(MI.getOperand(8).getImm());
32959
32960 MachineFunction *MF = MBB->getParent();
32961
32962 // Memory Reference
32963 assert(MI.hasOneMemOperand() && "Expected VAARG to have one memoperand")(static_cast<void> (0));
32964
32965 MachineMemOperand *OldMMO = MI.memoperands().front();
32966
32967 // Clone the MMO into two separate MMOs for loading and storing
32968 MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(
32969 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);
32970 MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(
32971 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);
32972
32973 // Machine Information
32974 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
32975 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
32976 const TargetRegisterClass *AddrRegClass =
32977 getRegClassFor(getPointerTy(MBB->getParent()->getDataLayout()));
32978 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
32979 const DebugLoc &DL = MI.getDebugLoc();
32980
32981 // struct va_list {
32982 // i32 gp_offset
32983 // i32 fp_offset
32984 // i64 overflow_area (address)
32985 // i64 reg_save_area (address)
32986 // }
32987 // sizeof(va_list) = 24
32988 // alignment(va_list) = 8
32989
32990 unsigned TotalNumIntRegs = 6;
32991 unsigned TotalNumXMMRegs = 8;
32992 bool UseGPOffset = (ArgMode == 1);
32993 bool UseFPOffset = (ArgMode == 2);
32994 unsigned MaxOffset = TotalNumIntRegs * 8 +
32995 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
32996
32997 /* Align ArgSize to a multiple of 8 */
32998 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
32999 bool NeedsAlign = (Alignment > 8);
33000
33001 MachineBasicBlock *thisMBB = MBB;
33002 MachineBasicBlock *overflowMBB;
33003 MachineBasicBlock *offsetMBB;
33004 MachineBasicBlock *endMBB;
33005
33006 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
33007 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
33008 unsigned OffsetReg = 0;
33009
33010 if (!UseGPOffset && !UseFPOffset) {
33011 // If we only pull from the overflow region, we don't create a branch.
33012 // We don't need to alter control flow.
33013 OffsetDestReg = 0; // unused
33014 OverflowDestReg = DestReg;
33015
33016 offsetMBB = nullptr;
33017 overflowMBB = thisMBB;
33018 endMBB = thisMBB;
33019 } else {
33020 // First emit code to check if gp_offset (or fp_offset) is below the bound.
33021 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
33022 // If not, pull from overflow_area. (branch to overflowMBB)
33023 //
33024 // thisMBB
33025 // | .
33026 // | .
33027 // offsetMBB overflowMBB
33028 // | .
33029 // | .
33030 // endMBB
33031
33032 // Registers for the PHI in endMBB
33033 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
33034 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
33035
33036 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
33037 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
33038 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
33039 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
33040
33041 MachineFunction::iterator MBBIter = ++MBB->getIterator();
33042
33043 // Insert the new basic blocks
33044 MF->insert(MBBIter, offsetMBB);
33045 MF->insert(MBBIter, overflowMBB);
33046 MF->insert(MBBIter, endMBB);
33047
33048 // Transfer the remainder of MBB and its successor edges to endMBB.
33049 endMBB->splice(endMBB->begin(), thisMBB,
33050 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
33051 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
33052
33053 // Make offsetMBB and overflowMBB successors of thisMBB
33054 thisMBB->addSuccessor(offsetMBB);
33055 thisMBB->addSuccessor(overflowMBB);
33056
33057 // endMBB is a successor of both offsetMBB and overflowMBB
33058 offsetMBB->addSuccessor(endMBB);
33059 overflowMBB->addSuccessor(endMBB);
33060
33061 // Load the offset value into a register
33062 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
33063 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
33064 .add(Base)
33065 .add(Scale)
33066 .add(Index)
33067 .addDisp(Disp, UseFPOffset ? 4 : 0)
33068 .add(Segment)
33069 .setMemRefs(LoadOnlyMMO);
33070
33071 // Check if there is enough room left to pull this argument.
33072 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
33073 .addReg(OffsetReg)
33074 .addImm(MaxOffset + 8 - ArgSizeA8);
33075
33076 // Branch to "overflowMBB" if offset >= max
33077 // Fall through to "offsetMBB" otherwise
33078 BuildMI(thisMBB, DL, TII->get(X86::JCC_1))
33079 .addMBB(overflowMBB).addImm(X86::COND_AE);
33080 }
33081
33082 // In offsetMBB, emit code to use the reg_save_area.
33083 if (offsetMBB) {
33084 assert(OffsetReg != 0)(static_cast<void> (0));
33085
33086 // Read the reg_save_area address.
33087 Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
33088 BuildMI(
33089 offsetMBB, DL,
33090 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
33091 RegSaveReg)
33092 .add(Base)
33093 .add(Scale)
33094 .add(Index)
33095 .addDisp(Disp, Subtarget.isTarget64BitLP64() ? 16 : 12)
33096 .add(Segment)
33097 .setMemRefs(LoadOnlyMMO);
33098
33099 if (Subtarget.isTarget64BitLP64()) {
33100 // Zero-extend the offset
33101 Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
33102 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
33103 .addImm(0)
33104 .addReg(OffsetReg)
33105 .addImm(X86::sub_32bit);
33106
33107 // Add the offset to the reg_save_area to get the final address.
33108 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
33109 .addReg(OffsetReg64)
33110 .addReg(RegSaveReg);
33111 } else {
33112 // Add the offset to the reg_save_area to get the final address.
33113 BuildMI(offsetMBB, DL, TII->get(X86::ADD32rr), OffsetDestReg)
33114 .addReg(OffsetReg)
33115 .addReg(RegSaveReg);
33116 }
33117
33118 // Compute the offset for the next argument
33119 Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
33120 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
33121 .addReg(OffsetReg)
33122 .addImm(UseFPOffset ? 16 : 8);
33123
33124 // Store it back into the va_list.
33125 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
33126 .add(Base)
33127 .add(Scale)
33128 .add(Index)
33129 .addDisp(Disp, UseFPOffset ? 4 : 0)
33130 .add(Segment)
33131 .addReg(NextOffsetReg)
33132 .setMemRefs(StoreOnlyMMO);
33133
33134 // Jump to endMBB
33135 BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
33136 .addMBB(endMBB);
33137 }
33138
33139 //
33140 // Emit code to use overflow area
33141 //
33142
33143 // Load the overflow_area address into a register.
33144 Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
33145 BuildMI(overflowMBB, DL,
33146 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
33147 OverflowAddrReg)
33148 .add(Base)
33149 .add(Scale)
33150 .add(Index)
33151 .addDisp(Disp, 8)
33152 .add(Segment)
33153 .setMemRefs(LoadOnlyMMO);
33154
33155 // If we need to align it, do so. Otherwise, just copy the address
33156 // to OverflowDestReg.
33157 if (NeedsAlign) {
33158 // Align the overflow address
33159 Register TmpReg = MRI.createVirtualRegister(AddrRegClass);
33160
33161 // aligned_addr = (addr + (align-1)) & ~(align-1)
33162 BuildMI(
33163 overflowMBB, DL,
33164 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
33165 TmpReg)
33166 .addReg(OverflowAddrReg)
33167 .addImm(Alignment.value() - 1);
33168
33169 BuildMI(
33170 overflowMBB, DL,
33171 TII->get(Subtarget.isTarget64BitLP64() ? X86::AND64ri32 : X86::AND32ri),
33172 OverflowDestReg)
33173 .addReg(TmpReg)
33174 .addImm(~(uint64_t)(Alignment.value() - 1));
33175 } else {
33176 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
33177 .addReg(OverflowAddrReg);
33178 }
33179
33180 // Compute the next overflow address after this argument.
33181 // (the overflow address should be kept 8-byte aligned)
33182 Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
33183 BuildMI(
33184 overflowMBB, DL,
33185 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
33186 NextAddrReg)
33187 .addReg(OverflowDestReg)
33188 .addImm(ArgSizeA8);
33189
33190 // Store the new overflow address.
33191 BuildMI(overflowMBB, DL,
33192 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64mr : X86::MOV32mr))
33193 .add(Base)
33194 .add(Scale)
33195 .add(Index)
33196 .addDisp(Disp, 8)
33197 .add(Segment)
33198 .addReg(NextAddrReg)
33199 .setMemRefs(StoreOnlyMMO);
33200
33201 // If we branched, emit the PHI to the front of endMBB.
33202 if (offsetMBB) {
33203 BuildMI(*endMBB, endMBB->begin(), DL,
33204 TII->get(X86::PHI), DestReg)
33205 .addReg(OffsetDestReg).addMBB(offsetMBB)
33206 .addReg(OverflowDestReg).addMBB(overflowMBB);
33207 }
33208
33209 // Erase the pseudo instruction
33210 MI.eraseFromParent();
33211
33212 return endMBB;
33213}
33214
33215// The EFLAGS operand of SelectItr might be missing a kill marker
33216// because there were multiple uses of EFLAGS, and ISel didn't know
33217// which to mark. Figure out whether SelectItr should have had a
33218// kill marker, and set it if it should. Returns the correct kill
33219// marker value.
33220static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
33221 MachineBasicBlock* BB,
33222 const TargetRegisterInfo* TRI) {
33223 if (isEFLAGSLiveAfter(SelectItr, BB))
33224 return false;
33225
33226 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
33227 // out. SelectMI should have a kill flag on EFLAGS.
33228 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
33229 return true;
33230}
33231
33232// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
33233// together with other CMOV pseudo-opcodes into a single basic-block with
33234// conditional jump around it.
33235static bool isCMOVPseudo(MachineInstr &MI) {
33236 switch (MI.getOpcode()) {
33237 case X86::CMOV_FR16X:
33238 case X86::CMOV_FR32:
33239 case X86::CMOV_FR32X:
33240 case X86::CMOV_FR64:
33241 case X86::CMOV_FR64X:
33242 case X86::CMOV_GR8:
33243 case X86::CMOV_GR16:
33244 case X86::CMOV_GR32:
33245 case X86::CMOV_RFP32:
33246 case X86::CMOV_RFP64:
33247 case X86::CMOV_RFP80:
33248 case X86::CMOV_VR64:
33249 case X86::CMOV_VR128:
33250 case X86::CMOV_VR128X:
33251 case X86::CMOV_VR256:
33252 case X86::CMOV_VR256X:
33253 case X86::CMOV_VR512:
33254 case X86::CMOV_VK1:
33255 case X86::CMOV_VK2:
33256 case X86::CMOV_VK4:
33257 case X86::CMOV_VK8:
33258 case X86::CMOV_VK16:
33259 case X86::CMOV_VK32:
33260 case X86::CMOV_VK64:
33261 return true;
33262
33263 default:
33264 return false;
33265 }
33266}
33267
33268// Helper function, which inserts PHI functions into SinkMBB:
33269// %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
33270// where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
33271// in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
33272// the last PHI function inserted.
33273static MachineInstrBuilder createPHIsForCMOVsInSinkBB(
33274 MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd,
33275 MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
33276 MachineBasicBlock *SinkMBB) {
33277 MachineFunction *MF = TrueMBB->getParent();
33278 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
33279 const DebugLoc &DL = MIItBegin->getDebugLoc();
33280
33281 X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
33282 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
33283
33284 MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
33285
33286 // As we are creating the PHIs, we have to be careful if there is more than
33287 // one. Later CMOVs may reference the results of earlier CMOVs, but later
33288 // PHIs have to reference the individual true/false inputs from earlier PHIs.
33289 // That also means that PHI construction must work forward from earlier to
33290 // later, and that the code must maintain a mapping from earlier PHI's
33291 // destination registers, and the registers that went into the PHI.
33292 DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
33293 MachineInstrBuilder MIB;
33294
33295 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
33296 Register DestReg = MIIt->getOperand(0).getReg();
33297 Register Op1Reg = MIIt->getOperand(1).getReg();
33298 Register Op2Reg = MIIt->getOperand(2).getReg();
33299
33300 // If this CMOV we are generating is the opposite condition from
33301 // the jump we generated, then we have to swap the operands for the
33302 // PHI that is going to be generated.
33303 if (MIIt->getOperand(3).getImm() == OppCC)
33304 std::swap(Op1Reg, Op2Reg);
33305
33306 if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
33307 Op1Reg = RegRewriteTable[Op1Reg].first;
33308
33309 if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
33310 Op2Reg = RegRewriteTable[Op2Reg].second;
33311
33312 MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)
33313 .addReg(Op1Reg)
33314 .addMBB(FalseMBB)
33315 .addReg(Op2Reg)
33316 .addMBB(TrueMBB);
33317
33318 // Add this PHI to the rewrite table.
33319 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
33320 }
33321
33322 return MIB;
33323}
33324
33325// Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
33326MachineBasicBlock *
33327X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
33328 MachineInstr &SecondCascadedCMOV,
33329 MachineBasicBlock *ThisMBB) const {
33330 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
33331 const DebugLoc &DL = FirstCMOV.getDebugLoc();
33332
33333 // We lower cascaded CMOVs such as
33334 //
33335 // (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
33336 //
33337 // to two successive branches.
33338 //
33339 // Without this, we would add a PHI between the two jumps, which ends up
33340 // creating a few copies all around. For instance, for
33341 //
33342 // (sitofp (zext (fcmp une)))
33343 //
33344 // we would generate:
33345 //
33346 // ucomiss %xmm1, %xmm0
33347 // movss <1.0f>, %xmm0
33348 // movaps %xmm0, %xmm1
33349 // jne .LBB5_2
33350 // xorps %xmm1, %xmm1
33351 // .LBB5_2:
33352 // jp .LBB5_4
33353 // movaps %xmm1, %xmm0
33354 // .LBB5_4:
33355 // retq
33356 //
33357 // because this custom-inserter would have generated:
33358 //
33359 // A
33360 // | \
33361 // | B
33362 // | /
33363 // C
33364 // | \
33365 // | D
33366 // | /
33367 // E
33368 //
33369 // A: X = ...; Y = ...
33370 // B: empty
33371 // C: Z = PHI [X, A], [Y, B]
33372 // D: empty
33373 // E: PHI [X, C], [Z, D]
33374 //
33375 // If we lower both CMOVs in a single step, we can instead generate:
33376 //
33377 // A
33378 // | \
33379 // | C
33380 // | /|
33381 // |/ |
33382 // | |
33383 // | D
33384 // | /
33385 // E
33386 //
33387 // A: X = ...; Y = ...
33388 // D: empty
33389 // E: PHI [X, A], [X, C], [Y, D]
33390 //
33391 // Which, in our sitofp/fcmp example, gives us something like:
33392 //
33393 // ucomiss %xmm1, %xmm0
33394 // movss <1.0f>, %xmm0
33395 // jne .LBB5_4
33396 // jp .LBB5_4
33397 // xorps %xmm0, %xmm0
33398 // .LBB5_4:
33399 // retq
33400 //
33401
33402 // We lower cascaded CMOV into two successive branches to the same block.
33403 // EFLAGS is used by both, so mark it as live in the second.
33404 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
33405 MachineFunction *F = ThisMBB->getParent();
33406 MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
33407 MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
33408 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
33409
33410 MachineFunction::iterator It = ++ThisMBB->getIterator();
33411 F->insert(It, FirstInsertedMBB);
33412 F->insert(It, SecondInsertedMBB);
33413 F->insert(It, SinkMBB);
33414
33415 // For a cascaded CMOV, we lower it to two successive branches to
33416 // the same block (SinkMBB). EFLAGS is used by both, so mark it as live in
33417 // the FirstInsertedMBB.
33418 FirstInsertedMBB->addLiveIn(X86::EFLAGS);
33419
33420 // If the EFLAGS register isn't dead in the terminator, then claim that it's
33421 // live into the sink and copy blocks.
33422 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
33423 if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS) &&
33424 !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
33425 SecondInsertedMBB->addLiveIn(X86::EFLAGS);
33426 SinkMBB->addLiveIn(X86::EFLAGS);
33427 }
33428
33429 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
33430 SinkMBB->splice(SinkMBB->begin(), ThisMBB,
33431 std::next(MachineBasicBlock::iterator(FirstCMOV)),
33432 ThisMBB->end());
33433 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
33434
33435 // Fallthrough block for ThisMBB.
33436 ThisMBB->addSuccessor(FirstInsertedMBB);
33437 // The true block target of the first branch is always SinkMBB.
33438 ThisMBB->addSuccessor(SinkMBB);
33439 // Fallthrough block for FirstInsertedMBB.
33440 FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
33441 // The true block for the branch of FirstInsertedMBB.
33442 FirstInsertedMBB->addSuccessor(SinkMBB);
33443 // This is fallthrough.
33444 SecondInsertedMBB->addSuccessor(SinkMBB);
33445
33446 // Create the conditional branch instructions.
33447 X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
33448 BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);
33449
33450 X86::CondCode SecondCC =
33451 X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
33452 BuildMI(FirstInsertedMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(SecondCC);
33453
33454 // SinkMBB:
33455 // %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
33456 Register DestReg = FirstCMOV.getOperand(0).getReg();
33457 Register Op1Reg = FirstCMOV.getOperand(1).getReg();
33458 Register Op2Reg = FirstCMOV.getOperand(2).getReg();
33459 MachineInstrBuilder MIB =
33460 BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg)
33461 .addReg(Op1Reg)
33462 .addMBB(SecondInsertedMBB)
33463 .addReg(Op2Reg)
33464 .addMBB(ThisMBB);
33465
33466 // The second SecondInsertedMBB provides the same incoming value as the
33467 // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
33468 MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
33469 // Copy the PHI result to the register defined by the second CMOV.
33470 BuildMI(*SinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), DL,
33471 TII->get(TargetOpcode::COPY),
33472 SecondCascadedCMOV.getOperand(0).getReg())
33473 .addReg(FirstCMOV.getOperand(0).getReg());
33474
33475 // Now remove the CMOVs.
33476 FirstCMOV.eraseFromParent();
33477 SecondCascadedCMOV.eraseFromParent();
33478
33479 return SinkMBB;
33480}
33481
33482MachineBasicBlock *
33483X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
33484 MachineBasicBlock *ThisMBB) const {
33485 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
33486 const DebugLoc &DL = MI.getDebugLoc();
33487
33488 // To "insert" a SELECT_CC instruction, we actually have to insert the
33489 // diamond control-flow pattern. The incoming instruction knows the
33490 // destination vreg to set, the condition code register to branch on, the
33491 // true/false values to select between and a branch opcode to use.
33492
33493 // ThisMBB:
33494 // ...
33495 // TrueVal = ...
33496 // cmpTY ccX, r1, r2
33497 // bCC copy1MBB
33498 // fallthrough --> FalseMBB
33499
33500 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
33501 // as described above, by inserting a BB, and then making a PHI at the join
33502 // point to select the true and false operands of the CMOV in the PHI.
33503 //
33504 // The code also handles two different cases of multiple CMOV opcodes
33505 // in a row.
33506 //
33507 // Case 1:
33508 // In this case, there are multiple CMOVs in a row, all which are based on
33509 // the same condition setting (or the exact opposite condition setting).
33510 // In this case we can lower all the CMOVs using a single inserted BB, and
33511 // then make a number of PHIs at the join point to model the CMOVs. The only
33512 // trickiness here, is that in a case like:
33513 //
33514 // t2 = CMOV cond1 t1, f1
33515 // t3 = CMOV cond1 t2, f2
33516 //
33517 // when rewriting this into PHIs, we have to perform some renaming on the
33518 // temps since you cannot have a PHI operand refer to a PHI result earlier
33519 // in the same block. The "simple" but wrong lowering would be:
33520 //
33521 // t2 = PHI t1(BB1), f1(BB2)
33522 // t3 = PHI t2(BB1), f2(BB2)
33523 //
33524 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
33525 // renaming is to note that on the path through BB1, t2 is really just a
33526 // copy of t1, and do that renaming, properly generating:
33527 //
33528 // t2 = PHI t1(BB1), f1(BB2)
33529 // t3 = PHI t1(BB1), f2(BB2)
33530 //
33531 // Case 2:
33532 // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
33533 // function - EmitLoweredCascadedSelect.
33534
33535 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
33536 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
33537 MachineInstr *LastCMOV = &MI;
33538 MachineBasicBlock::iterator NextMIIt = MachineBasicBlock::iterator(MI);
33539
33540 // Check for case 1, where there are multiple CMOVs with the same condition
33541 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
33542 // number of jumps the most.
33543
33544 if (isCMOVPseudo(MI)) {
33545 // See if we have a string of CMOVS with the same condition. Skip over
33546 // intervening debug insts.
33547 while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
33548 (NextMIIt->getOperand(3).getImm() == CC ||
33549 NextMIIt->getOperand(3).getImm() == OppCC)) {
33550 LastCMOV = &*NextMIIt;
33551 NextMIIt = next_nodbg(NextMIIt, ThisMBB->end());
33552 }
33553 }
33554
33555 // This checks for case 2, but only do this if we didn't already find
33556 // case 1, as indicated by LastCMOV == MI.
33557 if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
33558 NextMIIt->getOpcode() == MI.getOpcode() &&
33559 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
33560 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
33561 NextMIIt->getOperand(1).isKill()) {
33562 return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
33563 }
33564
33565 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
33566 MachineFunction *F = ThisMBB->getParent();
33567 MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
33568 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
33569
33570 MachineFunction::iterator It = ++ThisMBB->getIterator();
33571 F->insert(It, FalseMBB);
33572 F->insert(It, SinkMBB);
33573
33574 // If the EFLAGS register isn't dead in the terminator, then claim that it's
33575 // live into the sink and copy blocks.
33576 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
33577 if (!LastCMOV->killsRegister(X86::EFLAGS) &&
33578 !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
33579 FalseMBB->addLiveIn(X86::EFLAGS);
33580 SinkMBB->addLiveIn(X86::EFLAGS);
33581 }
33582
33583 // Transfer any debug instructions inside the CMOV sequence to the sunk block.
33584 auto DbgEnd = MachineBasicBlock::iterator(LastCMOV);
33585 auto DbgIt = MachineBasicBlock::iterator(MI);
33586 while (DbgIt != DbgEnd) {
33587 auto Next = std::next(DbgIt);
33588 if (DbgIt->isDebugInstr())
33589 SinkMBB->push_back(DbgIt->removeFromParent());
33590 DbgIt = Next;
33591 }
33592
33593 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
33594 SinkMBB->splice(SinkMBB->end(), ThisMBB,
33595 std::next(MachineBasicBlock::iterator(LastCMOV)),
33596 ThisMBB->end());
33597 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
33598
33599 // Fallthrough block for ThisMBB.
33600 ThisMBB->addSuccessor(FalseMBB);
33601 // The true block target of the first (or only) branch is always a SinkMBB.
33602 ThisMBB->addSuccessor(SinkMBB);
33603 // Fallthrough block for FalseMBB.
33604 FalseMBB->addSuccessor(SinkMBB);
33605
33606 // Create the conditional branch instruction.
33607 BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);
33608
33609 // SinkMBB:
33610 // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
33611 // ...
33612 MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
33613 MachineBasicBlock::iterator MIItEnd =
33614 std::next(MachineBasicBlock::iterator(LastCMOV));
33615 createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);
33616
33617 // Now remove the CMOV(s).
33618 ThisMBB->erase(MIItBegin, MIItEnd);
33619
33620 return SinkMBB;
33621}
33622
33623static unsigned getSUBriOpcode(bool IsLP64, int64_t Imm) {
33624 if (IsLP64) {
33625 if (isInt<8>(Imm))
33626 return X86::SUB64ri8;
33627 return X86::SUB64ri32;
33628 } else {
33629 if (isInt<8>(Imm))
33630 return X86::SUB32ri8;
33631 return X86::SUB32ri;
33632 }
33633}
33634
33635MachineBasicBlock *
33636X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,
33637 MachineBasicBlock *MBB) const {
33638 MachineFunction *MF = MBB->getParent();
33639 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
33640 const X86FrameLowering &TFI = *Subtarget.getFrameLowering();
33641 const DebugLoc &DL = MI.getDebugLoc();
33642 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
33643
33644 const unsigned ProbeSize = getStackProbeSize(*MF);
33645
33646 MachineRegisterInfo &MRI = MF->getRegInfo();
33647 MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB);
33648 MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB);
33649 MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB);
33650
33651 MachineFunction::iterator MBBIter = ++MBB->getIterator();
33652 MF->insert(MBBIter, testMBB);
33653 MF->insert(MBBIter, blockMBB);
33654 MF->insert(MBBIter, tailMBB);
33655
33656 Register sizeVReg = MI.getOperand(1).getReg();
33657
33658 Register physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP;
33659
33660 Register TmpStackPtr = MRI.createVirtualRegister(
33661 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
33662 Register FinalStackPtr = MRI.createVirtualRegister(
33663 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
33664
33665 BuildMI(*MBB, {MI}, DL, TII->get(TargetOpcode::COPY), TmpStackPtr)
33666 .addReg(physSPReg);
33667 {
33668 const unsigned Opc = TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr;
33669 BuildMI(*MBB, {MI}, DL, TII->get(Opc), FinalStackPtr)
33670 .addReg(TmpStackPtr)
33671 .addReg(sizeVReg);
33672 }
33673
33674 // test rsp size
33675
33676 BuildMI(testMBB, DL,
33677 TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
33678 .addReg(FinalStackPtr)
33679 .addReg(physSPReg);
33680
33681 BuildMI(testMBB, DL, TII->get(X86::JCC_1))
33682 .addMBB(tailMBB)
33683 .addImm(X86::COND_GE);
33684 testMBB->addSuccessor(blockMBB);
33685 testMBB->addSuccessor(tailMBB);
33686
33687 // Touch the block then extend it. This is done on the opposite side of
33688 // static probe where we allocate then touch, to avoid the need of probing the
33689 // tail of the static alloca. Possible scenarios are:
33690 //
33691 // + ---- <- ------------ <- ------------- <- ------------ +
33692 // | |
33693 // [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +
33694 // | |
33695 // + <- ----------- <- ------------ <- ----------- <- ------------ +
33696 //
33697 // The property we want to enforce is to never have more than [page alloc] between two probes.
33698
33699 const unsigned XORMIOpc =
33700 TFI.Uses64BitFramePtr ? X86::XOR64mi8 : X86::XOR32mi8;
33701 addRegOffset(BuildMI(blockMBB, DL, TII->get(XORMIOpc)), physSPReg, false, 0)
33702 .addImm(0);
33703
33704 BuildMI(blockMBB, DL,
33705 TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr, ProbeSize)), physSPReg)
33706 .addReg(physSPReg)
33707 .addImm(ProbeSize);
33708
33709
33710 BuildMI(blockMBB, DL, TII->get(X86::JMP_1)).addMBB(testMBB);
33711 blockMBB->addSuccessor(testMBB);
33712
33713 // Replace original instruction by the expected stack ptr
33714 BuildMI(tailMBB, DL, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
33715 .addReg(FinalStackPtr);
33716
33717 tailMBB->splice(tailMBB->end(), MBB,
33718 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
33719 tailMBB->transferSuccessorsAndUpdatePHIs(MBB);
33720 MBB->addSuccessor(testMBB);
33721
33722 // Delete the original pseudo instruction.
33723 MI.eraseFromParent();
33724
33725 // And we're done.
33726 return tailMBB;
33727}
33728
33729MachineBasicBlock *
33730X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
33731 MachineBasicBlock *BB) const {
33732 MachineFunction *MF = BB->getParent();
33733 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
33734 const DebugLoc &DL = MI.getDebugLoc();
33735 const BasicBlock *LLVM_BB = BB->getBasicBlock();
33736
33737 assert(MF->shouldSplitStack())(static_cast<void> (0));
33738
33739 const bool Is64Bit = Subtarget.is64Bit();
33740 const bool IsLP64 = Subtarget.isTarget64BitLP64();
33741
33742 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
33743 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
33744
33745 // BB:
33746 // ... [Till the alloca]
33747 // If stacklet is not large enough, jump to mallocMBB
33748 //
33749 // bumpMBB:
33750 // Allocate by subtracting from RSP
33751 // Jump to continueMBB
33752 //
33753 // mallocMBB:
33754 // Allocate by call to runtime
33755 //
33756 // continueMBB:
33757 // ...
33758 // [rest of original BB]
33759 //
33760
33761 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
33762 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
33763 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
33764
33765 MachineRegisterInfo &MRI = MF->getRegInfo();
33766 const TargetRegisterClass *AddrRegClass =
33767 getRegClassFor(getPointerTy(MF->getDataLayout()));
33768
33769 Register mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
33770 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
33771 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
33772 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
33773 sizeVReg = MI.getOperand(1).getReg(),
33774 physSPReg =
33775 IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
33776
33777 MachineFunction::iterator MBBIter = ++BB->getIterator();
33778
33779 MF->insert(MBBIter, bumpMBB);
33780 MF->insert(MBBIter, mallocMBB);
33781 MF->insert(MBBIter, continueMBB);
33782
33783 continueMBB->splice(continueMBB->begin(), BB,
33784 std::next(MachineBasicBlock::iterator(MI)), BB->end());
33785 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
33786
33787 // Add code to the main basic block to check if the stack limit has been hit,
33788 // and if so, jump to mallocMBB otherwise to bumpMBB.
33789 BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
33790 BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
33791 .addReg(tmpSPVReg).addReg(sizeVReg);
33792 BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
33793 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
33794 .addReg(SPLimitVReg);
33795 BuildMI(BB, DL, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);
33796
33797 // bumpMBB simply decreases the stack pointer, since we know the current
33798 // stacklet has enough space.
33799 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
33800 .addReg(SPLimitVReg);
33801 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
33802 .addReg(SPLimitVReg);
33803 BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
33804
33805 // Calls into a routine in libgcc to allocate more space from the heap.
33806 const uint32_t *RegMask =
33807 Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
33808 if (IsLP64) {
33809 BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
33810 .addReg(sizeVReg);
33811 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
33812 .addExternalSymbol("__morestack_allocate_stack_space")
33813 .addRegMask(RegMask)
33814 .addReg(X86::RDI, RegState::Implicit)
33815 .addReg(X86::RAX, RegState::ImplicitDefine);
33816 } else if (Is64Bit) {
33817 BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
33818 .addReg(sizeVReg);
33819 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
33820 .addExternalSymbol("__morestack_allocate_stack_space")
33821 .addRegMask(RegMask)
33822 .addReg(X86::EDI, RegState::Implicit)
33823 .addReg(X86::EAX, RegState::ImplicitDefine);
33824 } else {
33825 BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
33826 .addImm(12);
33827 BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
33828 BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
33829 .addExternalSymbol("__morestack_allocate_stack_space")
33830 .addRegMask(RegMask)
33831 .addReg(X86::EAX, RegState::ImplicitDefine);
33832 }
33833
33834 if (!Is64Bit)
33835 BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
33836 .addImm(16);
33837
33838 BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
33839 .addReg(IsLP64 ? X86::RAX : X86::EAX);
33840 BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
33841
33842 // Set up the CFG correctly.
33843 BB->addSuccessor(bumpMBB);
33844 BB->addSuccessor(mallocMBB);
33845 mallocMBB->addSuccessor(continueMBB);
33846 bumpMBB->addSuccessor(continueMBB);
33847
33848 // Take care of the PHI nodes.
33849 BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
33850 MI.getOperand(0).getReg())
33851 .addReg(mallocPtrVReg)
33852 .addMBB(mallocMBB)
33853 .addReg(bumpSPPtrVReg)
33854 .addMBB(bumpMBB);
33855
33856 // Delete the original pseudo instruction.
33857 MI.eraseFromParent();
33858
33859 // And we're done.
33860 return continueMBB;
33861}
33862
33863MachineBasicBlock *
33864X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
33865 MachineBasicBlock *BB) const {
33866 MachineFunction *MF = BB->getParent();
33867 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
33868 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
33869 const DebugLoc &DL = MI.getDebugLoc();
33870
33871 assert(!isAsynchronousEHPersonality((static_cast<void> (0))
33872 classifyEHPersonality(MF->getFunction().getPersonalityFn())) &&(static_cast<void> (0))
33873 "SEH does not use catchret!")(static_cast<void> (0));
33874
33875 // Only 32-bit EH needs to worry about manually restoring stack pointers.
33876 if (!Subtarget.is32Bit())
33877 return BB;
33878
33879 // C++ EH creates a new target block to hold the restore code, and wires up
33880 // the new block to the return destination with a normal JMP_4.
33881 MachineBasicBlock *RestoreMBB =
33882 MF->CreateMachineBasicBlock(BB->getBasicBlock());
33883 assert(BB->succ_size() == 1)(static_cast<void> (0));
33884 MF->insert(std::next(BB->getIterator()), RestoreMBB);
33885 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
33886 BB->addSuccessor(RestoreMBB);
33887 MI.getOperand(0).setMBB(RestoreMBB);
33888
33889 // Marking this as an EH pad but not a funclet entry block causes PEI to
33890 // restore stack pointers in the block.
33891 RestoreMBB->setIsEHPad(true);
33892
33893 auto RestoreMBBI = RestoreMBB->begin();
33894 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
33895 return BB;
33896}
33897
33898MachineBasicBlock *
33899X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
33900 MachineBasicBlock *BB) const {
33901 // So, here we replace TLSADDR with the sequence:
33902 // adjust_stackdown -> TLSADDR -> adjust_stackup.
33903 // We need this because TLSADDR is lowered into calls
33904 // inside MC, therefore without the two markers shrink-wrapping
33905 // may push the prologue/epilogue pass them.
33906 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
33907 const DebugLoc &DL = MI.getDebugLoc();
33908 MachineFunction &MF = *BB->getParent();
33909
33910 // Emit CALLSEQ_START right before the instruction.
33911 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
33912 MachineInstrBuilder CallseqStart =
33913 BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
33914 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
33915
33916 // Emit CALLSEQ_END right after the instruction.
33917 // We don't call erase from parent because we want to keep the
33918 // original instruction around.
33919 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
33920 MachineInstrBuilder CallseqEnd =
33921 BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
33922 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
33923
33924 return BB;
33925}
33926
33927MachineBasicBlock *
33928X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
33929 MachineBasicBlock *BB) const {
33930 // This is pretty easy. We're taking the value that we received from
33931 // our load from the relocation, sticking it in either RDI (x86-64)
33932 // or EAX and doing an indirect call. The return value will then
33933 // be in the normal return register.
33934 MachineFunction *F = BB->getParent();
33935 const X86InstrInfo *TII = Subtarget.getInstrInfo();
33936 const DebugLoc &DL = MI.getDebugLoc();
33937
33938 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?")(static_cast<void> (0));
33939 assert(MI.getOperand(3).isGlobal() && "This should be a global")(static_cast<void> (0));
33940
33941 // Get a register mask for the lowered call.
33942 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
33943 // proper register mask.
33944 const uint32_t *RegMask =
33945 Subtarget.is64Bit() ?
33946 Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
33947 Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
33948 if (Subtarget.is64Bit()) {
33949 MachineInstrBuilder MIB =
33950 BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
33951 .addReg(X86::RIP)
33952 .addImm(0)
33953 .addReg(0)
33954 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
33955 MI.getOperand(3).getTargetFlags())
33956 .addReg(0);
33957 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
33958 addDirectMem(MIB, X86::RDI);
33959 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
33960 } else if (!isPositionIndependent()) {
33961 MachineInstrBuilder MIB =
33962 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
33963 .addReg(0)
33964 .addImm(0)
33965 .addReg(0)
33966 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
33967 MI.getOperand(3).getTargetFlags())
33968 .addReg(0);
33969 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
33970 addDirectMem(MIB, X86::EAX);
33971 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
33972 } else {
33973 MachineInstrBuilder MIB =
33974 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
33975 .addReg(TII->getGlobalBaseReg(F))
33976 .addImm(0)
33977 .addReg(0)
33978 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
33979 MI.getOperand(3).getTargetFlags())
33980 .addReg(0);
33981 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
33982 addDirectMem(MIB, X86::EAX);
33983 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
33984 }
33985
33986 MI.eraseFromParent(); // The pseudo instruction is gone now.
33987 return BB;
33988}
33989
33990static unsigned getOpcodeForIndirectThunk(unsigned RPOpc) {
33991 switch (RPOpc) {
33992 case X86::INDIRECT_THUNK_CALL32:
33993 return X86::CALLpcrel32;
33994 case X86::INDIRECT_THUNK_CALL64:
33995 return X86::CALL64pcrel32;
33996 case X86::INDIRECT_THUNK_TCRETURN32:
33997 return X86::TCRETURNdi;
33998 case X86::INDIRECT_THUNK_TCRETURN64:
33999 return X86::TCRETURNdi64;
34000 }
34001 llvm_unreachable("not indirect thunk opcode")__builtin_unreachable();
34002}
34003
34004static const char *getIndirectThunkSymbol(const X86Subtarget &Subtarget,
34005 unsigned Reg) {
34006 if (Subtarget.useRetpolineExternalThunk()) {
34007 // When using an external thunk for retpolines, we pick names that match the
34008 // names GCC happens to use as well. This helps simplify the implementation
34009 // of the thunks for kernels where they have no easy ability to create
34010 // aliases and are doing non-trivial configuration of the thunk's body. For
34011 // example, the Linux kernel will do boot-time hot patching of the thunk
34012 // bodies and cannot easily export aliases of these to loaded modules.
34013 //
34014 // Note that at any point in the future, we may need to change the semantics
34015 // of how we implement retpolines and at that time will likely change the
34016 // name of the called thunk. Essentially, there is no hard guarantee that
34017 // LLVM will generate calls to specific thunks, we merely make a best-effort
34018 // attempt to help out kernels and other systems where duplicating the
34019 // thunks is costly.
34020 switch (Reg) {
34021 case X86::EAX:
34022 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast<void> (0));
34023 return "__x86_indirect_thunk_eax";
34024 case X86::ECX:
34025 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast<void> (0));
34026 return "__x86_indirect_thunk_ecx";
34027 case X86::EDX:
34028 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast<void> (0));
34029 return "__x86_indirect_thunk_edx";
34030 case X86::EDI:
34031 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast<void> (0));
34032 return "__x86_indirect_thunk_edi";
34033 case X86::R11:
34034 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!")(static_cast<void> (0));
34035 return "__x86_indirect_thunk_r11";
34036 }
34037 llvm_unreachable("unexpected reg for external indirect thunk")__builtin_unreachable();
34038 }
34039
34040 if (Subtarget.useRetpolineIndirectCalls() ||
34041 Subtarget.useRetpolineIndirectBranches()) {
34042 // When targeting an internal COMDAT thunk use an LLVM-specific name.
34043 switch (Reg) {
34044 case X86::EAX:
34045 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast<void> (0));
34046 return "__llvm_retpoline_eax";
34047 case X86::ECX:
34048 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast<void> (0));
34049 return "__llvm_retpoline_ecx";
34050 case X86::EDX:
34051 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast<void> (0));
34052 return "__llvm_retpoline_edx";
34053 case X86::EDI:
34054 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast<void> (0));
34055 return "__llvm_retpoline_edi";
34056 case X86::R11:
34057 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!")(static_cast<void> (0));
34058 return "__llvm_retpoline_r11";
34059 }
34060 llvm_unreachable("unexpected reg for retpoline")__builtin_unreachable();
34061 }
34062
34063 if (Subtarget.useLVIControlFlowIntegrity()) {
34064 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!")(static_cast<void> (0));
34065 return "__llvm_lvi_thunk_r11";
34066 }
34067 llvm_unreachable("getIndirectThunkSymbol() invoked without thunk feature")__builtin_unreachable();
34068}
34069
34070MachineBasicBlock *
34071X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI,
34072 MachineBasicBlock *BB) const {
34073 // Copy the virtual register into the R11 physical register and
34074 // call the retpoline thunk.
34075 const DebugLoc &DL = MI.getDebugLoc();
34076 const X86InstrInfo *TII = Subtarget.getInstrInfo();
34077 Register CalleeVReg = MI.getOperand(0).getReg();
34078 unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode());
34079
34080 // Find an available scratch register to hold the callee. On 64-bit, we can
34081 // just use R11, but we scan for uses anyway to ensure we don't generate
34082 // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
34083 // already a register use operand to the call to hold the callee. If none
34084 // are available, use EDI instead. EDI is chosen because EBX is the PIC base
34085 // register and ESI is the base pointer to realigned stack frames with VLAs.
34086 SmallVector<unsigned, 3> AvailableRegs;
34087 if (Subtarget.is64Bit())
34088 AvailableRegs.push_back(X86::R11);
34089 else
34090 AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});
34091
34092 // Zero out any registers that are already used.
34093 for (const auto &MO : MI.operands()) {
34094 if (MO.isReg() && MO.isUse())
34095 for (unsigned &Reg : AvailableRegs)
34096 if (Reg == MO.getReg())
34097 Reg = 0;
34098 }
34099
34100 // Choose the first remaining non-zero available register.
34101 unsigned AvailableReg = 0;
34102 for (unsigned MaybeReg : AvailableRegs) {
34103 if (MaybeReg) {
34104 AvailableReg = MaybeReg;
34105 break;
34106 }
34107 }
34108 if (!AvailableReg)
34109 report_fatal_error("calling convention incompatible with retpoline, no "
34110 "available registers");
34111
34112 const char *Symbol = getIndirectThunkSymbol(Subtarget, AvailableReg);
34113
34114 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)
34115 .addReg(CalleeVReg);
34116 MI.getOperand(0).ChangeToES(Symbol);
34117 MI.setDesc(TII->get(Opc));
34118 MachineInstrBuilder(*BB->getParent(), &MI)
34119 .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
34120 return BB;
34121}
34122
34123/// SetJmp implies future control flow change upon calling the corresponding
34124/// LongJmp.
34125/// Instead of using the 'return' instruction, the long jump fixes the stack and
34126/// performs an indirect branch. To do so it uses the registers that were stored
34127/// in the jump buffer (when calling SetJmp).
34128/// In case the shadow stack is enabled we need to fix it as well, because some
34129/// return addresses will be skipped.
34130/// The function will save the SSP for future fixing in the function
34131/// emitLongJmpShadowStackFix.
34132/// \sa emitLongJmpShadowStackFix
34133/// \param [in] MI The temporary Machine Instruction for the builtin.
34134/// \param [in] MBB The Machine Basic Block that will be modified.
34135void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
34136 MachineBasicBlock *MBB) const {
34137 const DebugLoc &DL = MI.getDebugLoc();
34138 MachineFunction *MF = MBB->getParent();
34139 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
34140 MachineRegisterInfo &MRI = MF->getRegInfo();
34141 MachineInstrBuilder MIB;
34142
34143 // Memory Reference.
34144 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
34145 MI.memoperands_end());
34146
34147 // Initialize a register with zero.
34148 MVT PVT = getPointerTy(MF->getDataLayout());
34149 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
34150 Register ZReg = MRI.createVirtualRegister(PtrRC);
34151 unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
34152 BuildMI(*MBB, MI, DL, TII->get(XorRROpc))
34153 .addDef(ZReg)
34154 .addReg(ZReg, RegState::Undef)
34155 .addReg(ZReg, RegState::Undef);
34156
34157 // Read the current SSP Register value to the zeroed register.
34158 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
34159 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
34160 BuildMI(*MBB, MI, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
34161
34162 // Write the SSP register value to offset 3 in input memory buffer.
34163 unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
34164 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrStoreOpc));
34165 const int64_t SSPOffset = 3 * PVT.getStoreSize();
34166 const unsigned MemOpndSlot = 1;
34167 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
34168 if (i == X86::AddrDisp)
34169 MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);
34170 else
34171 MIB.add(MI.getOperand(MemOpndSlot + i));
34172 }
34173 MIB.addReg(SSPCopyReg);
34174 MIB.setMemRefs(MMOs);
34175}
34176
34177MachineBasicBlock *
34178X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
34179 MachineBasicBlock *MBB) const {
34180 const DebugLoc &DL = MI.getDebugLoc();
34181 MachineFunction *MF = MBB->getParent();
34182 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
34183 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
34184 MachineRegisterInfo &MRI = MF->getRegInfo();
34185
34186 const BasicBlock *BB = MBB->getBasicBlock();
34187 MachineFunction::iterator I = ++MBB->getIterator();
34188
34189 // Memory Reference
34190 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
34191 MI.memoperands_end());
34192
34193 unsigned DstReg;
34194 unsigned MemOpndSlot = 0;
34195
34196 unsigned CurOp = 0;
34197
34198 DstReg = MI.getOperand(CurOp++).getReg();
34199 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
34200 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!")(static_cast<void> (0));
34201 (void)TRI;
34202 Register mainDstReg = MRI.createVirtualRegister(RC);
34203 Register restoreDstReg = MRI.createVirtualRegister(RC);
34204
34205 MemOpndSlot = CurOp;
34206
34207 MVT PVT = getPointerTy(MF->getDataLayout());
34208 assert((PVT == MVT::i64 || PVT == MVT::i32) &&(static_cast<void> (0))
34209 "Invalid Pointer Size!")(static_cast<void> (0));
34210
34211 // For v = setjmp(buf), we generate
34212 //
34213 // thisMBB:
34214 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
34215 // SjLjSetup restoreMBB
34216 //
34217 // mainMBB:
34218 // v_main = 0
34219 //
34220 // sinkMBB:
34221 // v = phi(main, restore)
34222 //
34223 // restoreMBB:
34224 // if base pointer being used, load it from frame
34225 // v_restore = 1
34226
34227 MachineBasicBlock *thisMBB = MBB;
34228 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
34229 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
34230 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
34231 MF->insert(I, mainMBB);
34232 MF->insert(I, sinkMBB);
34233 MF->push_back(restoreMBB);
34234 restoreMBB->setHasAddressTaken();
34235
34236 MachineInstrBuilder MIB;
34237
34238 // Transfer the remainder of BB and its successor edges to sinkMBB.
34239 sinkMBB->splice(sinkMBB->begin(), MBB,
34240 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
34241 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
34242
34243 // thisMBB:
34244 unsigned PtrStoreOpc = 0;
34245 unsigned LabelReg = 0;
34246 const int64_t LabelOffset = 1 * PVT.getStoreSize();
34247 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
34248 !isPositionIndependent();
34249
34250 // Prepare IP either in reg or imm.
34251 if (!UseImmLabel) {
34252 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
34253 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
34254 LabelReg = MRI.createVirtualRegister(PtrRC);
34255 if (Subtarget.is64Bit()) {
34256 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
34257 .addReg(X86::RIP)
34258 .addImm(0)
34259 .addReg(0)
34260 .addMBB(restoreMBB)
34261 .addReg(0);
34262 } else {
34263 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
34264 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
34265 .addReg(XII->getGlobalBaseReg(MF))
34266 .addImm(0)
34267 .addReg(0)
34268 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
34269 .addReg(0);
34270 }
34271 } else
34272 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
34273 // Store IP
34274 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
34275 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
34276 if (i == X86::AddrDisp)
34277 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
34278 else
34279 MIB.add(MI.getOperand(MemOpndSlot + i));
34280 }
34281 if (!UseImmLabel)
34282 MIB.addReg(LabelReg);
34283 else
34284 MIB.addMBB(restoreMBB);
34285 MIB.setMemRefs(MMOs);
34286
34287 if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
34288 emitSetJmpShadowStackFix(MI, thisMBB);
34289 }
34290
34291 // Setup
34292 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
34293 .addMBB(restoreMBB);
34294
34295 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
34296 MIB.addRegMask(RegInfo->getNoPreservedMask());
34297 thisMBB->addSuccessor(mainMBB);
34298 thisMBB->addSuccessor(restoreMBB);
34299
34300 // mainMBB:
34301 // EAX = 0
34302 BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
34303 mainMBB->addSuccessor(sinkMBB);
34304
34305 // sinkMBB:
34306 BuildMI(*sinkMBB, sinkMBB->begin(), DL,
34307 TII->get(X86::PHI), DstReg)
34308 .addReg(mainDstReg).addMBB(mainMBB)
34309 .addReg(restoreDstReg).addMBB(restoreMBB);
34310
34311 // restoreMBB:
34312 if (RegInfo->hasBasePointer(*MF)) {
34313 const bool Uses64BitFramePtr =
34314 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
34315 X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
34316 X86FI->setRestoreBasePointer(MF);
34317 Register FramePtr = RegInfo->getFrameRegister(*MF);
34318 Register BasePtr = RegInfo->getBaseRegister();
34319 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
34320 addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
34321 FramePtr, true, X86FI->getRestoreBasePointerOffset())
34322 .setMIFlag(MachineInstr::FrameSetup);
34323 }
34324 BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
34325 BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
34326 restoreMBB->addSuccessor(sinkMBB);
34327
34328 MI.eraseFromParent();
34329 return sinkMBB;
34330}
34331
34332/// Fix the shadow stack using the previously saved SSP pointer.
34333/// \sa emitSetJmpShadowStackFix
34334/// \param [in] MI The temporary Machine Instruction for the builtin.
34335/// \param [in] MBB The Machine Basic Block that will be modified.
34336/// \return The sink MBB that will perform the future indirect branch.
34337MachineBasicBlock *
34338X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
34339 MachineBasicBlock *MBB) const {
34340 const DebugLoc &DL = MI.getDebugLoc();
34341 MachineFunction *MF = MBB->getParent();
34342 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
34343 MachineRegisterInfo &MRI = MF->getRegInfo();
34344
34345 // Memory Reference
34346 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
34347 MI.memoperands_end());
34348
34349 MVT PVT = getPointerTy(MF->getDataLayout());
34350 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
34351
34352 // checkSspMBB:
34353 // xor vreg1, vreg1
34354 // rdssp vreg1
34355 // test vreg1, vreg1
34356 // je sinkMBB # Jump if Shadow Stack is not supported
34357 // fallMBB:
34358 // mov buf+24/12(%rip), vreg2
34359 // sub vreg1, vreg2
34360 // jbe sinkMBB # No need to fix the Shadow Stack
34361 // fixShadowMBB:
34362 // shr 3/2, vreg2
34363 // incssp vreg2 # fix the SSP according to the lower 8 bits
34364 // shr 8, vreg2
34365 // je sinkMBB
34366 // fixShadowLoopPrepareMBB:
34367 // shl vreg2
34368 // mov 128, vreg3
34369 // fixShadowLoopMBB:
34370 // incssp vreg3
34371 // dec vreg2
34372 // jne fixShadowLoopMBB # Iterate until you finish fixing
34373 // # the Shadow Stack
34374 // sinkMBB:
34375
34376 MachineFunction::iterator I = ++MBB->getIterator();
34377 const BasicBlock *BB = MBB->getBasicBlock();
34378
34379 MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);
34380 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
34381 MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);
34382 MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);
34383 MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);
34384 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
34385 MF->insert(I, checkSspMBB);
34386 MF->insert(I, fallMBB);
34387 MF->insert(I, fixShadowMBB);
34388 MF->insert(I, fixShadowLoopPrepareMBB);
34389 MF->insert(I, fixShadowLoopMBB);
34390 MF->insert(I, sinkMBB);
34391
34392 // Transfer the remainder of BB and its successor edges to sinkMBB.
34393 sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),
34394 MBB->end());
34395 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
34396
34397 MBB->addSuccessor(checkSspMBB);
34398
34399 // Initialize a register with zero.
34400 Register ZReg = MRI.createVirtualRegister(&X86::GR32RegClass);
34401 BuildMI(checkSspMBB, DL, TII->get(X86::MOV32r0), ZReg);
34402
34403 if (PVT == MVT::i64) {
34404 Register TmpZReg = MRI.createVirtualRegister(PtrRC);
34405 BuildMI(checkSspMBB, DL, TII->get(X86::SUBREG_TO_REG), TmpZReg)
34406 .addImm(0)
34407 .addReg(ZReg)
34408 .addImm(X86::sub_32bit);
34409 ZReg = TmpZReg;
34410 }
34411
34412 // Read the current SSP Register value to the zeroed register.
34413 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
34414 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
34415 BuildMI(checkSspMBB, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
34416
34417 // Check whether the result of the SSP register is zero and jump directly
34418 // to the sink.
34419 unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;
34420 BuildMI(checkSspMBB, DL, TII->get(TestRROpc))
34421 .addReg(SSPCopyReg)
34422 .addReg(SSPCopyReg);
34423 BuildMI(checkSspMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);
34424 checkSspMBB->addSuccessor(sinkMBB);
34425 checkSspMBB->addSuccessor(fallMBB);
34426
34427 // Reload the previously saved SSP register value.
34428 Register PrevSSPReg = MRI.createVirtualRegister(PtrRC);
34429 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
34430 const int64_t SPPOffset = 3 * PVT.getStoreSize();
34431 MachineInstrBuilder MIB =
34432 BuildMI(fallMBB, DL, TII->get(PtrLoadOpc), PrevSSPReg);
34433 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
34434 const MachineOperand &MO = MI.getOperand(i);
34435 if (i == X86::AddrDisp)
34436 MIB.addDisp(MO, SPPOffset);
34437 else if (MO.isReg()) // Don't add the whole operand, we don't want to
34438 // preserve kill flags.
34439 MIB.addReg(MO.getReg());
34440 else
34441 MIB.add(MO);
34442 }
34443 MIB.setMemRefs(MMOs);
34444
34445 // Subtract the current SSP from the previous SSP.
34446 Register SspSubReg = MRI.createVirtualRegister(PtrRC);
34447 unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
34448 BuildMI(fallMBB, DL, TII->get(SubRROpc), SspSubReg)
34449 .addReg(PrevSSPReg)
34450 .addReg(SSPCopyReg);
34451
34452 // Jump to sink in case PrevSSPReg <= SSPCopyReg.
34453 BuildMI(fallMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_BE);
34454 fallMBB->addSuccessor(sinkMBB);
34455 fallMBB->addSuccessor(fixShadowMBB);
34456
34457 // Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.
34458 unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
34459 unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
34460 Register SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
34461 BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspFirstShrReg)
34462 .addReg(SspSubReg)
34463 .addImm(Offset);
34464
34465 // Increase SSP when looking only on the lower 8 bits of the delta.
34466 unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;
34467 BuildMI(fixShadowMBB, DL, TII->get(IncsspOpc)).addReg(SspFirstShrReg);
34468
34469 // Reset the lower 8 bits.
34470 Register SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
34471 BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspSecondShrReg)
34472 .addReg(SspFirstShrReg)
34473 .addImm(8);
34474
34475 // Jump if the result of the shift is zero.
34476 BuildMI(fixShadowMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);
34477 fixShadowMBB->addSuccessor(sinkMBB);
34478 fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);
34479
34480 // Do a single shift left.
34481 unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64r1 : X86::SHL32r1;
34482 Register SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
34483 BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(ShlR1Opc), SspAfterShlReg)
34484 .addReg(SspSecondShrReg);
34485
34486 // Save the value 128 to a register (will be used next with incssp).
34487 Register Value128InReg = MRI.createVirtualRegister(PtrRC);
34488 unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
34489 BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(MovRIOpc), Value128InReg)
34490 .addImm(128);
34491 fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);
34492
34493 // Since incssp only looks at the lower 8 bits, we might need to do several
34494 // iterations of incssp until we finish fixing the shadow stack.
34495 Register DecReg = MRI.createVirtualRegister(PtrRC);
34496 Register CounterReg = MRI.createVirtualRegister(PtrRC);
34497 BuildMI(fixShadowLoopMBB, DL, TII->get(X86::PHI), CounterReg)
34498 .addReg(SspAfterShlReg)
34499 .addMBB(fixShadowLoopPrepareMBB)
34500 .addReg(DecReg)
34501 .addMBB(fixShadowLoopMBB);
34502
34503 // Every iteration we increase the SSP by 128.
34504 BuildMI(fixShadowLoopMBB, DL, TII->get(IncsspOpc)).addReg(Value128InReg);
34505
34506 // Every iteration we decrement the counter by 1.
34507 unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;
34508 BuildMI(fixShadowLoopMBB, DL, TII->get(DecROpc), DecReg).addReg(CounterReg);
34509
34510 // Jump if the counter is not zero yet.
34511 BuildMI(fixShadowLoopMBB, DL, TII->get(X86::JCC_1)).addMBB(fixShadowLoopMBB).addImm(X86::COND_NE);
34512 fixShadowLoopMBB->addSuccessor(sinkMBB);
34513 fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);
34514
34515 return sinkMBB;
34516}
34517
34518MachineBasicBlock *
34519X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
34520 MachineBasicBlock *MBB) const {
34521 const DebugLoc &DL = MI.getDebugLoc();
34522 MachineFunction *MF = MBB->getParent();
34523 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
34524 MachineRegisterInfo &MRI = MF->getRegInfo();
34525
34526 // Memory Reference
34527 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
34528 MI.memoperands_end());
34529
34530 MVT PVT = getPointerTy(MF->getDataLayout());
34531 assert((PVT == MVT::i64 || PVT == MVT::i32) &&(static_cast<void> (0))
34532 "Invalid Pointer Size!")(static_cast<void> (0));
34533
34534 const TargetRegisterClass *RC =
34535 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
34536 Register Tmp = MRI.createVirtualRegister(RC);
34537 // Since FP is only updated here but NOT referenced, it's treated as GPR.
34538 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
34539 Register FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
34540 Register SP = RegInfo->getStackRegister();
34541
34542 MachineInstrBuilder MIB;
34543
34544 const int64_t LabelOffset = 1 * PVT.getStoreSize();
34545 const int64_t SPOffset = 2 * PVT.getStoreSize();
34546
34547 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
34548 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
34549
34550 MachineBasicBlock *thisMBB = MBB;
34551
34552 // When CET and shadow stack is enabled, we need to fix the Shadow Stack.
34553 if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
34554 thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);
34555 }
34556
34557 // Reload FP
34558 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), FP);
34559 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
34560 const MachineOperand &MO = MI.getOperand(i);
34561 if (MO.isReg()) // Don't add the whole operand, we don't want to
34562 // preserve kill flags.
34563 MIB.addReg(MO.getReg());
34564 else
34565 MIB.add(MO);
34566 }
34567 MIB.setMemRefs(MMOs);
34568
34569 // Reload IP
34570 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
34571 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
34572 const MachineOperand &MO = MI.getOperand(i);
34573 if (i == X86::AddrDisp)
34574 MIB.addDisp(MO, LabelOffset);
34575 else if (MO.isReg()) // Don't add the whole operand, we don't want to
34576 // preserve kill flags.
34577 MIB.addReg(MO.getReg());
34578 else
34579 MIB.add(MO);
34580 }
34581 MIB.setMemRefs(MMOs);
34582
34583 // Reload SP
34584 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), SP);
34585 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
34586 if (i == X86::AddrDisp)
34587 MIB.addDisp(MI.getOperand(i), SPOffset);
34588 else
34589 MIB.add(MI.getOperand(i)); // We can preserve the kill flags here, it's
34590 // the last instruction of the expansion.
34591 }
34592 MIB.setMemRefs(MMOs);
34593
34594 // Jump
34595 BuildMI(*thisMBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
34596
34597 MI.eraseFromParent();
34598 return thisMBB;
34599}
34600
34601void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
34602 MachineBasicBlock *MBB,
34603 MachineBasicBlock *DispatchBB,
34604 int FI) const {
34605 const DebugLoc &DL = MI.getDebugLoc();
34606 MachineFunction *MF = MBB->getParent();
34607 MachineRegisterInfo *MRI = &MF->getRegInfo();
34608 const X86InstrInfo *TII = Subtarget.getInstrInfo();
34609
34610 MVT PVT = getPointerTy(MF->getDataLayout());
34611 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!")(static_cast<void> (0));
34612
34613 unsigned Op = 0;
34614 unsigned VR = 0;
34615
34616 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
34617 !isPositionIndependent();
34618
34619 if (UseImmLabel) {
34620 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
34621 } else {
34622 const TargetRegisterClass *TRC =
34623 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
34624 VR = MRI->createVirtualRegister(TRC);
34625 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
34626
34627 if (Subtarget.is64Bit())
34628 BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
34629 .addReg(X86::RIP)
34630 .addImm(1)
34631 .addReg(0)
34632 .addMBB(DispatchBB)
34633 .addReg(0);
34634 else
34635 BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
34636 .addReg(0) /* TII->getGlobalBaseReg(MF) */
34637 .addImm(1)
34638 .addReg(0)
34639 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
34640 .addReg(0);
34641 }
34642
34643 MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
34644 addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
34645 if (UseImmLabel)
34646 MIB.addMBB(DispatchBB);
34647 else
34648 MIB.addReg(VR);
34649}
34650
34651MachineBasicBlock *
34652X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
34653 MachineBasicBlock *BB) const {
34654 const DebugLoc &DL = MI.getDebugLoc();
34655 MachineFunction *MF = BB->getParent();
34656 MachineRegisterInfo *MRI = &MF->getRegInfo();
34657 const X86InstrInfo *TII = Subtarget.getInstrInfo();
34658 int FI = MF->getFrameInfo().getFunctionContextIndex();
34659
34660 // Get a mapping of the call site numbers to all of the landing pads they're
34661 // associated with.
34662 DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
34663 unsigned MaxCSNum = 0;
34664 for (auto &MBB : *MF) {
34665 if (!MBB.isEHPad())
34666 continue;
34667
34668 MCSymbol *Sym = nullptr;
34669 for (const auto &MI : MBB) {
34670 if (MI.isDebugInstr())
34671 continue;
34672
34673 assert(MI.isEHLabel() && "expected EH_LABEL")(static_cast<void> (0));
34674 Sym = MI.getOperand(0).getMCSymbol();
34675 break;
34676 }
34677
34678 if (!MF->hasCallSiteLandingPad(Sym))
34679 continue;
34680
34681 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
34682 CallSiteNumToLPad[CSI].push_back(&MBB);
34683 MaxCSNum = std::max(MaxCSNum, CSI);
34684 }
34685 }
34686
34687 // Get an ordered list of the machine basic blocks for the jump table.
34688 std::vector<MachineBasicBlock *> LPadList;
34689 SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
34690 LPadList.reserve(CallSiteNumToLPad.size());
34691
34692 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
34693 for (auto &LP : CallSiteNumToLPad[CSI]) {
34694 LPadList.push_back(LP);
34695 InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
34696 }
34697 }
34698
34699 assert(!LPadList.empty() &&(static_cast<void> (0))
34700 "No landing pad destinations for the dispatch jump table!")(static_cast<void> (0));
34701
34702 // Create the MBBs for the dispatch code.
34703
34704 // Shove the dispatch's address into the return slot in the function context.
34705 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
34706 DispatchBB->setIsEHPad(true);
34707
34708 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
34709 BuildMI(TrapBB, DL, TII->get(X86::TRAP));
34710 DispatchBB->addSuccessor(TrapBB);
34711
34712 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
34713 DispatchBB->addSuccessor(DispContBB);
34714
34715 // Insert MBBs.
34716 MF->push_back(DispatchBB);
34717 MF->push_back(DispContBB);
34718 MF->push_back(TrapBB);
34719
34720 // Insert code into the entry block that creates and registers the function
34721 // context.
34722 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
34723
34724 // Create the jump table and associated information
34725 unsigned JTE = getJumpTableEncoding();
34726 MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
34727 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
34728
34729 const X86RegisterInfo &RI = TII->getRegisterInfo();
34730 // Add a register mask with no preserved registers. This results in all
34731 // registers being marked as clobbered.
34732 if (RI.hasBasePointer(*MF)) {
34733 const bool FPIs64Bit =
34734 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
34735 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
34736 MFI->setRestoreBasePointer(MF);
34737
34738 Register FP = RI.getFrameRegister(*MF);
34739 Register BP = RI.getBaseRegister();
34740 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
34741 addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
34742 MFI->getRestoreBasePointerOffset())
34743 .addRegMask(RI.getNoPreservedMask());
34744 } else {
34745 BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
34746 .addRegMask(RI.getNoPreservedMask());
34747 }
34748
34749 // IReg is used as an index in a memory operand and therefore can't be SP
34750 Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
34751 addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
34752 Subtarget.is64Bit() ? 8 : 4);
34753 BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
34754 .addReg(IReg)
34755 .addImm(LPadList.size());
34756 BuildMI(DispatchBB, DL, TII->get(X86::JCC_1)).addMBB(TrapBB).addImm(X86::COND_AE);
34757
34758 if (Subtarget.is64Bit()) {
34759 Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
34760 Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
34761
34762 // leaq .LJTI0_0(%rip), BReg
34763 BuildMI(DispContBB, DL, TII->get(X86::LEA64r), BReg)
34764 .addReg(X86::RIP)
34765 .addImm(1)
34766 .addReg(0)
34767 .addJumpTableIndex(MJTI)
34768 .addReg(0);
34769 // movzx IReg64, IReg
34770 BuildMI(DispContBB, DL, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
34771 .addImm(0)
34772 .addReg(IReg)
34773 .addImm(X86::sub_32bit);
34774
34775 switch (JTE) {
34776 case MachineJumpTableInfo::EK_BlockAddress:
34777 // jmpq *(BReg,IReg64,8)
34778 BuildMI(DispContBB, DL, TII->get(X86::JMP64m))
34779 .addReg(BReg)
34780 .addImm(8)
34781 .addReg(IReg64)
34782 .addImm(0)
34783 .addReg(0);
34784 break;
34785 case MachineJumpTableInfo::EK_LabelDifference32: {
34786 Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
34787 Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
34788 Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
34789
34790 // movl (BReg,IReg64,4), OReg
34791 BuildMI(DispContBB, DL, TII->get(X86::MOV32rm), OReg)
34792 .addReg(BReg)
34793 .addImm(4)
34794 .addReg(IReg64)
34795 .addImm(0)
34796 .addReg(0);
34797 // movsx OReg64, OReg
34798 BuildMI(DispContBB, DL, TII->get(X86::MOVSX64rr32), OReg64).addReg(OReg);
34799 // addq BReg, OReg64, TReg
34800 BuildMI(DispContBB, DL, TII->get(X86::ADD64rr), TReg)
34801 .addReg(OReg64)
34802 .addReg(BReg);
34803 // jmpq *TReg
34804 BuildMI(DispContBB, DL, TII->get(X86::JMP64r)).addReg(TReg);
34805 break;
34806 }
34807 default:
34808 llvm_unreachable("Unexpected jump table encoding")__builtin_unreachable();
34809 }
34810 } else {
34811 // jmpl *.LJTI0_0(,IReg,4)
34812 BuildMI(DispContBB, DL, TII->get(X86::JMP32m))
34813 .addReg(0)
34814 .addImm(4)
34815 .addReg(IReg)
34816 .addJumpTableIndex(MJTI)
34817 .addReg(0);
34818 }
34819
34820 // Add the jump table entries as successors to the MBB.
34821 SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
34822 for (auto &LP : LPadList)
34823 if (SeenMBBs.insert(LP).second)
34824 DispContBB->addSuccessor(LP);
34825
34826 // N.B. the order the invoke BBs are processed in doesn't matter here.
34827 SmallVector<MachineBasicBlock *, 64> MBBLPads;
34828 const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
34829 for (MachineBasicBlock *MBB : InvokeBBs) {
34830 // Remove the landing pad successor from the invoke block and replace it
34831 // with the new dispatch block.
34832 // Keep a copy of Successors since it's modified inside the loop.
34833 SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
34834 MBB->succ_rend());
34835 // FIXME: Avoid quadratic complexity.
34836 for (auto MBBS : Successors) {
34837 if (MBBS->isEHPad()) {
34838 MBB->removeSuccessor(MBBS);
34839 MBBLPads.push_back(MBBS);
34840 }
34841 }
34842
34843 MBB->addSuccessor(DispatchBB);
34844
34845 // Find the invoke call and mark all of the callee-saved registers as
34846 // 'implicit defined' so that they're spilled. This prevents code from
34847 // moving instructions to before the EH block, where they will never be
34848 // executed.
34849 for (auto &II : reverse(*MBB)) {
34850 if (!II.isCall())
34851 continue;
34852
34853 DenseMap<unsigned, bool> DefRegs;
34854 for (auto &MOp : II.operands())
34855 if (MOp.isReg())
34856 DefRegs[MOp.getReg()] = true;
34857
34858 MachineInstrBuilder MIB(*MF, &II);
34859 for (unsigned RegIdx = 0; SavedRegs[RegIdx]; ++RegIdx) {
34860 unsigned Reg = SavedRegs[RegIdx];
34861 if (!DefRegs[Reg])
34862 MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
34863 }
34864
34865 break;
34866 }
34867 }
34868
34869 // Mark all former landing pads as non-landing pads. The dispatch is the only
34870 // landing pad now.
34871 for (auto &LP : MBBLPads)
34872 LP->setIsEHPad(false);
34873
34874 // The instruction is gone now.
34875 MI.eraseFromParent();
34876 return BB;
34877}
34878
34879MachineBasicBlock *
34880X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
34881 MachineBasicBlock *BB) const {
34882 MachineFunction *MF = BB->getParent();
34883 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
34884 const DebugLoc &DL = MI.getDebugLoc();
34885
34886 auto TMMImmToTMMReg = [](unsigned Imm) {
34887 assert (Imm < 8 && "Illegal tmm index")(static_cast<void> (0));
34888 return X86::TMM0 + Imm;
34889 };
34890 switch (MI.getOpcode()) {
34891 default: llvm_unreachable("Unexpected instr type to insert")__builtin_unreachable();
34892 case X86::TLS_addr32:
34893 case X86::TLS_addr64:
34894 case X86::TLS_addrX32:
34895 case X86::TLS_base_addr32:
34896 case X86::TLS_base_addr64:
34897 case X86::TLS_base_addrX32:
34898 return EmitLoweredTLSAddr(MI, BB);
34899 case X86::INDIRECT_THUNK_CALL32:
34900 case X86::INDIRECT_THUNK_CALL64:
34901 case X86::INDIRECT_THUNK_TCRETURN32:
34902 case X86::INDIRECT_THUNK_TCRETURN64:
34903 return EmitLoweredIndirectThunk(MI, BB);
34904 case X86::CATCHRET:
34905 return EmitLoweredCatchRet(MI, BB);
34906 case X86::SEG_ALLOCA_32:
34907 case X86::SEG_ALLOCA_64:
34908 return EmitLoweredSegAlloca(MI, BB);
34909 case X86::PROBED_ALLOCA_32:
34910 case X86::PROBED_ALLOCA_64:
34911 return EmitLoweredProbedAlloca(MI, BB);
34912 case X86::TLSCall_32:
34913 case X86::TLSCall_64:
34914 return EmitLoweredTLSCall(MI, BB);
34915 case X86::CMOV_FR32:
34916 case X86::CMOV_FR32X:
34917 case X86::CMOV_FR64:
34918 case X86::CMOV_FR64X:
34919 case X86::CMOV_GR8:
34920 case X86::CMOV_GR16:
34921 case X86::CMOV_GR32:
34922 case X86::CMOV_RFP32:
34923 case X86::CMOV_RFP64:
34924 case X86::CMOV_RFP80:
34925 case X86::CMOV_VR64:
34926 case X86::CMOV_VR128:
34927 case X86::CMOV_VR128X:
34928 case X86::CMOV_VR256:
34929 case X86::CMOV_VR256X:
34930 case X86::CMOV_VR512:
34931 case X86::CMOV_VK1:
34932 case X86::CMOV_VK2:
34933 case X86::CMOV_VK4:
34934 case X86::CMOV_VK8:
34935 case X86::CMOV_VK16:
34936 case X86::CMOV_VK32:
34937 case X86::CMOV_VK64:
34938 return EmitLoweredSelect(MI, BB);
34939
34940 case X86::RDFLAGS32:
34941 case X86::RDFLAGS64: {
34942 unsigned PushF =
34943 MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
34944 unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
34945 MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));
34946 // Permit reads of the EFLAGS and DF registers without them being defined.
34947 // This intrinsic exists to read external processor state in flags, such as
34948 // the trap flag, interrupt flag, and direction flag, none of which are
34949 // modeled by the backend.
34950 assert(Push->getOperand(2).getReg() == X86::EFLAGS &&(static_cast<void> (0))
34951 "Unexpected register in operand!")(static_cast<void> (0));
34952 Push->getOperand(2).setIsUndef();
34953 assert(Push->getOperand(3).getReg() == X86::DF &&(static_cast<void> (0))
34954 "Unexpected register in operand!")(static_cast<void> (0));
34955 Push->getOperand(3).setIsUndef();
34956 BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());
34957
34958 MI.eraseFromParent(); // The pseudo is gone now.
34959 return BB;
34960 }
34961
34962 case X86::WRFLAGS32:
34963 case X86::WRFLAGS64: {
34964 unsigned Push =
34965 MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
34966 unsigned PopF =
34967 MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
34968 BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
34969 BuildMI(*BB, MI, DL, TII->get(PopF));
34970
34971 MI.eraseFromParent(); // The pseudo is gone now.
34972 return BB;
34973 }
34974
34975 case X86::FP32_TO_INT16_IN_MEM:
34976 case X86::FP32_TO_INT32_IN_MEM:
34977 case X86::FP32_TO_INT64_IN_MEM:
34978 case X86::FP64_TO_INT16_IN_MEM:
34979 case X86::FP64_TO_INT32_IN_MEM:
34980 case X86::FP64_TO_INT64_IN_MEM:
34981 case X86::FP80_TO_INT16_IN_MEM:
34982 case X86::FP80_TO_INT32_IN_MEM:
34983 case X86::FP80_TO_INT64_IN_MEM: {
34984 // Change the floating point control register to use "round towards zero"
34985 // mode when truncating to an integer value.
34986 int OrigCWFrameIdx =
34987 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
34988 addFrameReference(BuildMI(*BB, MI, DL,
34989 TII->get(X86::FNSTCW16m)), OrigCWFrameIdx);
34990
34991 // Load the old value of the control word...
34992 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
34993 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW),
34994 OrigCWFrameIdx);
34995
34996 // OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.
34997 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
34998 BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW)
34999 .addReg(OldCW, RegState::Kill).addImm(0xC00);
35000
35001 // Extract to 16 bits.
35002 Register NewCW16 =
35003 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
35004 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16)
35005 .addReg(NewCW, RegState::Kill, X86::sub_16bit);
35006
35007 // Prepare memory for FLDCW.
35008 int NewCWFrameIdx =
35009 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
35010 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)),
35011 NewCWFrameIdx)
35012 .addReg(NewCW16, RegState::Kill);
35013
35014 // Reload the modified control word now...
35015 addFrameReference(BuildMI(*BB, MI, DL,
35016 TII->get(X86::FLDCW16m)), NewCWFrameIdx);
35017
35018 // Get the X86 opcode to use.
35019 unsigned Opc;
35020 switch (MI.getOpcode()) {
35021 default: llvm_unreachable("illegal opcode!")__builtin_unreachable();
35022 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
35023 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
35024 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
35025 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
35026 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
35027 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
35028 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
35029 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
35030 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
35031 }
35032
35033 X86AddressMode AM = getAddressFromInstr(&MI, 0);
35034 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
35035 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
35036
35037 // Reload the original control word now.
35038 addFrameReference(BuildMI(*BB, MI, DL,
35039 TII->get(X86::FLDCW16m)), OrigCWFrameIdx);
35040
35041 MI.eraseFromParent(); // The pseudo instruction is gone now.
35042 return BB;
35043 }
35044
35045 // xbegin
35046 case X86::XBEGIN:
35047 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
35048
35049 case X86::VAARG_64:
35050 case X86::VAARG_X32:
35051 return EmitVAARGWithCustomInserter(MI, BB);
35052
35053 case X86::EH_SjLj_SetJmp32:
35054 case X86::EH_SjLj_SetJmp64:
35055 return emitEHSjLjSetJmp(MI, BB);
35056
35057 case X86::EH_SjLj_LongJmp32:
35058 case X86::EH_SjLj_LongJmp64:
35059 return emitEHSjLjLongJmp(MI, BB);
35060
35061 case X86::Int_eh_sjlj_setup_dispatch:
35062 return EmitSjLjDispatchBlock(MI, BB);
35063
35064 case TargetOpcode::STATEPOINT:
35065 // As an implementation detail, STATEPOINT shares the STACKMAP format at
35066 // this point in the process. We diverge later.
35067 return emitPatchPoint(MI, BB);
35068
35069 case TargetOpcode::STACKMAP:
35070 case TargetOpcode::PATCHPOINT:
35071 return emitPatchPoint(MI, BB);
35072
35073 case TargetOpcode::PATCHABLE_EVENT_CALL:
35074 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
35075 return BB;
35076
35077 case X86::LCMPXCHG8B: {
35078 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
35079 // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
35080 // requires a memory operand. If it happens that current architecture is
35081 // i686 and for current function we need a base pointer
35082 // - which is ESI for i686 - register allocator would not be able to
35083 // allocate registers for an address in form of X(%reg, %reg, Y)
35084 // - there never would be enough unreserved registers during regalloc
35085 // (without the need for base ptr the only option would be X(%edi, %esi, Y).
35086 // We are giving a hand to register allocator by precomputing the address in
35087 // a new vreg using LEA.
35088
35089 // If it is not i686 or there is no base pointer - nothing to do here.
35090 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
35091 return BB;
35092
35093 // Even though this code does not necessarily needs the base pointer to
35094 // be ESI, we check for that. The reason: if this assert fails, there are
35095 // some changes happened in the compiler base pointer handling, which most
35096 // probably have to be addressed somehow here.
35097 assert(TRI->getBaseRegister() == X86::ESI &&(static_cast<void> (0))
35098 "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "(static_cast<void> (0))
35099 "base pointer in mind")(static_cast<void> (0));
35100
35101 MachineRegisterInfo &MRI = MF->getRegInfo();
35102 MVT SPTy = getPointerTy(MF->getDataLayout());
35103 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
35104 Register computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
35105
35106 X86AddressMode AM = getAddressFromInstr(&MI, 0);
35107 // Regalloc does not need any help when the memory operand of CMPXCHG8B
35108 // does not use index register.
35109 if (AM.IndexReg == X86::NoRegister)
35110 return BB;
35111
35112 // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
35113 // four operand definitions that are E[ABCD] registers. We skip them and
35114 // then insert the LEA.
35115 MachineBasicBlock::reverse_iterator RMBBI(MI.getReverseIterator());
35116 while (RMBBI != BB->rend() && (RMBBI->definesRegister(X86::EAX) ||
35117 RMBBI->definesRegister(X86::EBX) ||
35118 RMBBI->definesRegister(X86::ECX) ||
35119 RMBBI->definesRegister(X86::EDX))) {
35120 ++RMBBI;
35121 }
35122 MachineBasicBlock::iterator MBBI(RMBBI);
35123 addFullAddress(
35124 BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);
35125
35126 setDirectAddressInInstr(&MI, 0, computedAddrVReg);
35127
35128 return BB;
35129 }
35130 case X86::LCMPXCHG16B_NO_RBX: {
35131 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
35132 Register BasePtr = TRI->getBaseRegister();
35133 if (TRI->hasBasePointer(*MF) &&
35134 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
35135 if (!BB->isLiveIn(BasePtr))
35136 BB->addLiveIn(BasePtr);
35137 // Save RBX into a virtual register.
35138 Register SaveRBX =
35139 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
35140 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX)
35141 .addReg(X86::RBX);
35142 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
35143 MachineInstrBuilder MIB =
35144 BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B_SAVE_RBX), Dst);
35145 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
35146 MIB.add(MI.getOperand(Idx));
35147 MIB.add(MI.getOperand(X86::AddrNumOperands));
35148 MIB.addReg(SaveRBX);
35149 } else {
35150 // Simple case, just copy the virtual register to RBX.
35151 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::RBX)
35152 .add(MI.getOperand(X86::AddrNumOperands));
35153 MachineInstrBuilder MIB =
35154 BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B));
35155 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
35156 MIB.add(MI.getOperand(Idx));
35157 }
35158 MI.eraseFromParent();
35159 return BB;
35160 }
35161 case X86::MWAITX: {
35162 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
35163 Register BasePtr = TRI->getBaseRegister();
35164 bool IsRBX = (BasePtr == X86::RBX || BasePtr == X86::EBX);
35165 // If no need to save the base pointer, we generate MWAITXrrr,
35166 // else we generate pseudo MWAITX_SAVE_RBX.
35167 if (!IsRBX || !TRI->hasBasePointer(*MF)) {
35168 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX)
35169 .addReg(MI.getOperand(0).getReg());
35170 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX)
35171 .addReg(MI.getOperand(1).getReg());
35172 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EBX)
35173 .addReg(MI.getOperand(2).getReg());
35174 BuildMI(*BB, MI, DL, TII->get(X86::MWAITXrrr));
35175 MI.eraseFromParent();
35176 } else {
35177 if (!BB->isLiveIn(BasePtr)) {
35178 BB->addLiveIn(BasePtr);
35179 }
35180 // Parameters can be copied into ECX and EAX but not EBX yet.
35181 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX)
35182 .addReg(MI.getOperand(0).getReg());
35183 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX)
35184 .addReg(MI.getOperand(1).getReg());
35185 assert(Subtarget.is64Bit() && "Expected 64-bit mode!")(static_cast<void> (0));
35186 // Save RBX into a virtual register.
35187 Register SaveRBX =
35188 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
35189 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX)
35190 .addReg(X86::RBX);
35191 // Generate mwaitx pseudo.
35192 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
35193 BuildMI(*BB, MI, DL, TII->get(X86::MWAITX_SAVE_RBX))
35194 .addDef(Dst) // Destination tied in with SaveRBX.
35195 .addReg(MI.getOperand(2).getReg()) // input value of EBX.
35196 .addUse(SaveRBX); // Save of base pointer.
35197 MI.eraseFromParent();
35198 }
35199 return BB;
35200 }
35201 case TargetOpcode::PREALLOCATED_SETUP: {
35202 assert(Subtarget.is32Bit() && "preallocated only used in 32-bit")(static_cast<void> (0));
35203 auto MFI = MF->getInfo<X86MachineFunctionInfo>();
35204 MFI->setHasPreallocatedCall(true);
35205 int64_t PreallocatedId = MI.getOperand(0).getImm();
35206 size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId);
35207 assert(StackAdjustment != 0 && "0 stack adjustment")(static_cast<void> (0));
35208 LLVM_DEBUG(dbgs() << "PREALLOCATED_SETUP stack adjustment "do { } while (false)
35209 << StackAdjustment << "\n")do { } while (false);
35210 BuildMI(*BB, MI, DL, TII->get(X86::SUB32ri), X86::ESP)
35211 .addReg(X86::ESP)
35212 .addImm(StackAdjustment);
35213 MI.eraseFromParent();
35214 return BB;
35215 }
35216 case TargetOpcode::PREALLOCATED_ARG: {
35217 assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit")(static_cast<void> (0));
35218 int64_t PreallocatedId = MI.getOperand(1).getImm();
35219 int64_t ArgIdx = MI.getOperand(2).getImm();
35220 auto MFI = MF->getInfo<X86MachineFunctionInfo>();
35221 size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx];
35222 LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdxdo { } while (false)
35223 << ", arg offset " << ArgOffset << "\n")do { } while (false);
35224 // stack pointer + offset
35225 addRegOffset(
35226 BuildMI(*BB, MI, DL, TII->get(X86::LEA32r), MI.getOperand(0).getReg()),
35227 X86::ESP, false, ArgOffset);
35228 MI.eraseFromParent();
35229 return BB;
35230 }
35231 case X86::PTDPBSSD:
35232 case X86::PTDPBSUD:
35233 case X86::PTDPBUSD:
35234 case X86::PTDPBUUD:
35235 case X86::PTDPBF16PS: {
35236 unsigned Opc;
35237 switch (MI.getOpcode()) {
35238 default: llvm_unreachable("illegal opcode!")__builtin_unreachable();
35239 case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;
35240 case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;
35241 case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;
35242 case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;
35243 case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;
35244 }
35245
35246 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
35247 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
35248 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
35249 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
35250 MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
35251
35252 MI.eraseFromParent(); // The pseudo is gone now.
35253 return BB;
35254 }
35255 case X86::PTILEZERO: {
35256 unsigned Imm = MI.getOperand(0).getImm();
35257 BuildMI(*BB, MI, DL, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));
35258 MI.eraseFromParent(); // The pseudo is gone now.
35259 return BB;
35260 }
35261 case X86::PTILELOADD:
35262 case X86::PTILELOADDT1:
35263 case X86::PTILESTORED: {
35264 unsigned Opc;
35265 switch (MI.getOpcode()) {
35266 default: llvm_unreachable("illegal opcode!")__builtin_unreachable();
35267 case X86::PTILELOADD: Opc = X86::TILELOADD; break;
35268 case X86::PTILELOADDT1: Opc = X86::TILELOADDT1; break;
35269 case X86::PTILESTORED: Opc = X86::TILESTORED; break;
35270 }
35271
35272 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
35273 unsigned CurOp = 0;
35274 if (Opc != X86::TILESTORED)
35275 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
35276 RegState::Define);
35277
35278 MIB.add(MI.getOperand(CurOp++)); // base
35279 MIB.add(MI.getOperand(CurOp++)); // scale
35280 MIB.add(MI.getOperand(CurOp++)); // index -- stride
35281 MIB.add(MI.getOperand(CurOp++)); // displacement
35282 MIB.add(MI.getOperand(CurOp++)); // segment
35283
35284 if (Opc == X86::TILESTORED)
35285 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
35286 RegState::Undef);
35287
35288 MI.eraseFromParent(); // The pseudo is gone now.
35289 return BB;
35290 }
35291 }
35292}
35293
35294//===----------------------------------------------------------------------===//
35295// X86 Optimization Hooks
35296//===----------------------------------------------------------------------===//
35297
35298bool
35299X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,
35300 const APInt &DemandedBits,
35301 const APInt &DemandedElts,
35302 TargetLoweringOpt &TLO) const {
35303 EVT VT = Op.getValueType();
35304 unsigned Opcode = Op.getOpcode();
35305 unsigned EltSize = VT.getScalarSizeInBits();
35306
35307 if (VT.isVector()) {
35308 // If the constant is only all signbits in the active bits, then we should
35309 // extend it to the entire constant to allow it act as a boolean constant
35310 // vector.
35311 auto NeedsSignExtension = [&](SDValue V, unsigned ActiveBits) {
35312 if (!ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
35313 return false;
35314 for (unsigned i = 0, e = V.getNumOperands(); i != e; ++i) {
35315 if (!DemandedElts[i] || V.getOperand(i).isUndef())
35316 continue;
35317 const APInt &Val = V.getConstantOperandAPInt(i);
35318 if (Val.getBitWidth() > Val.getNumSignBits() &&
35319 Val.trunc(ActiveBits).getNumSignBits() == ActiveBits)
35320 return true;
35321 }
35322 return false;
35323 };
35324 // For vectors - if we have a constant, then try to sign extend.
35325 // TODO: Handle AND/ANDN cases.
35326 unsigned ActiveBits = DemandedBits.getActiveBits();
35327 if (EltSize > ActiveBits && EltSize > 1 && isTypeLegal(VT) &&
35328 (Opcode == ISD::OR || Opcode == ISD::XOR) &&
35329 NeedsSignExtension(Op.getOperand(1), ActiveBits)) {
35330 EVT ExtSVT = EVT::getIntegerVT(*TLO.DAG.getContext(), ActiveBits);
35331 EVT ExtVT = EVT::getVectorVT(*TLO.DAG.getContext(), ExtSVT,
35332 VT.getVectorNumElements());
35333 SDValue NewC =
35334 TLO.DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(Op), VT,
35335 Op.getOperand(1), TLO.DAG.getValueType(ExtVT));
35336 SDValue NewOp =
35337 TLO.DAG.getNode(Opcode, SDLoc(Op), VT, Op.getOperand(0), NewC);
35338 return TLO.CombineTo(Op, NewOp);
35339 }
35340 return false;
35341 }
35342
35343 // Only optimize Ands to prevent shrinking a constant that could be
35344 // matched by movzx.
35345 if (Opcode != ISD::AND)
35346 return false;
35347
35348 // Make sure the RHS really is a constant.
35349 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
35350 if (!C)
35351 return false;
35352
35353 const APInt &Mask = C->getAPIntValue();
35354
35355 // Clear all non-demanded bits initially.
35356 APInt ShrunkMask = Mask & DemandedBits;
35357
35358 // Find the width of the shrunk mask.
35359 unsigned Width = ShrunkMask.getActiveBits();
35360
35361 // If the mask is all 0s there's nothing to do here.
35362 if (Width == 0)
35363 return false;
35364
35365 // Find the next power of 2 width, rounding up to a byte.
35366 Width = PowerOf2Ceil(std::max(Width, 8U));
35367 // Truncate the width to size to handle illegal types.
35368 Width = std::min(Width, EltSize);
35369
35370 // Calculate a possible zero extend mask for this constant.
35371 APInt ZeroExtendMask = APInt::getLowBitsSet(EltSize, Width);
35372
35373 // If we aren't changing the mask, just return true to keep it and prevent
35374 // the caller from optimizing.
35375 if (ZeroExtendMask == Mask)
35376 return true;
35377
35378 // Make sure the new mask can be represented by a combination of mask bits
35379 // and non-demanded bits.
35380 if (!ZeroExtendMask.isSubsetOf(Mask | ~DemandedBits))
35381 return false;
35382
35383 // Replace the constant with the zero extend mask.
35384 SDLoc DL(Op);
35385 SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);
35386 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
35387 return TLO.CombineTo(Op, NewOp);
35388}
35389
35390void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
35391 KnownBits &Known,
35392 const APInt &DemandedElts,
35393 const SelectionDAG &DAG,
35394 unsigned Depth) const {
35395 unsigned BitWidth = Known.getBitWidth();
35396 unsigned NumElts = DemandedElts.getBitWidth();
35397 unsigned Opc = Op.getOpcode();
35398 EVT VT = Op.getValueType();
35399 assert((Opc >= ISD::BUILTIN_OP_END ||(static_cast<void> (0))
35400 Opc == ISD::INTRINSIC_WO_CHAIN ||(static_cast<void> (0))
35401 Opc == ISD::INTRINSIC_W_CHAIN ||(static_cast<void> (0))
35402 Opc == ISD::INTRINSIC_VOID) &&(static_cast<void> (0))
35403 "Should use MaskedValueIsZero if you don't know whether Op"(static_cast<void> (0))
35404 " is a target node!")(static_cast<void> (0));
35405
35406 Known.resetAll();
35407 switch (Opc) {
35408 default: break;
35409 case X86ISD::SETCC:
35410 Known.Zero.setBitsFrom(1);
35411 break;
35412 case X86ISD::MOVMSK: {
35413 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
35414 Known.Zero.setBitsFrom(NumLoBits);
35415 break;
35416 }
35417 case X86ISD::PEXTRB:
35418 case X86ISD::PEXTRW: {
35419 SDValue Src = Op.getOperand(0);
35420 EVT SrcVT = Src.getValueType();
35421 APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
35422 Op.getConstantOperandVal(1));
35423 Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);
35424 Known = Known.anyextOrTrunc(BitWidth);
35425 Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
35426 break;
35427 }
35428 case X86ISD::VSRAI:
35429 case X86ISD::VSHLI:
35430 case X86ISD::VSRLI: {
35431 unsigned ShAmt = Op.getConstantOperandVal(1);
35432 if (ShAmt >= VT.getScalarSizeInBits()) {
35433 Known.setAllZero();
35434 break;
35435 }
35436
35437 Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
35438 if (Opc == X86ISD::VSHLI) {
35439 Known.Zero <<= ShAmt;
35440 Known.One <<= ShAmt;
35441 // Low bits are known zero.
35442 Known.Zero.setLowBits(ShAmt);
35443 } else if (Opc == X86ISD::VSRLI) {
35444 Known.Zero.lshrInPlace(ShAmt);
35445 Known.One.lshrInPlace(ShAmt);
35446 // High bits are known zero.
35447 Known.Zero.setHighBits(ShAmt);
35448 } else {
35449 Known.Zero.ashrInPlace(ShAmt);
35450 Known.One.ashrInPlace(ShAmt);
35451 }
35452 break;
35453 }
35454 case X86ISD::PACKUS: {
35455 // PACKUS is just a truncation if the upper half is zero.
35456 APInt DemandedLHS, DemandedRHS;
35457 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
35458
35459 Known.One = APInt::getAllOnesValue(BitWidth * 2);
35460 Known.Zero = APInt::getAllOnesValue(BitWidth * 2);
35461
35462 KnownBits Known2;
35463 if (!!DemandedLHS) {
35464 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);
35465 Known = KnownBits::commonBits(Known, Known2);
35466 }
35467 if (!!DemandedRHS) {
35468 Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);
35469 Known = KnownBits::commonBits(Known, Known2);
35470 }
35471
35472 if (Known.countMinLeadingZeros() < BitWidth)
35473 Known.resetAll();
35474 Known = Known.trunc(BitWidth);
35475 break;
35476 }
35477 case X86ISD::VBROADCAST: {
35478 SDValue Src = Op.getOperand(0);
35479 if (!Src.getSimpleValueType().isVector()) {
35480 Known = DAG.computeKnownBits(Src, Depth + 1);
35481 return;
35482 }
35483 break;
35484 }
35485 case X86ISD::ANDNP: {
35486 KnownBits Known2;
35487 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
35488 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
35489
35490 // ANDNP = (~X & Y);
35491 Known.One &= Known2.Zero;
35492 Known.Zero |= Known2.One;
35493 break;
35494 }
35495 case X86ISD::FOR: {
35496 KnownBits Known2;
35497 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
35498 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
35499
35500 Known |= Known2;
35501 break;
35502 }
35503 case X86ISD::PSADBW: {
35504 assert(VT.getScalarType() == MVT::i64 &&(static_cast<void> (0))
35505 Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&(static_cast<void> (0))
35506 "Unexpected PSADBW types")(static_cast<void> (0));
35507
35508 // PSADBW - fills low 16 bits and zeros upper 48 bits of each i64 result.
35509 Known.Zero.setBitsFrom(16);
35510 break;
35511 }
35512 case X86ISD::PMULUDQ: {
35513 KnownBits Known2;
35514 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
35515 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
35516
35517 Known = Known.trunc(BitWidth / 2).zext(BitWidth);
35518 Known2 = Known2.trunc(BitWidth / 2).zext(BitWidth);
35519 Known = KnownBits::mul(Known, Known2);
35520 break;
35521 }
35522 case X86ISD::CMOV: {
35523 Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
35524 // If we don't know any bits, early out.
35525 if (Known.isUnknown())
35526 break;
35527 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
35528
35529 // Only known if known in both the LHS and RHS.
35530 Known = KnownBits::commonBits(Known, Known2);
35531 break;
35532 }
35533 case X86ISD::BEXTR:
35534 case X86ISD::BEXTRI: {
35535 SDValue Op0 = Op.getOperand(0);
35536 SDValue Op1 = Op.getOperand(1);
35537
35538 if (auto* Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
35539 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
35540 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
35541
35542 // If the length is 0, the result is 0.
35543 if (Length == 0) {
35544 Known.setAllZero();
35545 break;
35546 }
35547
35548 if ((Shift + Length) <= BitWidth) {
35549 Known = DAG.computeKnownBits(Op0, Depth + 1);
35550 Known = Known.extractBits(Length, Shift);
35551 Known = Known.zextOrTrunc(BitWidth);
35552 }
35553 }
35554 break;
35555 }
35556 case X86ISD::PDEP: {
35557 KnownBits Known2;
35558 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
35559 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
35560 // Zeros are retained from the mask operand. But not ones.
35561 Known.One.clearAllBits();
35562 // The result will have at least as many trailing zeros as the non-mask
35563 // operand since bits can only map to the same or higher bit position.
35564 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
35565 break;
35566 }
35567 case X86ISD::PEXT: {
35568 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
35569 // The result has as many leading zeros as the number of zeroes in the mask.
35570 unsigned Count = Known.Zero.countPopulation();
35571 Known.Zero = APInt::getHighBitsSet(BitWidth, Count);
35572 Known.One.clearAllBits();
35573 break;
35574 }
35575 case X86ISD::VTRUNC:
35576 case X86ISD::VTRUNCS:
35577 case X86ISD::VTRUNCUS:
35578 case X86ISD::CVTSI2P:
35579 case X86ISD::CVTUI2P:
35580 case X86ISD::CVTP2SI:
35581 case X86ISD::CVTP2UI:
35582 case X86ISD::MCVTP2SI:
35583 case X86ISD::MCVTP2UI:
35584 case X86ISD::CVTTP2SI:
35585 case X86ISD::CVTTP2UI:
35586 case X86ISD::MCVTTP2SI:
35587 case X86ISD::MCVTTP2UI:
35588 case X86ISD::MCVTSI2P:
35589 case X86ISD::MCVTUI2P:
35590 case X86ISD::VFPROUND:
35591 case X86ISD::VMFPROUND:
35592 case X86ISD::CVTPS2PH:
35593 case X86ISD::MCVTPS2PH: {
35594 // Truncations/Conversions - upper elements are known zero.
35595 EVT SrcVT = Op.getOperand(0).getValueType();
35596 if (SrcVT.isVector()) {
35597 unsigned NumSrcElts = SrcVT.getVectorNumElements();
35598 if (NumElts > NumSrcElts &&
35599 DemandedElts.countTrailingZeros() >= NumSrcElts)
35600 Known.setAllZero();
35601 }
35602 break;
35603 }
35604 case X86ISD::STRICT_CVTTP2SI:
35605 case X86ISD::STRICT_CVTTP2UI:
35606 case X86ISD::STRICT_CVTSI2P:
35607 case X86ISD::STRICT_CVTUI2P:
35608 case X86ISD::STRICT_VFPROUND:
35609 case X86ISD::STRICT_CVTPS2PH: {
35610 // Strict Conversions - upper elements are known zero.
35611 EVT SrcVT = Op.getOperand(1).getValueType();
35612 if (SrcVT.isVector()) {
35613 unsigned NumSrcElts = SrcVT.getVectorNumElements();
35614 if (NumElts > NumSrcElts &&
35615 DemandedElts.countTrailingZeros() >= NumSrcElts)
35616 Known.setAllZero();
35617 }
35618 break;
35619 }
35620 case X86ISD::MOVQ2DQ: {
35621 // Move from MMX to XMM. Upper half of XMM should be 0.
35622 if (DemandedElts.countTrailingZeros() >= (NumElts / 2))
35623 Known.setAllZero();
35624 break;
35625 }
35626 }
35627
35628 // Handle target shuffles.
35629 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
35630 if (isTargetShuffle(Opc)) {
35631 SmallVector<int, 64> Mask;
35632 SmallVector<SDValue, 2> Ops;
35633 if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask)) {
35634 unsigned NumOps = Ops.size();
35635 unsigned NumElts = VT.getVectorNumElements();
35636 if (Mask.size() == NumElts) {
35637 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
35638 Known.Zero.setAllBits(); Known.One.setAllBits();
35639 for (unsigned i = 0; i != NumElts; ++i) {
35640 if (!DemandedElts[i])
35641 continue;
35642 int M = Mask[i];
35643 if (M == SM_SentinelUndef) {
35644 // For UNDEF elements, we don't know anything about the common state
35645 // of the shuffle result.
35646 Known.resetAll();
35647 break;
35648 }
35649 if (M == SM_SentinelZero) {
35650 Known.One.clearAllBits();
35651 continue;
35652 }
35653 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&(static_cast<void> (0))
35654 "Shuffle index out of range")(static_cast<void> (0));
35655
35656 unsigned OpIdx = (unsigned)M / NumElts;
35657 unsigned EltIdx = (unsigned)M % NumElts;
35658 if (Ops[OpIdx].getValueType() != VT) {
35659 // TODO - handle target shuffle ops with different value types.
35660 Known.resetAll();
35661 break;
35662 }
35663 DemandedOps[OpIdx].setBit(EltIdx);
35664 }
35665 // Known bits are the values that are shared by every demanded element.
35666 for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {
35667 if (!DemandedOps[i])
35668 continue;
35669 KnownBits Known2 =
35670 DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);
35671 Known = KnownBits::commonBits(Known, Known2);
35672 }
35673 }
35674 }
35675 }
35676}
35677
35678unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
35679 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
35680 unsigned Depth) const {
35681 EVT VT = Op.getValueType();
35682 unsigned VTBits = VT.getScalarSizeInBits();
35683 unsigned Opcode = Op.getOpcode();
35684 switch (Opcode) {
35685 case X86ISD::SETCC_CARRY:
35686 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
35687 return VTBits;
35688
35689 case X86ISD::VTRUNC: {
35690 SDValue Src = Op.getOperand(0);
35691 MVT SrcVT = Src.getSimpleValueType();
35692 unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
35693 assert(VTBits < NumSrcBits && "Illegal truncation input type")(static_cast<void> (0));
35694 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
35695 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
35696 if (Tmp > (NumSrcBits - VTBits))
35697 return Tmp - (NumSrcBits - VTBits);
35698 return 1;
35699 }
35700
35701 case X86ISD::PACKSS: {
35702 // PACKSS is just a truncation if the sign bits extend to the packed size.
35703 APInt DemandedLHS, DemandedRHS;
35704 getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,
35705 DemandedRHS);
35706
35707 unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
35708 unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;
35709 if (!!DemandedLHS)
35710 Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), DemandedLHS, Depth + 1);
35711 if (!!DemandedRHS)
35712 Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), DemandedRHS, Depth + 1);
35713 unsigned Tmp = std::min(Tmp0, Tmp1);
35714 if (Tmp > (SrcBits - VTBits))
35715 return Tmp - (SrcBits - VTBits);
35716 return 1;
35717 }
35718
35719 case X86ISD::VBROADCAST: {
35720 SDValue Src = Op.getOperand(0);
35721 if (!Src.getSimpleValueType().isVector())
35722 return DAG.ComputeNumSignBits(Src, Depth + 1);
35723 break;
35724 }
35725
35726 case X86ISD::VSHLI: {
35727 SDValue Src = Op.getOperand(0);
35728 const APInt &ShiftVal = Op.getConstantOperandAPInt(1);
35729 if (ShiftVal.uge(VTBits))
35730 return VTBits; // Shifted all bits out --> zero.
35731 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
35732 if (ShiftVal.uge(Tmp))
35733 return 1; // Shifted all sign bits out --> unknown.
35734 return Tmp - ShiftVal.getZExtValue();
35735 }
35736
35737 case X86ISD::VSRAI: {
35738 SDValue Src = Op.getOperand(0);
35739 APInt ShiftVal = Op.getConstantOperandAPInt(1);
35740 if (ShiftVal.uge(VTBits - 1))
35741 return VTBits; // Sign splat.
35742 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
35743 ShiftVal += Tmp;
35744 return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
35745 }
35746
35747 case X86ISD::FSETCC:
35748 // cmpss/cmpsd return zero/all-bits result values in the bottom element.
35749 if (VT == MVT::f32 || VT == MVT::f64 ||
35750 ((VT == MVT::v4f32 || VT == MVT::v2f64) && DemandedElts == 1))
35751 return VTBits;
35752 break;
35753
35754 case X86ISD::PCMPGT:
35755 case X86ISD::PCMPEQ:
35756 case X86ISD::CMPP:
35757 case X86ISD::VPCOM:
35758 case X86ISD::VPCOMU:
35759 // Vector compares return zero/all-bits result values.
35760 return VTBits;
35761
35762 case X86ISD::ANDNP: {
35763 unsigned Tmp0 =
35764 DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
35765 if (Tmp0 == 1) return 1; // Early out.
35766 unsigned Tmp1 =
35767 DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
35768 return std::min(Tmp0, Tmp1);
35769 }
35770
35771 case X86ISD::CMOV: {
35772 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
35773 if (Tmp0 == 1) return 1; // Early out.
35774 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
35775 return std::min(Tmp0, Tmp1);
35776 }
35777 }
35778
35779 // Handle target shuffles.
35780 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
35781 if (isTargetShuffle(Opcode)) {
35782 SmallVector<int, 64> Mask;
35783 SmallVector<SDValue, 2> Ops;
35784 if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask)) {
35785 unsigned NumOps = Ops.size();
35786 unsigned NumElts = VT.getVectorNumElements();
35787 if (Mask.size() == NumElts) {
35788 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
35789 for (unsigned i = 0; i != NumElts; ++i) {
35790 if (!DemandedElts[i])
35791 continue;
35792 int M = Mask[i];
35793 if (M == SM_SentinelUndef) {
35794 // For UNDEF elements, we don't know anything about the common state
35795 // of the shuffle result.
35796 return 1;
35797 } else if (M == SM_SentinelZero) {
35798 // Zero = all sign bits.
35799 continue;
35800 }
35801 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&(static_cast<void> (0))
35802 "Shuffle index out of range")(static_cast<void> (0));
35803
35804 unsigned OpIdx = (unsigned)M / NumElts;
35805 unsigned EltIdx = (unsigned)M % NumElts;
35806 if (Ops[OpIdx].getValueType() != VT) {
35807 // TODO - handle target shuffle ops with different value types.
35808 return 1;
35809 }
35810 DemandedOps[OpIdx].setBit(EltIdx);
35811 }
35812 unsigned Tmp0 = VTBits;
35813 for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) {
35814 if (!DemandedOps[i])
35815 continue;
35816 unsigned Tmp1 =
35817 DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1);
35818 Tmp0 = std::min(Tmp0, Tmp1);
35819 }
35820 return Tmp0;
35821 }
35822 }
35823 }
35824
35825 // Fallback case.
35826 return 1;
35827}
35828
35829SDValue X86TargetLowering::unwrapAddress(SDValue N) const {
35830 if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
35831 return N->getOperand(0);
35832 return N;
35833}
35834
35835// Helper to look for a normal load that can be narrowed into a vzload with the
35836// specified VT and memory VT. Returns SDValue() on failure.
35837static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT,
35838 SelectionDAG &DAG) {
35839 // Can't if the load is volatile or atomic.
35840 if (!LN->isSimple())
35841 return SDValue();
35842
35843 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
35844 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
35845 return DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, SDLoc(LN), Tys, Ops, MemVT,
35846 LN->getPointerInfo(), LN->getOriginalAlign(),
35847 LN->getMemOperand()->getFlags());
35848}
35849
35850// Attempt to match a combined shuffle mask against supported unary shuffle
35851// instructions.
35852// TODO: Investigate sharing more of this with shuffle lowering.
35853static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
35854 bool AllowFloatDomain, bool AllowIntDomain,
35855 SDValue &V1, const SDLoc &DL, SelectionDAG &DAG,
35856 const X86Subtarget &Subtarget, unsigned &Shuffle,
35857 MVT &SrcVT, MVT &DstVT) {
35858 unsigned NumMaskElts = Mask.size();
35859 unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
35860
35861 // Match against a VZEXT_MOVL vXi32 and vXi16 zero-extending instruction.
35862 if (Mask[0] == 0 &&
35863 (MaskEltSize == 32 || (MaskEltSize == 16 && Subtarget.hasFP16()))) {
35864 if ((isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) ||
35865 (V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
35866 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1))) {
35867 Shuffle = X86ISD::VZEXT_MOVL;
35868 SrcVT = DstVT =
35869 !Subtarget.hasSSE2() && MaskEltSize == 32 ? MVT::v4f32 : MaskVT;
35870 return true;
35871 }
35872 }
35873
35874 // Match against a ANY/ZERO_EXTEND_VECTOR_INREG instruction.
35875 // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
35876 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
35877 (MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
35878 unsigned MaxScale = 64 / MaskEltSize;
35879 for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
35880 bool MatchAny = true;
35881 bool MatchZero = true;
35882 unsigned NumDstElts = NumMaskElts / Scale;
35883 for (unsigned i = 0; i != NumDstElts && (MatchAny || MatchZero); ++i) {
35884 if (!isUndefOrEqual(Mask[i * Scale], (int)i)) {
35885 MatchAny = MatchZero = false;
35886 break;
35887 }
35888 MatchAny &= isUndefInRange(Mask, (i * Scale) + 1, Scale - 1);
35889 MatchZero &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
35890 }
35891 if (MatchAny || MatchZero) {
35892 assert(MatchZero && "Failed to match zext but matched aext?")(static_cast<void> (0));
35893 unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
35894 MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() :
35895 MVT::getIntegerVT(MaskEltSize);
35896 SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
35897
35898 if (SrcVT.getSizeInBits() != MaskVT.getSizeInBits())
35899 V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);
35900
35901 Shuffle = unsigned(MatchAny ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND);
35902 if (SrcVT.getVectorNumElements() != NumDstElts)
35903 Shuffle = getOpcode_EXTEND_VECTOR_INREG(Shuffle);
35904
35905 DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
35906 DstVT = MVT::getVectorVT(DstVT, NumDstElts);
35907 return true;
35908 }
35909 }
35910 }
35911
35912 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
35913 if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
35914 isUndefOrEqual(Mask[0], 0) &&
35915 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
35916 Shuffle = X86ISD::VZEXT_MOVL;
35917 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
35918 return true;
35919 }
35920
35921 // Check if we have SSE3 which will let us use MOVDDUP etc. The
35922 // instructions are no slower than UNPCKLPD but has the option to
35923 // fold the input operand into even an unaligned memory load.
35924 if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
35925 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, V1)) {
35926 Shuffle = X86ISD::MOVDDUP;
35927 SrcVT = DstVT = MVT::v2f64;
35928 return true;
35929 }
35930 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, V1)) {
35931 Shuffle = X86ISD::MOVSLDUP;
35932 SrcVT = DstVT = MVT::v4f32;
35933 return true;
35934 }
35935 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3}, V1)) {
35936 Shuffle = X86ISD::MOVSHDUP;
35937 SrcVT = DstVT = MVT::v4f32;
35938 return true;
35939 }
35940 }
35941
35942 if (MaskVT.is256BitVector() && AllowFloatDomain) {
35943 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles")(static_cast<void> (0));
35944 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, V1)) {
35945 Shuffle = X86ISD::MOVDDUP;
35946 SrcVT = DstVT = MVT::v4f64;
35947 return true;
35948 }
35949 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1)) {
35950 Shuffle = X86ISD::MOVSLDUP;
35951 SrcVT = DstVT = MVT::v8f32;
35952 return true;
35953 }
35954 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3, 5, 5, 7, 7}, V1)) {
35955 Shuffle = X86ISD::MOVSHDUP;
35956 SrcVT = DstVT = MVT::v8f32;
35957 return true;
35958 }
35959 }
35960
35961 if (MaskVT.is512BitVector() && AllowFloatDomain) {
35962 assert(Subtarget.hasAVX512() &&(static_cast<void> (0))
35963 "AVX512 required for 512-bit vector shuffles")(static_cast<void> (0));
35964 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1)) {
35965 Shuffle = X86ISD::MOVDDUP;
35966 SrcVT = DstVT = MVT::v8f64;
35967 return true;
35968 }
35969 if (isTargetShuffleEquivalent(
35970 MaskVT, Mask,
35971 {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, V1)) {
35972 Shuffle = X86ISD::MOVSLDUP;
35973 SrcVT = DstVT = MVT::v16f32;
35974 return true;
35975 }
35976 if (isTargetShuffleEquivalent(
35977 MaskVT, Mask,
35978 {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, V1)) {
35979 Shuffle = X86ISD::MOVSHDUP;
35980 SrcVT = DstVT = MVT::v16f32;
35981 return true;
35982 }
35983 }
35984
35985 return false;
35986}
35987
35988// Attempt to match a combined shuffle mask against supported unary immediate
35989// permute instructions.
35990// TODO: Investigate sharing more of this with shuffle lowering.
35991static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,
35992 const APInt &Zeroable,
35993 bool AllowFloatDomain, bool AllowIntDomain,
35994 const X86Subtarget &Subtarget,
35995 unsigned &Shuffle, MVT &ShuffleVT,
35996 unsigned &PermuteImm) {
35997 unsigned NumMaskElts = Mask.size();
35998 unsigned InputSizeInBits = MaskVT.getSizeInBits();
35999 unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
36000 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
36001 bool ContainsZeros = isAnyZero(Mask);
36002
36003 // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
36004 if (!ContainsZeros && MaskScalarSizeInBits == 64) {
36005 // Check for lane crossing permutes.
36006 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
36007 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
36008 if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
36009 Shuffle = X86ISD::VPERMI;
36010 ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
36011 PermuteImm = getV4X86ShuffleImm(Mask);
36012 return true;
36013 }
36014 if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
36015 SmallVector<int, 4> RepeatedMask;
36016 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
36017 Shuffle = X86ISD::VPERMI;
36018 ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
36019 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
36020 return true;
36021 }
36022 }
36023 } else if (AllowFloatDomain && Subtarget.hasAVX()) {
36024 // VPERMILPD can permute with a non-repeating shuffle.
36025 Shuffle = X86ISD::VPERMILPI;
36026 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
36027 PermuteImm = 0;
36028 for (int i = 0, e = Mask.size(); i != e; ++i) {
36029 int M = Mask[i];
36030 if (M == SM_SentinelUndef)
36031 continue;
36032 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index")(static_cast<void> (0));
36033 PermuteImm |= (M & 1) << i;
36034 }
36035 return true;
36036 }
36037 }
36038
36039 // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
36040 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
36041 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
36042 if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
36043 !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
36044 SmallVector<int, 4> RepeatedMask;
36045 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
36046 // Narrow the repeated mask to create 32-bit element permutes.
36047 SmallVector<int, 4> WordMask = RepeatedMask;
36048 if (MaskScalarSizeInBits == 64)
36049 narrowShuffleMaskElts(2, RepeatedMask, WordMask);
36050
36051 Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
36052 ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
36053 ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
36054 PermuteImm = getV4X86ShuffleImm(WordMask);
36055 return true;
36056 }
36057 }
36058
36059 // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
36060 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16 &&
36061 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
36062 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
36063 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
36064 SmallVector<int, 4> RepeatedMask;
36065 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
36066 ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4);
36067 ArrayRef<int> HiMask(RepeatedMask.data() + 4, 4);
36068
36069 // PSHUFLW: permute lower 4 elements only.
36070 if (isUndefOrInRange(LoMask, 0, 4) &&
36071 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
36072 Shuffle = X86ISD::PSHUFLW;
36073 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
36074 PermuteImm = getV4X86ShuffleImm(LoMask);
36075 return true;
36076 }
36077
36078 // PSHUFHW: permute upper 4 elements only.
36079 if (isUndefOrInRange(HiMask, 4, 8) &&
36080 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
36081 // Offset the HiMask so that we can create the shuffle immediate.
36082 int OffsetHiMask[4];
36083 for (int i = 0; i != 4; ++i)
36084 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
36085
36086 Shuffle = X86ISD::PSHUFHW;
36087 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
36088 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
36089 return true;
36090 }
36091 }
36092 }
36093
36094 // Attempt to match against byte/bit shifts.
36095 if (AllowIntDomain &&
36096 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
36097 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
36098 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
36099 int ShiftAmt = matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits,
36100 Mask, 0, Zeroable, Subtarget);
36101 if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() || Subtarget.hasBWI() ||
36102 32 <= ShuffleVT.getScalarSizeInBits())) {
36103 PermuteImm = (unsigned)ShiftAmt;
36104 return true;
36105 }
36106 }
36107
36108 // Attempt to match against bit rotates.
36109 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 &&
36110 ((MaskVT.is128BitVector() && Subtarget.hasXOP()) ||
36111 Subtarget.hasAVX512())) {
36112 int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits,
36113 Subtarget, Mask);
36114 if (0 < RotateAmt) {
36115 Shuffle = X86ISD::VROTLI;
36116 PermuteImm = (unsigned)RotateAmt;
36117 return true;
36118 }
36119 }
36120
36121 return false;
36122}
36123
36124// Attempt to match a combined unary shuffle mask against supported binary
36125// shuffle instructions.
36126// TODO: Investigate sharing more of this with shuffle lowering.
36127static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
36128 bool AllowFloatDomain, bool AllowIntDomain,
36129 SDValue &V1, SDValue &V2, const SDLoc &DL,
36130 SelectionDAG &DAG, const X86Subtarget &Subtarget,
36131 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
36132 bool IsUnary) {
36133 unsigned NumMaskElts = Mask.size();
36134 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
36135
36136 if (MaskVT.is128BitVector()) {
36137 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}) && AllowFloatDomain) {
36138 V2 = V1;
36139 V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
36140 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;
36141 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
36142 return true;
36143 }
36144 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1}) && AllowFloatDomain) {
36145 V2 = V1;
36146 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;
36147 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
36148 return true;
36149 }
36150 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 3}) &&
36151 Subtarget.hasSSE2() && (AllowFloatDomain || !Subtarget.hasSSE41())) {
36152 std::swap(V1, V2);
36153 Shuffle = X86ISD::MOVSD;
36154 SrcVT = DstVT = MVT::v2f64;
36155 return true;
36156 }
36157 if (isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}) &&
36158 (AllowFloatDomain || !Subtarget.hasSSE41())) {
36159 Shuffle = X86ISD::MOVSS;
36160 SrcVT = DstVT = MVT::v4f32;
36161 return true;
36162 }
36163 if (isTargetShuffleEquivalent(MaskVT, Mask, {8, 1, 2, 3, 4, 5, 6, 7}) &&
36164 Subtarget.hasFP16()) {
36165 Shuffle = X86ISD::MOVSH;
36166 SrcVT = DstVT = MVT::v8f16;
36167 return true;
36168 }
36169 }
36170
36171 // Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.
36172 if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) ||
36173 ((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) ||
36174 ((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {
36175 if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
36176 Subtarget)) {
36177 DstVT = MaskVT;
36178 return true;
36179 }
36180 }
36181
36182 // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
36183 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
36184 (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
36185 (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
36186 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
36187 (MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
36188 if (matchShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG,
36189 Subtarget)) {
36190 SrcVT = DstVT = MaskVT;
36191 if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
36192 SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
36193 return true;
36194 }
36195 }
36196
36197 // Attempt to match against a OR if we're performing a blend shuffle and the
36198 // non-blended source element is zero in each case.
36199 if ((EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
36200 (EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) {
36201 bool IsBlend = true;
36202 unsigned NumV1Elts = V1.getValueType().getVectorNumElements();
36203 unsigned NumV2Elts = V2.getValueType().getVectorNumElements();
36204 unsigned Scale1 = NumV1Elts / NumMaskElts;
36205 unsigned Scale2 = NumV2Elts / NumMaskElts;
36206 APInt DemandedZeroV1 = APInt::getNullValue(NumV1Elts);
36207 APInt DemandedZeroV2 = APInt::getNullValue(NumV2Elts);
36208 for (unsigned i = 0; i != NumMaskElts; ++i) {
36209 int M = Mask[i];
36210 if (M == SM_SentinelUndef)
36211 continue;
36212 if (M == SM_SentinelZero) {
36213 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
36214 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
36215 continue;
36216 }
36217 if (M == (int)i) {
36218 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
36219 continue;
36220 }
36221 if (M == (int)(i + NumMaskElts)) {
36222 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
36223 continue;
36224 }
36225 IsBlend = false;
36226 break;
36227 }
36228 if (IsBlend &&
36229 DAG.computeKnownBits(V1, DemandedZeroV1).isZero() &&
36230 DAG.computeKnownBits(V2, DemandedZeroV2).isZero()) {
36231 Shuffle = ISD::OR;
36232 SrcVT = DstVT = MaskVT.changeTypeToInteger();
36233 return true;
36234 }
36235 }
36236
36237 return false;
36238}
36239
36240static bool matchBinaryPermuteShuffle(
36241 MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
36242 bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
36243 const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
36244 unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {
36245 unsigned NumMaskElts = Mask.size();
36246 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
36247
36248 // Attempt to match against VALIGND/VALIGNQ rotate.
36249 if (AllowIntDomain && (EltSizeInBits == 64 || EltSizeInBits == 32) &&
36250 ((MaskVT.is128BitVector() && Subtarget.hasVLX()) ||
36251 (MaskVT.is256BitVector() && Subtarget.hasVLX()) ||
36252 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
36253 if (!isAnyZero(Mask)) {
36254 int Rotation = matchShuffleAsElementRotate(V1, V2, Mask);
36255 if (0 < Rotation) {
36256 Shuffle = X86ISD::VALIGN;
36257 if (EltSizeInBits == 64)
36258 ShuffleVT = MVT::getVectorVT(MVT::i64, MaskVT.getSizeInBits() / 64);
36259 else
36260 ShuffleVT = MVT::getVectorVT(MVT::i32, MaskVT.getSizeInBits() / 32);
36261 PermuteImm = Rotation;
36262 return true;
36263 }
36264 }
36265 }
36266
36267 // Attempt to match against PALIGNR byte rotate.
36268 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
36269 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
36270 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
36271 int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);
36272 if (0 < ByteRotation) {
36273 Shuffle = X86ISD::PALIGNR;
36274 ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
36275 PermuteImm = ByteRotation;
36276 return true;
36277 }
36278 }
36279
36280 // Attempt to combine to X86ISD::BLENDI.
36281 if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
36282 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
36283 (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
36284 uint64_t BlendMask = 0;
36285 bool ForceV1Zero = false, ForceV2Zero = false;
36286 SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
36287 if (matchShuffleAsBlend(V1, V2, TargetMask, Zeroable, ForceV1Zero,
36288 ForceV2Zero, BlendMask)) {
36289 if (MaskVT == MVT::v16i16) {
36290 // We can only use v16i16 PBLENDW if the lanes are repeated.
36291 SmallVector<int, 8> RepeatedMask;
36292 if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
36293 RepeatedMask)) {
36294 assert(RepeatedMask.size() == 8 &&(static_cast<void> (0))
36295 "Repeated mask size doesn't match!")(static_cast<void> (0));
36296 PermuteImm = 0;
36297 for (int i = 0; i < 8; ++i)
36298 if (RepeatedMask[i] >= 8)
36299 PermuteImm |= 1 << i;
36300 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
36301 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
36302 Shuffle = X86ISD::BLENDI;
36303 ShuffleVT = MaskVT;
36304 return true;
36305 }
36306 } else {
36307 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
36308 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
36309 PermuteImm = (unsigned)BlendMask;
36310 Shuffle = X86ISD::BLENDI;
36311 ShuffleVT = MaskVT;
36312 return true;
36313 }
36314 }
36315 }
36316
36317 // Attempt to combine to INSERTPS, but only if it has elements that need to
36318 // be set to zero.
36319 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
36320 MaskVT.is128BitVector() && isAnyZero(Mask) &&
36321 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
36322 Shuffle = X86ISD::INSERTPS;
36323 ShuffleVT = MVT::v4f32;
36324 return true;
36325 }
36326
36327 // Attempt to combine to SHUFPD.
36328 if (AllowFloatDomain && EltSizeInBits == 64 &&
36329 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
36330 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
36331 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
36332 bool ForceV1Zero = false, ForceV2Zero = false;
36333 if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero,
36334 PermuteImm, Mask, Zeroable)) {
36335 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
36336 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
36337 Shuffle = X86ISD::SHUFP;
36338 ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
36339 return true;
36340 }
36341 }
36342
36343 // Attempt to combine to SHUFPS.
36344 if (AllowFloatDomain && EltSizeInBits == 32 &&
36345 ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
36346 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
36347 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
36348 SmallVector<int, 4> RepeatedMask;
36349 if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
36350 // Match each half of the repeated mask, to determine if its just
36351 // referencing one of the vectors, is zeroable or entirely undef.
36352 auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
36353 int M0 = RepeatedMask[Offset];
36354 int M1 = RepeatedMask[Offset + 1];
36355
36356 if (isUndefInRange(RepeatedMask, Offset, 2)) {
36357 return DAG.getUNDEF(MaskVT);
36358 } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
36359 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
36360 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
36361 return getZeroVector(MaskVT, Subtarget, DAG, DL);
36362 } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
36363 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
36364 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
36365 return V1;
36366 } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
36367 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
36368 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
36369 return V2;
36370 }
36371
36372 return SDValue();
36373 };
36374
36375 int ShufMask[4] = {-1, -1, -1, -1};
36376 SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
36377 SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
36378
36379 if (Lo && Hi) {
36380 V1 = Lo;
36381 V2 = Hi;
36382 Shuffle = X86ISD::SHUFP;
36383 ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
36384 PermuteImm = getV4X86ShuffleImm(ShufMask);
36385 return true;
36386 }
36387 }
36388 }
36389
36390 // Attempt to combine to INSERTPS more generally if X86ISD::SHUFP failed.
36391 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
36392 MaskVT.is128BitVector() &&
36393 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
36394 Shuffle = X86ISD::INSERTPS;
36395 ShuffleVT = MVT::v4f32;
36396 return true;
36397 }
36398
36399 return false;
36400}
36401
36402static SDValue combineX86ShuffleChainWithExtract(
36403 ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
36404 bool HasVariableMask, bool AllowVariableCrossLaneMask,
36405 bool AllowVariablePerLaneMask, SelectionDAG &DAG,
36406 const X86Subtarget &Subtarget);
36407
36408/// Combine an arbitrary chain of shuffles into a single instruction if
36409/// possible.
36410///
36411/// This is the leaf of the recursive combine below. When we have found some
36412/// chain of single-use x86 shuffle instructions and accumulated the combined
36413/// shuffle mask represented by them, this will try to pattern match that mask
36414/// into either a single instruction if there is a special purpose instruction
36415/// for this operation, or into a PSHUFB instruction which is a fully general
36416/// instruction but should only be used to replace chains over a certain depth.
36417static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
36418 ArrayRef<int> BaseMask, int Depth,
36419 bool HasVariableMask,
36420 bool AllowVariableCrossLaneMask,
36421 bool AllowVariablePerLaneMask,
36422 SelectionDAG &DAG,
36423 const X86Subtarget &Subtarget) {
36424 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!")(static_cast<void> (0));
36425 assert((Inputs.size() == 1 || Inputs.size() == 2) &&(static_cast<void> (0))
36426 "Unexpected number of shuffle inputs!")(static_cast<void> (0));
36427
36428 MVT RootVT = Root.getSimpleValueType();
36429 unsigned RootSizeInBits = RootVT.getSizeInBits();
36430 unsigned NumRootElts = RootVT.getVectorNumElements();
36431
36432 // Canonicalize shuffle input op to the requested type.
36433 // TODO: Support cases where Op is smaller than VT.
36434 auto CanonicalizeShuffleInput = [&](MVT VT, SDValue Op) {
36435 return DAG.getBitcast(VT, Op);
36436 };
36437
36438 // Find the inputs that enter the chain. Note that multiple uses are OK
36439 // here, we're not going to remove the operands we find.
36440 bool UnaryShuffle = (Inputs.size() == 1);
36441 SDValue V1 = peekThroughBitcasts(Inputs[0]);
36442 SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
36443 : peekThroughBitcasts(Inputs[1]));
36444
36445 MVT VT1 = V1.getSimpleValueType();
36446 MVT VT2 = V2.getSimpleValueType();
36447 assert(VT1.getSizeInBits() == RootSizeInBits &&(static_cast<void> (0))
36448 VT2.getSizeInBits() == RootSizeInBits && "Vector size mismatch")(static_cast<void> (0));
36449
36450 SDLoc DL(Root);
36451 SDValue Res;
36452
36453 unsigned NumBaseMaskElts = BaseMask.size();
36454 if (NumBaseMaskElts == 1) {
36455 assert(BaseMask[0] == 0 && "Invalid shuffle index found!")(static_cast<void> (0));
36456 return CanonicalizeShuffleInput(RootVT, V1);
36457 }
36458
36459 bool OptForSize = DAG.shouldOptForSize();
36460 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
36461 bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
36462 (RootVT.isFloatingPoint() && Depth >= 1) ||
36463 (RootVT.is256BitVector() && !Subtarget.hasAVX2());
36464
36465 // Don't combine if we are a AVX512/EVEX target and the mask element size
36466 // is different from the root element size - this would prevent writemasks
36467 // from being reused.
36468 bool IsMaskedShuffle = false;
36469 if (RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128)) {
36470 if (Root.hasOneUse() && Root->use_begin()->getOpcode() == ISD::VSELECT &&
36471 Root->use_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
36472 IsMaskedShuffle = true;
36473 }
36474 }
36475
36476 // If we are shuffling a broadcast (and not introducing zeros) then
36477 // we can just use the broadcast directly. This works for smaller broadcast
36478 // elements as well as they already repeat across each mask element
36479 if (UnaryShuffle && isTargetShuffleSplat(V1) && !isAnyZero(BaseMask) &&
36480 (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
36481 V1.getValueSizeInBits() >= RootSizeInBits) {
36482 return CanonicalizeShuffleInput(RootVT, V1);
36483 }
36484
36485 SmallVector<int, 64> Mask(BaseMask.begin(), BaseMask.end());
36486
36487 // See if the shuffle is a hidden identity shuffle - repeated args in HOPs
36488 // etc. can be simplified.
36489 if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits) {
36490 SmallVector<int> ScaledMask, IdentityMask;
36491 unsigned NumElts = VT1.getVectorNumElements();
36492 if (Mask.size() <= NumElts &&
36493 scaleShuffleElements(Mask, NumElts, ScaledMask)) {
36494 for (unsigned i = 0; i != NumElts; ++i)
36495 IdentityMask.push_back(i);
36496 if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, V1, V2))
36497 return CanonicalizeShuffleInput(RootVT, V1);
36498 }
36499 }
36500
36501 // Handle 128/256-bit lane shuffles of 512-bit vectors.
36502 if (RootVT.is512BitVector() &&
36503 (NumBaseMaskElts == 2 || NumBaseMaskElts == 4)) {
36504 // If the upper subvectors are zeroable, then an extract+insert is more
36505 // optimal than using X86ISD::SHUF128. The insertion is free, even if it has
36506 // to zero the upper subvectors.
36507 if (isUndefOrZeroInRange(Mask, 1, NumBaseMaskElts - 1)) {
36508 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
36509 return SDValue(); // Nothing to do!
36510 assert(isInRange(Mask[0], 0, NumBaseMaskElts) &&(static_cast<void> (0))
36511 "Unexpected lane shuffle")(static_cast<void> (0));
36512 Res = CanonicalizeShuffleInput(RootVT, V1);
36513 unsigned SubIdx = Mask[0] * (NumRootElts / NumBaseMaskElts);
36514 bool UseZero = isAnyZero(Mask);
36515 Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);
36516 return widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);
36517 }
36518
36519 // Narrow shuffle mask to v4x128.
36520 SmallVector<int, 4> ScaledMask;
36521 assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size")(static_cast<void> (0));
36522 narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, Mask, ScaledMask);
36523
36524 // Try to lower to vshuf64x2/vshuf32x4.
36525 auto MatchSHUF128 = [&](MVT ShuffleVT, const SDLoc &DL,
36526 ArrayRef<int> ScaledMask, SDValue V1, SDValue V2,
36527 SelectionDAG &DAG) {
36528 unsigned PermMask = 0;
36529 // Insure elements came from the same Op.
36530 SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)};
36531 for (int i = 0; i < 4; ++i) {
36532 assert(ScaledMask[i] >= -1 && "Illegal shuffle sentinel value")(static_cast<void> (0));
36533 if (ScaledMask[i] < 0)
36534 continue;
36535
36536 SDValue Op = ScaledMask[i] >= 4 ? V2 : V1;
36537 unsigned OpIndex = i / 2;
36538 if (Ops[OpIndex].isUndef())
36539 Ops[OpIndex] = Op;
36540 else if (Ops[OpIndex] != Op)
36541 return SDValue();
36542
36543 // Convert the 128-bit shuffle mask selection values into 128-bit
36544 // selection bits defined by a vshuf64x2 instruction's immediate control
36545 // byte.
36546 PermMask |= (ScaledMask[i] % 4) << (i * 2);
36547 }
36548
36549 return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
36550 CanonicalizeShuffleInput(ShuffleVT, Ops[0]),
36551 CanonicalizeShuffleInput(ShuffleVT, Ops[1]),
36552 DAG.getTargetConstant(PermMask, DL, MVT::i8));
36553 };
36554
36555 // FIXME: Is there a better way to do this? is256BitLaneRepeatedShuffleMask
36556 // doesn't work because our mask is for 128 bits and we don't have an MVT
36557 // to match that.
36558 bool PreferPERMQ = UnaryShuffle && isUndefOrInRange(ScaledMask[0], 0, 2) &&
36559 isUndefOrInRange(ScaledMask[1], 0, 2) &&
36560 isUndefOrInRange(ScaledMask[2], 2, 4) &&
36561 isUndefOrInRange(ScaledMask[3], 2, 4) &&
36562 (ScaledMask[0] < 0 || ScaledMask[2] < 0 ||
36563 ScaledMask[0] == (ScaledMask[2] % 2)) &&
36564 (ScaledMask[1] < 0 || ScaledMask[3] < 0 ||
36565 ScaledMask[1] == (ScaledMask[3] % 2));
36566
36567 if (!isAnyZero(ScaledMask) && !PreferPERMQ) {
36568 if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
36569 return SDValue(); // Nothing to do!
36570 MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
36571 if (SDValue V = MatchSHUF128(ShuffleVT, DL, ScaledMask, V1, V2, DAG))
36572 return DAG.getBitcast(RootVT, V);
36573 }
36574 }
36575
36576 // Handle 128-bit lane shuffles of 256-bit vectors.
36577 if (RootVT.is256BitVector() && NumBaseMaskElts == 2) {
36578 // If the upper half is zeroable, then an extract+insert is more optimal
36579 // than using X86ISD::VPERM2X128. The insertion is free, even if it has to
36580 // zero the upper half.
36581 if (isUndefOrZero(Mask[1])) {
36582 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
36583 return SDValue(); // Nothing to do!
36584 assert(isInRange(Mask[0], 0, 2) && "Unexpected lane shuffle")(static_cast<void> (0));
36585 Res = CanonicalizeShuffleInput(RootVT, V1);
36586 Res = extract128BitVector(Res, Mask[0] * (NumRootElts / 2), DAG, DL);
36587 return widenSubVector(Res, Mask[1] == SM_SentinelZero, Subtarget, DAG, DL,
36588 256);
36589 }
36590
36591 // If we're splatting the low subvector, an insert-subvector 'concat'
36592 // pattern is quicker than VPERM2X128.
36593 // TODO: Add AVX2 support instead of VPERMQ/VPERMPD.
36594 if (Mask[0] == 0 && Mask[1] == 0 && !Subtarget.hasAVX2()) {
36595 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
36596 return SDValue(); // Nothing to do!
36597 Res = CanonicalizeShuffleInput(RootVT, V1);
36598 Res = extractSubVector(Res, 0, DAG, DL, 128);
36599 return concatSubVectors(Res, Res, DAG, DL);
36600 }
36601
36602 if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128)
36603 return SDValue(); // Nothing to do!
36604
36605 // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
36606 // we need to use the zeroing feature.
36607 // Prefer blends for sequential shuffles unless we are optimizing for size.
36608 if (UnaryShuffle &&
36609 !(Subtarget.hasAVX2() && isUndefOrInRange(Mask, 0, 2)) &&
36610 (OptForSize || !isSequentialOrUndefOrZeroInRange(Mask, 0, 2, 0))) {
36611 unsigned PermMask = 0;
36612 PermMask |= ((Mask[0] < 0 ? 0x8 : (Mask[0] & 1)) << 0);
36613 PermMask |= ((Mask[1] < 0 ? 0x8 : (Mask[1] & 1)) << 4);
36614 return DAG.getNode(
36615 X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, V1),
36616 DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8));
36617 }
36618
36619 if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
36620 return SDValue(); // Nothing to do!
36621
36622 // TODO - handle AVX512VL cases with X86ISD::SHUF128.
36623 if (!UnaryShuffle && !IsMaskedShuffle) {
36624 assert(llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) &&(static_cast<void> (0))
36625 "Unexpected shuffle sentinel value")(static_cast<void> (0));
36626 // Prefer blends to X86ISD::VPERM2X128.
36627 if (!((Mask[0] == 0 && Mask[1] == 3) || (Mask[0] == 2 && Mask[1] == 1))) {
36628 unsigned PermMask = 0;
36629 PermMask |= ((Mask[0] & 3) << 0);
36630 PermMask |= ((Mask[1] & 3) << 4);
36631 SDValue LHS = isInRange(Mask[0], 0, 2) ? V1 : V2;
36632 SDValue RHS = isInRange(Mask[1], 0, 2) ? V1 : V2;
36633 return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT,
36634 CanonicalizeShuffleInput(RootVT, LHS),
36635 CanonicalizeShuffleInput(RootVT, RHS),
36636 DAG.getTargetConstant(PermMask, DL, MVT::i8));
36637 }
36638 }
36639 }
36640
36641 // For masks that have been widened to 128-bit elements or more,
36642 // narrow back down to 64-bit elements.
36643 if (BaseMaskEltSizeInBits > 64) {
36644 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size")(static_cast<void> (0));
36645 int MaskScale = BaseMaskEltSizeInBits / 64;
36646 SmallVector<int, 64> ScaledMask;
36647 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
36648 Mask = std::move(ScaledMask);
36649 }
36650
36651 // For masked shuffles, we're trying to match the root width for better
36652 // writemask folding, attempt to scale the mask.
36653 // TODO - variable shuffles might need this to be widened again.
36654 if (IsMaskedShuffle && NumRootElts > Mask.size()) {
36655 assert((NumRootElts % Mask.size()) == 0 && "Illegal mask size")(static_cast<void> (0));
36656 int MaskScale = NumRootElts / Mask.size();
36657 SmallVector<int, 64> ScaledMask;
36658 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
36659 Mask = std::move(ScaledMask);
36660 }
36661
36662 unsigned NumMaskElts = Mask.size();
36663 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
36664
36665 // Determine the effective mask value type.
36666 FloatDomain &= (32 <= MaskEltSizeInBits);
36667 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
36668 : MVT::getIntegerVT(MaskEltSizeInBits);
36669 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
36670
36671 // Only allow legal mask types.
36672 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
36673 return SDValue();
36674
36675 // Attempt to match the mask against known shuffle patterns.
36676 MVT ShuffleSrcVT, ShuffleVT;
36677 unsigned Shuffle, PermuteImm;
36678
36679 // Which shuffle domains are permitted?
36680 // Permit domain crossing at higher combine depths.
36681 // TODO: Should we indicate which domain is preferred if both are allowed?
36682 bool AllowFloatDomain = FloatDomain || (Depth >= 3);
36683 bool AllowIntDomain = (!FloatDomain || (Depth >= 3)) && Subtarget.hasSSE2() &&
36684 (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
36685
36686 // Determine zeroable mask elements.
36687 APInt KnownUndef, KnownZero;
36688 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
36689 APInt Zeroable = KnownUndef | KnownZero;
36690
36691 if (UnaryShuffle) {
36692 // Attempt to match against broadcast-from-vector.
36693 // Limit AVX1 to cases where we're loading+broadcasting a scalar element.
36694 if ((Subtarget.hasAVX2() ||
36695 (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) &&
36696 (!IsMaskedShuffle || NumRootElts == NumMaskElts)) {
36697 if (isUndefOrEqual(Mask, 0)) {
36698 if (V1.getValueType() == MaskVT &&
36699 V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
36700 MayFoldLoad(V1.getOperand(0))) {
36701 if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
36702 return SDValue(); // Nothing to do!
36703 Res = V1.getOperand(0);
36704 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
36705 return DAG.getBitcast(RootVT, Res);
36706 }
36707 if (Subtarget.hasAVX2()) {
36708 if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
36709 return SDValue(); // Nothing to do!
36710 Res = CanonicalizeShuffleInput(MaskVT, V1);
36711 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
36712 return DAG.getBitcast(RootVT, Res);
36713 }
36714 }
36715 }
36716
36717 SDValue NewV1 = V1; // Save operand in case early exit happens.
36718 if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
36719 DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
36720 ShuffleVT) &&
36721 (!IsMaskedShuffle ||
36722 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
36723 if (Depth == 0 && Root.getOpcode() == Shuffle)
36724 return SDValue(); // Nothing to do!
36725 Res = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);
36726 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
36727 return DAG.getBitcast(RootVT, Res);
36728 }
36729
36730 if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
36731 AllowIntDomain, Subtarget, Shuffle, ShuffleVT,
36732 PermuteImm) &&
36733 (!IsMaskedShuffle ||
36734 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
36735 if (Depth == 0 && Root.getOpcode() == Shuffle)
36736 return SDValue(); // Nothing to do!
36737 Res = CanonicalizeShuffleInput(ShuffleVT, V1);
36738 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
36739 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
36740 return DAG.getBitcast(RootVT, Res);
36741 }
36742 }
36743
36744 // Attempt to combine to INSERTPS, but only if the inserted element has come
36745 // from a scalar.
36746 // TODO: Handle other insertions here as well?
36747 if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 &&
36748 Subtarget.hasSSE41() &&
36749 !isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3})) {
36750 if (MaskEltSizeInBits == 32) {
36751 SDValue SrcV1 = V1, SrcV2 = V2;
36752 if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask,
36753 DAG) &&
36754 SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {
36755 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
36756 return SDValue(); // Nothing to do!
36757 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
36758 CanonicalizeShuffleInput(MVT::v4f32, SrcV1),
36759 CanonicalizeShuffleInput(MVT::v4f32, SrcV2),
36760 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
36761 return DAG.getBitcast(RootVT, Res);
36762 }
36763 }
36764 if (MaskEltSizeInBits == 64 &&
36765 isTargetShuffleEquivalent(MaskVT, Mask, {0, 2}) &&
36766 V2.getOpcode() == ISD::SCALAR_TO_VECTOR &&
36767 V2.getScalarValueSizeInBits() <= 32) {
36768 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
36769 return SDValue(); // Nothing to do!
36770 PermuteImm = (/*DstIdx*/2 << 4) | (/*SrcIdx*/0 << 0);
36771 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
36772 CanonicalizeShuffleInput(MVT::v4f32, V1),
36773 CanonicalizeShuffleInput(MVT::v4f32, V2),
36774 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
36775 return DAG.getBitcast(RootVT, Res);
36776 }
36777 }
36778
36779 SDValue NewV1 = V1; // Save operands in case early exit happens.
36780 SDValue NewV2 = V2;
36781 if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
36782 NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
36783 ShuffleVT, UnaryShuffle) &&
36784 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
36785 if (Depth == 0 && Root.getOpcode() == Shuffle)
36786 return SDValue(); // Nothing to do!
36787 NewV1 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);
36788 NewV2 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV2);
36789 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
36790 return DAG.getBitcast(RootVT, Res);
36791 }
36792
36793 NewV1 = V1; // Save operands in case early exit happens.
36794 NewV2 = V2;
36795 if (matchBinaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
36796 AllowIntDomain, NewV1, NewV2, DL, DAG,
36797 Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
36798 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
36799 if (Depth == 0 && Root.getOpcode() == Shuffle)
36800 return SDValue(); // Nothing to do!
36801 NewV1 = CanonicalizeShuffleInput(ShuffleVT, NewV1);
36802 NewV2 = CanonicalizeShuffleInput(ShuffleVT, NewV2);
36803 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
36804 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
36805 return DAG.getBitcast(RootVT, Res);
36806 }
36807
36808 // Typically from here on, we need an integer version of MaskVT.
36809 MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
36810 IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
36811
36812 // Annoyingly, SSE4A instructions don't map into the above match helpers.
36813 if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
36814 uint64_t BitLen, BitIdx;
36815 if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
36816 Zeroable)) {
36817 if (Depth == 0 && Root.getOpcode() == X86ISD::EXTRQI)
36818 return SDValue(); // Nothing to do!
36819 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
36820 Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
36821 DAG.getTargetConstant(BitLen, DL, MVT::i8),
36822 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
36823 return DAG.getBitcast(RootVT, Res);
36824 }
36825
36826 if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
36827 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTQI)
36828 return SDValue(); // Nothing to do!
36829 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
36830 V2 = CanonicalizeShuffleInput(IntMaskVT, V2);
36831 Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
36832 DAG.getTargetConstant(BitLen, DL, MVT::i8),
36833 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
36834 return DAG.getBitcast(RootVT, Res);
36835 }
36836 }
36837
36838 // Match shuffle against TRUNCATE patterns.
36839 if (AllowIntDomain && MaskEltSizeInBits < 64 && Subtarget.hasAVX512()) {
36840 // Match against a VTRUNC instruction, accounting for src/dst sizes.
36841 if (matchShuffleAsVTRUNC(ShuffleSrcVT, ShuffleVT, IntMaskVT, Mask, Zeroable,
36842 Subtarget)) {
36843 bool IsTRUNCATE = ShuffleVT.getVectorNumElements() ==
36844 ShuffleSrcVT.getVectorNumElements();
36845 unsigned Opc =
36846 IsTRUNCATE ? (unsigned)ISD::TRUNCATE : (unsigned)X86ISD::VTRUNC;
36847 if (Depth == 0 && Root.getOpcode() == Opc)
36848 return SDValue(); // Nothing to do!
36849 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
36850 Res = DAG.getNode(Opc, DL, ShuffleVT, V1);
36851 if (ShuffleVT.getSizeInBits() < RootSizeInBits)
36852 Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits);
36853 return DAG.getBitcast(RootVT, Res);
36854 }
36855
36856 // Do we need a more general binary truncation pattern?
36857 if (RootSizeInBits < 512 &&
36858 ((RootVT.is256BitVector() && Subtarget.useAVX512Regs()) ||
36859 (RootVT.is128BitVector() && Subtarget.hasVLX())) &&
36860 (MaskEltSizeInBits > 8 || Subtarget.hasBWI()) &&
36861 isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) {
36862 if (Depth == 0 && Root.getOpcode() == ISD::TRUNCATE)
36863 return SDValue(); // Nothing to do!
36864 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
36865 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);
36866 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
36867 V2 = CanonicalizeShuffleInput(ShuffleSrcVT, V2);
36868 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
36869 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts);
36870 Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2);
36871 Res = DAG.getNode(ISD::TRUNCATE, DL, IntMaskVT, Res);
36872 return DAG.getBitcast(RootVT, Res);
36873 }
36874 }
36875
36876 // Don't try to re-form single instruction chains under any circumstances now
36877 // that we've done encoding canonicalization for them.
36878 if (Depth < 1)
36879 return SDValue();
36880
36881 // Depth threshold above which we can efficiently use variable mask shuffles.
36882 int VariableCrossLaneShuffleDepth =
36883 Subtarget.hasFastVariableCrossLaneShuffle() ? 1 : 2;
36884 int VariablePerLaneShuffleDepth =
36885 Subtarget.hasFastVariablePerLaneShuffle() ? 1 : 2;
36886 AllowVariableCrossLaneMask &=
36887 (Depth >= VariableCrossLaneShuffleDepth) || HasVariableMask;
36888 AllowVariablePerLaneMask &=
36889 (Depth >= VariablePerLaneShuffleDepth) || HasVariableMask;
36890 // VPERMI2W/VPERMI2B are 3 uops on Skylake and Icelake so we require a
36891 // higher depth before combining them.
36892 bool AllowBWIVPERMV3 =
36893 (Depth >= (VariableCrossLaneShuffleDepth + 2) || HasVariableMask);
36894
36895 bool MaskContainsZeros = isAnyZero(Mask);
36896
36897 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
36898 // If we have a single input lane-crossing shuffle then lower to VPERMV.
36899 if (UnaryShuffle && AllowVariableCrossLaneMask && !MaskContainsZeros) {
36900 if (Subtarget.hasAVX2() &&
36901 (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) {
36902 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
36903 Res = CanonicalizeShuffleInput(MaskVT, V1);
36904 Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
36905 return DAG.getBitcast(RootVT, Res);
36906 }
36907 // AVX512 variants (non-VLX will pad to 512-bit shuffles).
36908 if ((Subtarget.hasAVX512() &&
36909 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
36910 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
36911 (Subtarget.hasBWI() &&
36912 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
36913 (Subtarget.hasVBMI() &&
36914 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8))) {
36915 V1 = CanonicalizeShuffleInput(MaskVT, V1);
36916 V2 = DAG.getUNDEF(MaskVT);
36917 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
36918 return DAG.getBitcast(RootVT, Res);
36919 }
36920 }
36921
36922 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
36923 // vector as the second source (non-VLX will pad to 512-bit shuffles).
36924 if (UnaryShuffle && AllowVariableCrossLaneMask &&
36925 ((Subtarget.hasAVX512() &&
36926 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
36927 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
36928 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32 ||
36929 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
36930 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
36931 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
36932 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
36933 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
36934 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
36935 for (unsigned i = 0; i != NumMaskElts; ++i)
36936 if (Mask[i] == SM_SentinelZero)
36937 Mask[i] = NumMaskElts + i;
36938 V1 = CanonicalizeShuffleInput(MaskVT, V1);
36939 V2 = getZeroVector(MaskVT, Subtarget, DAG, DL);
36940 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
36941 return DAG.getBitcast(RootVT, Res);
36942 }
36943
36944 // If that failed and either input is extracted then try to combine as a
36945 // shuffle with the larger type.
36946 if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
36947 Inputs, Root, BaseMask, Depth, HasVariableMask,
36948 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG,
36949 Subtarget))
36950 return WideShuffle;
36951
36952 // If we have a dual input lane-crossing shuffle then lower to VPERMV3,
36953 // (non-VLX will pad to 512-bit shuffles).
36954 if (AllowVariableCrossLaneMask && !MaskContainsZeros &&
36955 ((Subtarget.hasAVX512() &&
36956 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
36957 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
36958 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32 ||
36959 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
36960 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
36961 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
36962 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
36963 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
36964 V1 = CanonicalizeShuffleInput(MaskVT, V1);
36965 V2 = CanonicalizeShuffleInput(MaskVT, V2);
36966 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
36967 return DAG.getBitcast(RootVT, Res);
36968 }
36969 return SDValue();
36970 }
36971
36972 // See if we can combine a single input shuffle with zeros to a bit-mask,
36973 // which is much simpler than any shuffle.
36974 if (UnaryShuffle && MaskContainsZeros && AllowVariablePerLaneMask &&
36975 isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
36976 DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
36977 APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
36978 APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
36979 APInt UndefElts(NumMaskElts, 0);
36980 SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
36981 for (unsigned i = 0; i != NumMaskElts; ++i) {
36982 int M = Mask[i];
36983 if (M == SM_SentinelUndef) {
36984 UndefElts.setBit(i);
36985 continue;
36986 }
36987 if (M == SM_SentinelZero)
36988 continue;
36989 EltBits[i] = AllOnes;
36990 }
36991 SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
36992 Res = CanonicalizeShuffleInput(MaskVT, V1);
36993 unsigned AndOpcode =
36994 MaskVT.isFloatingPoint() ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
36995 Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
36996 return DAG.getBitcast(RootVT, Res);
36997 }
36998
36999 // If we have a single input shuffle with different shuffle patterns in the
37000 // the 128-bit lanes use the variable mask to VPERMILPS.
37001 // TODO Combine other mask types at higher depths.
37002 if (UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
37003 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
37004 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
37005 SmallVector<SDValue, 16> VPermIdx;
37006 for (int M : Mask) {
37007 SDValue Idx =
37008 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
37009 VPermIdx.push_back(Idx);
37010 }
37011 SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
37012 Res = CanonicalizeShuffleInput(MaskVT, V1);
37013 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
37014 return DAG.getBitcast(RootVT, Res);
37015 }
37016
37017 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
37018 // to VPERMIL2PD/VPERMIL2PS.
37019 if (AllowVariablePerLaneMask && Subtarget.hasXOP() &&
37020 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
37021 MaskVT == MVT::v8f32)) {
37022 // VPERMIL2 Operation.
37023 // Bits[3] - Match Bit.
37024 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
37025 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
37026 unsigned NumLanes = MaskVT.getSizeInBits() / 128;
37027 unsigned NumEltsPerLane = NumMaskElts / NumLanes;
37028 SmallVector<int, 8> VPerm2Idx;
37029 unsigned M2ZImm = 0;
37030 for (int M : Mask) {
37031 if (M == SM_SentinelUndef) {
37032 VPerm2Idx.push_back(-1);
37033 continue;
37034 }
37035 if (M == SM_SentinelZero) {
37036 M2ZImm = 2;
37037 VPerm2Idx.push_back(8);
37038 continue;
37039 }
37040 int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
37041 Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
37042 VPerm2Idx.push_back(Index);
37043 }
37044 V1 = CanonicalizeShuffleInput(MaskVT, V1);
37045 V2 = CanonicalizeShuffleInput(MaskVT, V2);
37046 SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
37047 Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
37048 DAG.getTargetConstant(M2ZImm, DL, MVT::i8));
37049 return DAG.getBitcast(RootVT, Res);
37050 }
37051
37052 // If we have 3 or more shuffle instructions or a chain involving a variable
37053 // mask, we can replace them with a single PSHUFB instruction profitably.
37054 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
37055 // instructions, but in practice PSHUFB tends to be *very* fast so we're
37056 // more aggressive.
37057 if (UnaryShuffle && AllowVariablePerLaneMask &&
37058 ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
37059 (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
37060 (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
37061 SmallVector<SDValue, 16> PSHUFBMask;
37062 int NumBytes = RootVT.getSizeInBits() / 8;
37063 int Ratio = NumBytes / NumMaskElts;
37064 for (int i = 0; i < NumBytes; ++i) {
37065 int M = Mask[i / Ratio];
37066 if (M == SM_SentinelUndef) {
37067 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
37068 continue;
37069 }
37070 if (M == SM_SentinelZero) {
37071 PSHUFBMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
37072 continue;
37073 }
37074 M = Ratio * M + i % Ratio;
37075 assert((M / 16) == (i / 16) && "Lane crossing detected")(static_cast<void> (0));
37076 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
37077 }
37078 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
37079 Res = CanonicalizeShuffleInput(ByteVT, V1);
37080 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
37081 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
37082 return DAG.getBitcast(RootVT, Res);
37083 }
37084
37085 // With XOP, if we have a 128-bit binary input shuffle we can always combine
37086 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
37087 // slower than PSHUFB on targets that support both.
37088 if (AllowVariablePerLaneMask && RootVT.is128BitVector() &&
37089 Subtarget.hasXOP()) {
37090 // VPPERM Mask Operation
37091 // Bits[4:0] - Byte Index (0 - 31)
37092 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
37093 SmallVector<SDValue, 16> VPPERMMask;
37094 int NumBytes = 16;
37095 int Ratio = NumBytes / NumMaskElts;
37096 for (int i = 0; i < NumBytes; ++i) {
37097 int M = Mask[i / Ratio];
37098 if (M == SM_SentinelUndef) {
37099 VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
37100 continue;
37101 }
37102 if (M == SM_SentinelZero) {
37103 VPPERMMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
37104 continue;
37105 }
37106 M = Ratio * M + i % Ratio;
37107 VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
37108 }
37109 MVT ByteVT = MVT::v16i8;
37110 V1 = CanonicalizeShuffleInput(ByteVT, V1);
37111 V2 = CanonicalizeShuffleInput(ByteVT, V2);
37112 SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
37113 Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
37114 return DAG.getBitcast(RootVT, Res);
37115 }
37116
37117 // If that failed and either input is extracted then try to combine as a
37118 // shuffle with the larger type.
37119 if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
37120 Inputs, Root, BaseMask, Depth, HasVariableMask,
37121 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG, Subtarget))
37122 return WideShuffle;
37123
37124 // If we have a dual input shuffle then lower to VPERMV3,
37125 // (non-VLX will pad to 512-bit shuffles)
37126 if (!UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
37127 ((Subtarget.hasAVX512() &&
37128 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v8f64 ||
37129 MaskVT == MVT::v2i64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8i64 ||
37130 MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || MaskVT == MVT::v8f32 ||
37131 MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 ||
37132 MaskVT == MVT::v16i32)) ||
37133 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
37134 (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 ||
37135 MaskVT == MVT::v32i16)) ||
37136 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
37137 (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 ||
37138 MaskVT == MVT::v64i8)))) {
37139 V1 = CanonicalizeShuffleInput(MaskVT, V1);
37140 V2 = CanonicalizeShuffleInput(MaskVT, V2);
37141 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
37142 return DAG.getBitcast(RootVT, Res);
37143 }
37144
37145 // Failed to find any combines.
37146 return SDValue();
37147}
37148
37149// Combine an arbitrary chain of shuffles + extract_subvectors into a single
37150// instruction if possible.
37151//
37152// Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger
37153// type size to attempt to combine:
37154// shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)
37155// -->
37156// extract_subvector(shuffle(x,y,m2),0)
37157static SDValue combineX86ShuffleChainWithExtract(
37158 ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
37159 bool HasVariableMask, bool AllowVariableCrossLaneMask,
37160 bool AllowVariablePerLaneMask, SelectionDAG &DAG,
37161 const X86Subtarget &Subtarget) {
37162 unsigned NumMaskElts = BaseMask.size();
37163 unsigned NumInputs = Inputs.size();
37164 if (NumInputs == 0)
37165 return SDValue();
37166
37167 EVT RootVT = Root.getValueType();
37168 unsigned RootSizeInBits = RootVT.getSizeInBits();
37169 assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask")(static_cast<void> (0));
37170
37171 SmallVector<SDValue, 4> WideInputs(Inputs.begin(), Inputs.end());
37172 SmallVector<unsigned, 4> Offsets(NumInputs, 0);
37173
37174 // Peek through subvectors.
37175 // TODO: Support inter-mixed EXTRACT_SUBVECTORs + BITCASTs?
37176 unsigned WideSizeInBits = RootSizeInBits;
37177 for (unsigned i = 0; i != NumInputs; ++i) {
37178 SDValue &Src = WideInputs[i];
37179 unsigned &Offset = Offsets[i];
37180 Src = peekThroughBitcasts(Src);
37181 EVT BaseVT = Src.getValueType();
37182 while (Src.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
37183 Offset += Src.getConstantOperandVal(1);
37184 Src = Src.getOperand(0);
37185 }
37186 WideSizeInBits = std::max(WideSizeInBits,
37187 (unsigned)Src.getValueSizeInBits());
37188 assert((Offset % BaseVT.getVectorNumElements()) == 0 &&(static_cast<void> (0))
37189 "Unexpected subvector extraction")(static_cast<void> (0));
37190 Offset /= BaseVT.getVectorNumElements();
37191 Offset *= NumMaskElts;
37192 }
37193
37194 // Bail if we're always extracting from the lowest subvectors,
37195 // combineX86ShuffleChain should match this for the current width.
37196 if (llvm::all_of(Offsets, [](unsigned Offset) { return Offset == 0; }))
37197 return SDValue();
37198
37199 unsigned Scale = WideSizeInBits / RootSizeInBits;
37200 assert((WideSizeInBits % RootSizeInBits) == 0 &&(static_cast<void> (0))
37201 "Unexpected subvector extraction")(static_cast<void> (0));
37202
37203 // If the src vector types aren't the same, see if we can extend
37204 // them to match each other.
37205 // TODO: Support different scalar types?
37206 EVT WideSVT = WideInputs[0].getValueType().getScalarType();
37207 if (llvm::any_of(WideInputs, [&WideSVT, &DAG](SDValue Op) {
37208 return !DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()) ||
37209 Op.getValueType().getScalarType() != WideSVT;
37210 }))
37211 return SDValue();
37212
37213 for (SDValue &NewInput : WideInputs) {
37214 assert((WideSizeInBits % NewInput.getValueSizeInBits()) == 0 &&(static_cast<void> (0))
37215 "Shuffle vector size mismatch")(static_cast<void> (0));
37216 if (WideSizeInBits > NewInput.getValueSizeInBits())
37217 NewInput = widenSubVector(NewInput, false, Subtarget, DAG,
37218 SDLoc(NewInput), WideSizeInBits);
37219 assert(WideSizeInBits == NewInput.getValueSizeInBits() &&(static_cast<void> (0))
37220 "Unexpected subvector extraction")(static_cast<void> (0));
37221 }
37222
37223 // Create new mask for larger type.
37224 for (unsigned i = 1; i != NumInputs; ++i)
37225 Offsets[i] += i * Scale * NumMaskElts;
37226
37227 SmallVector<int, 64> WideMask(BaseMask.begin(), BaseMask.end());
37228 for (int &M : WideMask) {
37229 if (M < 0)
37230 continue;
37231 M = (M % NumMaskElts) + Offsets[M / NumMaskElts];
37232 }
37233 WideMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef);
37234
37235 // Remove unused/repeated shuffle source ops.
37236 resolveTargetShuffleInputsAndMask(WideInputs, WideMask);
37237 assert(!WideInputs.empty() && "Shuffle with no inputs detected")(static_cast<void> (0));
37238
37239 if (WideInputs.size() > 2)
37240 return SDValue();
37241
37242 // Increase depth for every upper subvector we've peeked through.
37243 Depth += count_if(Offsets, [](unsigned Offset) { return Offset > 0; });
37244
37245 // Attempt to combine wider chain.
37246 // TODO: Can we use a better Root?
37247 SDValue WideRoot = WideInputs[0];
37248 if (SDValue WideShuffle =
37249 combineX86ShuffleChain(WideInputs, WideRoot, WideMask, Depth,
37250 HasVariableMask, AllowVariableCrossLaneMask,
37251 AllowVariablePerLaneMask, DAG, Subtarget)) {
37252 WideShuffle =
37253 extractSubVector(WideShuffle, 0, DAG, SDLoc(Root), RootSizeInBits);
37254 return DAG.getBitcast(RootVT, WideShuffle);
37255 }
37256 return SDValue();
37257}
37258
37259// Canonicalize the combined shuffle mask chain with horizontal ops.
37260// NOTE: This may update the Ops and Mask.
37261static SDValue canonicalizeShuffleMaskWithHorizOp(
37262 MutableArrayRef<SDValue> Ops, MutableArrayRef<int> Mask,
37263 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
37264 const X86Subtarget &Subtarget) {
37265 if (Mask.empty() || Ops.empty())
37266 return SDValue();
37267
37268 SmallVector<SDValue> BC;
37269 for (SDValue Op : Ops)
37270 BC.push_back(peekThroughBitcasts(Op));
37271
37272 // All ops must be the same horizop + type.
37273 SDValue BC0 = BC[0];
37274 EVT VT0 = BC0.getValueType();
37275 unsigned Opcode0 = BC0.getOpcode();
37276 if (VT0.getSizeInBits() != RootSizeInBits || llvm::any_of(BC, [&](SDValue V) {
37277 return V.getOpcode() != Opcode0 || V.getValueType() != VT0;
37278 }))
37279 return SDValue();
37280
37281 bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
37282 Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);
37283 bool isPack = (Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS);
37284 if (!isHoriz && !isPack)
37285 return SDValue();
37286
37287 // Do all ops have a single use?
37288 bool OneUseOps = llvm::all_of(Ops, [](SDValue Op) {
37289 return Op.hasOneUse() &&
37290 peekThroughBitcasts(Op) == peekThroughOneUseBitcasts(Op);
37291 });
37292
37293 int NumElts = VT0.getVectorNumElements();
37294 int NumLanes = VT0.getSizeInBits() / 128;
37295 int NumEltsPerLane = NumElts / NumLanes;
37296 int NumHalfEltsPerLane = NumEltsPerLane / 2;
37297 MVT SrcVT = BC0.getOperand(0).getSimpleValueType();
37298 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
37299
37300 if (NumEltsPerLane >= 4 &&
37301 (isPack || shouldUseHorizontalOp(Ops.size() == 1, DAG, Subtarget))) {
37302 SmallVector<int> LaneMask, ScaledMask;
37303 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, LaneMask) &&
37304 scaleShuffleElements(LaneMask, 4, ScaledMask)) {
37305 // See if we can remove the shuffle by resorting the HOP chain so that
37306 // the HOP args are pre-shuffled.
37307 // TODO: Generalize to any sized/depth chain.
37308 // TODO: Add support for PACKSS/PACKUS.
37309 if (isHoriz) {
37310 // Attempt to find a HOP(HOP(X,Y),HOP(Z,W)) source operand.
37311 auto GetHOpSrc = [&](int M) {
37312 if (M == SM_SentinelUndef)
37313 return DAG.getUNDEF(VT0);
37314 if (M == SM_SentinelZero)
37315 return getZeroVector(VT0.getSimpleVT(), Subtarget, DAG, DL);
37316 SDValue Src0 = BC[M / 4];
37317 SDValue Src1 = Src0.getOperand((M % 4) >= 2);
37318 if (Src1.getOpcode() == Opcode0 && Src0->isOnlyUserOf(Src1.getNode()))
37319 return Src1.getOperand(M % 2);
37320 return SDValue();
37321 };
37322 SDValue M0 = GetHOpSrc(ScaledMask[0]);
37323 SDValue M1 = GetHOpSrc(ScaledMask[1]);
37324 SDValue M2 = GetHOpSrc(ScaledMask[2]);
37325 SDValue M3 = GetHOpSrc(ScaledMask[3]);
37326 if (M0 && M1 && M2 && M3) {
37327 SDValue LHS = DAG.getNode(Opcode0, DL, SrcVT, M0, M1);
37328 SDValue RHS = DAG.getNode(Opcode0, DL, SrcVT, M2, M3);
37329 return DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
37330 }
37331 }
37332 // shuffle(hop(x,y),hop(z,w)) -> permute(hop(x,z)) etc.
37333 if (Ops.size() >= 2) {
37334 SDValue LHS, RHS;
37335 auto GetHOpSrc = [&](int M, int &OutM) {
37336 // TODO: Support SM_SentinelZero
37337 if (M < 0)
37338 return M == SM_SentinelUndef;
37339 SDValue Src = BC[M / 4].getOperand((M % 4) >= 2);
37340 if (!LHS || LHS == Src) {
37341 LHS = Src;
37342 OutM = (M % 2);
37343 return true;
37344 }
37345 if (!RHS || RHS == Src) {
37346 RHS = Src;
37347 OutM = (M % 2) + 2;
37348 return true;
37349 }
37350 return false;
37351 };
37352 int PostMask[4] = {-1, -1, -1, -1};
37353 if (GetHOpSrc(ScaledMask[0], PostMask[0]) &&
37354 GetHOpSrc(ScaledMask[1], PostMask[1]) &&
37355 GetHOpSrc(ScaledMask[2], PostMask[2]) &&
37356 GetHOpSrc(ScaledMask[3], PostMask[3])) {
37357 LHS = DAG.getBitcast(SrcVT, LHS);
37358 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
37359 SDValue Res = DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
37360 // Use SHUFPS for the permute so this will work on SSE3 targets,
37361 // shuffle combining and domain handling will simplify this later on.
37362 MVT ShuffleVT = MVT::getVectorVT(MVT::f32, RootSizeInBits / 32);
37363 Res = DAG.getBitcast(ShuffleVT, Res);
37364 return DAG.getNode(X86ISD::SHUFP, DL, ShuffleVT, Res, Res,
37365 getV4X86ShuffleImm8ForMask(PostMask, DL, DAG));
37366 }
37367 }
37368 }
37369 }
37370
37371 if (2 < Ops.size())
37372 return SDValue();
37373
37374 SDValue BC1 = BC[BC.size() - 1];
37375 if (Mask.size() == VT0.getVectorNumElements()) {
37376 // Canonicalize binary shuffles of horizontal ops that use the
37377 // same sources to an unary shuffle.
37378 // TODO: Try to perform this fold even if the shuffle remains.
37379 if (Ops.size() == 2) {
37380 auto ContainsOps = [](SDValue HOp, SDValue Op) {
37381 return Op == HOp.getOperand(0) || Op == HOp.getOperand(1);
37382 };
37383 // Commute if all BC0's ops are contained in BC1.
37384 if (ContainsOps(BC1, BC0.getOperand(0)) &&
37385 ContainsOps(BC1, BC0.getOperand(1))) {
37386 ShuffleVectorSDNode::commuteMask(Mask);
37387 std::swap(Ops[0], Ops[1]);
37388 std::swap(BC0, BC1);
37389 }
37390
37391 // If BC1 can be represented by BC0, then convert to unary shuffle.
37392 if (ContainsOps(BC0, BC1.getOperand(0)) &&
37393 ContainsOps(BC0, BC1.getOperand(1))) {
37394 for (int &M : Mask) {
37395 if (M < NumElts) // BC0 element or UNDEF/Zero sentinel.
37396 continue;
37397 int SubLane = ((M % NumEltsPerLane) >= NumHalfEltsPerLane) ? 1 : 0;
37398 M -= NumElts + (SubLane * NumHalfEltsPerLane);
37399 if (BC1.getOperand(SubLane) != BC0.getOperand(0))
37400 M += NumHalfEltsPerLane;
37401 }
37402 }
37403 }
37404
37405 // Canonicalize unary horizontal ops to only refer to lower halves.
37406 for (int i = 0; i != NumElts; ++i) {
37407 int &M = Mask[i];
37408 if (isUndefOrZero(M))
37409 continue;
37410 if (M < NumElts && BC0.getOperand(0) == BC0.getOperand(1) &&
37411 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
37412 M -= NumHalfEltsPerLane;
37413 if (NumElts <= M && BC1.getOperand(0) == BC1.getOperand(1) &&
37414 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
37415 M -= NumHalfEltsPerLane;
37416 }
37417 }
37418
37419 // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
37420 // single instruction. Attempt to match a v2X64 repeating shuffle pattern that
37421 // represents the LHS/RHS inputs for the lower/upper halves.
37422 SmallVector<int, 16> TargetMask128, WideMask128;
37423 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, TargetMask128) &&
37424 scaleShuffleElements(TargetMask128, 2, WideMask128)) {
37425 assert(isUndefOrZeroOrInRange(WideMask128, 0, 4) && "Illegal shuffle")(static_cast<void> (0));
37426 bool SingleOp = (Ops.size() == 1);
37427 if (isPack || OneUseOps ||
37428 shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {
37429 SDValue Lo = isInRange(WideMask128[0], 0, 2) ? BC0 : BC1;
37430 SDValue Hi = isInRange(WideMask128[1], 0, 2) ? BC0 : BC1;
37431 Lo = Lo.getOperand(WideMask128[0] & 1);
37432 Hi = Hi.getOperand(WideMask128[1] & 1);
37433 if (SingleOp) {
37434 SDValue Undef = DAG.getUNDEF(SrcVT);
37435 SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL);
37436 Lo = (WideMask128[0] == SM_SentinelZero ? Zero : Lo);
37437 Hi = (WideMask128[1] == SM_SentinelZero ? Zero : Hi);
37438 Lo = (WideMask128[0] == SM_SentinelUndef ? Undef : Lo);
37439 Hi = (WideMask128[1] == SM_SentinelUndef ? Undef : Hi);
37440 }
37441 return DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
37442 }
37443 }
37444
37445 return SDValue();
37446}
37447
37448// Attempt to constant fold all of the constant source ops.
37449// Returns true if the entire shuffle is folded to a constant.
37450// TODO: Extend this to merge multiple constant Ops and update the mask.
37451static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,
37452 ArrayRef<int> Mask, SDValue Root,
37453 bool HasVariableMask,
37454 SelectionDAG &DAG,
37455 const X86Subtarget &Subtarget) {
37456 MVT VT = Root.getSimpleValueType();
37457
37458 unsigned SizeInBits = VT.getSizeInBits();
37459 unsigned NumMaskElts = Mask.size();
37460 unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
37461 unsigned NumOps = Ops.size();
37462
37463 // Extract constant bits from each source op.
37464 bool OneUseConstantOp = false;
37465 SmallVector<APInt, 16> UndefEltsOps(NumOps);
37466 SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
37467 for (unsigned i = 0; i != NumOps; ++i) {
37468 SDValue SrcOp = Ops[i];
37469 OneUseConstantOp |= SrcOp.hasOneUse();
37470 if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
37471 RawBitsOps[i]))
37472 return SDValue();
37473 }
37474
37475 // Only fold if at least one of the constants is only used once or
37476 // the combined shuffle has included a variable mask shuffle, this
37477 // is to avoid constant pool bloat.
37478 if (!OneUseConstantOp && !HasVariableMask)
37479 return SDValue();
37480
37481 // Shuffle the constant bits according to the mask.
37482 SDLoc DL(Root);
37483 APInt UndefElts(NumMaskElts, 0);
37484 APInt ZeroElts(NumMaskElts, 0);
37485 APInt ConstantElts(NumMaskElts, 0);
37486 SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
37487 APInt::getNullValue(MaskSizeInBits));
37488 for (unsigned i = 0; i != NumMaskElts; ++i) {
37489 int M = Mask[i];
37490 if (M == SM_SentinelUndef) {
37491 UndefElts.setBit(i);
37492 continue;
37493 } else if (M == SM_SentinelZero) {
37494 ZeroElts.setBit(i);
37495 continue;
37496 }
37497 assert(0 <= M && M < (int)(NumMaskElts * NumOps))(static_cast<void> (0));
37498
37499 unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
37500 unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
37501
37502 auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
37503 if (SrcUndefElts[SrcMaskIdx]) {
37504 UndefElts.setBit(i);
37505 continue;
37506 }
37507
37508 auto &SrcEltBits = RawBitsOps[SrcOpIdx];
37509 APInt &Bits = SrcEltBits[SrcMaskIdx];
37510 if (!Bits) {
37511 ZeroElts.setBit(i);
37512 continue;
37513 }
37514
37515 ConstantElts.setBit(i);
37516 ConstantBitData[i] = Bits;
37517 }
37518 assert((UndefElts | ZeroElts | ConstantElts).isAllOnesValue())(static_cast<void> (0));
37519
37520 // Attempt to create a zero vector.
37521 if ((UndefElts | ZeroElts).isAllOnesValue())
37522 return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG, DL);
37523
37524 // Create the constant data.
37525 MVT MaskSVT;
37526 if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
37527 MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
37528 else
37529 MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
37530
37531 MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
37532 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
37533 return SDValue();
37534
37535 SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
37536 return DAG.getBitcast(VT, CstOp);
37537}
37538
37539namespace llvm {
37540 namespace X86 {
37541 enum {
37542 MaxShuffleCombineDepth = 8
37543 };
37544 }
37545} // namespace llvm
37546
37547/// Fully generic combining of x86 shuffle instructions.
37548///
37549/// This should be the last combine run over the x86 shuffle instructions. Once
37550/// they have been fully optimized, this will recursively consider all chains
37551/// of single-use shuffle instructions, build a generic model of the cumulative
37552/// shuffle operation, and check for simpler instructions which implement this
37553/// operation. We use this primarily for two purposes:
37554///
37555/// 1) Collapse generic shuffles to specialized single instructions when
37556/// equivalent. In most cases, this is just an encoding size win, but
37557/// sometimes we will collapse multiple generic shuffles into a single
37558/// special-purpose shuffle.
37559/// 2) Look for sequences of shuffle instructions with 3 or more total
37560/// instructions, and replace them with the slightly more expensive SSSE3
37561/// PSHUFB instruction if available. We do this as the last combining step
37562/// to ensure we avoid using PSHUFB if we can implement the shuffle with
37563/// a suitable short sequence of other instructions. The PSHUFB will either
37564/// use a register or have to read from memory and so is slightly (but only
37565/// slightly) more expensive than the other shuffle instructions.
37566///
37567/// Because this is inherently a quadratic operation (for each shuffle in
37568/// a chain, we recurse up the chain), the depth is limited to 8 instructions.
37569/// This should never be an issue in practice as the shuffle lowering doesn't
37570/// produce sequences of more than 8 instructions.
37571///
37572/// FIXME: We will currently miss some cases where the redundant shuffling
37573/// would simplify under the threshold for PSHUFB formation because of
37574/// combine-ordering. To fix this, we should do the redundant instruction
37575/// combining in this recursive walk.
37576static SDValue combineX86ShufflesRecursively(
37577 ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
37578 ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
37579 unsigned MaxDepth, bool HasVariableMask, bool AllowVariableCrossLaneMask,
37580 bool AllowVariablePerLaneMask, SelectionDAG &DAG,
37581 const X86Subtarget &Subtarget) {
37582 assert(RootMask.size() > 0 &&(static_cast<void> (0))
37583 (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&(static_cast<void> (0))
37584 "Illegal shuffle root mask")(static_cast<void> (0));
37585 assert(Root.getSimpleValueType().isVector() &&(static_cast<void> (0))
37586 "Shuffles operate on vector types!")(static_cast<void> (0));
37587 unsigned RootSizeInBits = Root.getSimpleValueType().getSizeInBits();
37588
37589 // Bound the depth of our recursive combine because this is ultimately
37590 // quadratic in nature.
37591 if (Depth >= MaxDepth)
37592 return SDValue();
37593
37594 // Directly rip through bitcasts to find the underlying operand.
37595 SDValue Op = SrcOps[SrcOpIndex];
37596 Op = peekThroughOneUseBitcasts(Op);
37597
37598 EVT VT = Op.getValueType();
37599 if (!VT.isVector() || !VT.isSimple())
37600 return SDValue(); // Bail if we hit a non-simple non-vector.
37601
37602 // FIXME: Just bail on f16 for now.
37603 if (VT.getVectorElementType() == MVT::f16)
37604 return SDValue();
37605
37606 assert((RootSizeInBits % VT.getSizeInBits()) == 0 &&(static_cast<void> (0))
37607 "Can only combine shuffles upto size of the root op.")(static_cast<void> (0));
37608
37609 // Extract target shuffle mask and resolve sentinels and inputs.
37610 // TODO - determine Op's demanded elts from RootMask.
37611 SmallVector<int, 64> OpMask;
37612 SmallVector<SDValue, 2> OpInputs;
37613 APInt OpUndef, OpZero;
37614 APInt OpDemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
37615 bool IsOpVariableMask = isTargetShuffleVariableMask(Op.getOpcode());
37616 if (!getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,
37617 OpZero, DAG, Depth, false))
37618 return SDValue();
37619
37620 // Shuffle inputs must not be larger than the shuffle result.
37621 // TODO: Relax this for single input faux shuffles (trunc/extract_subvector).
37622 if (llvm::any_of(OpInputs, [VT](SDValue OpInput) {
37623 return OpInput.getValueSizeInBits() > VT.getSizeInBits();
37624 }))
37625 return SDValue();
37626
37627 // If the shuffle result was smaller than the root, we need to adjust the
37628 // mask indices and pad the mask with undefs.
37629 if (RootSizeInBits > VT.getSizeInBits()) {
37630 unsigned NumSubVecs = RootSizeInBits / VT.getSizeInBits();
37631 unsigned OpMaskSize = OpMask.size();
37632 if (OpInputs.size() > 1) {
37633 unsigned PaddedMaskSize = NumSubVecs * OpMaskSize;
37634 for (int &M : OpMask) {
37635 if (M < 0)
37636 continue;
37637 int EltIdx = M % OpMaskSize;
37638 int OpIdx = M / OpMaskSize;
37639 M = (PaddedMaskSize * OpIdx) + EltIdx;
37640 }
37641 }
37642 OpZero = OpZero.zext(NumSubVecs * OpMaskSize);
37643 OpUndef = OpUndef.zext(NumSubVecs * OpMaskSize);
37644 OpMask.append((NumSubVecs - 1) * OpMaskSize, SM_SentinelUndef);
37645 }
37646
37647 SmallVector<int, 64> Mask;
37648 SmallVector<SDValue, 16> Ops;
37649
37650 // We don't need to merge masks if the root is empty.
37651 bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);
37652 if (EmptyRoot) {
37653 // Only resolve zeros if it will remove an input, otherwise we might end
37654 // up in an infinite loop.
37655 bool ResolveKnownZeros = true;
37656 if (!OpZero.isNullValue()) {
37657 APInt UsedInputs = APInt::getNullValue(OpInputs.size());
37658 for (int i = 0, e = OpMask.size(); i != e; ++i) {
37659 int M = OpMask[i];
37660 if (OpUndef[i] || OpZero[i] || isUndefOrZero(M))
37661 continue;
37662 UsedInputs.setBit(M / OpMask.size());
37663 if (UsedInputs.isAllOnesValue()) {
37664 ResolveKnownZeros = false;
37665 break;
37666 }
37667 }
37668 }
37669 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero,
37670 ResolveKnownZeros);
37671
37672 Mask = OpMask;
37673 Ops.append(OpInputs.begin(), OpInputs.end());
37674 } else {
37675 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);
37676
37677 // Add the inputs to the Ops list, avoiding duplicates.
37678 Ops.append(SrcOps.begin(), SrcOps.end());
37679
37680 auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
37681 // Attempt to find an existing match.
37682 SDValue InputBC = peekThroughBitcasts(Input);
37683 for (int i = 0, e = Ops.size(); i < e; ++i)
37684 if (InputBC == peekThroughBitcasts(Ops[i]))
37685 return i;
37686 // Match failed - should we replace an existing Op?
37687 if (InsertionPoint >= 0) {
37688 Ops[InsertionPoint] = Input;
37689 return InsertionPoint;
37690 }
37691 // Add to the end of the Ops list.
37692 Ops.push_back(Input);
37693 return Ops.size() - 1;
37694 };
37695
37696 SmallVector<int, 2> OpInputIdx;
37697 for (SDValue OpInput : OpInputs)
37698 OpInputIdx.push_back(
37699 AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));
37700
37701 assert(((RootMask.size() > OpMask.size() &&(static_cast<void> (0))
37702 RootMask.size() % OpMask.size() == 0) ||(static_cast<void> (0))
37703 (OpMask.size() > RootMask.size() &&(static_cast<void> (0))
37704 OpMask.size() % RootMask.size() == 0) ||(static_cast<void> (0))
37705 OpMask.size() == RootMask.size()) &&(static_cast<void> (0))
37706 "The smaller number of elements must divide the larger.")(static_cast<void> (0));
37707
37708 // This function can be performance-critical, so we rely on the power-of-2
37709 // knowledge that we have about the mask sizes to replace div/rem ops with
37710 // bit-masks and shifts.
37711 assert(isPowerOf2_32(RootMask.size()) &&(static_cast<void> (0))
37712 "Non-power-of-2 shuffle mask sizes")(static_cast<void> (0));
37713 assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes")(static_cast<void> (0));
37714 unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());
37715 unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());
37716
37717 unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
37718 unsigned RootRatio =
37719 std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
37720 unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
37721 assert((RootRatio == 1 || OpRatio == 1) &&(static_cast<void> (0))
37722 "Must not have a ratio for both incoming and op masks!")(static_cast<void> (0));
37723
37724 assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes")(static_cast<void> (0));
37725 assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes")(static_cast<void> (0));
37726 assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes")(static_cast<void> (0));
37727 unsigned RootRatioLog2 = countTrailingZeros(RootRatio);
37728 unsigned OpRatioLog2 = countTrailingZeros(OpRatio);
37729
37730 Mask.resize(MaskWidth, SM_SentinelUndef);
37731
37732 // Merge this shuffle operation's mask into our accumulated mask. Note that
37733 // this shuffle's mask will be the first applied to the input, followed by
37734 // the root mask to get us all the way to the root value arrangement. The
37735 // reason for this order is that we are recursing up the operation chain.
37736 for (unsigned i = 0; i < MaskWidth; ++i) {
37737 unsigned RootIdx = i >> RootRatioLog2;
37738 if (RootMask[RootIdx] < 0) {
37739 // This is a zero or undef lane, we're done.
37740 Mask[i] = RootMask[RootIdx];
37741 continue;
37742 }
37743
37744 unsigned RootMaskedIdx =
37745 RootRatio == 1
37746 ? RootMask[RootIdx]
37747 : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
37748
37749 // Just insert the scaled root mask value if it references an input other
37750 // than the SrcOp we're currently inserting.
37751 if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
37752 (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
37753 Mask[i] = RootMaskedIdx;
37754 continue;
37755 }
37756
37757 RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
37758 unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
37759 if (OpMask[OpIdx] < 0) {
37760 // The incoming lanes are zero or undef, it doesn't matter which ones we
37761 // are using.
37762 Mask[i] = OpMask[OpIdx];
37763 continue;
37764 }
37765
37766 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
37767 unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx]
37768 : (OpMask[OpIdx] << OpRatioLog2) +
37769 (RootMaskedIdx & (OpRatio - 1));
37770
37771 OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
37772 int InputIdx = OpMask[OpIdx] / (int)OpMask.size();
37773 assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input")(static_cast<void> (0));
37774 OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;
37775
37776 Mask[i] = OpMaskedIdx;
37777 }
37778 }
37779
37780 // Remove unused/repeated shuffle source ops.
37781 resolveTargetShuffleInputsAndMask(Ops, Mask);
37782
37783 // Handle the all undef/zero/ones cases early.
37784 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
37785 return DAG.getUNDEF(Root.getValueType());
37786 if (all_of(Mask, [](int Idx) { return Idx < 0; }))
37787 return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG,
37788 SDLoc(Root));
37789 if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) &&
37790 none_of(Mask, [](int M) { return M == SM_SentinelZero; }))
37791 return getOnesVector(Root.getValueType(), DAG, SDLoc(Root));
37792
37793 assert(!Ops.empty() && "Shuffle with no inputs detected")(static_cast<void> (0));
37794 HasVariableMask |= IsOpVariableMask;
37795
37796 // Update the list of shuffle nodes that have been combined so far.
37797 SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
37798 SrcNodes.end());
37799 CombinedNodes.push_back(Op.getNode());
37800
37801 // See if we can recurse into each shuffle source op (if it's a target
37802 // shuffle). The source op should only be generally combined if it either has
37803 // a single use (i.e. current Op) or all its users have already been combined,
37804 // if not then we can still combine but should prevent generation of variable
37805 // shuffles to avoid constant pool bloat.
37806 // Don't recurse if we already have more source ops than we can combine in
37807 // the remaining recursion depth.
37808 if (Ops.size() < (MaxDepth - Depth)) {
37809 for (int i = 0, e = Ops.size(); i < e; ++i) {
37810 // For empty roots, we need to resolve zeroable elements before combining
37811 // them with other shuffles.
37812 SmallVector<int, 64> ResolvedMask = Mask;
37813 if (EmptyRoot)
37814 resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero);
37815 bool AllowCrossLaneVar = false;
37816 bool AllowPerLaneVar = false;
37817 if (Ops[i].getNode()->hasOneUse() ||
37818 SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode())) {
37819 AllowCrossLaneVar = AllowVariableCrossLaneMask;
37820 AllowPerLaneVar = AllowVariablePerLaneMask;
37821 }
37822 if (SDValue Res = combineX86ShufflesRecursively(
37823 Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1, MaxDepth,
37824 HasVariableMask, AllowCrossLaneVar, AllowPerLaneVar, DAG,
37825 Subtarget))
37826 return Res;
37827 }
37828 }
37829
37830 // Attempt to constant fold all of the constant source ops.
37831 if (SDValue Cst = combineX86ShufflesConstants(
37832 Ops, Mask, Root, HasVariableMask, DAG, Subtarget))
37833 return Cst;
37834
37835 // If constant fold failed and we only have constants - then we have
37836 // multiple uses by a single non-variable shuffle - just bail.
37837 if (Depth == 0 && llvm::all_of(Ops, [&](SDValue Op) {
37838 APInt UndefElts;
37839 SmallVector<APInt> RawBits;
37840 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
37841 return getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
37842 RawBits);
37843 })) {
37844 return SDValue();
37845 }
37846
37847 // Canonicalize the combined shuffle mask chain with horizontal ops.
37848 // NOTE: This will update the Ops and Mask.
37849 if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp(
37850 Ops, Mask, RootSizeInBits, SDLoc(Root), DAG, Subtarget))
37851 return DAG.getBitcast(Root.getValueType(), HOp);
37852
37853 // Widen any subvector shuffle inputs we've collected.
37854 if (any_of(Ops, [RootSizeInBits](SDValue Op) {
37855 return Op.getValueSizeInBits() < RootSizeInBits;
37856 })) {
37857 for (SDValue &Op : Ops)
37858 if (Op.getValueSizeInBits() < RootSizeInBits)
37859 Op = widenSubVector(Op, false, Subtarget, DAG, SDLoc(Op),
37860 RootSizeInBits);
37861 // Reresolve - we might have repeated subvector sources.
37862 resolveTargetShuffleInputsAndMask(Ops, Mask);
37863 }
37864
37865 // We can only combine unary and binary shuffle mask cases.
37866 if (Ops.size() <= 2) {
37867 // Minor canonicalization of the accumulated shuffle mask to make it easier
37868 // to match below. All this does is detect masks with sequential pairs of
37869 // elements, and shrink them to the half-width mask. It does this in a loop
37870 // so it will reduce the size of the mask to the minimal width mask which
37871 // performs an equivalent shuffle.
37872 while (Mask.size() > 1) {
37873 SmallVector<int, 64> WidenedMask;
37874 if (!canWidenShuffleElements(Mask, WidenedMask))
37875 break;
37876 Mask = std::move(WidenedMask);
37877 }
37878
37879 // Canonicalization of binary shuffle masks to improve pattern matching by
37880 // commuting the inputs.
37881 if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
37882 ShuffleVectorSDNode::commuteMask(Mask);
37883 std::swap(Ops[0], Ops[1]);
37884 }
37885
37886 // Finally, try to combine into a single shuffle instruction.
37887 return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask,
37888 AllowVariableCrossLaneMask,
37889 AllowVariablePerLaneMask, DAG, Subtarget);
37890 }
37891
37892 // If that failed and any input is extracted then try to combine as a
37893 // shuffle with the larger type.
37894 return combineX86ShuffleChainWithExtract(
37895 Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask,
37896 AllowVariablePerLaneMask, DAG, Subtarget);
37897}
37898
37899/// Helper entry wrapper to combineX86ShufflesRecursively.
37900static SDValue combineX86ShufflesRecursively(SDValue Op, SelectionDAG &DAG,
37901 const X86Subtarget &Subtarget) {
37902 return combineX86ShufflesRecursively(
37903 {Op}, 0, Op, {0}, {}, /*Depth*/ 0, X86::MaxShuffleCombineDepth,
37904 /*HasVarMask*/ false,
37905 /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, DAG,
37906 Subtarget);
37907}
37908
37909/// Get the PSHUF-style mask from PSHUF node.
37910///
37911/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
37912/// PSHUF-style masks that can be reused with such instructions.
37913static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
37914 MVT VT = N.getSimpleValueType();
37915 SmallVector<int, 4> Mask;
37916 SmallVector<SDValue, 2> Ops;
37917 bool HaveMask =
37918 getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask);
37919 (void)HaveMask;
37920 assert(HaveMask)(static_cast<void> (0));
37921
37922 // If we have more than 128-bits, only the low 128-bits of shuffle mask
37923 // matter. Check that the upper masks are repeats and remove them.
37924 if (VT.getSizeInBits() > 128) {
37925 int LaneElts = 128 / VT.getScalarSizeInBits();
37926#ifndef NDEBUG1
37927 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
37928 for (int j = 0; j < LaneElts; ++j)
37929 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&(static_cast<void> (0))
37930 "Mask doesn't repeat in high 128-bit lanes!")(static_cast<void> (0));
37931#endif
37932 Mask.resize(LaneElts);
37933 }
37934
37935 switch (N.getOpcode()) {
37936 case X86ISD::PSHUFD:
37937 return Mask;
37938 case X86ISD::PSHUFLW:
37939 Mask.resize(4);
37940 return Mask;
37941 case X86ISD::PSHUFHW:
37942 Mask.erase(Mask.begin(), Mask.begin() + 4);
37943 for (int &M : Mask)
37944 M -= 4;
37945 return Mask;
37946 default:
37947 llvm_unreachable("No valid shuffle instruction found!")__builtin_unreachable();
37948 }
37949}
37950
37951/// Search for a combinable shuffle across a chain ending in pshufd.
37952///
37953/// We walk up the chain and look for a combinable shuffle, skipping over
37954/// shuffles that we could hoist this shuffle's transformation past without
37955/// altering anything.
37956static SDValue
37957combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
37958 SelectionDAG &DAG) {
37959 assert(N.getOpcode() == X86ISD::PSHUFD &&(static_cast<void> (0))
37960 "Called with something other than an x86 128-bit half shuffle!")(static_cast<void> (0));
37961 SDLoc DL(N);
37962
37963 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
37964 // of the shuffles in the chain so that we can form a fresh chain to replace
37965 // this one.
37966 SmallVector<SDValue, 8> Chain;
37967 SDValue V = N.getOperand(0);
37968 for (; V.hasOneUse(); V = V.getOperand(0)) {
37969 switch (V.getOpcode()) {
37970 default:
37971 return SDValue(); // Nothing combined!
37972
37973 case ISD::BITCAST:
37974 // Skip bitcasts as we always know the type for the target specific
37975 // instructions.
37976 continue;
37977
37978 case X86ISD::PSHUFD:
37979 // Found another dword shuffle.
37980 break;
37981
37982 case X86ISD::PSHUFLW:
37983 // Check that the low words (being shuffled) are the identity in the
37984 // dword shuffle, and the high words are self-contained.
37985 if (Mask[0] != 0 || Mask[1] != 1 ||
37986 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
37987 return SDValue();
37988
37989 Chain.push_back(V);
37990 continue;
37991
37992 case X86ISD::PSHUFHW:
37993 // Check that the high words (being shuffled) are the identity in the
37994 // dword shuffle, and the low words are self-contained.
37995 if (Mask[2] != 2 || Mask[3] != 3 ||
37996 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
37997 return SDValue();
37998
37999 Chain.push_back(V);
38000 continue;
38001
38002 case X86ISD::UNPCKL:
38003 case X86ISD::UNPCKH:
38004 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
38005 // shuffle into a preceding word shuffle.
38006 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
38007 V.getSimpleValueType().getVectorElementType() != MVT::i16)
38008 return SDValue();
38009
38010 // Search for a half-shuffle which we can combine with.
38011 unsigned CombineOp =
38012 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
38013 if (V.getOperand(0) != V.getOperand(1) ||
38014 !V->isOnlyUserOf(V.getOperand(0).getNode()))
38015 return SDValue();
38016 Chain.push_back(V);
38017 V = V.getOperand(0);
38018 do {
38019 switch (V.getOpcode()) {
38020 default:
38021 return SDValue(); // Nothing to combine.
38022
38023 case X86ISD::PSHUFLW:
38024 case X86ISD::PSHUFHW:
38025 if (V.getOpcode() == CombineOp)
38026 break;
38027
38028 Chain.push_back(V);
38029
38030 LLVM_FALLTHROUGH[[gnu::fallthrough]];
38031 case ISD::BITCAST:
38032 V = V.getOperand(0);
38033 continue;
38034 }
38035 break;
38036 } while (V.hasOneUse());
38037 break;
38038 }
38039 // Break out of the loop if we break out of the switch.
38040 break;
38041 }
38042
38043 if (!V.hasOneUse())
38044 // We fell out of the loop without finding a viable combining instruction.
38045 return SDValue();
38046
38047 // Merge this node's mask and our incoming mask.
38048 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
38049 for (int &M : Mask)
38050 M = VMask[M];
38051 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
38052 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
38053
38054 // Rebuild the chain around this new shuffle.
38055 while (!Chain.empty()) {
38056 SDValue W = Chain.pop_back_val();
38057
38058 if (V.getValueType() != W.getOperand(0).getValueType())
38059 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
38060
38061 switch (W.getOpcode()) {
38062 default:
38063 llvm_unreachable("Only PSHUF and UNPCK instructions get here!")__builtin_unreachable();
38064
38065 case X86ISD::UNPCKL:
38066 case X86ISD::UNPCKH:
38067 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
38068 break;
38069
38070 case X86ISD::PSHUFD:
38071 case X86ISD::PSHUFLW:
38072 case X86ISD::PSHUFHW:
38073 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
38074 break;
38075 }
38076 }
38077 if (V.getValueType() != N.getValueType())
38078 V = DAG.getBitcast(N.getValueType(), V);
38079
38080 // Return the new chain to replace N.
38081 return V;
38082}
38083
38084// Attempt to commute shufps LHS loads:
38085// permilps(shufps(load(),x)) --> permilps(shufps(x,load()))
38086static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL,
38087 SelectionDAG &DAG) {
38088 // TODO: Add vXf64 support.
38089 if (VT != MVT::v4f32 && VT != MVT::v8f32 && VT != MVT::v16f32)
38090 return SDValue();
38091
38092 // SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.
38093 auto commuteSHUFP = [&VT, &DL, &DAG](SDValue Parent, SDValue V) {
38094 if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode()))
38095 return SDValue();
38096 SDValue N0 = V.getOperand(0);
38097 SDValue N1 = V.getOperand(1);
38098 unsigned Imm = V.getConstantOperandVal(2);
38099 if (!MayFoldLoad(peekThroughOneUseBitcasts(N0)) ||
38100 MayFoldLoad(peekThroughOneUseBitcasts(N1)))
38101 return SDValue();
38102 Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);
38103 return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,
38104 DAG.getTargetConstant(Imm, DL, MVT::i8));
38105 };
38106
38107 switch (N.getOpcode()) {
38108 case X86ISD::VPERMILPI:
38109 if (SDValue NewSHUFP = commuteSHUFP(N, N.getOperand(0))) {
38110 unsigned Imm = N.getConstantOperandVal(1);
38111 return DAG.getNode(X86ISD::VPERMILPI, DL, VT, NewSHUFP,
38112 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
38113 }
38114 break;
38115 case X86ISD::SHUFP: {
38116 SDValue N0 = N.getOperand(0);
38117 SDValue N1 = N.getOperand(1);
38118 unsigned Imm = N.getConstantOperandVal(2);
38119 if (N0 == N1) {
38120 if (SDValue NewSHUFP = commuteSHUFP(N, N0))
38121 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, NewSHUFP,
38122 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
38123 } else if (SDValue NewSHUFP = commuteSHUFP(N, N0)) {
38124 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, N1,
38125 DAG.getTargetConstant(Imm ^ 0x0A, DL, MVT::i8));
38126 } else if (SDValue NewSHUFP = commuteSHUFP(N, N1)) {
38127 return DAG.getNode(X86ISD::SHUFP, DL, VT, N0, NewSHUFP,
38128 DAG.getTargetConstant(Imm ^ 0xA0, DL, MVT::i8));
38129 }
38130 break;
38131 }
38132 }
38133
38134 return SDValue();
38135}
38136
38137// Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
38138static SDValue canonicalizeShuffleWithBinOps(SDValue N, SelectionDAG &DAG,
38139 const SDLoc &DL) {
38140 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
38141 EVT ShuffleVT = N.getValueType();
38142
38143 auto IsMergeableWithShuffle = [](SDValue Op) {
38144 // AllZeros/AllOnes constants are freely shuffled and will peek through
38145 // bitcasts. Other constant build vectors do not peek through bitcasts. Only
38146 // merge with target shuffles if it has one use so shuffle combining is
38147 // likely to kick in.
38148 return ISD::isBuildVectorAllOnes(Op.getNode()) ||
38149 ISD::isBuildVectorAllZeros(Op.getNode()) ||
38150 ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) ||
38151 ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode()) ||
38152 (isTargetShuffle(Op.getOpcode()) && Op->hasOneUse());
38153 };
38154 auto IsSafeToMoveShuffle = [ShuffleVT](SDValue Op, unsigned BinOp) {
38155 // Ensure we only shuffle whole vector src elements, unless its a logical
38156 // binops where we can more aggressively move shuffles from dst to src.
38157 return BinOp == ISD::AND || BinOp == ISD::OR || BinOp == ISD::XOR ||
38158 (Op.getScalarValueSizeInBits() <= ShuffleVT.getScalarSizeInBits());
38159 };
38160
38161 unsigned Opc = N.getOpcode();
38162 switch (Opc) {
38163 // Unary and Unary+Permute Shuffles.
38164 case X86ISD::PSHUFB: {
38165 // Don't merge PSHUFB if it contains zero'd elements.
38166 SmallVector<int> Mask;
38167 SmallVector<SDValue> Ops;
38168 if (!getTargetShuffleMask(N.getNode(), ShuffleVT.getSimpleVT(), false, Ops,
38169 Mask))
38170 break;
38171 LLVM_FALLTHROUGH[[gnu::fallthrough]];
38172 }
38173 case X86ISD::VBROADCAST:
38174 case X86ISD::MOVDDUP:
38175 case X86ISD::PSHUFD:
38176 case X86ISD::VPERMI:
38177 case X86ISD::VPERMILPI: {
38178 if (N.getOperand(0).getValueType() == ShuffleVT &&
38179 N->isOnlyUserOf(N.getOperand(0).getNode())) {
38180 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
38181 unsigned SrcOpcode = N0.getOpcode();
38182 if (TLI.isBinOp(SrcOpcode) && IsSafeToMoveShuffle(N0, SrcOpcode)) {
38183 SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0));
38184 SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1));
38185 if (IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op01)) {
38186 SDValue LHS, RHS;
38187 Op00 = DAG.getBitcast(ShuffleVT, Op00);
38188 Op01 = DAG.getBitcast(ShuffleVT, Op01);
38189 if (N.getNumOperands() == 2) {
38190 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, N.getOperand(1));
38191 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, N.getOperand(1));
38192 } else {
38193 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00);
38194 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01);
38195 }
38196 EVT OpVT = N0.getValueType();
38197 return DAG.getBitcast(ShuffleVT,
38198 DAG.getNode(SrcOpcode, DL, OpVT,
38199 DAG.getBitcast(OpVT, LHS),
38200 DAG.getBitcast(OpVT, RHS)));
38201 }
38202 }
38203 }
38204 break;
38205 }
38206 // Binary and Binary+Permute Shuffles.
38207 case X86ISD::INSERTPS: {
38208 // Don't merge INSERTPS if it contains zero'd elements.
38209 unsigned InsertPSMask = N.getConstantOperandVal(2);
38210 unsigned ZeroMask = InsertPSMask & 0xF;
38211 if (ZeroMask != 0)
38212 break;
38213 LLVM_FALLTHROUGH[[gnu::fallthrough]];
38214 }
38215 case X86ISD::MOVSD:
38216 case X86ISD::MOVSS:
38217 case X86ISD::BLENDI:
38218 case X86ISD::SHUFP:
38219 case X86ISD::UNPCKH:
38220 case X86ISD::UNPCKL: {
38221 if (N->isOnlyUserOf(N.getOperand(0).getNode()) &&
38222 N->isOnlyUserOf(N.getOperand(1).getNode())) {
38223 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
38224 SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));
38225 unsigned SrcOpcode = N0.getOpcode();
38226 if (TLI.isBinOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&
38227 IsSafeToMoveShuffle(N0, SrcOpcode) &&
38228 IsSafeToMoveShuffle(N1, SrcOpcode)) {
38229 SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0));
38230 SDValue Op10 = peekThroughOneUseBitcasts(N1.getOperand(0));
38231 SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1));
38232 SDValue Op11 = peekThroughOneUseBitcasts(N1.getOperand(1));
38233 // Ensure the total number of shuffles doesn't increase by folding this
38234 // shuffle through to the source ops.
38235 if (((IsMergeableWithShuffle(Op00) && IsMergeableWithShuffle(Op10)) ||
38236 (IsMergeableWithShuffle(Op01) && IsMergeableWithShuffle(Op11))) ||
38237 ((IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op10)) &&
38238 (IsMergeableWithShuffle(Op01) || IsMergeableWithShuffle(Op11)))) {
38239 SDValue LHS, RHS;
38240 Op00 = DAG.getBitcast(ShuffleVT, Op00);
38241 Op10 = DAG.getBitcast(ShuffleVT, Op10);
38242 Op01 = DAG.getBitcast(ShuffleVT, Op01);
38243 Op11 = DAG.getBitcast(ShuffleVT, Op11);
38244 if (N.getNumOperands() == 3) {
38245 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));
38246 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11, N.getOperand(2));
38247 } else {
38248 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);
38249 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11);
38250 }
38251 EVT OpVT = N0.getValueType();
38252 return DAG.getBitcast(ShuffleVT,
38253 DAG.getNode(SrcOpcode, DL, OpVT,
38254 DAG.getBitcast(OpVT, LHS),
38255 DAG.getBitcast(OpVT, RHS)));
38256 }
38257 }
38258 }
38259 break;
38260 }
38261 }
38262 return SDValue();
38263}
38264
38265/// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
38266static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V,
38267 SelectionDAG &DAG,
38268 const SDLoc &DL) {
38269 assert(V.getOpcode() == X86ISD::VPERM2X128 && "Unknown lane shuffle")(static_cast<void> (0));
38270
38271 MVT VT = V.getSimpleValueType();
38272 SDValue Src0 = peekThroughBitcasts(V.getOperand(0));
38273 SDValue Src1 = peekThroughBitcasts(V.getOperand(1));
38274 unsigned SrcOpc0 = Src0.getOpcode();
38275 unsigned SrcOpc1 = Src1.getOpcode();
38276 EVT SrcVT0 = Src0.getValueType();
38277 EVT SrcVT1 = Src1.getValueType();
38278
38279 if (!Src1.isUndef() && (SrcVT0 != SrcVT1 || SrcOpc0 != SrcOpc1))
38280 return SDValue();
38281
38282 switch (SrcOpc0) {
38283 case X86ISD::MOVDDUP: {
38284 SDValue LHS = Src0.getOperand(0);
38285 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
38286 SDValue Res =
38287 DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS, V.getOperand(2));
38288 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res);
38289 return DAG.getBitcast(VT, Res);
38290 }
38291 case X86ISD::VPERMILPI:
38292 // TODO: Handle v4f64 permutes with different low/high lane masks.
38293 if (SrcVT0 == MVT::v4f64) {
38294 uint64_t Mask = Src0.getConstantOperandVal(1);
38295 if ((Mask & 0x3) != ((Mask >> 2) & 0x3))
38296 break;
38297 }
38298 LLVM_FALLTHROUGH[[gnu::fallthrough]];
38299 case X86ISD::VSHLI:
38300 case X86ISD::VSRLI:
38301 case X86ISD::VSRAI:
38302 case X86ISD::PSHUFD:
38303 if (Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) {
38304 SDValue LHS = Src0.getOperand(0);
38305 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
38306 SDValue Res = DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS,
38307 V.getOperand(2));
38308 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res, Src0.getOperand(1));
38309 return DAG.getBitcast(VT, Res);
38310 }
38311 break;
38312 }
38313
38314 return SDValue();
38315}
38316
38317/// Try to combine x86 target specific shuffles.
38318static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
38319 TargetLowering::DAGCombinerInfo &DCI,
38320 const X86Subtarget &Subtarget) {
38321 SDLoc DL(N);
38322 MVT VT = N.getSimpleValueType();
38323 SmallVector<int, 4> Mask;
38324 unsigned Opcode = N.getOpcode();
38325
38326 if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))
38327 return R;
38328
38329 if (SDValue R = canonicalizeShuffleWithBinOps(N, DAG, DL))
38330 return R;
38331
38332 // Handle specific target shuffles.
38333 switch (Opcode) {
38334 case X86ISD::MOVDDUP: {
38335 SDValue Src = N.getOperand(0);
38336 // Turn a 128-bit MOVDDUP of a full vector load into movddup+vzload.
38337 if (VT == MVT::v2f64 && Src.hasOneUse() &&
38338 ISD::isNormalLoad(Src.getNode())) {
38339 LoadSDNode *LN = cast<LoadSDNode>(Src);
38340 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::f64, MVT::v2f64, DAG)) {
38341 SDValue Movddup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, VZLoad);
38342 DCI.CombineTo(N.getNode(), Movddup);
38343 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
38344 DCI.recursivelyDeleteUnusedNodes(LN);
38345 return N; // Return N so it doesn't get rechecked!
38346 }
38347 }
38348
38349 return SDValue();
38350 }
38351 case X86ISD::VBROADCAST: {
38352 SDValue Src = N.getOperand(0);
38353 SDValue BC = peekThroughBitcasts(Src);
38354 EVT SrcVT = Src.getValueType();
38355 EVT BCVT = BC.getValueType();
38356
38357 // If broadcasting from another shuffle, attempt to simplify it.
38358 // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
38359 if (isTargetShuffle(BC.getOpcode()) &&
38360 VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
38361 unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
38362 SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),
38363 SM_SentinelUndef);
38364 for (unsigned i = 0; i != Scale; ++i)
38365 DemandedMask[i] = i;
38366 if (SDValue Res = combineX86ShufflesRecursively(
38367 {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 0,
38368 X86::MaxShuffleCombineDepth,
38369 /*HasVarMask*/ false, /*AllowCrossLaneVarMask*/ true,
38370 /*AllowPerLaneVarMask*/ true, DAG, Subtarget))
38371 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
38372 DAG.getBitcast(SrcVT, Res));
38373 }
38374
38375 // broadcast(bitcast(src)) -> bitcast(broadcast(src))
38376 // 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.
38377 if (Src.getOpcode() == ISD::BITCAST &&
38378 SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits() &&
38379 DAG.getTargetLoweringInfo().isTypeLegal(BCVT) &&
38380 FixedVectorType::isValidElementType(
38381 BCVT.getScalarType().getTypeForEVT(*DAG.getContext()))) {
38382 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),
38383 VT.getVectorNumElements());
38384 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
38385 }
38386
38387 // Reduce broadcast source vector to lowest 128-bits.
38388 if (SrcVT.getSizeInBits() > 128)
38389 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
38390 extract128BitVector(Src, 0, DAG, DL));
38391
38392 // broadcast(scalar_to_vector(x)) -> broadcast(x).
38393 if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR)
38394 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
38395
38396 // broadcast(extract_vector_elt(x, 0)) -> broadcast(x).
38397 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
38398 isNullConstant(Src.getOperand(1)) &&
38399 DAG.getTargetLoweringInfo().isTypeLegal(
38400 Src.getOperand(0).getValueType()))
38401 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
38402
38403 // Share broadcast with the longest vector and extract low subvector (free).
38404 // Ensure the same SDValue from the SDNode use is being used.
38405 for (SDNode *User : Src->uses())
38406 if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&
38407 Src == User->getOperand(0) &&
38408 User->getValueSizeInBits(0).getFixedSize() >
38409 VT.getFixedSizeInBits()) {
38410 return extractSubVector(SDValue(User, 0), 0, DAG, DL,
38411 VT.getSizeInBits());
38412 }
38413
38414 // vbroadcast(scalarload X) -> vbroadcast_load X
38415 // For float loads, extract other uses of the scalar from the broadcast.
38416 if (!SrcVT.isVector() && (Src.hasOneUse() || VT.isFloatingPoint()) &&
38417 ISD::isNormalLoad(Src.getNode())) {
38418 LoadSDNode *LN = cast<LoadSDNode>(Src);
38419 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
38420 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
38421 SDValue BcastLd =
38422 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
38423 LN->getMemoryVT(), LN->getMemOperand());
38424 // If the load value is used only by N, replace it via CombineTo N.
38425 bool NoReplaceExtract = Src.hasOneUse();
38426 DCI.CombineTo(N.getNode(), BcastLd);
38427 if (NoReplaceExtract) {
38428 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
38429 DCI.recursivelyDeleteUnusedNodes(LN);
38430 } else {
38431 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd,
38432 DAG.getIntPtrConstant(0, DL));
38433 DCI.CombineTo(LN, Scl, BcastLd.getValue(1));
38434 }
38435 return N; // Return N so it doesn't get rechecked!
38436 }
38437
38438 // Due to isTypeDesirableForOp, we won't always shrink a load truncated to
38439 // i16. So shrink it ourselves if we can make a broadcast_load.
38440 if (SrcVT == MVT::i16 && Src.getOpcode() == ISD::TRUNCATE &&
38441 Src.hasOneUse() && Src.getOperand(0).hasOneUse()) {
38442 assert(Subtarget.hasAVX2() && "Expected AVX2")(static_cast<void> (0));
38443 SDValue TruncIn = Src.getOperand(0);
38444
38445 // If this is a truncate of a non extending load we can just narrow it to
38446 // use a broadcast_load.
38447 if (ISD::isNormalLoad(TruncIn.getNode())) {
38448 LoadSDNode *LN = cast<LoadSDNode>(TruncIn);
38449 // Unless its volatile or atomic.
38450 if (LN->isSimple()) {
38451 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
38452 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
38453 SDValue BcastLd = DAG.getMemIntrinsicNode(
38454 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
38455 LN->getPointerInfo(), LN->getOriginalAlign(),
38456 LN->getMemOperand()->getFlags());
38457 DCI.CombineTo(N.getNode(), BcastLd);
38458 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
38459 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
38460 return N; // Return N so it doesn't get rechecked!
38461 }
38462 }
38463
38464 // If this is a truncate of an i16 extload, we can directly replace it.
38465 if (ISD::isUNINDEXEDLoad(Src.getOperand(0).getNode()) &&
38466 ISD::isEXTLoad(Src.getOperand(0).getNode())) {
38467 LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0));
38468 if (LN->getMemoryVT().getSizeInBits() == 16) {
38469 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
38470 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
38471 SDValue BcastLd =
38472 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
38473 LN->getMemoryVT(), LN->getMemOperand());
38474 DCI.CombineTo(N.getNode(), BcastLd);
38475 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
38476 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
38477 return N; // Return N so it doesn't get rechecked!
38478 }
38479 }
38480
38481 // If this is a truncate of load that has been shifted right, we can
38482 // offset the pointer and use a narrower load.
38483 if (TruncIn.getOpcode() == ISD::SRL &&
38484 TruncIn.getOperand(0).hasOneUse() &&
38485 isa<ConstantSDNode>(TruncIn.getOperand(1)) &&
38486 ISD::isNormalLoad(TruncIn.getOperand(0).getNode())) {
38487 LoadSDNode *LN = cast<LoadSDNode>(TruncIn.getOperand(0));
38488 unsigned ShiftAmt = TruncIn.getConstantOperandVal(1);
38489 // Make sure the shift amount and the load size are divisible by 16.
38490 // Don't do this if the load is volatile or atomic.
38491 if (ShiftAmt % 16 == 0 && TruncIn.getValueSizeInBits() % 16 == 0 &&
38492 LN->isSimple()) {
38493 unsigned Offset = ShiftAmt / 8;
38494 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
38495 SDValue Ptr = DAG.getMemBasePlusOffset(LN->getBasePtr(),
38496 TypeSize::Fixed(Offset), DL);
38497 SDValue Ops[] = { LN->getChain(), Ptr };
38498 SDValue BcastLd = DAG.getMemIntrinsicNode(
38499 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
38500 LN->getPointerInfo().getWithOffset(Offset),
38501 LN->getOriginalAlign(),
38502 LN->getMemOperand()->getFlags());
38503 DCI.CombineTo(N.getNode(), BcastLd);
38504 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
38505 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
38506 return N; // Return N so it doesn't get rechecked!
38507 }
38508 }
38509 }
38510
38511 // vbroadcast(vzload X) -> vbroadcast_load X
38512 if (Src.getOpcode() == X86ISD::VZEXT_LOAD && Src.hasOneUse()) {
38513 MemSDNode *LN = cast<MemIntrinsicSDNode>(Src);
38514 if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) {
38515 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
38516 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
38517 SDValue BcastLd =
38518 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
38519 LN->getMemoryVT(), LN->getMemOperand());
38520 DCI.CombineTo(N.getNode(), BcastLd);
38521 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
38522 DCI.recursivelyDeleteUnusedNodes(LN);
38523 return N; // Return N so it doesn't get rechecked!
38524 }
38525 }
38526
38527 // vbroadcast(vector load X) -> vbroadcast_load
38528 if ((SrcVT == MVT::v2f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v2i64 ||
38529 SrcVT == MVT::v4i32) &&
38530 Src.hasOneUse() && ISD::isNormalLoad(Src.getNode())) {
38531 LoadSDNode *LN = cast<LoadSDNode>(Src);
38532 // Unless the load is volatile or atomic.
38533 if (LN->isSimple()) {
38534 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
38535 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
38536 SDValue BcastLd = DAG.getMemIntrinsicNode(
38537 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SrcVT.getScalarType(),
38538 LN->getPointerInfo(), LN->getOriginalAlign(),
38539 LN->getMemOperand()->getFlags());
38540 DCI.CombineTo(N.getNode(), BcastLd);
38541 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
38542 DCI.recursivelyDeleteUnusedNodes(LN);
38543 return N; // Return N so it doesn't get rechecked!
38544 }
38545 }
38546
38547 return SDValue();
38548 }
38549 case X86ISD::VZEXT_MOVL: {
38550 SDValue N0 = N.getOperand(0);
38551
38552 // If this a vzmovl of a full vector load, replace it with a vzload, unless
38553 // the load is volatile.
38554 if (N0.hasOneUse() && ISD::isNormalLoad(N0.getNode())) {
38555 auto *LN = cast<LoadSDNode>(N0);
38556 if (SDValue VZLoad =
38557 narrowLoadToVZLoad(LN, VT.getVectorElementType(), VT, DAG)) {
38558 DCI.CombineTo(N.getNode(), VZLoad);
38559 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
38560 DCI.recursivelyDeleteUnusedNodes(LN);
38561 return N;
38562 }
38563 }
38564
38565 // If this a VZEXT_MOVL of a VBROADCAST_LOAD, we don't need the broadcast
38566 // and can just use a VZEXT_LOAD.
38567 // FIXME: Is there some way to do this with SimplifyDemandedVectorElts?
38568 if (N0.hasOneUse() && N0.getOpcode() == X86ISD::VBROADCAST_LOAD) {
38569 auto *LN = cast<MemSDNode>(N0);
38570 if (VT.getScalarSizeInBits() == LN->getMemoryVT().getSizeInBits()) {
38571 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
38572 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
38573 SDValue VZLoad =
38574 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops,
38575 LN->getMemoryVT(), LN->getMemOperand());
38576 DCI.CombineTo(N.getNode(), VZLoad);
38577 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
38578 DCI.recursivelyDeleteUnusedNodes(LN);
38579 return N;
38580 }
38581 }
38582
38583 // Turn (v2i64 (vzext_movl (scalar_to_vector (i64 X)))) into
38584 // (v2i64 (bitcast (v4i32 (vzext_movl (scalar_to_vector (i32 (trunc X)))))))
38585 // if the upper bits of the i64 are zero.
38586 if (N0.hasOneUse() && N0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
38587 N0.getOperand(0).hasOneUse() &&
38588 N0.getOperand(0).getValueType() == MVT::i64) {
38589 SDValue In = N0.getOperand(0);
38590 APInt Mask = APInt::getHighBitsSet(64, 32);
38591 if (DAG.MaskedValueIsZero(In, Mask)) {
38592 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, In);
38593 MVT VecVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
38594 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Trunc);
38595 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, SclVec);
38596 return DAG.getBitcast(VT, Movl);
38597 }
38598 }
38599
38600 // Load a scalar integer constant directly to XMM instead of transferring an
38601 // immediate value from GPR.
38602 // vzext_movl (scalar_to_vector C) --> load [C,0...]
38603 if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR) {
38604 if (auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {
38605 // Create a vector constant - scalar constant followed by zeros.
38606 EVT ScalarVT = N0.getOperand(0).getValueType();
38607 Type *ScalarTy = ScalarVT.getTypeForEVT(*DAG.getContext());
38608 unsigned NumElts = VT.getVectorNumElements();
38609 Constant *Zero = ConstantInt::getNullValue(ScalarTy);
38610 SmallVector<Constant *, 32> ConstantVec(NumElts, Zero);
38611 ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue());
38612
38613 // Load the vector constant from constant pool.
38614 MVT PVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
38615 SDValue CP = DAG.getConstantPool(ConstantVector::get(ConstantVec), PVT);
38616 MachinePointerInfo MPI =
38617 MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
38618 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
38619 return DAG.getLoad(VT, DL, DAG.getEntryNode(), CP, MPI, Alignment,
38620 MachineMemOperand::MOLoad);
38621 }
38622 }
38623
38624 // Pull subvector inserts into undef through VZEXT_MOVL by making it an
38625 // insert into a zero vector. This helps get VZEXT_MOVL closer to
38626 // scalar_to_vectors where 256/512 are canonicalized to an insert and a
38627 // 128-bit scalar_to_vector. This reduces the number of isel patterns.
38628 if (!DCI.isBeforeLegalizeOps() && N0.hasOneUse()) {
38629 SDValue V = peekThroughOneUseBitcasts(N0);
38630
38631 if (V.getOpcode() == ISD::INSERT_SUBVECTOR && V.getOperand(0).isUndef() &&
38632 isNullConstant(V.getOperand(2))) {
38633 SDValue In = V.getOperand(1);
38634 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
38635 In.getValueSizeInBits() /
38636 VT.getScalarSizeInBits());
38637 In = DAG.getBitcast(SubVT, In);
38638 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, SubVT, In);
38639 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
38640 getZeroVector(VT, Subtarget, DAG, DL), Movl,
38641 V.getOperand(2));
38642 }
38643 }
38644
38645 return SDValue();
38646 }
38647 case X86ISD::BLENDI: {
38648 SDValue N0 = N.getOperand(0);
38649 SDValue N1 = N.getOperand(1);
38650
38651 // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
38652 // TODO: Handle MVT::v16i16 repeated blend mask.
38653 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
38654 N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {
38655 MVT SrcVT = N0.getOperand(0).getSimpleValueType();
38656 if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0 &&
38657 SrcVT.getScalarSizeInBits() >= 32) {
38658 unsigned BlendMask = N.getConstantOperandVal(2);
38659 unsigned Size = VT.getVectorNumElements();
38660 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
38661 BlendMask = scaleVectorShuffleBlendMask(BlendMask, Size, Scale);
38662 return DAG.getBitcast(
38663 VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
38664 N1.getOperand(0),
38665 DAG.getTargetConstant(BlendMask, DL, MVT::i8)));
38666 }
38667 }
38668 return SDValue();
38669 }
38670 case X86ISD::VPERMI: {
38671 // vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
38672 // TODO: Remove when we have preferred domains in combineX86ShuffleChain.
38673 SDValue N0 = N.getOperand(0);
38674 SDValue N1 = N.getOperand(1);
38675 unsigned EltSizeInBits = VT.getScalarSizeInBits();
38676 if (N0.getOpcode() == ISD::BITCAST &&
38677 N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) {
38678 SDValue Src = N0.getOperand(0);
38679 EVT SrcVT = Src.getValueType();
38680 SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1);
38681 return DAG.getBitcast(VT, Res);
38682 }
38683 return SDValue();
38684 }
38685 case X86ISD::VPERM2X128: {
38686 // Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)).
38687 SDValue LHS = N->getOperand(0);
38688 SDValue RHS = N->getOperand(1);
38689 if (LHS.getOpcode() == ISD::BITCAST &&
38690 (RHS.getOpcode() == ISD::BITCAST || RHS.isUndef())) {
38691 EVT SrcVT = LHS.getOperand(0).getValueType();
38692 if (RHS.isUndef() || SrcVT == RHS.getOperand(0).getValueType()) {
38693 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT,
38694 DAG.getBitcast(SrcVT, LHS),
38695 DAG.getBitcast(SrcVT, RHS),
38696 N->getOperand(2)));
38697 }
38698 }
38699
38700 // Fold vperm2x128(op(),op()) -> op(vperm2x128(),vperm2x128()).
38701 if (SDValue Res = canonicalizeLaneShuffleWithRepeatedOps(N, DAG, DL))
38702 return Res;
38703
38704 // Fold vperm2x128 subvector shuffle with an inner concat pattern.
38705 // vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc.
38706 auto FindSubVector128 = [&](unsigned Idx) {
38707 if (Idx > 3)
38708 return SDValue();
38709 SDValue Src = peekThroughBitcasts(N.getOperand(Idx < 2 ? 0 : 1));
38710 SmallVector<SDValue> SubOps;
38711 if (collectConcatOps(Src.getNode(), SubOps) && SubOps.size() == 2)
38712 return SubOps[Idx & 1];
38713 unsigned NumElts = Src.getValueType().getVectorNumElements();
38714 if ((Idx & 1) == 1 && Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
38715 Src.getOperand(1).getValueSizeInBits() == 128 &&
38716 Src.getConstantOperandAPInt(2) == (NumElts / 2)) {
38717 return Src.getOperand(1);
38718 }
38719 return SDValue();
38720 };
38721 unsigned Imm = N.getConstantOperandVal(2);
38722 if (SDValue SubLo = FindSubVector128(Imm & 0x0F)) {
38723 if (SDValue SubHi = FindSubVector128((Imm & 0xF0) >> 4)) {
38724 MVT SubVT = VT.getHalfNumVectorElementsVT();
38725 SubLo = DAG.getBitcast(SubVT, SubLo);
38726 SubHi = DAG.getBitcast(SubVT, SubHi);
38727 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, SubLo, SubHi);
38728 }
38729 }
38730 return SDValue();
38731 }
38732 case X86ISD::PSHUFD:
38733 case X86ISD::PSHUFLW:
38734 case X86ISD::PSHUFHW:
38735 Mask = getPSHUFShuffleMask(N);
38736 assert(Mask.size() == 4)(static_cast<void> (0));
38737 break;
38738 case X86ISD::MOVSD:
38739 case X86ISD::MOVSH:
38740 case X86ISD::MOVSS: {
38741 SDValue N0 = N.getOperand(0);
38742 SDValue N1 = N.getOperand(1);
38743
38744 // Canonicalize scalar FPOps:
38745 // MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))
38746 // If commutable, allow OP(N1[0], N0[0]).
38747 unsigned Opcode1 = N1.getOpcode();
38748 if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB ||
38749 Opcode1 == ISD::FDIV) {
38750 SDValue N10 = N1.getOperand(0);
38751 SDValue N11 = N1.getOperand(1);
38752 if (N10 == N0 ||
38753 (N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) {
38754 if (N10 != N0)
38755 std::swap(N10, N11);
38756 MVT SVT = VT.getVectorElementType();
38757 SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
38758 N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);
38759 N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);
38760 SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);
38761 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
38762 return DAG.getNode(Opcode, DL, VT, N0, SclVec);
38763 }
38764 }
38765
38766 return SDValue();
38767 }
38768 case X86ISD::INSERTPS: {
38769 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32")(static_cast<void> (0));
38770 SDValue Op0 = N.getOperand(0);
38771 SDValue Op1 = N.getOperand(1);
38772 unsigned InsertPSMask = N.getConstantOperandVal(2);
38773 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
38774 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
38775 unsigned ZeroMask = InsertPSMask & 0xF;
38776
38777 // If we zero out all elements from Op0 then we don't need to reference it.
38778 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
38779 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
38780 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
38781
38782 // If we zero out the element from Op1 then we don't need to reference it.
38783 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
38784 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
38785 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
38786
38787 // Attempt to merge insertps Op1 with an inner target shuffle node.
38788 SmallVector<int, 8> TargetMask1;
38789 SmallVector<SDValue, 2> Ops1;
38790 APInt KnownUndef1, KnownZero1;
38791 if (getTargetShuffleAndZeroables(Op1, TargetMask1, Ops1, KnownUndef1,
38792 KnownZero1)) {
38793 if (KnownUndef1[SrcIdx] || KnownZero1[SrcIdx]) {
38794 // Zero/UNDEF insertion - zero out element and remove dependency.
38795 InsertPSMask |= (1u << DstIdx);
38796 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
38797 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
38798 }
38799 // Update insertps mask srcidx and reference the source input directly.
38800 int M = TargetMask1[SrcIdx];
38801 assert(0 <= M && M < 8 && "Shuffle index out of range")(static_cast<void> (0));
38802 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
38803 Op1 = Ops1[M < 4 ? 0 : 1];
38804 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
38805 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
38806 }
38807
38808 // Attempt to merge insertps Op0 with an inner target shuffle node.
38809 SmallVector<int, 8> TargetMask0;
38810 SmallVector<SDValue, 2> Ops0;
38811 APInt KnownUndef0, KnownZero0;
38812 if (getTargetShuffleAndZeroables(Op0, TargetMask0, Ops0, KnownUndef0,
38813 KnownZero0)) {
38814 bool Updated = false;
38815 bool UseInput00 = false;
38816 bool UseInput01 = false;
38817 for (int i = 0; i != 4; ++i) {
38818 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
38819 // No change if element is already zero or the inserted element.
38820 continue;
38821 } else if (KnownUndef0[i] || KnownZero0[i]) {
38822 // If the target mask is undef/zero then we must zero the element.
38823 InsertPSMask |= (1u << i);
38824 Updated = true;
38825 continue;
38826 }
38827
38828 // The input vector element must be inline.
38829 int M = TargetMask0[i];
38830 if (M != i && M != (i + 4))
38831 return SDValue();
38832
38833 // Determine which inputs of the target shuffle we're using.
38834 UseInput00 |= (0 <= M && M < 4);
38835 UseInput01 |= (4 <= M);
38836 }
38837
38838 // If we're not using both inputs of the target shuffle then use the
38839 // referenced input directly.
38840 if (UseInput00 && !UseInput01) {
38841 Updated = true;
38842 Op0 = Ops0[0];
38843 } else if (!UseInput00 && UseInput01) {
38844 Updated = true;
38845 Op0 = Ops0[1];
38846 }
38847
38848 if (Updated)
38849 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
38850 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
38851 }
38852
38853 // If we're inserting an element from a vbroadcast load, fold the
38854 // load into the X86insertps instruction. We need to convert the scalar
38855 // load to a vector and clear the source lane of the INSERTPS control.
38856 if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) {
38857 auto *MemIntr = cast<MemIntrinsicSDNode>(Op1);
38858 if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {
38859 SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),
38860 MemIntr->getBasePtr(),
38861 MemIntr->getMemOperand());
38862 SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0,
38863 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT,
38864 Load),
38865 DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));
38866 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
38867 return Insert;
38868 }
38869 }
38870
38871 return SDValue();
38872 }
38873 default:
38874 return SDValue();
38875 }
38876
38877 // Nuke no-op shuffles that show up after combining.
38878 if (isNoopShuffleMask(Mask))
38879 return N.getOperand(0);
38880
38881 // Look for simplifications involving one or two shuffle instructions.
38882 SDValue V = N.getOperand(0);
38883 switch (N.getOpcode()) {
38884 default:
38885 break;
38886 case X86ISD::PSHUFLW:
38887 case X86ISD::PSHUFHW:
38888 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!")(static_cast<void> (0));
38889
38890 // See if this reduces to a PSHUFD which is no more expensive and can
38891 // combine with more operations. Note that it has to at least flip the
38892 // dwords as otherwise it would have been removed as a no-op.
38893 if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
38894 int DMask[] = {0, 1, 2, 3};
38895 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
38896 DMask[DOffset + 0] = DOffset + 1;
38897 DMask[DOffset + 1] = DOffset + 0;
38898 MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
38899 V = DAG.getBitcast(DVT, V);
38900 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
38901 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
38902 return DAG.getBitcast(VT, V);
38903 }
38904
38905 // Look for shuffle patterns which can be implemented as a single unpack.
38906 // FIXME: This doesn't handle the location of the PSHUFD generically, and
38907 // only works when we have a PSHUFD followed by two half-shuffles.
38908 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
38909 (V.getOpcode() == X86ISD::PSHUFLW ||
38910 V.getOpcode() == X86ISD::PSHUFHW) &&
38911 V.getOpcode() != N.getOpcode() &&
38912 V.hasOneUse() && V.getOperand(0).hasOneUse()) {
38913 SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
38914 if (D.getOpcode() == X86ISD::PSHUFD) {
38915 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
38916 SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
38917 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
38918 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
38919 int WordMask[8];
38920 for (int i = 0; i < 4; ++i) {
38921 WordMask[i + NOffset] = Mask[i] + NOffset;
38922 WordMask[i + VOffset] = VMask[i] + VOffset;
38923 }
38924 // Map the word mask through the DWord mask.
38925 int MappedMask[8];
38926 for (int i = 0; i < 8; ++i)
38927 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
38928 if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
38929 makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
38930 // We can replace all three shuffles with an unpack.
38931 V = DAG.getBitcast(VT, D.getOperand(0));
38932 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
38933 : X86ISD::UNPCKH,
38934 DL, VT, V, V);
38935 }
38936 }
38937 }
38938
38939 break;
38940
38941 case X86ISD::PSHUFD:
38942 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
38943 return NewN;
38944
38945 break;
38946 }
38947
38948 return SDValue();
38949}
38950
38951/// Checks if the shuffle mask takes subsequent elements
38952/// alternately from two vectors.
38953/// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.
38954static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {
38955
38956 int ParitySrc[2] = {-1, -1};
38957 unsigned Size = Mask.size();
38958 for (unsigned i = 0; i != Size; ++i) {
38959 int M = Mask[i];
38960 if (M < 0)
38961 continue;
38962
38963 // Make sure we are using the matching element from the input.
38964 if ((M % Size) != i)
38965 return false;
38966
38967 // Make sure we use the same input for all elements of the same parity.
38968 int Src = M / Size;
38969 if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)
38970 return false;
38971 ParitySrc[i % 2] = Src;
38972 }
38973
38974 // Make sure each input is used.
38975 if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])
38976 return false;
38977
38978 Op0Even = ParitySrc[0] == 0;
38979 return true;
38980}
38981
38982/// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
38983/// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
38984/// are written to the parameters \p Opnd0 and \p Opnd1.
38985///
38986/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
38987/// so it is easier to generically match. We also insert dummy vector shuffle
38988/// nodes for the operands which explicitly discard the lanes which are unused
38989/// by this operation to try to flow through the rest of the combiner
38990/// the fact that they're unused.
38991static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
38992 SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
38993 bool &IsSubAdd) {
38994
38995 EVT VT = N->getValueType(0);
38996 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
38997 if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||
38998 !VT.getSimpleVT().isFloatingPoint())
38999 return false;
39000
39001 // We only handle target-independent shuffles.
39002 // FIXME: It would be easy and harmless to use the target shuffle mask
39003 // extraction tool to support more.
39004 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
39005 return false;
39006
39007 SDValue V1 = N->getOperand(0);
39008 SDValue V2 = N->getOperand(1);
39009
39010 // Make sure we have an FADD and an FSUB.
39011 if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||
39012 (V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||
39013 V1.getOpcode() == V2.getOpcode())
39014 return false;
39015
39016 // If there are other uses of these operations we can't fold them.
39017 if (!V1->hasOneUse() || !V2->hasOneUse())
39018 return false;
39019
39020 // Ensure that both operations have the same operands. Note that we can
39021 // commute the FADD operands.
39022 SDValue LHS, RHS;
39023 if (V1.getOpcode() == ISD::FSUB) {
39024 LHS = V1->getOperand(0); RHS = V1->getOperand(1);
39025 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
39026 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
39027 return false;
39028 } else {
39029 assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode")(static_cast<void> (0));
39030 LHS = V2->getOperand(0); RHS = V2->getOperand(1);
39031 if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&
39032 (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))
39033 return false;
39034 }
39035
39036 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
39037 bool Op0Even;
39038 if (!isAddSubOrSubAddMask(Mask, Op0Even))
39039 return false;
39040
39041 // It's a subadd if the vector in the even parity is an FADD.
39042 IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
39043 : V2->getOpcode() == ISD::FADD;
39044
39045 Opnd0 = LHS;
39046 Opnd1 = RHS;
39047 return true;
39048}
39049
39050/// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
39051static SDValue combineShuffleToFMAddSub(SDNode *N,
39052 const X86Subtarget &Subtarget,
39053 SelectionDAG &DAG) {
39054 // We only handle target-independent shuffles.
39055 // FIXME: It would be easy and harmless to use the target shuffle mask
39056 // extraction tool to support more.
39057 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
39058 return SDValue();
39059
39060 MVT VT = N->getSimpleValueType(0);
39061 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
39062 if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT))
39063 return SDValue();
39064
39065 // We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).
39066 SDValue Op0 = N->getOperand(0);
39067 SDValue Op1 = N->getOperand(1);
39068 SDValue FMAdd = Op0, FMSub = Op1;
39069 if (FMSub.getOpcode() != X86ISD::FMSUB)
39070 std::swap(FMAdd, FMSub);
39071
39072 if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB ||
39073 FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() ||
39074 FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() ||
39075 FMAdd.getOperand(2) != FMSub.getOperand(2))
39076 return SDValue();
39077
39078 // Check for correct shuffle mask.
39079 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
39080 bool Op0Even;
39081 if (!isAddSubOrSubAddMask(Mask, Op0Even))
39082 return SDValue();
39083
39084 // FMAddSub takes zeroth operand from FMSub node.
39085 SDLoc DL(N);
39086 bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;
39087 unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
39088 return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),
39089 FMAdd.getOperand(2));
39090}
39091
39092/// Try to combine a shuffle into a target-specific add-sub or
39093/// mul-add-sub node.
39094static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
39095 const X86Subtarget &Subtarget,
39096 SelectionDAG &DAG) {
39097 if (SDValue V = combineShuffleToFMAddSub(N, Subtarget, DAG))
39098 return V;
39099
39100 SDValue Opnd0, Opnd1;
39101 bool IsSubAdd;
39102 if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd))
39103 return SDValue();
39104
39105 MVT VT = N->getSimpleValueType(0);
39106 SDLoc DL(N);
39107
39108 // Try to generate X86ISD::FMADDSUB node here.
39109 SDValue Opnd2;
39110 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) {
39111 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
39112 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
39113 }
39114
39115 if (IsSubAdd)
39116 return SDValue();
39117
39118 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
39119 // the ADDSUB idiom has been successfully recognized. There are no known
39120 // X86 targets with 512-bit ADDSUB instructions!
39121 if (VT.is512BitVector())
39122 return SDValue();
39123
39124 // Do not generate X86ISD::ADDSUB node for FP16's vector types even though
39125 // the ADDSUB idiom has been successfully recognized. There are no known
39126 // X86 targets with FP16 ADDSUB instructions!
39127 if (VT.getVectorElementType() == MVT::f16)
39128 return SDValue();
39129
39130 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
39131}
39132
39133// We are looking for a shuffle where both sources are concatenated with undef
39134// and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
39135// if we can express this as a single-source shuffle, that's preferable.
39136static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
39137 const X86Subtarget &Subtarget) {
39138 if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
39139 return SDValue();
39140
39141 EVT VT = N->getValueType(0);
39142
39143 // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
39144 if (!VT.is128BitVector() && !VT.is256BitVector())
39145 return SDValue();
39146
39147 if (VT.getVectorElementType() != MVT::i32 &&
39148 VT.getVectorElementType() != MVT::i64 &&
39149 VT.getVectorElementType() != MVT::f32 &&
39150 VT.getVectorElementType() != MVT::f64)
39151 return SDValue();
39152
39153 SDValue N0 = N->getOperand(0);
39154 SDValue N1 = N->getOperand(1);
39155
39156 // Check that both sources are concats with undef.
39157 if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
39158 N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
39159 N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
39160 !N1.getOperand(1).isUndef())
39161 return SDValue();
39162
39163 // Construct the new shuffle mask. Elements from the first source retain their
39164 // index, but elements from the second source no longer need to skip an undef.
39165 SmallVector<int, 8> Mask;
39166 int NumElts = VT.getVectorNumElements();
39167
39168 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
39169 for (int Elt : SVOp->getMask())
39170 Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
39171
39172 SDLoc DL(N);
39173 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
39174 N1.getOperand(0));
39175 return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
39176}
39177
39178/// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the
39179/// low half of each source vector and does not set any high half elements in
39180/// the destination vector, narrow the shuffle to half its original size.
39181static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG) {
39182 if (!Shuf->getValueType(0).isSimple())
39183 return SDValue();
39184 MVT VT = Shuf->getSimpleValueType(0);
39185 if (!VT.is256BitVector() && !VT.is512BitVector())
39186 return SDValue();
39187
39188 // See if we can ignore all of the high elements of the shuffle.
39189 ArrayRef<int> Mask = Shuf->getMask();
39190 if (!isUndefUpperHalf(Mask))
39191 return SDValue();
39192
39193 // Check if the shuffle mask accesses only the low half of each input vector
39194 // (half-index output is 0 or 2).
39195 int HalfIdx1, HalfIdx2;
39196 SmallVector<int, 8> HalfMask(Mask.size() / 2);
39197 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) ||
39198 (HalfIdx1 % 2 == 1) || (HalfIdx2 % 2 == 1))
39199 return SDValue();
39200
39201 // Create a half-width shuffle to replace the unnecessarily wide shuffle.
39202 // The trick is knowing that all of the insert/extract are actually free
39203 // subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle
39204 // of narrow inputs into a narrow output, and that is always cheaper than
39205 // the wide shuffle that we started with.
39206 return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),
39207 Shuf->getOperand(1), HalfMask, HalfIdx1,
39208 HalfIdx2, false, DAG, /*UseConcat*/true);
39209}
39210
39211static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
39212 TargetLowering::DAGCombinerInfo &DCI,
39213 const X86Subtarget &Subtarget) {
39214 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N))
39215 if (SDValue V = narrowShuffle(Shuf, DAG))
39216 return V;
39217
39218 // If we have legalized the vector types, look for blends of FADD and FSUB
39219 // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
39220 SDLoc dl(N);
39221 EVT VT = N->getValueType(0);
39222 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
39223 if (TLI.isTypeLegal(VT))
39224 if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
39225 return AddSub;
39226
39227 // Attempt to combine into a vector load/broadcast.
39228 if (SDValue LD = combineToConsecutiveLoads(
39229 VT, SDValue(N, 0), dl, DAG, Subtarget, /*IsAfterLegalize*/ true))
39230 return LD;
39231
39232 // For AVX2, we sometimes want to combine
39233 // (vector_shuffle <mask> (concat_vectors t1, undef)
39234 // (concat_vectors t2, undef))
39235 // Into:
39236 // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
39237 // Since the latter can be efficiently lowered with VPERMD/VPERMQ
39238 if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
39239 return ShufConcat;
39240
39241 if (isTargetShuffle(N->getOpcode())) {
39242 SDValue Op(N, 0);
39243 if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
39244 return Shuffle;
39245
39246 // Try recursively combining arbitrary sequences of x86 shuffle
39247 // instructions into higher-order shuffles. We do this after combining
39248 // specific PSHUF instruction sequences into their minimal form so that we
39249 // can evaluate how many specialized shuffle instructions are involved in
39250 // a particular chain.
39251 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
39252 return Res;
39253
39254 // Simplify source operands based on shuffle mask.
39255 // TODO - merge this into combineX86ShufflesRecursively.
39256 APInt KnownUndef, KnownZero;
39257 APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
39258 if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, KnownUndef, KnownZero,
39259 DCI))
39260 return SDValue(N, 0);
39261 }
39262
39263 return SDValue();
39264}
39265
39266// Simplify variable target shuffle masks based on the demanded elements.
39267// TODO: Handle DemandedBits in mask indices as well?
39268bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetShuffle(
39269 SDValue Op, const APInt &DemandedElts, unsigned MaskIndex,
39270 TargetLowering::TargetLoweringOpt &TLO, unsigned Depth) const {
39271 // If we're demanding all elements don't bother trying to simplify the mask.
39272 unsigned NumElts = DemandedElts.getBitWidth();
39273 if (DemandedElts.isAllOnesValue())
39274 return false;
39275
39276 SDValue Mask = Op.getOperand(MaskIndex);
39277 if (!Mask.hasOneUse())
39278 return false;
39279
39280 // Attempt to generically simplify the variable shuffle mask.
39281 APInt MaskUndef, MaskZero;
39282 if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
39283 Depth + 1))
39284 return true;
39285
39286 // Attempt to extract+simplify a (constant pool load) shuffle mask.
39287 // TODO: Support other types from getTargetShuffleMaskIndices?
39288 SDValue BC = peekThroughOneUseBitcasts(Mask);
39289 EVT BCVT = BC.getValueType();
39290 auto *Load = dyn_cast<LoadSDNode>(BC);
39291 if (!Load)
39292 return false;
39293
39294 const Constant *C = getTargetConstantFromNode(Load);
39295 if (!C)
39296 return false;
39297
39298 Type *CTy = C->getType();
39299 if (!CTy->isVectorTy() ||
39300 CTy->getPrimitiveSizeInBits() != Mask.getValueSizeInBits())
39301 return false;
39302
39303 // Handle scaling for i64 elements on 32-bit targets.
39304 unsigned NumCstElts = cast<FixedVectorType>(CTy)->getNumElements();
39305 if (NumCstElts != NumElts && NumCstElts != (NumElts * 2))
39306 return false;
39307 unsigned Scale = NumCstElts / NumElts;
39308
39309 // Simplify mask if we have an undemanded element that is not undef.
39310 bool Simplified = false;
39311 SmallVector<Constant *, 32> ConstVecOps;
39312 for (unsigned i = 0; i != NumCstElts; ++i) {
39313 Constant *Elt = C->getAggregateElement(i);
39314 if (!DemandedElts[i / Scale] && !isa<UndefValue>(Elt)) {
39315 ConstVecOps.push_back(UndefValue::get(Elt->getType()));
39316 Simplified = true;
39317 continue;
39318 }
39319 ConstVecOps.push_back(Elt);
39320 }
39321 if (!Simplified)
39322 return false;
39323
39324 // Generate new constant pool entry + legalize immediately for the load.
39325 SDLoc DL(Op);
39326 SDValue CV = TLO.DAG.getConstantPool(ConstantVector::get(ConstVecOps), BCVT);
39327 SDValue LegalCV = LowerConstantPool(CV, TLO.DAG);
39328 SDValue NewMask = TLO.DAG.getLoad(
39329 BCVT, DL, TLO.DAG.getEntryNode(), LegalCV,
39330 MachinePointerInfo::getConstantPool(TLO.DAG.getMachineFunction()),
39331 Load->getAlign());
39332 return TLO.CombineTo(Mask, TLO.DAG.getBitcast(Mask.getValueType(), NewMask));
39333}
39334
39335bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
39336 SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
39337 TargetLoweringOpt &TLO, unsigned Depth) const {
39338 int NumElts = DemandedElts.getBitWidth();
39339 unsigned Opc = Op.getOpcode();
39340 EVT VT = Op.getValueType();
39341
39342 // Handle special case opcodes.
39343 switch (Opc) {
39344 case X86ISD::PMULDQ:
39345 case X86ISD::PMULUDQ: {
39346 APInt LHSUndef, LHSZero;
39347 APInt RHSUndef, RHSZero;
39348 SDValue LHS = Op.getOperand(0);
39349 SDValue RHS = Op.getOperand(1);
39350 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
39351 Depth + 1))
39352 return true;
39353 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
39354 Depth + 1))
39355 return true;
39356 // Multiply by zero.
39357 KnownZero = LHSZero | RHSZero;
39358 break;
39359 }
39360 case X86ISD::VSHL:
39361 case X86ISD::VSRL:
39362 case X86ISD::VSRA: {
39363 // We only need the bottom 64-bits of the (128-bit) shift amount.
39364 SDValue Amt = Op.getOperand(1);
39365 MVT AmtVT = Amt.getSimpleValueType();
39366 assert(AmtVT.is128BitVector() && "Unexpected value type")(static_cast<void> (0));
39367
39368 // If we reuse the shift amount just for sse shift amounts then we know that
39369 // only the bottom 64-bits are only ever used.
39370 bool AssumeSingleUse = llvm::all_of(Amt->uses(), [&Amt](SDNode *Use) {
39371 unsigned UseOpc = Use->getOpcode();
39372 return (UseOpc == X86ISD::VSHL || UseOpc == X86ISD::VSRL ||
39373 UseOpc == X86ISD::VSRA) &&
39374 Use->getOperand(0) != Amt;
39375 });
39376
39377 APInt AmtUndef, AmtZero;
39378 unsigned NumAmtElts = AmtVT.getVectorNumElements();
39379 APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);
39380 if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,
39381 Depth + 1, AssumeSingleUse))
39382 return true;
39383 LLVM_FALLTHROUGH[[gnu::fallthrough]];
39384 }
39385 case X86ISD::VSHLI:
39386 case X86ISD::VSRLI:
39387 case X86ISD::VSRAI: {
39388 SDValue Src = Op.getOperand(0);
39389 APInt SrcUndef;
39390 if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,
39391 Depth + 1))
39392 return true;
39393
39394 // Aggressively peek through ops to get at the demanded elts.
39395 if (!DemandedElts.isAllOnesValue())
39396 if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
39397 Src, DemandedElts, TLO.DAG, Depth + 1))
39398 return TLO.CombineTo(
39399 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1)));
39400 break;
39401 }
39402 case X86ISD::KSHIFTL: {
39403 SDValue Src = Op.getOperand(0);
39404 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
39405 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount")(static_cast<void> (0));
39406 unsigned ShiftAmt = Amt->getZExtValue();
39407
39408 if (ShiftAmt == 0)
39409 return TLO.CombineTo(Op, Src);
39410
39411 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
39412 // single shift. We can do this if the bottom bits (which are shifted
39413 // out) are never demanded.
39414 if (Src.getOpcode() == X86ISD::KSHIFTR) {
39415 if (!DemandedElts.intersects(APInt::getLowBitsSet(NumElts, ShiftAmt))) {
39416 unsigned C1 = Src.getConstantOperandVal(1);
39417 unsigned NewOpc = X86ISD::KSHIFTL;
39418 int Diff = ShiftAmt - C1;
39419 if (Diff < 0) {
39420 Diff = -Diff;
39421 NewOpc = X86ISD::KSHIFTR;
39422 }
39423
39424 SDLoc dl(Op);
39425 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
39426 return TLO.CombineTo(
39427 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
39428 }
39429 }
39430
39431 APInt DemandedSrc = DemandedElts.lshr(ShiftAmt);
39432 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
39433 Depth + 1))
39434 return true;
39435
39436 KnownUndef <<= ShiftAmt;
39437 KnownZero <<= ShiftAmt;
39438 KnownZero.setLowBits(ShiftAmt);
39439 break;
39440 }
39441 case X86ISD::KSHIFTR: {
39442 SDValue Src = Op.getOperand(0);
39443 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
39444 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount")(static_cast<void> (0));
39445 unsigned ShiftAmt = Amt->getZExtValue();
39446
39447 if (ShiftAmt == 0)
39448 return TLO.CombineTo(Op, Src);
39449
39450 // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a
39451 // single shift. We can do this if the top bits (which are shifted
39452 // out) are never demanded.
39453 if (Src.getOpcode() == X86ISD::KSHIFTL) {
39454 if (!DemandedElts.intersects(APInt::getHighBitsSet(NumElts, ShiftAmt))) {
39455 unsigned C1 = Src.getConstantOperandVal(1);
39456 unsigned NewOpc = X86ISD::KSHIFTR;
39457 int Diff = ShiftAmt - C1;
39458 if (Diff < 0) {
39459 Diff = -Diff;
39460 NewOpc = X86ISD::KSHIFTL;
39461 }
39462
39463 SDLoc dl(Op);
39464 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
39465 return TLO.CombineTo(
39466 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
39467 }
39468 }
39469
39470 APInt DemandedSrc = DemandedElts.shl(ShiftAmt);
39471 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
39472 Depth + 1))
39473 return true;
39474
39475 KnownUndef.lshrInPlace(ShiftAmt);
39476 KnownZero.lshrInPlace(ShiftAmt);
39477 KnownZero.setHighBits(ShiftAmt);
39478 break;
39479 }
39480 case X86ISD::CVTSI2P:
39481 case X86ISD::CVTUI2P: {
39482 SDValue Src = Op.getOperand(0);
39483 MVT SrcVT = Src.getSimpleValueType();
39484 APInt SrcUndef, SrcZero;
39485 APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
39486 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
39487 Depth + 1))
39488 return true;
39489 break;
39490 }
39491 case X86ISD::PACKSS:
39492 case X86ISD::PACKUS: {
39493 SDValue N0 = Op.getOperand(0);
39494 SDValue N1 = Op.getOperand(1);
39495
39496 APInt DemandedLHS, DemandedRHS;
39497 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
39498
39499 APInt LHSUndef, LHSZero;
39500 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
39501 Depth + 1))
39502 return true;
39503 APInt RHSUndef, RHSZero;
39504 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
39505 Depth + 1))
39506 return true;
39507
39508 // TODO - pass on known zero/undef.
39509
39510 // Aggressively peek through ops to get at the demanded elts.
39511 // TODO - we should do this for all target/faux shuffles ops.
39512 if (!DemandedElts.isAllOnesValue()) {
39513 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
39514 TLO.DAG, Depth + 1);
39515 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
39516 TLO.DAG, Depth + 1);
39517 if (NewN0 || NewN1) {
39518 NewN0 = NewN0 ? NewN0 : N0;
39519 NewN1 = NewN1 ? NewN1 : N1;
39520 return TLO.CombineTo(Op,
39521 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
39522 }
39523 }
39524 break;
39525 }
39526 case X86ISD::HADD:
39527 case X86ISD::HSUB:
39528 case X86ISD::FHADD:
39529 case X86ISD::FHSUB: {
39530 SDValue N0 = Op.getOperand(0);
39531 SDValue N1 = Op.getOperand(1);
39532
39533 APInt DemandedLHS, DemandedRHS;
39534 getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
39535
39536 APInt LHSUndef, LHSZero;
39537 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
39538 Depth + 1))
39539 return true;
39540 APInt RHSUndef, RHSZero;
39541 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
39542 Depth + 1))
39543 return true;
39544
39545 // TODO - pass on known zero/undef.
39546
39547 // Aggressively peek through ops to get at the demanded elts.
39548 // TODO: Handle repeated operands.
39549 if (N0 != N1 && !DemandedElts.isAllOnesValue()) {
39550 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
39551 TLO.DAG, Depth + 1);
39552 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
39553 TLO.DAG, Depth + 1);
39554 if (NewN0 || NewN1) {
39555 NewN0 = NewN0 ? NewN0 : N0;
39556 NewN1 = NewN1 ? NewN1 : N1;
39557 return TLO.CombineTo(Op,
39558 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
39559 }
39560 }
39561 break;
39562 }
39563 case X86ISD::VTRUNC:
39564 case X86ISD::VTRUNCS:
39565 case X86ISD::VTRUNCUS: {
39566 SDValue Src = Op.getOperand(0);
39567 MVT SrcVT = Src.getSimpleValueType();
39568 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
39569 APInt SrcUndef, SrcZero;
39570 if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO,
39571 Depth + 1))
39572 return true;
39573 KnownZero = SrcZero.zextOrTrunc(NumElts);
39574 KnownUndef = SrcUndef.zextOrTrunc(NumElts);
39575 break;
39576 }
39577 case X86ISD::BLENDV: {
39578 APInt SelUndef, SelZero;
39579 if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,
39580 SelZero, TLO, Depth + 1))
39581 return true;
39582
39583 // TODO: Use SelZero to adjust LHS/RHS DemandedElts.
39584 APInt LHSUndef, LHSZero;
39585 if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef,
39586 LHSZero, TLO, Depth + 1))
39587 return true;
39588
39589 APInt RHSUndef, RHSZero;
39590 if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef,
39591 RHSZero, TLO, Depth + 1))
39592 return true;
39593
39594 KnownZero = LHSZero & RHSZero;
39595 KnownUndef = LHSUndef & RHSUndef;
39596 break;
39597 }
39598 case X86ISD::VZEXT_MOVL: {
39599 // If upper demanded elements are already zero then we have nothing to do.
39600 SDValue Src = Op.getOperand(0);
39601 APInt DemandedUpperElts = DemandedElts;
39602 DemandedUpperElts.clearLowBits(1);
39603 if (TLO.DAG.computeKnownBits(Src, DemandedUpperElts, Depth + 1).isZero())
39604 return TLO.CombineTo(Op, Src);
39605 break;
39606 }
39607 case X86ISD::VBROADCAST: {
39608 SDValue Src = Op.getOperand(0);
39609 MVT SrcVT = Src.getSimpleValueType();
39610 if (!SrcVT.isVector())
39611 break;
39612 // Don't bother broadcasting if we just need the 0'th element.
39613 if (DemandedElts == 1) {
39614 if (Src.getValueType() != VT)
39615 Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,
39616 SDLoc(Op));
39617 return TLO.CombineTo(Op, Src);
39618 }
39619 APInt SrcUndef, SrcZero;
39620 APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);
39621 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
39622 Depth + 1))
39623 return true;
39624 // Aggressively peek through src to get at the demanded elt.
39625 // TODO - we should do this for all target/faux shuffles ops.
39626 if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
39627 Src, SrcElts, TLO.DAG, Depth + 1))
39628 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
39629 break;
39630 }
39631 case X86ISD::VPERMV:
39632 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 0, TLO,
39633 Depth))
39634 return true;
39635 break;
39636 case X86ISD::PSHUFB:
39637 case X86ISD::VPERMV3:
39638 case X86ISD::VPERMILPV:
39639 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 1, TLO,
39640 Depth))
39641 return true;
39642 break;
39643 case X86ISD::VPPERM:
39644 case X86ISD::VPERMIL2:
39645 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 2, TLO,
39646 Depth))
39647 return true;
39648 break;
39649 }
39650
39651 // For 256/512-bit ops that are 128/256-bit ops glued together, if we do not
39652 // demand any of the high elements, then narrow the op to 128/256-bits: e.g.
39653 // (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0
39654 if ((VT.is256BitVector() || VT.is512BitVector()) &&
39655 DemandedElts.lshr(NumElts / 2) == 0) {
39656 unsigned SizeInBits = VT.getSizeInBits();
39657 unsigned ExtSizeInBits = SizeInBits / 2;
39658
39659 // See if 512-bit ops only use the bottom 128-bits.
39660 if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0)
39661 ExtSizeInBits = SizeInBits / 4;
39662
39663 switch (Opc) {
39664 // Scalar broadcast.
39665 case X86ISD::VBROADCAST: {
39666 SDLoc DL(Op);
39667 SDValue Src = Op.getOperand(0);
39668 if (Src.getValueSizeInBits() > ExtSizeInBits)
39669 Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);
39670 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
39671 ExtSizeInBits / VT.getScalarSizeInBits());
39672 SDValue Bcst = TLO.DAG.getNode(X86ISD::VBROADCAST, DL, BcstVT, Src);
39673 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
39674 TLO.DAG, DL, ExtSizeInBits));
39675 }
39676 case X86ISD::VBROADCAST_LOAD: {
39677 SDLoc DL(Op);
39678 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
39679 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
39680 ExtSizeInBits / VT.getScalarSizeInBits());
39681 SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);
39682 SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};
39683 SDValue Bcst = TLO.DAG.getMemIntrinsicNode(
39684 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),
39685 MemIntr->getMemOperand());
39686 TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),
39687 Bcst.getValue(1));
39688 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
39689 TLO.DAG, DL, ExtSizeInBits));
39690 }
39691 // Subvector broadcast.
39692 case X86ISD::SUBV_BROADCAST_LOAD: {
39693 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
39694 EVT MemVT = MemIntr->getMemoryVT();
39695 if (ExtSizeInBits == MemVT.getStoreSizeInBits()) {
39696 SDLoc DL(Op);
39697 SDValue Ld =
39698 TLO.DAG.getLoad(MemVT, DL, MemIntr->getChain(),
39699 MemIntr->getBasePtr(), MemIntr->getMemOperand());
39700 TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),
39701 Ld.getValue(1));
39702 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Ld, 0,
39703 TLO.DAG, DL, ExtSizeInBits));
39704 } else if ((ExtSizeInBits % MemVT.getStoreSizeInBits()) == 0) {
39705 SDLoc DL(Op);
39706 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
39707 ExtSizeInBits / VT.getScalarSizeInBits());
39708 SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);
39709 SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};
39710 SDValue Bcst =
39711 TLO.DAG.getMemIntrinsicNode(X86ISD::SUBV_BROADCAST_LOAD, DL, Tys,
39712 Ops, MemVT, MemIntr->getMemOperand());
39713 TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),
39714 Bcst.getValue(1));
39715 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
39716 TLO.DAG, DL, ExtSizeInBits));
39717 }
39718 break;
39719 }
39720 // Byte shifts by immediate.
39721 case X86ISD::VSHLDQ:
39722 case X86ISD::VSRLDQ:
39723 // Shift by uniform.
39724 case X86ISD::VSHL:
39725 case X86ISD::VSRL:
39726 case X86ISD::VSRA:
39727 // Shift by immediate.
39728 case X86ISD::VSHLI:
39729 case X86ISD::VSRLI:
39730 case X86ISD::VSRAI: {
39731 SDLoc DL(Op);
39732 SDValue Ext0 =
39733 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
39734 SDValue ExtOp =
39735 TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1));
39736 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
39737 SDValue Insert =
39738 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
39739 return TLO.CombineTo(Op, Insert);
39740 }
39741 case X86ISD::VPERMI: {
39742 // Simplify PERMPD/PERMQ to extract_subvector.
39743 // TODO: This should be done in shuffle combining.
39744 if (VT == MVT::v4f64 || VT == MVT::v4i64) {
39745 SmallVector<int, 4> Mask;
39746 DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask);
39747 if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) {
39748 SDLoc DL(Op);
39749 SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128);
39750 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
39751 SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128);
39752 return TLO.CombineTo(Op, Insert);
39753 }
39754 }
39755 break;
39756 }
39757 case X86ISD::VPERM2X128: {
39758 // Simplify VPERM2F128/VPERM2I128 to extract_subvector.
39759 SDLoc DL(Op);
39760 unsigned LoMask = Op.getConstantOperandVal(2) & 0xF;
39761 if (LoMask & 0x8)
39762 return TLO.CombineTo(
39763 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, DL));
39764 unsigned EltIdx = (LoMask & 0x1) * (NumElts / 2);
39765 unsigned SrcIdx = (LoMask & 0x2) >> 1;
39766 SDValue ExtOp =
39767 extractSubVector(Op.getOperand(SrcIdx), EltIdx, TLO.DAG, DL, 128);
39768 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
39769 SDValue Insert =
39770 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
39771 return TLO.CombineTo(Op, Insert);
39772 }
39773 // Zero upper elements.
39774 case X86ISD::VZEXT_MOVL:
39775 // Target unary shuffles by immediate:
39776 case X86ISD::PSHUFD:
39777 case X86ISD::PSHUFLW:
39778 case X86ISD::PSHUFHW:
39779 case X86ISD::VPERMILPI:
39780 // (Non-Lane Crossing) Target Shuffles.
39781 case X86ISD::VPERMILPV:
39782 case X86ISD::VPERMIL2:
39783 case X86ISD::PSHUFB:
39784 case X86ISD::UNPCKL:
39785 case X86ISD::UNPCKH:
39786 case X86ISD::BLENDI:
39787 // Integer ops.
39788 case X86ISD::AVG:
39789 case X86ISD::PACKSS:
39790 case X86ISD::PACKUS:
39791 // Horizontal Ops.
39792 case X86ISD::HADD:
39793 case X86ISD::HSUB:
39794 case X86ISD::FHADD:
39795 case X86ISD::FHSUB: {
39796 SDLoc DL(Op);
39797 SmallVector<SDValue, 4> Ops;
39798 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
39799 SDValue SrcOp = Op.getOperand(i);
39800 EVT SrcVT = SrcOp.getValueType();
39801 assert((!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) &&(static_cast<void> (0))
39802 "Unsupported vector size")(static_cast<void> (0));
39803 Ops.push_back(SrcVT.isVector() ? extractSubVector(SrcOp, 0, TLO.DAG, DL,
39804 ExtSizeInBits)
39805 : SrcOp);
39806 }
39807 MVT ExtVT = VT.getSimpleVT();
39808 ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),
39809 ExtSizeInBits / ExtVT.getScalarSizeInBits());
39810 SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ops);
39811 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
39812 SDValue Insert =
39813 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
39814 return TLO.CombineTo(Op, Insert);
39815 }
39816 }
39817 }
39818
39819 // Get target/faux shuffle mask.
39820 APInt OpUndef, OpZero;
39821 SmallVector<int, 64> OpMask;
39822 SmallVector<SDValue, 2> OpInputs;
39823 if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, OpUndef,
39824 OpZero, TLO.DAG, Depth, false))
39825 return false;
39826
39827 // Shuffle inputs must be the same size as the result.
39828 if (OpMask.size() != (unsigned)NumElts ||
39829 llvm::any_of(OpInputs, [VT](SDValue V) {
39830 return VT.getSizeInBits() != V.getValueSizeInBits() ||
39831 !V.getValueType().isVector();
39832 }))
39833 return false;
39834
39835 KnownZero = OpZero;
39836 KnownUndef = OpUndef;
39837
39838 // Check if shuffle mask can be simplified to undef/zero/identity.
39839 int NumSrcs = OpInputs.size();
39840 for (int i = 0; i != NumElts; ++i)
39841 if (!DemandedElts[i])
39842 OpMask[i] = SM_SentinelUndef;
39843
39844 if (isUndefInRange(OpMask, 0, NumElts)) {
39845 KnownUndef.setAllBits();
39846 return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
39847 }
39848 if (isUndefOrZeroInRange(OpMask, 0, NumElts)) {
39849 KnownZero.setAllBits();
39850 return TLO.CombineTo(
39851 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
39852 }
39853 for (int Src = 0; Src != NumSrcs; ++Src)
39854 if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))
39855 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, OpInputs[Src]));
39856
39857 // Attempt to simplify inputs.
39858 for (int Src = 0; Src != NumSrcs; ++Src) {
39859 // TODO: Support inputs of different types.
39860 if (OpInputs[Src].getValueType() != VT)
39861 continue;
39862
39863 int Lo = Src * NumElts;
39864 APInt SrcElts = APInt::getNullValue(NumElts);
39865 for (int i = 0; i != NumElts; ++i)
39866 if (DemandedElts[i]) {
39867 int M = OpMask[i] - Lo;
39868 if (0 <= M && M < NumElts)
39869 SrcElts.setBit(M);
39870 }
39871
39872 // TODO - Propagate input undef/zero elts.
39873 APInt SrcUndef, SrcZero;
39874 if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,
39875 TLO, Depth + 1))
39876 return true;
39877 }
39878
39879 // If we don't demand all elements, then attempt to combine to a simpler
39880 // shuffle.
39881 // We need to convert the depth to something combineX86ShufflesRecursively
39882 // can handle - so pretend its Depth == 0 again, and reduce the max depth
39883 // to match. This prevents combineX86ShuffleChain from returning a
39884 // combined shuffle that's the same as the original root, causing an
39885 // infinite loop.
39886 if (!DemandedElts.isAllOnesValue()) {
39887 assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range")(static_cast<void> (0));
39888
39889 SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);
39890 for (int i = 0; i != NumElts; ++i)
39891 if (DemandedElts[i])
39892 DemandedMask[i] = i;
39893
39894 SDValue NewShuffle = combineX86ShufflesRecursively(
39895 {Op}, 0, Op, DemandedMask, {}, 0, X86::MaxShuffleCombineDepth - Depth,
39896 /*HasVarMask*/ false,
39897 /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, TLO.DAG,
39898 Subtarget);
39899 if (NewShuffle)
39900 return TLO.CombineTo(Op, NewShuffle);
39901 }
39902
39903 return false;
39904}
39905
39906bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
39907 SDValue Op, const APInt &OriginalDemandedBits,
39908 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
39909 unsigned Depth) const {
39910 EVT VT = Op.getValueType();
39911 unsigned BitWidth = OriginalDemandedBits.getBitWidth();
39912 unsigned Opc = Op.getOpcode();
39913 switch(Opc) {
39914 case X86ISD::VTRUNC: {
39915 KnownBits KnownOp;
39916 SDValue Src = Op.getOperand(0);
39917 MVT SrcVT = Src.getSimpleValueType();
39918
39919 // Simplify the input, using demanded bit information.
39920 APInt TruncMask = OriginalDemandedBits.zext(SrcVT.getScalarSizeInBits());
39921 APInt DemandedElts = OriginalDemandedElts.trunc(SrcVT.getVectorNumElements());
39922 if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1))
39923 return true;
39924 break;
39925 }
39926 case X86ISD::PMULDQ:
39927 case X86ISD::PMULUDQ: {
39928 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
39929 KnownBits KnownOp;
39930 SDValue LHS = Op.getOperand(0);
39931 SDValue RHS = Op.getOperand(1);
39932 // FIXME: Can we bound this better?
39933 APInt DemandedMask = APInt::getLowBitsSet(64, 32);
39934 if (SimplifyDemandedBits(LHS, DemandedMask, OriginalDemandedElts, KnownOp,
39935 TLO, Depth + 1))
39936 return true;
39937 if (SimplifyDemandedBits(RHS, DemandedMask, OriginalDemandedElts, KnownOp,
39938 TLO, Depth + 1))
39939 return true;
39940
39941 // Aggressively peek through ops to get at the demanded low bits.
39942 SDValue DemandedLHS = SimplifyMultipleUseDemandedBits(
39943 LHS, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
39944 SDValue DemandedRHS = SimplifyMultipleUseDemandedBits(
39945 RHS, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
39946 if (DemandedLHS || DemandedRHS) {
39947 DemandedLHS = DemandedLHS ? DemandedLHS : LHS;
39948 DemandedRHS = DemandedRHS ? DemandedRHS : RHS;
39949 return TLO.CombineTo(
39950 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, DemandedLHS, DemandedRHS));
39951 }
39952 break;
39953 }
39954 case X86ISD::VSHLI: {
39955 SDValue Op0 = Op.getOperand(0);
39956
39957 unsigned ShAmt = Op.getConstantOperandVal(1);
39958 if (ShAmt >= BitWidth)
39959 break;
39960
39961 APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);
39962
39963 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
39964 // single shift. We can do this if the bottom bits (which are shifted
39965 // out) are never demanded.
39966 if (Op0.getOpcode() == X86ISD::VSRLI &&
39967 OriginalDemandedBits.countTrailingZeros() >= ShAmt) {
39968 unsigned Shift2Amt = Op0.getConstantOperandVal(1);
39969 if (Shift2Amt < BitWidth) {
39970 int Diff = ShAmt - Shift2Amt;
39971 if (Diff == 0)
39972 return TLO.CombineTo(Op, Op0.getOperand(0));
39973
39974 unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;
39975 SDValue NewShift = TLO.DAG.getNode(
39976 NewOpc, SDLoc(Op), VT, Op0.getOperand(0),
39977 TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));
39978 return TLO.CombineTo(Op, NewShift);
39979 }
39980 }
39981
39982 // If we are only demanding sign bits then we can use the shift source directly.
39983 unsigned NumSignBits =
39984 TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1);
39985 unsigned UpperDemandedBits =
39986 BitWidth - OriginalDemandedBits.countTrailingZeros();
39987 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
39988 return TLO.CombineTo(Op, Op0);
39989
39990 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
39991 TLO, Depth + 1))
39992 return true;
39993
39994 assert(!Known.hasConflict() && "Bits known to be one AND zero?")(static_cast<void> (0));
39995 Known.Zero <<= ShAmt;
39996 Known.One <<= ShAmt;
39997
39998 // Low bits known zero.
39999 Known.Zero.setLowBits(ShAmt);
40000 return false;
40001 }
40002 case X86ISD::VSRLI: {
40003 unsigned ShAmt = Op.getConstantOperandVal(1);
40004 if (ShAmt >= BitWidth)
40005 break;
40006
40007 APInt DemandedMask = OriginalDemandedBits << ShAmt;
40008
40009 if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask,
40010 OriginalDemandedElts, Known, TLO, Depth + 1))
40011 return true;
40012
40013 assert(!Known.hasConflict() && "Bits known to be one AND zero?")(static_cast<void> (0));
40014 Known.Zero.lshrInPlace(ShAmt);
40015 Known.One.lshrInPlace(ShAmt);
40016
40017 // High bits known zero.
40018 Known.Zero.setHighBits(ShAmt);
40019 return false;
40020 }
40021 case X86ISD::VSRAI: {
40022 SDValue Op0 = Op.getOperand(0);
40023 SDValue Op1 = Op.getOperand(1);
40024
40025 unsigned ShAmt = cast<ConstantSDNode>(Op1)->getZExtValue();
40026 if (ShAmt >= BitWidth)
40027 break;
40028
40029 APInt DemandedMask = OriginalDemandedBits << ShAmt;
40030
40031 // If we just want the sign bit then we don't need to shift it.
40032 if (OriginalDemandedBits.isSignMask())
40033 return TLO.CombineTo(Op, Op0);
40034
40035 // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
40036 if (Op0.getOpcode() == X86ISD::VSHLI &&
40037 Op.getOperand(1) == Op0.getOperand(1)) {
40038 SDValue Op00 = Op0.getOperand(0);
40039 unsigned NumSignBits =
40040 TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);
40041 if (ShAmt < NumSignBits)
40042 return TLO.CombineTo(Op, Op00);
40043 }
40044
40045 // If any of the demanded bits are produced by the sign extension, we also
40046 // demand the input sign bit.
40047 if (OriginalDemandedBits.countLeadingZeros() < ShAmt)
40048 DemandedMask.setSignBit();
40049
40050 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
40051 TLO, Depth + 1))
40052 return true;
40053
40054 assert(!Known.hasConflict() && "Bits known to be one AND zero?")(static_cast<void> (0));
40055 Known.Zero.lshrInPlace(ShAmt);
40056 Known.One.lshrInPlace(ShAmt);
40057
40058 // If the input sign bit is known to be zero, or if none of the top bits
40059 // are demanded, turn this into an unsigned shift right.
40060 if (Known.Zero[BitWidth - ShAmt - 1] ||
40061 OriginalDemandedBits.countLeadingZeros() >= ShAmt)
40062 return TLO.CombineTo(
40063 Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));
40064
40065 // High bits are known one.
40066 if (Known.One[BitWidth - ShAmt - 1])
40067 Known.One.setHighBits(ShAmt);
40068 return false;
40069 }
40070 case X86ISD::PEXTRB:
40071 case X86ISD::PEXTRW: {
40072 SDValue Vec = Op.getOperand(0);
40073 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
40074 MVT VecVT = Vec.getSimpleValueType();
40075 unsigned NumVecElts = VecVT.getVectorNumElements();
40076
40077 if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {
40078 unsigned Idx = CIdx->getZExtValue();
40079 unsigned VecBitWidth = VecVT.getScalarSizeInBits();
40080
40081 // If we demand no bits from the vector then we must have demanded
40082 // bits from the implict zext - simplify to zero.
40083 APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth);
40084 if (DemandedVecBits == 0)
40085 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
40086
40087 APInt KnownUndef, KnownZero;
40088 APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx);
40089 if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,
40090 KnownZero, TLO, Depth + 1))
40091 return true;
40092
40093 KnownBits KnownVec;
40094 if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,
40095 KnownVec, TLO, Depth + 1))
40096 return true;
40097
40098 if (SDValue V = SimplifyMultipleUseDemandedBits(
40099 Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1))
40100 return TLO.CombineTo(
40101 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1)));
40102
40103 Known = KnownVec.zext(BitWidth);
40104 return false;
40105 }
40106 break;
40107 }
40108 case X86ISD::PINSRB:
40109 case X86ISD::PINSRW: {
40110 SDValue Vec = Op.getOperand(0);
40111 SDValue Scl = Op.getOperand(1);
40112 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
40113 MVT VecVT = Vec.getSimpleValueType();
40114
40115 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {
40116 unsigned Idx = CIdx->getZExtValue();
40117 if (!OriginalDemandedElts[Idx])
40118 return TLO.CombineTo(Op, Vec);
40119
40120 KnownBits KnownVec;
40121 APInt DemandedVecElts(OriginalDemandedElts);
40122 DemandedVecElts.clearBit(Idx);
40123 if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts,
40124 KnownVec, TLO, Depth + 1))
40125 return true;
40126
40127 KnownBits KnownScl;
40128 unsigned NumSclBits = Scl.getScalarValueSizeInBits();
40129 APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits);
40130 if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))
40131 return true;
40132
40133 KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());
40134 Known = KnownBits::commonBits(KnownVec, KnownScl);
40135 return false;
40136 }
40137 break;
40138 }
40139 case X86ISD::PACKSS:
40140 // PACKSS saturates to MIN/MAX integer values. So if we just want the
40141 // sign bit then we can just ask for the source operands sign bit.
40142 // TODO - add known bits handling.
40143 if (OriginalDemandedBits.isSignMask()) {
40144 APInt DemandedLHS, DemandedRHS;
40145 getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS);
40146
40147 KnownBits KnownLHS, KnownRHS;
40148 APInt SignMask = APInt::getSignMask(BitWidth * 2);
40149 if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS,
40150 KnownLHS, TLO, Depth + 1))
40151 return true;
40152 if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,
40153 KnownRHS, TLO, Depth + 1))
40154 return true;
40155
40156 // Attempt to avoid multi-use ops if we don't need anything from them.
40157 SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
40158 Op.getOperand(0), SignMask, DemandedLHS, TLO.DAG, Depth + 1);
40159 SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits(
40160 Op.getOperand(1), SignMask, DemandedRHS, TLO.DAG, Depth + 1);
40161 if (DemandedOp0 || DemandedOp1) {
40162 SDValue Op0 = DemandedOp0 ? DemandedOp0 : Op.getOperand(0);
40163 SDValue Op1 = DemandedOp1 ? DemandedOp1 : Op.getOperand(1);
40164 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, Op0, Op1));
40165 }
40166 }
40167 // TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.
40168 break;
40169 case X86ISD::VBROADCAST: {
40170 SDValue Src = Op.getOperand(0);
40171 MVT SrcVT = Src.getSimpleValueType();
40172 APInt DemandedElts = APInt::getOneBitSet(
40173 SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1, 0);
40174 if (SimplifyDemandedBits(Src, OriginalDemandedBits, DemandedElts, Known,
40175 TLO, Depth + 1))
40176 return true;
40177 // If we don't need the upper bits, attempt to narrow the broadcast source.
40178 // Don't attempt this on AVX512 as it might affect broadcast folding.
40179 // TODO: Should we attempt this for i32/i16 splats? They tend to be slower.
40180 if ((BitWidth == 64) && SrcVT.isScalarInteger() && !Subtarget.hasAVX512() &&
40181 OriginalDemandedBits.countLeadingZeros() >= (BitWidth / 2)) {
40182 MVT NewSrcVT = MVT::getIntegerVT(BitWidth / 2);
40183 SDValue NewSrc =
40184 TLO.DAG.getNode(ISD::TRUNCATE, SDLoc(Src), NewSrcVT, Src);
40185 MVT NewVT = MVT::getVectorVT(NewSrcVT, VT.getVectorNumElements() * 2);
40186 SDValue NewBcst =
40187 TLO.DAG.getNode(X86ISD::VBROADCAST, SDLoc(Op), NewVT, NewSrc);
40188 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, NewBcst));
40189 }
40190 break;
40191 }
40192 case X86ISD::PCMPGT:
40193 // icmp sgt(0, R) == ashr(R, BitWidth-1).
40194 // iff we only need the sign bit then we can use R directly.
40195 if (OriginalDemandedBits.isSignMask() &&
40196 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
40197 return TLO.CombineTo(Op, Op.getOperand(1));
40198 break;
40199 case X86ISD::MOVMSK: {
40200 SDValue Src = Op.getOperand(0);
40201 MVT SrcVT = Src.getSimpleValueType();
40202 unsigned SrcBits = SrcVT.getScalarSizeInBits();
40203 unsigned NumElts = SrcVT.getVectorNumElements();
40204
40205 // If we don't need the sign bits at all just return zero.
40206 if (OriginalDemandedBits.countTrailingZeros() >= NumElts)
40207 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
40208
40209 // Only demand the vector elements of the sign bits we need.
40210 APInt KnownUndef, KnownZero;
40211 APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);
40212 if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
40213 TLO, Depth + 1))
40214 return true;
40215
40216 Known.Zero = KnownZero.zextOrSelf(BitWidth);
40217 Known.Zero.setHighBits(BitWidth - NumElts);
40218
40219 // MOVMSK only uses the MSB from each vector element.
40220 KnownBits KnownSrc;
40221 APInt DemandedSrcBits = APInt::getSignMask(SrcBits);
40222 if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, KnownSrc, TLO,
40223 Depth + 1))
40224 return true;
40225
40226 if (KnownSrc.One[SrcBits - 1])
40227 Known.One.setLowBits(NumElts);
40228 else if (KnownSrc.Zero[SrcBits - 1])
40229 Known.Zero.setLowBits(NumElts);
40230
40231 // Attempt to avoid multi-use os if we don't need anything from it.
40232 if (SDValue NewSrc = SimplifyMultipleUseDemandedBits(
40233 Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1))
40234 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
40235 return false;
40236 }
40237 case X86ISD::BEXTR:
40238 case X86ISD::BEXTRI: {
40239 SDValue Op0 = Op.getOperand(0);
40240 SDValue Op1 = Op.getOperand(1);
40241
40242 // Only bottom 16-bits of the control bits are required.
40243 if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
40244 // NOTE: SimplifyDemandedBits won't do this for constants.
40245 uint64_t Val1 = Cst1->getZExtValue();
40246 uint64_t MaskedVal1 = Val1 & 0xFFFF;
40247 if (Opc == X86ISD::BEXTR && MaskedVal1 != Val1) {
40248 SDLoc DL(Op);
40249 return TLO.CombineTo(
40250 Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0,
40251 TLO.DAG.getConstant(MaskedVal1, DL, VT)));
40252 }
40253
40254 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
40255 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
40256
40257 // If the length is 0, the result is 0.
40258 if (Length == 0) {
40259 Known.setAllZero();
40260 return false;
40261 }
40262
40263 if ((Shift + Length) <= BitWidth) {
40264 APInt DemandedMask = APInt::getBitsSet(BitWidth, Shift, Shift + Length);
40265 if (SimplifyDemandedBits(Op0, DemandedMask, Known, TLO, Depth + 1))
40266 return true;
40267
40268 Known = Known.extractBits(Length, Shift);
40269 Known = Known.zextOrTrunc(BitWidth);
40270 return false;
40271 }
40272 } else {
40273 assert(Opc == X86ISD::BEXTR && "Unexpected opcode!")(static_cast<void> (0));
40274 KnownBits Known1;
40275 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16));
40276 if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))
40277 return true;
40278
40279 // If the length is 0, replace with 0.
40280 KnownBits LengthBits = Known1.extractBits(8, 8);
40281 if (LengthBits.isZero())
40282 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
40283 }
40284
40285 break;
40286 }
40287 case X86ISD::PDEP: {
40288 SDValue Op0 = Op.getOperand(0);
40289 SDValue Op1 = Op.getOperand(1);
40290
40291 unsigned DemandedBitsLZ = OriginalDemandedBits.countLeadingZeros();
40292 APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
40293
40294 // If the demanded bits has leading zeroes, we don't demand those from the
40295 // mask.
40296 if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1))
40297 return true;
40298
40299 // The number of possible 1s in the mask determines the number of LSBs of
40300 // operand 0 used. Undemanded bits from the mask don't matter so filter
40301 // them before counting.
40302 KnownBits Known2;
40303 uint64_t Count = (~Known.Zero & LoMask).countPopulation();
40304 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count));
40305 if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1))
40306 return true;
40307
40308 // Zeroes are retained from the mask, but not ones.
40309 Known.One.clearAllBits();
40310 // The result will have at least as many trailing zeros as the non-mask
40311 // operand since bits can only map to the same or higher bit position.
40312 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
40313 return false;
40314 }
40315 }
40316
40317 return TargetLowering::SimplifyDemandedBitsForTargetNode(
40318 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
40319}
40320
40321SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
40322 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
40323 SelectionDAG &DAG, unsigned Depth) const {
40324 int NumElts = DemandedElts.getBitWidth();
40325 unsigned Opc = Op.getOpcode();
40326 EVT VT = Op.getValueType();
40327
40328 switch (Opc) {
40329 case X86ISD::PINSRB:
40330 case X86ISD::PINSRW: {
40331 // If we don't demand the inserted element, return the base vector.
40332 SDValue Vec = Op.getOperand(0);
40333 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
40334 MVT VecVT = Vec.getSimpleValueType();
40335 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&
40336 !DemandedElts[CIdx->getZExtValue()])
40337 return Vec;
40338 break;
40339 }
40340 case X86ISD::VSHLI: {
40341 // If we are only demanding sign bits then we can use the shift source
40342 // directly.
40343 SDValue Op0 = Op.getOperand(0);
40344 unsigned ShAmt = Op.getConstantOperandVal(1);
40345 unsigned BitWidth = DemandedBits.getBitWidth();
40346 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);
40347 unsigned UpperDemandedBits = BitWidth - DemandedBits.countTrailingZeros();
40348 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
40349 return Op0;
40350 break;
40351 }
40352 case X86ISD::VSRAI:
40353 // iff we only need the sign bit then we can use the source directly.
40354 // TODO: generalize where we only demand extended signbits.
40355 if (DemandedBits.isSignMask())
40356 return Op.getOperand(0);
40357 break;
40358 case X86ISD::PCMPGT:
40359 // icmp sgt(0, R) == ashr(R, BitWidth-1).
40360 // iff we only need the sign bit then we can use R directly.
40361 if (DemandedBits.isSignMask() &&
40362 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
40363 return Op.getOperand(1);
40364 break;
40365 }
40366
40367 APInt ShuffleUndef, ShuffleZero;
40368 SmallVector<int, 16> ShuffleMask;
40369 SmallVector<SDValue, 2> ShuffleOps;
40370 if (getTargetShuffleInputs(Op, DemandedElts, ShuffleOps, ShuffleMask,
40371 ShuffleUndef, ShuffleZero, DAG, Depth, false)) {
40372 // If all the demanded elts are from one operand and are inline,
40373 // then we can use the operand directly.
40374 int NumOps = ShuffleOps.size();
40375 if (ShuffleMask.size() == (unsigned)NumElts &&
40376 llvm::all_of(ShuffleOps, [VT](SDValue V) {
40377 return VT.getSizeInBits() == V.getValueSizeInBits();
40378 })) {
40379
40380 if (DemandedElts.isSubsetOf(ShuffleUndef))
40381 return DAG.getUNDEF(VT);
40382 if (DemandedElts.isSubsetOf(ShuffleUndef | ShuffleZero))
40383 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(Op));
40384
40385 // Bitmask that indicates which ops have only been accessed 'inline'.
40386 APInt IdentityOp = APInt::getAllOnesValue(NumOps);
40387 for (int i = 0; i != NumElts; ++i) {
40388 int M = ShuffleMask[i];
40389 if (!DemandedElts[i] || ShuffleUndef[i])
40390 continue;
40391 int OpIdx = M / NumElts;
40392 int EltIdx = M % NumElts;
40393 if (M < 0 || EltIdx != i) {
40394 IdentityOp.clearAllBits();
40395 break;
40396 }
40397 IdentityOp &= APInt::getOneBitSet(NumOps, OpIdx);
40398 if (IdentityOp == 0)
40399 break;
40400 }
40401 assert((IdentityOp == 0 || IdentityOp.countPopulation() == 1) &&(static_cast<void> (0))
40402 "Multiple identity shuffles detected")(static_cast<void> (0));
40403
40404 if (IdentityOp != 0)
40405 return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countTrailingZeros()]);
40406 }
40407 }
40408
40409 return TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
40410 Op, DemandedBits, DemandedElts, DAG, Depth);
40411}
40412
40413// Helper to peek through bitops/trunc/setcc to determine size of source vector.
40414// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.
40415static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size,
40416 bool AllowTruncate) {
40417 switch (Src.getOpcode()) {
40418 case ISD::TRUNCATE:
40419 if (!AllowTruncate)
40420 return false;
40421 LLVM_FALLTHROUGH[[gnu::fallthrough]];
40422 case ISD::SETCC:
40423 return Src.getOperand(0).getValueSizeInBits() == Size;
40424 case ISD::AND:
40425 case ISD::XOR:
40426 case ISD::OR:
40427 return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate) &&
40428 checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate);
40429 }
40430 return false;
40431}
40432
40433// Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents.
40434static unsigned getAltBitOpcode(unsigned Opcode) {
40435 switch(Opcode) {
40436 case ISD::AND: return X86ISD::FAND;
40437 case ISD::OR: return X86ISD::FOR;
40438 case ISD::XOR: return X86ISD::FXOR;
40439 case X86ISD::ANDNP: return X86ISD::FANDN;
40440 }
40441 llvm_unreachable("Unknown bitwise opcode")__builtin_unreachable();
40442}
40443
40444// Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets.
40445static SDValue adjustBitcastSrcVectorSSE1(SelectionDAG &DAG, SDValue Src,
40446 const SDLoc &DL) {
40447 EVT SrcVT = Src.getValueType();
40448 if (SrcVT != MVT::v4i1)
40449 return SDValue();
40450
40451 switch (Src.getOpcode()) {
40452 case ISD::SETCC:
40453 if (Src.getOperand(0).getValueType() == MVT::v4i32 &&
40454 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) &&
40455 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT) {
40456 SDValue Op0 = Src.getOperand(0);
40457 if (ISD::isNormalLoad(Op0.getNode()))
40458 return DAG.getBitcast(MVT::v4f32, Op0);
40459 if (Op0.getOpcode() == ISD::BITCAST &&
40460 Op0.getOperand(0).getValueType() == MVT::v4f32)
40461 return Op0.getOperand(0);
40462 }
40463 break;
40464 case ISD::AND:
40465 case ISD::XOR:
40466 case ISD::OR: {
40467 SDValue Op0 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(0), DL);
40468 SDValue Op1 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(1), DL);
40469 if (Op0 && Op1)
40470 return DAG.getNode(getAltBitOpcode(Src.getOpcode()), DL, MVT::v4f32, Op0,
40471 Op1);
40472 break;
40473 }
40474 }
40475 return SDValue();
40476}
40477
40478// Helper to push sign extension of vXi1 SETCC result through bitops.
40479static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT,
40480 SDValue Src, const SDLoc &DL) {
40481 switch (Src.getOpcode()) {
40482 case ISD::SETCC:
40483 case ISD::TRUNCATE:
40484 return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
40485 case ISD::AND:
40486 case ISD::XOR:
40487 case ISD::OR:
40488 return DAG.getNode(
40489 Src.getOpcode(), DL, SExtVT,
40490 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(0), DL),
40491 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL));
40492 }
40493 llvm_unreachable("Unexpected node type for vXi1 sign extension")__builtin_unreachable();
40494}
40495
40496// Try to match patterns such as
40497// (i16 bitcast (v16i1 x))
40498// ->
40499// (i16 movmsk (16i8 sext (v16i1 x)))
40500// before the illegal vector is scalarized on subtargets that don't have legal
40501// vxi1 types.
40502static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
40503 const SDLoc &DL,
40504 const X86Subtarget &Subtarget) {
40505 EVT SrcVT = Src.getValueType();
40506 if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)
40507 return SDValue();
40508
40509 // Recognize the IR pattern for the movmsk intrinsic under SSE1 before type
40510 // legalization destroys the v4i32 type.
40511 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2()) {
40512 if (SDValue V = adjustBitcastSrcVectorSSE1(DAG, Src, DL)) {
40513 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32,
40514 DAG.getBitcast(MVT::v4f32, V));
40515 return DAG.getZExtOrTrunc(V, DL, VT);
40516 }
40517 }
40518
40519 // If the input is a truncate from v16i8 or v32i8 go ahead and use a
40520 // movmskb even with avx512. This will be better than truncating to vXi1 and
40521 // using a kmov. This can especially help KNL if the input is a v16i8/v32i8
40522 // vpcmpeqb/vpcmpgtb.
40523 bool PreferMovMsk = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&
40524 (Src.getOperand(0).getValueType() == MVT::v16i8 ||
40525 Src.getOperand(0).getValueType() == MVT::v32i8 ||
40526 Src.getOperand(0).getValueType() == MVT::v64i8);
40527
40528 // Prefer movmsk for AVX512 for (bitcast (setlt X, 0)) which can be handled
40529 // directly with vpmovmskb/vmovmskps/vmovmskpd.
40530 if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() &&
40531 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT &&
40532 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {
40533 EVT CmpVT = Src.getOperand(0).getValueType();
40534 EVT EltVT = CmpVT.getVectorElementType();
40535 if (CmpVT.getSizeInBits() <= 256 &&
40536 (EltVT == MVT::i8 || EltVT == MVT::i32 || EltVT == MVT::i64))
40537 PreferMovMsk = true;
40538 }
40539
40540 // With AVX512 vxi1 types are legal and we prefer using k-regs.
40541 // MOVMSK is supported in SSE2 or later.
40542 if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !PreferMovMsk))
40543 return SDValue();
40544
40545 // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
40546 // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
40547 // v8i16 and v16i16.
40548 // For these two cases, we can shuffle the upper element bytes to a
40549 // consecutive sequence at the start of the vector and treat the results as
40550 // v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
40551 // for v16i16 this is not the case, because the shuffle is expensive, so we
40552 // avoid sign-extending to this type entirely.
40553 // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
40554 // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
40555 MVT SExtVT;
40556 bool PropagateSExt = false;
40557 switch (SrcVT.getSimpleVT().SimpleTy) {
40558 default:
40559 return SDValue();
40560 case MVT::v2i1:
40561 SExtVT = MVT::v2i64;
40562 break;
40563 case MVT::v4i1:
40564 SExtVT = MVT::v4i32;
40565 // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
40566 // sign-extend to a 256-bit operation to avoid truncation.
40567 if (Subtarget.hasAVX() &&
40568 checkBitcastSrcVectorSize(Src, 256, Subtarget.hasAVX2())) {
40569 SExtVT = MVT::v4i64;
40570 PropagateSExt = true;
40571 }
40572 break;
40573 case MVT::v8i1:
40574 SExtVT = MVT::v8i16;
40575 // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
40576 // sign-extend to a 256-bit operation to match the compare.
40577 // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
40578 // 256-bit because the shuffle is cheaper than sign extending the result of
40579 // the compare.
40580 if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256, true) ||
40581 checkBitcastSrcVectorSize(Src, 512, true))) {
40582 SExtVT = MVT::v8i32;
40583 PropagateSExt = true;
40584 }
40585 break;
40586 case MVT::v16i1:
40587 SExtVT = MVT::v16i8;
40588 // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
40589 // it is not profitable to sign-extend to 256-bit because this will
40590 // require an extra cross-lane shuffle which is more expensive than
40591 // truncating the result of the compare to 128-bits.
40592 break;
40593 case MVT::v32i1:
40594 SExtVT = MVT::v32i8;
40595 break;
40596 case MVT::v64i1:
40597 // If we have AVX512F, but not AVX512BW and the input is truncated from
40598 // v64i8 checked earlier. Then split the input and make two pmovmskbs.
40599 if (Subtarget.hasAVX512()) {
40600 if (Subtarget.hasBWI())
40601 return SDValue();
40602 SExtVT = MVT::v64i8;
40603 break;
40604 }
40605 // Split if this is a <64 x i8> comparison result.
40606 if (checkBitcastSrcVectorSize(Src, 512, false)) {
40607 SExtVT = MVT::v64i8;
40608 break;
40609 }
40610 return SDValue();
40611 };
40612
40613 SDValue V = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)
40614 : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
40615
40616 if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8 || SExtVT == MVT::v64i8) {
40617 V = getPMOVMSKB(DL, V, DAG, Subtarget);
40618 } else {
40619 if (SExtVT == MVT::v8i16)
40620 V = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, V,
40621 DAG.getUNDEF(MVT::v8i16));
40622 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
40623 }
40624
40625 EVT IntVT =
40626 EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements());
40627 V = DAG.getZExtOrTrunc(V, DL, IntVT);
40628 return DAG.getBitcast(VT, V);
40629}
40630
40631// Convert a vXi1 constant build vector to the same width scalar integer.
40632static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG) {
40633 EVT SrcVT = Op.getValueType();
40634 assert(SrcVT.getVectorElementType() == MVT::i1 &&(static_cast<void> (0))
40635 "Expected a vXi1 vector")(static_cast<void> (0));
40636 assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&(static_cast<void> (0))
40637 "Expected a constant build vector")(static_cast<void> (0));
40638
40639 APInt Imm(SrcVT.getVectorNumElements(), 0);
40640 for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
40641 SDValue In = Op.getOperand(Idx);
40642 if (!In.isUndef() && (cast<ConstantSDNode>(In)->getZExtValue() & 0x1))
40643 Imm.setBit(Idx);
40644 }
40645 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
40646 return DAG.getConstant(Imm, SDLoc(Op), IntVT);
40647}
40648
40649static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,
40650 TargetLowering::DAGCombinerInfo &DCI,
40651 const X86Subtarget &Subtarget) {
40652 assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast")(static_cast<void> (0));
40653
40654 if (!DCI.isBeforeLegalizeOps())
40655 return SDValue();
40656
40657 // Only do this if we have k-registers.
40658 if (!Subtarget.hasAVX512())
40659 return SDValue();
40660
40661 EVT DstVT = N->getValueType(0);
40662 SDValue Op = N->getOperand(0);
40663 EVT SrcVT = Op.getValueType();
40664
40665 if (!Op.hasOneUse())
40666 return SDValue();
40667
40668 // Look for logic ops.
40669 if (Op.getOpcode() != ISD::AND &&
40670 Op.getOpcode() != ISD::OR &&
40671 Op.getOpcode() != ISD::XOR)
40672 return SDValue();
40673
40674 // Make sure we have a bitcast between mask registers and a scalar type.
40675 if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
40676 DstVT.isScalarInteger()) &&
40677 !(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&
40678 SrcVT.isScalarInteger()))
40679 return SDValue();
40680
40681 SDValue LHS = Op.getOperand(0);
40682 SDValue RHS = Op.getOperand(1);
40683
40684 if (LHS.hasOneUse() && LHS.getOpcode() == ISD::BITCAST &&
40685 LHS.getOperand(0).getValueType() == DstVT)
40686 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, LHS.getOperand(0),
40687 DAG.getBitcast(DstVT, RHS));
40688
40689 if (RHS.hasOneUse() && RHS.getOpcode() == ISD::BITCAST &&
40690 RHS.getOperand(0).getValueType() == DstVT)
40691 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
40692 DAG.getBitcast(DstVT, LHS), RHS.getOperand(0));
40693
40694 // If the RHS is a vXi1 build vector, this is a good reason to flip too.
40695 // Most of these have to move a constant from the scalar domain anyway.
40696 if (ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) {
40697 RHS = combinevXi1ConstantToInteger(RHS, DAG);
40698 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
40699 DAG.getBitcast(DstVT, LHS), RHS);
40700 }
40701
40702 return SDValue();
40703}
40704
40705static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG,
40706 const X86Subtarget &Subtarget) {
40707 SDLoc DL(BV);
40708 unsigned NumElts = BV->getNumOperands();
40709 SDValue Splat = BV->getSplatValue();
40710
40711 // Build MMX element from integer GPR or SSE float values.
40712 auto CreateMMXElement = [&](SDValue V) {
40713 if (V.isUndef())
40714 return DAG.getUNDEF(MVT::x86mmx);
40715 if (V.getValueType().isFloatingPoint()) {
40716 if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {
40717 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);
40718 V = DAG.getBitcast(MVT::v2i64, V);
40719 return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);
40720 }
40721 V = DAG.getBitcast(MVT::i32, V);
40722 } else {
40723 V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);
40724 }
40725 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);
40726 };
40727
40728 // Convert build vector ops to MMX data in the bottom elements.
40729 SmallVector<SDValue, 8> Ops;
40730
40731 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40732
40733 // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
40734 if (Splat) {
40735 if (Splat.isUndef())
40736 return DAG.getUNDEF(MVT::x86mmx);
40737
40738 Splat = CreateMMXElement(Splat);
40739
40740 if (Subtarget.hasSSE1()) {
40741 // Unpack v8i8 to splat i8 elements to lowest 16-bits.
40742 if (NumElts == 8)
40743 Splat = DAG.getNode(
40744 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
40745 DAG.getTargetConstant(Intrinsic::x86_mmx_punpcklbw, DL,
40746 TLI.getPointerTy(DAG.getDataLayout())),
40747 Splat, Splat);
40748
40749 // Use PSHUFW to repeat 16-bit elements.
40750 unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
40751 return DAG.getNode(
40752 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
40753 DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL,
40754 TLI.getPointerTy(DAG.getDataLayout())),
40755 Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8));
40756 }
40757 Ops.append(NumElts, Splat);
40758 } else {
40759 for (unsigned i = 0; i != NumElts; ++i)
40760 Ops.push_back(CreateMMXElement(BV->getOperand(i)));
40761 }
40762
40763 // Use tree of PUNPCKLs to build up general MMX vector.
40764 while (Ops.size() > 1) {
40765 unsigned NumOps = Ops.size();
40766 unsigned IntrinOp =
40767 (NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
40768 : (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
40769 : Intrinsic::x86_mmx_punpcklbw));
40770 SDValue Intrin = DAG.getTargetConstant(
40771 IntrinOp, DL, TLI.getPointerTy(DAG.getDataLayout()));
40772 for (unsigned i = 0; i != NumOps; i += 2)
40773 Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
40774 Ops[i], Ops[i + 1]);
40775 Ops.resize(NumOps / 2);
40776 }
40777
40778 return Ops[0];
40779}
40780
40781// Recursive function that attempts to find if a bool vector node was originally
40782// a vector/float/double that got truncated/extended/bitcast to/from a scalar
40783// integer. If so, replace the scalar ops with bool vector equivalents back down
40784// the chain.
40785static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL,
40786 SelectionDAG &DAG,
40787 const X86Subtarget &Subtarget) {
40788 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40789 unsigned Opc = V.getOpcode();
40790 switch (Opc) {
40791 case ISD::BITCAST: {
40792 // Bitcast from a vector/float/double, we can cheaply bitcast to VT.
40793 SDValue Src = V.getOperand(0);
40794 EVT SrcVT = Src.getValueType();
40795 if (SrcVT.isVector() || SrcVT.isFloatingPoint())
40796 return DAG.getBitcast(VT, Src);
40797 break;
40798 }
40799 case ISD::TRUNCATE: {
40800 // If we find a suitable source, a truncated scalar becomes a subvector.
40801 SDValue Src = V.getOperand(0);
40802 EVT NewSrcVT =
40803 EVT::getVectorVT(*DAG.getContext(), MVT::i1, Src.getValueSizeInBits());
40804 if (TLI.isTypeLegal(NewSrcVT))
40805 if (SDValue N0 =
40806 combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))
40807 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N0,
40808 DAG.getIntPtrConstant(0, DL));
40809 break;
40810 }
40811 case ISD::ANY_EXTEND:
40812 case ISD::ZERO_EXTEND: {
40813 // If we find a suitable source, an extended scalar becomes a subvector.
40814 SDValue Src = V.getOperand(0);
40815 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
40816 Src.getScalarValueSizeInBits());
40817 if (TLI.isTypeLegal(NewSrcVT))
40818 if (SDValue N0 =
40819 combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))
40820 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
40821 Opc == ISD::ANY_EXTEND ? DAG.getUNDEF(VT)
40822 : DAG.getConstant(0, DL, VT),
40823 N0, DAG.getIntPtrConstant(0, DL));
40824 break;
40825 }
40826 case ISD::OR: {
40827 // If we find suitable sources, we can just move an OR to the vector domain.
40828 SDValue Src0 = V.getOperand(0);
40829 SDValue Src1 = V.getOperand(1);
40830 if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
40831 if (SDValue N1 = combineBitcastToBoolVector(VT, Src1, DL, DAG, Subtarget))
40832 return DAG.getNode(Opc, DL, VT, N0, N1);
40833 break;
40834 }
40835 case ISD::SHL: {
40836 // If we find a suitable source, a SHL becomes a KSHIFTL.
40837 SDValue Src0 = V.getOperand(0);
40838 if ((VT == MVT::v8i1 && !Subtarget.hasDQI()) ||
40839 ((VT == MVT::v32i1 || VT == MVT::v64i1) && !Subtarget.hasBWI()))
40840 break;
40841
40842 if (auto *Amt = dyn_cast<ConstantSDNode>(V.getOperand(1)))
40843 if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
40844 return DAG.getNode(
40845 X86ISD::KSHIFTL, DL, VT, N0,
40846 DAG.getTargetConstant(Amt->getZExtValue(), DL, MVT::i8));
40847 break;
40848 }
40849 }
40850 return SDValue();
40851}
40852
40853static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
40854 TargetLowering::DAGCombinerInfo &DCI,
40855 const X86Subtarget &Subtarget) {
40856 SDValue N0 = N->getOperand(0);
40857 EVT VT = N->getValueType(0);
40858 EVT SrcVT = N0.getValueType();
40859 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40860
40861 // Try to match patterns such as
40862 // (i16 bitcast (v16i1 x))
40863 // ->
40864 // (i16 movmsk (16i8 sext (v16i1 x)))
40865 // before the setcc result is scalarized on subtargets that don't have legal
40866 // vxi1 types.
40867 if (DCI.isBeforeLegalize()) {
40868 SDLoc dl(N);
40869 if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))
40870 return V;
40871
40872 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
40873 // type, widen both sides to avoid a trip through memory.
40874 if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
40875 Subtarget.hasAVX512()) {
40876 N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
40877 N0 = DAG.getBitcast(MVT::v8i1, N0);
40878 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
40879 DAG.getIntPtrConstant(0, dl));
40880 }
40881
40882 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
40883 // type, widen both sides to avoid a trip through memory.
40884 if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
40885 Subtarget.hasAVX512()) {
40886 // Use zeros for the widening if we already have some zeroes. This can
40887 // allow SimplifyDemandedBits to remove scalar ANDs that may be down
40888 // stream of this.
40889 // FIXME: It might make sense to detect a concat_vectors with a mix of
40890 // zeroes and undef and turn it into insert_subvector for i1 vectors as
40891 // a separate combine. What we can't do is canonicalize the operands of
40892 // such a concat or we'll get into a loop with SimplifyDemandedBits.
40893 if (N0.getOpcode() == ISD::CONCAT_VECTORS) {
40894 SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1);
40895 if (ISD::isBuildVectorAllZeros(LastOp.getNode())) {
40896 SrcVT = LastOp.getValueType();
40897 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
40898 SmallVector<SDValue, 4> Ops(N0->op_begin(), N0->op_end());
40899 Ops.resize(NumConcats, DAG.getConstant(0, dl, SrcVT));
40900 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
40901 N0 = DAG.getBitcast(MVT::i8, N0);
40902 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
40903 }
40904 }
40905
40906 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
40907 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
40908 Ops[0] = N0;
40909 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
40910 N0 = DAG.getBitcast(MVT::i8, N0);
40911 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
40912 }
40913 } else {
40914 // If we're bitcasting from iX to vXi1, see if the integer originally
40915 // began as a vXi1 and whether we can remove the bitcast entirely.
40916 if (VT.isVector() && VT.getScalarType() == MVT::i1 &&
40917 SrcVT.isScalarInteger() && TLI.isTypeLegal(VT)) {
40918 if (SDValue V =
40919 combineBitcastToBoolVector(VT, N0, SDLoc(N), DAG, Subtarget))
40920 return V;
40921 }
40922 }
40923
40924 // Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and
40925 // replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur
40926 // due to insert_subvector legalization on KNL. By promoting the copy to i16
40927 // we can help with known bits propagation from the vXi1 domain to the
40928 // scalar domain.
40929 if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() &&
40930 !Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
40931 N0.getOperand(0).getValueType() == MVT::v16i1 &&
40932 isNullConstant(N0.getOperand(1)))
40933 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,
40934 DAG.getBitcast(MVT::i16, N0.getOperand(0)));
40935
40936 // Canonicalize (bitcast (vbroadcast_load)) so that the output of the bitcast
40937 // and the vbroadcast_load are both integer or both fp. In some cases this
40938 // will remove the bitcast entirely.
40939 if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&
40940 VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) {
40941 auto *BCast = cast<MemIntrinsicSDNode>(N0);
40942 unsigned SrcVTSize = SrcVT.getScalarSizeInBits();
40943 unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits();
40944 // Don't swap i8/i16 since don't have fp types that size.
40945 if (MemSize >= 32) {
40946 MVT MemVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(MemSize)
40947 : MVT::getIntegerVT(MemSize);
40948 MVT LoadVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(SrcVTSize)
40949 : MVT::getIntegerVT(SrcVTSize);
40950 LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements());
40951
40952 SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
40953 SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };
40954 SDValue ResNode =
40955 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops,
40956 MemVT, BCast->getMemOperand());
40957 DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));
40958 return DAG.getBitcast(VT, ResNode);
40959 }
40960 }
40961
40962 // Since MMX types are special and don't usually play with other vector types,
40963 // it's better to handle them early to be sure we emit efficient code by
40964 // avoiding store-load conversions.
40965 if (VT == MVT::x86mmx) {
40966 // Detect MMX constant vectors.
40967 APInt UndefElts;
40968 SmallVector<APInt, 1> EltBits;
40969 if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits)) {
40970 SDLoc DL(N0);
40971 // Handle zero-extension of i32 with MOVD.
40972 if (EltBits[0].countLeadingZeros() >= 32)
40973 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,
40974 DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));
40975 // Else, bitcast to a double.
40976 // TODO - investigate supporting sext 32-bit immediates on x86_64.
40977 APFloat F64(APFloat::IEEEdouble(), EltBits[0]);
40978 return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));
40979 }
40980
40981 // Detect bitcasts to x86mmx low word.
40982 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
40983 (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&
40984 N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {
40985 bool LowUndef = true, AllUndefOrZero = true;
40986 for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
40987 SDValue Op = N0.getOperand(i);
40988 LowUndef &= Op.isUndef() || (i >= e/2);
40989 AllUndefOrZero &= (Op.isUndef() || isNullConstant(Op));
40990 }
40991 if (AllUndefOrZero) {
40992 SDValue N00 = N0.getOperand(0);
40993 SDLoc dl(N00);
40994 N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)
40995 : DAG.getZExtOrTrunc(N00, dl, MVT::i32);
40996 return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);
40997 }
40998 }
40999
41000 // Detect bitcasts of 64-bit build vectors and convert to a
41001 // MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the
41002 // lowest element.
41003 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
41004 (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||
41005 SrcVT == MVT::v8i8))
41006 return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget);
41007
41008 // Detect bitcasts between element or subvector extraction to x86mmx.
41009 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
41010 N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
41011 isNullConstant(N0.getOperand(1))) {
41012 SDValue N00 = N0.getOperand(0);
41013 if (N00.getValueType().is128BitVector())
41014 return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
41015 DAG.getBitcast(MVT::v2i64, N00));
41016 }
41017
41018 // Detect bitcasts from FP_TO_SINT to x86mmx.
41019 if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {
41020 SDLoc DL(N0);
41021 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
41022 DAG.getUNDEF(MVT::v2i32));
41023 return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
41024 DAG.getBitcast(MVT::v2i64, Res));
41025 }
41026 }
41027
41028 // Try to remove a bitcast of constant vXi1 vector. We have to legalize
41029 // most of these to scalar anyway.
41030 if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
41031 SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
41032 ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {
41033 return combinevXi1ConstantToInteger(N0, DAG);
41034 }
41035
41036 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
41037 VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
41038 isa<ConstantSDNode>(N0)) {
41039 auto *C = cast<ConstantSDNode>(N0);
41040 if (C->isAllOnesValue())
41041 return DAG.getConstant(1, SDLoc(N0), VT);
41042 if (C->isNullValue())
41043 return DAG.getConstant(0, SDLoc(N0), VT);
41044 }
41045
41046 // Look for MOVMSK that is maybe truncated and then bitcasted to vXi1.
41047 // Turn it into a sign bit compare that produces a k-register. This avoids
41048 // a trip through a GPR.
41049 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
41050 VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
41051 isPowerOf2_32(VT.getVectorNumElements())) {
41052 unsigned NumElts = VT.getVectorNumElements();
41053 SDValue Src = N0;
41054
41055 // Peek through truncate.
41056 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
41057 Src = N0.getOperand(0);
41058
41059 if (Src.getOpcode() == X86ISD::MOVMSK && Src.hasOneUse()) {
41060 SDValue MovmskIn = Src.getOperand(0);
41061 MVT MovmskVT = MovmskIn.getSimpleValueType();
41062 unsigned MovMskElts = MovmskVT.getVectorNumElements();
41063
41064 // We allow extra bits of the movmsk to be used since they are known zero.
41065 // We can't convert a VPMOVMSKB without avx512bw.
41066 if (MovMskElts <= NumElts &&
41067 (Subtarget.hasBWI() || MovmskVT.getVectorElementType() != MVT::i8)) {
41068 EVT IntVT = EVT(MovmskVT).changeVectorElementTypeToInteger();
41069 MovmskIn = DAG.getBitcast(IntVT, MovmskIn);
41070 SDLoc dl(N);
41071 MVT CmpVT = MVT::getVectorVT(MVT::i1, MovMskElts);
41072 SDValue Cmp = DAG.getSetCC(dl, CmpVT, MovmskIn,
41073 DAG.getConstant(0, dl, IntVT), ISD::SETLT);
41074 if (EVT(CmpVT) == VT)
41075 return Cmp;
41076
41077 // Pad with zeroes up to original VT to replace the zeroes that were
41078 // being used from the MOVMSK.
41079 unsigned NumConcats = NumElts / MovMskElts;
41080 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, CmpVT));
41081 Ops[0] = Cmp;
41082 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Ops);
41083 }
41084 }
41085 }
41086
41087 // Try to remove bitcasts from input and output of mask arithmetic to
41088 // remove GPR<->K-register crossings.
41089 if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
41090 return V;
41091
41092 // Convert a bitcasted integer logic operation that has one bitcasted
41093 // floating-point operand into a floating-point logic operation. This may
41094 // create a load of a constant, but that is cheaper than materializing the
41095 // constant in an integer register and transferring it to an SSE register or
41096 // transferring the SSE operand to integer register and back.
41097 unsigned FPOpcode;
41098 switch (N0.getOpcode()) {
41099 case ISD::AND: FPOpcode = X86ISD::FAND; break;
41100 case ISD::OR: FPOpcode = X86ISD::FOR; break;
41101 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
41102 default: return SDValue();
41103 }
41104
41105 // Check if we have a bitcast from another integer type as well.
41106 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
41107 (Subtarget.hasSSE2() && VT == MVT::f64) ||
41108 (Subtarget.hasFP16() && VT == MVT::f16) ||
41109 (Subtarget.hasSSE2() && VT.isInteger() && VT.isVector() &&
41110 TLI.isTypeLegal(VT))))
41111 return SDValue();
41112
41113 SDValue LogicOp0 = N0.getOperand(0);
41114 SDValue LogicOp1 = N0.getOperand(1);
41115 SDLoc DL0(N0);
41116
41117 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
41118 if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
41119 LogicOp0.hasOneUse() && LogicOp0.getOperand(0).hasOneUse() &&
41120 LogicOp0.getOperand(0).getValueType() == VT &&
41121 !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
41122 SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
41123 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
41124 return DAG.getNode(Opcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
41125 }
41126 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
41127 if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
41128 LogicOp1.hasOneUse() && LogicOp1.getOperand(0).hasOneUse() &&
41129 LogicOp1.getOperand(0).getValueType() == VT &&
41130 !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
41131 SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
41132 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
41133 return DAG.getNode(Opcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
41134 }
41135
41136 return SDValue();
41137}
41138
41139// Given a ABS node, detect the following pattern:
41140// (ABS (SUB (ZERO_EXTEND a), (ZERO_EXTEND b))).
41141// This is useful as it is the input into a SAD pattern.
41142static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1) {
41143 SDValue AbsOp1 = Abs->getOperand(0);
41144 if (AbsOp1.getOpcode() != ISD::SUB)
41145 return false;
41146
41147 Op0 = AbsOp1.getOperand(0);
41148 Op1 = AbsOp1.getOperand(1);
41149
41150 // Check if the operands of the sub are zero-extended from vectors of i8.
41151 if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
41152 Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
41153 Op1.getOpcode() != ISD::ZERO_EXTEND ||
41154 Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
41155 return false;
41156
41157 return true;
41158}
41159
41160// Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
41161// to these zexts.
41162static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
41163 const SDValue &Zext1, const SDLoc &DL,
41164 const X86Subtarget &Subtarget) {
41165 // Find the appropriate width for the PSADBW.
41166 EVT InVT = Zext0.getOperand(0).getValueType();
41167 unsigned RegSize = std::max(128u, (unsigned)InVT.getSizeInBits());
41168
41169 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
41170 // fill in the missing vector elements with 0.
41171 unsigned NumConcat = RegSize / InVT.getSizeInBits();
41172 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
41173 Ops[0] = Zext0.getOperand(0);
41174 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
41175 SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
41176 Ops[0] = Zext1.getOperand(0);
41177 SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
41178
41179 // Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
41180 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
41181 ArrayRef<SDValue> Ops) {
41182 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
41183 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);
41184 };
41185 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
41186 return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, { SadOp0, SadOp1 },
41187 PSADBWBuilder);
41188}
41189
41190// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
41191// PHMINPOSUW.
41192static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG,
41193 const X86Subtarget &Subtarget) {
41194 // Bail without SSE41.
41195 if (!Subtarget.hasSSE41())
41196 return SDValue();
41197
41198 EVT ExtractVT = Extract->getValueType(0);
41199 if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
41200 return SDValue();
41201
41202 // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
41203 ISD::NodeType BinOp;
41204 SDValue Src = DAG.matchBinOpReduction(
41205 Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}, true);
41206 if (!Src)
41207 return SDValue();
41208
41209 EVT SrcVT = Src.getValueType();
41210 EVT SrcSVT = SrcVT.getScalarType();
41211 if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
41212 return SDValue();
41213
41214 SDLoc DL(Extract);
41215 SDValue MinPos = Src;
41216
41217 // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
41218 while (SrcVT.getSizeInBits() > 128) {
41219 SDValue Lo, Hi;
41220 std::tie(Lo, Hi) = splitVector(MinPos, DAG, DL);
41221 SrcVT = Lo.getValueType();
41222 MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
41223 }
41224 assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||(static_cast<void> (0))
41225 (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&(static_cast<void> (0))
41226 "Unexpected value type")(static_cast<void> (0));
41227
41228 // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
41229 // to flip the value accordingly.
41230 SDValue Mask;
41231 unsigned MaskEltsBits = ExtractVT.getSizeInBits();
41232 if (BinOp == ISD::SMAX)
41233 Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
41234 else if (BinOp == ISD::SMIN)
41235 Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
41236 else if (BinOp == ISD::UMAX)
41237 Mask = DAG.getConstant(APInt::getAllOnesValue(MaskEltsBits), DL, SrcVT);
41238
41239 if (Mask)
41240 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
41241
41242 // For v16i8 cases we need to perform UMIN on pairs of byte elements,
41243 // shuffling each upper element down and insert zeros. This means that the
41244 // v16i8 UMIN will leave the upper element as zero, performing zero-extension
41245 // ready for the PHMINPOS.
41246 if (ExtractVT == MVT::i8) {
41247 SDValue Upper = DAG.getVectorShuffle(
41248 SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8),
41249 {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
41250 MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
41251 }
41252
41253 // Perform the PHMINPOS on a v8i16 vector,
41254 MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
41255 MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
41256 MinPos = DAG.getBitcast(SrcVT, MinPos);
41257
41258 if (Mask)
41259 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
41260
41261 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
41262 DAG.getIntPtrConstant(0, DL));
41263}
41264
41265// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
41266static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG,
41267 const X86Subtarget &Subtarget) {
41268 // Bail without SSE2.
41269 if (!Subtarget.hasSSE2())
41270 return SDValue();
41271
41272 EVT ExtractVT = Extract->getValueType(0);
41273 unsigned BitWidth = ExtractVT.getSizeInBits();
41274 if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
41275 ExtractVT != MVT::i8 && ExtractVT != MVT::i1)
41276 return SDValue();
41277
41278 // Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns.
41279 ISD::NodeType BinOp;
41280 SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
41281 if (!Match && ExtractVT == MVT::i1)
41282 Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR});
41283 if (!Match)
41284 return SDValue();
41285
41286 // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
41287 // which we can't support here for now.
41288 if (Match.getScalarValueSizeInBits() != BitWidth)
41289 return SDValue();
41290
41291 SDValue Movmsk;
41292 SDLoc DL(Extract);
41293 EVT MatchVT = Match.getValueType();
41294 unsigned NumElts = MatchVT.getVectorNumElements();
41295 unsigned MaxElts = Subtarget.hasInt256() ? 32 : 16;
41296 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41297
41298 if (ExtractVT == MVT::i1) {
41299 // Special case for (pre-legalization) vXi1 reductions.
41300 if (NumElts > 64 || !isPowerOf2_32(NumElts))
41301 return SDValue();
41302 if (TLI.isTypeLegal(MatchVT)) {
41303 // If this is a legal AVX512 predicate type then we can just bitcast.
41304 EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
41305 Movmsk = DAG.getBitcast(MovmskVT, Match);
41306 } else {
41307 // For all_of(setcc(vec,0,eq)) - avoid vXi64 comparisons if we don't have
41308 // PCMPEQQ (SSE41+), use PCMPEQD instead.
41309 if (BinOp == ISD::AND && !Subtarget.hasSSE41() &&
41310 Match.getOpcode() == ISD::SETCC &&
41311 ISD::isBuildVectorAllZeros(Match.getOperand(1).getNode()) &&
41312 cast<CondCodeSDNode>(Match.getOperand(2))->get() ==
41313 ISD::CondCode::SETEQ) {
41314 SDValue Vec = Match.getOperand(0);
41315 if (Vec.getValueType().getScalarType() == MVT::i64 &&
41316 (2 * NumElts) <= MaxElts) {
41317 NumElts *= 2;
41318 EVT CmpVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
41319 MatchVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
41320 Match = DAG.getSetCC(
41321 DL, MatchVT, DAG.getBitcast(CmpVT, Match.getOperand(0)),
41322 DAG.getBitcast(CmpVT, Match.getOperand(1)), ISD::CondCode::SETEQ);
41323 }
41324 }
41325
41326 // Use combineBitcastvxi1 to create the MOVMSK.
41327 while (NumElts > MaxElts) {
41328 SDValue Lo, Hi;
41329 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
41330 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
41331 NumElts /= 2;
41332 }
41333 EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
41334 Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);
41335 }
41336 if (!Movmsk)
41337 return SDValue();
41338 Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32);
41339 } else {
41340 // FIXME: Better handling of k-registers or 512-bit vectors?
41341 unsigned MatchSizeInBits = Match.getValueSizeInBits();
41342 if (!(MatchSizeInBits == 128 ||
41343 (MatchSizeInBits == 256 && Subtarget.hasAVX())))
41344 return SDValue();
41345
41346 // Make sure this isn't a vector of 1 element. The perf win from using
41347 // MOVMSK diminishes with less elements in the reduction, but it is
41348 // generally better to get the comparison over to the GPRs as soon as
41349 // possible to reduce the number of vector ops.
41350 if (Match.getValueType().getVectorNumElements() < 2)
41351 return SDValue();
41352
41353 // Check that we are extracting a reduction of all sign bits.
41354 if (DAG.ComputeNumSignBits(Match) != BitWidth)
41355 return SDValue();
41356
41357 if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) {
41358 SDValue Lo, Hi;
41359 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
41360 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
41361 MatchSizeInBits = Match.getValueSizeInBits();
41362 }
41363
41364 // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
41365 MVT MaskSrcVT;
41366 if (64 == BitWidth || 32 == BitWidth)
41367 MaskSrcVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
41368 MatchSizeInBits / BitWidth);
41369 else
41370 MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
41371
41372 SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match);
41373 Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);
41374 NumElts = MaskSrcVT.getVectorNumElements();
41375 }
41376 assert((NumElts <= 32 || NumElts == 64) &&(static_cast<void> (0))
41377 "Not expecting more than 64 elements")(static_cast<void> (0));
41378
41379 MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32;
41380 if (BinOp == ISD::XOR) {
41381 // parity -> (PARITY(MOVMSK X))
41382 SDValue Result = DAG.getNode(ISD::PARITY, DL, CmpVT, Movmsk);
41383 return DAG.getZExtOrTrunc(Result, DL, ExtractVT);
41384 }
41385
41386 SDValue CmpC;
41387 ISD::CondCode CondCode;
41388 if (BinOp == ISD::OR) {
41389 // any_of -> MOVMSK != 0
41390 CmpC = DAG.getConstant(0, DL, CmpVT);
41391 CondCode = ISD::CondCode::SETNE;
41392 } else {
41393 // all_of -> MOVMSK == ((1 << NumElts) - 1)
41394 CmpC = DAG.getConstant(APInt::getLowBitsSet(CmpVT.getSizeInBits(), NumElts),
41395 DL, CmpVT);
41396 CondCode = ISD::CondCode::SETEQ;
41397 }
41398
41399 // The setcc produces an i8 of 0/1, so extend that to the result width and
41400 // negate to get the final 0/-1 mask value.
41401 EVT SetccVT =
41402 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), CmpVT);
41403 SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);
41404 SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);
41405 SDValue Zero = DAG.getConstant(0, DL, ExtractVT);
41406 return DAG.getNode(ISD::SUB, DL, ExtractVT, Zero, Zext);
41407}
41408
41409static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
41410 const X86Subtarget &Subtarget) {
41411 // PSADBW is only supported on SSE2 and up.
41412 if (!Subtarget.hasSSE2())
41413 return SDValue();
41414
41415 EVT ExtractVT = Extract->getValueType(0);
41416 // Verify the type we're extracting is either i32 or i64.
41417 // FIXME: Could support other types, but this is what we have coverage for.
41418 if (ExtractVT != MVT::i32 && ExtractVT != MVT::i64)
41419 return SDValue();
41420
41421 EVT VT = Extract->getOperand(0).getValueType();
41422 if (!isPowerOf2_32(VT.getVectorNumElements()))
41423 return SDValue();
41424
41425 // Match shuffle + add pyramid.
41426 ISD::NodeType BinOp;
41427 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
41428
41429 // The operand is expected to be zero extended from i8
41430 // (verified in detectZextAbsDiff).
41431 // In order to convert to i64 and above, additional any/zero/sign
41432 // extend is expected.
41433 // The zero extend from 32 bit has no mathematical effect on the result.
41434 // Also the sign extend is basically zero extend
41435 // (extends the sign bit which is zero).
41436 // So it is correct to skip the sign/zero extend instruction.
41437 if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
41438 Root.getOpcode() == ISD::ZERO_EXTEND ||
41439 Root.getOpcode() == ISD::ANY_EXTEND))
41440 Root = Root.getOperand(0);
41441
41442 // If there was a match, we want Root to be a select that is the root of an
41443 // abs-diff pattern.
41444 if (!Root || Root.getOpcode() != ISD::ABS)
41445 return SDValue();
41446
41447 // Check whether we have an abs-diff pattern feeding into the select.
41448 SDValue Zext0, Zext1;
41449 if (!detectZextAbsDiff(Root, Zext0, Zext1))
41450 return SDValue();
41451
41452 // Create the SAD instruction.
41453 SDLoc DL(Extract);
41454 SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL, Subtarget);
41455
41456 // If the original vector was wider than 8 elements, sum over the results
41457 // in the SAD vector.
41458 unsigned Stages = Log2_32(VT.getVectorNumElements());
41459 EVT SadVT = SAD.getValueType();
41460 if (Stages > 3) {
41461 unsigned SadElems = SadVT.getVectorNumElements();
41462
41463 for(unsigned i = Stages - 3; i > 0; --i) {
41464 SmallVector<int, 16> Mask(SadElems, -1);
41465 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
41466 Mask[j] = MaskEnd + j;
41467
41468 SDValue Shuffle =
41469 DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
41470 SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
41471 }
41472 }
41473
41474 unsigned ExtractSizeInBits = ExtractVT.getSizeInBits();
41475 // Return the lowest ExtractSizeInBits bits.
41476 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), ExtractVT,
41477 SadVT.getSizeInBits() / ExtractSizeInBits);
41478 SAD = DAG.getBitcast(ResVT, SAD);
41479 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, SAD,
41480 Extract->getOperand(1));
41481}
41482
41483// Attempt to peek through a target shuffle and extract the scalar from the
41484// source.
41485static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
41486 TargetLowering::DAGCombinerInfo &DCI,
41487 const X86Subtarget &Subtarget) {
41488 if (DCI.isBeforeLegalizeOps())
41489 return SDValue();
41490
41491 SDLoc dl(N);
41492 SDValue Src = N->getOperand(0);
41493 SDValue Idx = N->getOperand(1);
41494
41495 EVT VT = N->getValueType(0);
41496 EVT SrcVT = Src.getValueType();
41497 EVT SrcSVT = SrcVT.getVectorElementType();
41498 unsigned SrcEltBits = SrcSVT.getSizeInBits();
41499 unsigned NumSrcElts = SrcVT.getVectorNumElements();
41500
41501 // Don't attempt this for boolean mask vectors or unknown extraction indices.
41502 if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
41503 return SDValue();
41504
41505 const APInt &IdxC = N->getConstantOperandAPInt(1);
41506 if (IdxC.uge(NumSrcElts))
41507 return SDValue();
41508
41509 SDValue SrcBC = peekThroughBitcasts(Src);
41510
41511 // Handle extract(bitcast(broadcast(scalar_value))).
41512 if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {
41513 SDValue SrcOp = SrcBC.getOperand(0);
41514 EVT SrcOpVT = SrcOp.getValueType();
41515 if (SrcOpVT.isScalarInteger() && VT.isInteger() &&
41516 (SrcOpVT.getSizeInBits() % SrcEltBits) == 0) {
41517 unsigned Scale = SrcOpVT.getSizeInBits() / SrcEltBits;
41518 unsigned Offset = IdxC.urem(Scale) * SrcEltBits;
41519 // TODO support non-zero offsets.
41520 if (Offset == 0) {
41521 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, SrcVT.getScalarType());
41522 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, VT);
41523 return SrcOp;
41524 }
41525 }
41526 }
41527
41528 // If we're extracting a single element from a broadcast load and there are
41529 // no other users, just create a single load.
41530 if (SrcBC.getOpcode() == X86ISD::VBROADCAST_LOAD && SrcBC.hasOneUse()) {
41531 auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);
41532 unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();
41533 if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&
41534 VT.getSizeInBits() == SrcBCWidth && SrcEltBits == SrcBCWidth) {
41535 SDValue Load = DAG.getLoad(VT, dl, MemIntr->getChain(),
41536 MemIntr->getBasePtr(),
41537 MemIntr->getPointerInfo(),
41538 MemIntr->getOriginalAlign(),
41539 MemIntr->getMemOperand()->getFlags());
41540 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
41541 return Load;
41542 }
41543 }
41544
41545 // Handle extract(bitcast(scalar_to_vector(scalar_value))) for integers.
41546 // TODO: Move to DAGCombine?
41547 if (SrcBC.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isInteger() &&
41548 SrcBC.getValueType().isInteger() &&
41549 (SrcBC.getScalarValueSizeInBits() % SrcEltBits) == 0 &&
41550 SrcBC.getScalarValueSizeInBits() ==
41551 SrcBC.getOperand(0).getValueSizeInBits()) {
41552 unsigned Scale = SrcBC.getScalarValueSizeInBits() / SrcEltBits;
41553 if (IdxC.ult(Scale)) {
41554 unsigned Offset = IdxC.getZExtValue() * SrcVT.getScalarSizeInBits();
41555 SDValue Scl = SrcBC.getOperand(0);
41556 EVT SclVT = Scl.getValueType();
41557 if (Offset) {
41558 Scl = DAG.getNode(ISD::SRL, dl, SclVT, Scl,
41559 DAG.getShiftAmountConstant(Offset, SclVT, dl));
41560 }
41561 Scl = DAG.getZExtOrTrunc(Scl, dl, SrcVT.getScalarType());
41562 Scl = DAG.getZExtOrTrunc(Scl, dl, VT);
41563 return Scl;
41564 }
41565 }
41566
41567 // Handle extract(truncate(x)) for 0'th index.
41568 // TODO: Treat this as a faux shuffle?
41569 // TODO: When can we use this for general indices?
41570 if (ISD::TRUNCATE == Src.getOpcode() && IdxC == 0 &&
41571 (SrcVT.getSizeInBits() % 128) == 0) {
41572 Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);
41573 MVT ExtractVT = MVT::getVectorVT(SrcSVT.getSimpleVT(), 128 / SrcEltBits);
41574 return DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(ExtractVT, Src),
41575 Idx);
41576 }
41577
41578 // We can only legally extract other elements from 128-bit vectors and in
41579 // certain circumstances, depending on SSE-level.
41580 // TODO: Investigate float/double extraction if it will be just stored.
41581 auto GetLegalExtract = [&Subtarget, &DAG, &dl](SDValue Vec, EVT VecVT,
41582 unsigned Idx) {
41583 EVT VecSVT = VecVT.getScalarType();
41584 if ((VecVT.is256BitVector() || VecVT.is512BitVector()) &&
41585 (VecSVT == MVT::i8 || VecSVT == MVT::i16 || VecSVT == MVT::i32 ||
41586 VecSVT == MVT::i64)) {
41587 unsigned EltSizeInBits = VecSVT.getSizeInBits();
41588 unsigned NumEltsPerLane = 128 / EltSizeInBits;
41589 unsigned LaneOffset = (Idx & ~(NumEltsPerLane - 1)) * EltSizeInBits;
41590 unsigned LaneIdx = LaneOffset / Vec.getScalarValueSizeInBits();
41591 VecVT = EVT::getVectorVT(*DAG.getContext(), VecSVT, NumEltsPerLane);
41592 Vec = extract128BitVector(Vec, LaneIdx, DAG, dl);
41593 Idx &= (NumEltsPerLane - 1);
41594 }
41595 if ((VecVT == MVT::v4i32 || VecVT == MVT::v2i64) &&
41596 ((Idx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
41597 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VecVT.getScalarType(),
41598 DAG.getBitcast(VecVT, Vec),
41599 DAG.getIntPtrConstant(Idx, dl));
41600 }
41601 if ((VecVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
41602 (VecVT == MVT::v16i8 && Subtarget.hasSSE41())) {
41603 unsigned OpCode = (VecVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
41604 return DAG.getNode(OpCode, dl, MVT::i32, DAG.getBitcast(VecVT, Vec),
41605 DAG.getTargetConstant(Idx, dl, MVT::i8));
41606 }
41607 return SDValue();
41608 };
41609
41610 // Resolve the target shuffle inputs and mask.
41611 SmallVector<int, 16> Mask;
41612 SmallVector<SDValue, 2> Ops;
41613 if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG))
41614 return SDValue();
41615
41616 // Shuffle inputs must be the same size as the result.
41617 if (llvm::any_of(Ops, [SrcVT](SDValue Op) {
41618 return SrcVT.getSizeInBits() != Op.getValueSizeInBits();
41619 }))
41620 return SDValue();
41621
41622 // Attempt to narrow/widen the shuffle mask to the correct size.
41623 if (Mask.size() != NumSrcElts) {
41624 if ((NumSrcElts % Mask.size()) == 0) {
41625 SmallVector<int, 16> ScaledMask;
41626 int Scale = NumSrcElts / Mask.size();
41627 narrowShuffleMaskElts(Scale, Mask, ScaledMask);
41628 Mask = std::move(ScaledMask);
41629 } else if ((Mask.size() % NumSrcElts) == 0) {
41630 // Simplify Mask based on demanded element.
41631 int ExtractIdx = (int)IdxC.getZExtValue();
41632 int Scale = Mask.size() / NumSrcElts;
41633 int Lo = Scale * ExtractIdx;
41634 int Hi = Scale * (ExtractIdx + 1);
41635 for (int i = 0, e = (int)Mask.size(); i != e; ++i)
41636 if (i < Lo || Hi <= i)
41637 Mask[i] = SM_SentinelUndef;
41638
41639 SmallVector<int, 16> WidenedMask;
41640 while (Mask.size() > NumSrcElts &&
41641 canWidenShuffleElements(Mask, WidenedMask))
41642 Mask = std::move(WidenedMask);
41643 }
41644 }
41645
41646 // If narrowing/widening failed, see if we can extract+zero-extend.
41647 int ExtractIdx;
41648 EVT ExtractVT;
41649 if (Mask.size() == NumSrcElts) {
41650 ExtractIdx = Mask[IdxC.getZExtValue()];
41651 ExtractVT = SrcVT;
41652 } else {
41653 unsigned Scale = Mask.size() / NumSrcElts;
41654 if ((Mask.size() % NumSrcElts) != 0 || SrcVT.isFloatingPoint())
41655 return SDValue();
41656 unsigned ScaledIdx = Scale * IdxC.getZExtValue();
41657 if (!isUndefOrZeroInRange(Mask, ScaledIdx + 1, Scale - 1))
41658 return SDValue();
41659 ExtractIdx = Mask[ScaledIdx];
41660 EVT ExtractSVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltBits / Scale);
41661 ExtractVT = EVT::getVectorVT(*DAG.getContext(), ExtractSVT, Mask.size());
41662 assert(SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() &&(static_cast<void> (0))
41663 "Failed to widen vector type")(static_cast<void> (0));
41664 }
41665
41666 // If the shuffle source element is undef/zero then we can just accept it.
41667 if (ExtractIdx == SM_SentinelUndef)
41668 return DAG.getUNDEF(VT);
41669
41670 if (ExtractIdx == SM_SentinelZero)
41671 return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
41672 : DAG.getConstant(0, dl, VT);
41673
41674 SDValue SrcOp = Ops[ExtractIdx / Mask.size()];
41675 ExtractIdx = ExtractIdx % Mask.size();
41676 if (SDValue V = GetLegalExtract(SrcOp, ExtractVT, ExtractIdx))
41677 return DAG.getZExtOrTrunc(V, dl, VT);
41678
41679 return SDValue();
41680}
41681
41682/// Extracting a scalar FP value from vector element 0 is free, so extract each
41683/// operand first, then perform the math as a scalar op.
41684static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG,
41685 const X86Subtarget &Subtarget) {
41686 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract")(static_cast<void> (0));
41687 SDValue Vec = ExtElt->getOperand(0);
41688 SDValue Index = ExtElt->getOperand(1);
41689 EVT VT = ExtElt->getValueType(0);
41690 EVT VecVT = Vec.getValueType();
41691
41692 // TODO: If this is a unary/expensive/expand op, allow extraction from a
41693 // non-zero element because the shuffle+scalar op will be cheaper?
41694 if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT)
41695 return SDValue();
41696
41697 // Vector FP compares don't fit the pattern of FP math ops (propagate, not
41698 // extract, the condition code), so deal with those as a special-case.
41699 if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) {
41700 EVT OpVT = Vec.getOperand(0).getValueType().getScalarType();
41701 if (OpVT != MVT::f32 && OpVT != MVT::f64)
41702 return SDValue();
41703
41704 // extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC
41705 SDLoc DL(ExtElt);
41706 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
41707 Vec.getOperand(0), Index);
41708 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
41709 Vec.getOperand(1), Index);
41710 return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2));
41711 }
41712
41713 if (!(VT == MVT::f16 && Subtarget.hasFP16()) && VT != MVT::f32 &&
41714 VT != MVT::f64)
41715 return SDValue();
41716
41717 // Vector FP selects don't fit the pattern of FP math ops (because the
41718 // condition has a different type and we have to change the opcode), so deal
41719 // with those here.
41720 // FIXME: This is restricted to pre type legalization by ensuring the setcc
41721 // has i1 elements. If we loosen this we need to convert vector bool to a
41722 // scalar bool.
41723 if (Vec.getOpcode() == ISD::VSELECT &&
41724 Vec.getOperand(0).getOpcode() == ISD::SETCC &&
41725 Vec.getOperand(0).getValueType().getScalarType() == MVT::i1 &&
41726 Vec.getOperand(0).getOperand(0).getValueType() == VecVT) {
41727 // ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)
41728 SDLoc DL(ExtElt);
41729 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
41730 Vec.getOperand(0).getValueType().getScalarType(),
41731 Vec.getOperand(0), Index);
41732 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
41733 Vec.getOperand(1), Index);
41734 SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
41735 Vec.getOperand(2), Index);
41736 return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);
41737 }
41738
41739 // TODO: This switch could include FNEG and the x86-specific FP logic ops
41740 // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid
41741 // missed load folding and fma+fneg combining.
41742 switch (Vec.getOpcode()) {
41743 case ISD::FMA: // Begin 3 operands
41744 case ISD::FMAD:
41745 case ISD::FADD: // Begin 2 operands
41746 case ISD::FSUB:
41747 case ISD::FMUL:
41748 case ISD::FDIV:
41749 case ISD::FREM:
41750 case ISD::FCOPYSIGN:
41751 case ISD::FMINNUM:
41752 case ISD::FMAXNUM:
41753 case ISD::FMINNUM_IEEE:
41754 case ISD::FMAXNUM_IEEE:
41755 case ISD::FMAXIMUM:
41756 case ISD::FMINIMUM:
41757 case X86ISD::FMAX:
41758 case X86ISD::FMIN:
41759 case ISD::FABS: // Begin 1 operand
41760 case ISD::FSQRT:
41761 case ISD::FRINT:
41762 case ISD::FCEIL:
41763 case ISD::FTRUNC:
41764 case ISD::FNEARBYINT:
41765 case ISD::FROUND:
41766 case ISD::FFLOOR:
41767 case X86ISD::FRCP:
41768 case X86ISD::FRSQRT: {
41769 // extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...
41770 SDLoc DL(ExtElt);
41771 SmallVector<SDValue, 4> ExtOps;
41772 for (SDValue Op : Vec->ops())
41773 ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index));
41774 return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps);
41775 }
41776 default:
41777 return SDValue();
41778 }
41779 llvm_unreachable("All opcodes should return within switch")__builtin_unreachable();
41780}
41781
41782/// Try to convert a vector reduction sequence composed of binops and shuffles
41783/// into horizontal ops.
41784static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG,
41785 const X86Subtarget &Subtarget) {
41786 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller")(static_cast<void> (0));
41787
41788 // We need at least SSE2 to anything here.
41789 if (!Subtarget.hasSSE2())
41790 return SDValue();
41791
41792 ISD::NodeType Opc;
41793 SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc,
41794 {ISD::ADD, ISD::MUL, ISD::FADD}, true);
41795 if (!Rdx)
41796 return SDValue();
41797
41798 SDValue Index = ExtElt->getOperand(1);
41799 assert(isNullConstant(Index) &&(static_cast<void> (0))
41800 "Reduction doesn't end in an extract from index 0")(static_cast<void> (0));
41801
41802 EVT VT = ExtElt->getValueType(0);
41803 EVT VecVT = Rdx.getValueType();
41804 if (VecVT.getScalarType() != VT)
41805 return SDValue();
41806
41807 SDLoc DL(ExtElt);
41808
41809 // vXi8 mul reduction - promote to vXi16 mul reduction.
41810 if (Opc == ISD::MUL) {
41811 unsigned NumElts = VecVT.getVectorNumElements();
41812 if (VT != MVT::i8 || NumElts < 4 || !isPowerOf2_32(NumElts))
41813 return SDValue();
41814 if (VecVT.getSizeInBits() >= 128) {
41815 EVT WideVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts / 2);
41816 SDValue Lo = getUnpackl(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
41817 SDValue Hi = getUnpackh(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
41818 Lo = DAG.getBitcast(WideVT, Lo);
41819 Hi = DAG.getBitcast(WideVT, Hi);
41820 Rdx = DAG.getNode(Opc, DL, WideVT, Lo, Hi);
41821 while (Rdx.getValueSizeInBits() > 128) {
41822 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
41823 Rdx = DAG.getNode(Opc, DL, Lo.getValueType(), Lo, Hi);
41824 }
41825 } else {
41826 if (VecVT == MVT::v4i8)
41827 Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, Rdx,
41828 DAG.getUNDEF(MVT::v4i8));
41829 Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Rdx,
41830 DAG.getUNDEF(MVT::v8i8));
41831 Rdx = getUnpackl(DAG, DL, MVT::v16i8, Rdx, DAG.getUNDEF(MVT::v16i8));
41832 Rdx = DAG.getBitcast(MVT::v8i16, Rdx);
41833 }
41834 if (NumElts >= 8)
41835 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
41836 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
41837 {4, 5, 6, 7, -1, -1, -1, -1}));
41838 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
41839 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
41840 {2, 3, -1, -1, -1, -1, -1, -1}));
41841 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
41842 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
41843 {1, -1, -1, -1, -1, -1, -1, -1}));
41844 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
41845 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
41846 }
41847
41848 // vXi8 add reduction - sub 128-bit vector.
41849 if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) {
41850 if (VecVT == MVT::v4i8) {
41851 // Pad with zero.
41852 if (Subtarget.hasSSE41()) {
41853 Rdx = DAG.getBitcast(MVT::i32, Rdx);
41854 Rdx = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
41855 DAG.getConstant(0, DL, MVT::v4i32), Rdx,
41856 DAG.getIntPtrConstant(0, DL));
41857 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
41858 } else {
41859 Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, Rdx,
41860 DAG.getConstant(0, DL, VecVT));
41861 }
41862 }
41863 if (Rdx.getValueType() == MVT::v8i8) {
41864 // Pad with undef.
41865 Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Rdx,
41866 DAG.getUNDEF(MVT::v8i8));
41867 }
41868 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
41869 DAG.getConstant(0, DL, MVT::v16i8));
41870 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
41871 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
41872 }
41873
41874 // Must be a >=128-bit vector with pow2 elements.
41875 if ((VecVT.getSizeInBits() % 128) != 0 ||
41876 !isPowerOf2_32(VecVT.getVectorNumElements()))
41877 return SDValue();
41878
41879 // vXi8 add reduction - sum lo/hi halves then use PSADBW.
41880 if (VT == MVT::i8) {
41881 while (Rdx.getValueSizeInBits() > 128) {
41882 SDValue Lo, Hi;
41883 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
41884 VecVT = Lo.getValueType();
41885 Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
41886 }
41887 assert(VecVT == MVT::v16i8 && "v16i8 reduction expected")(static_cast<void> (0));
41888
41889 SDValue Hi = DAG.getVectorShuffle(
41890 MVT::v16i8, DL, Rdx, Rdx,
41891 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
41892 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi);
41893 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
41894 getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
41895 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
41896 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
41897 }
41898
41899 // Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.
41900 if (!shouldUseHorizontalOp(true, DAG, Subtarget))
41901 return SDValue();
41902
41903 unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
41904
41905 // 256-bit horizontal instructions operate on 128-bit chunks rather than
41906 // across the whole vector, so we need an extract + hop preliminary stage.
41907 // This is the only step where the operands of the hop are not the same value.
41908 // TODO: We could extend this to handle 512-bit or even longer vectors.
41909 if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) ||
41910 ((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {
41911 unsigned NumElts = VecVT.getVectorNumElements();
41912 SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);
41913 SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);
41914 Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo);
41915 VecVT = Rdx.getValueType();
41916 }
41917 if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
41918 !((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
41919 return SDValue();
41920
41921 // extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
41922 unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
41923 for (unsigned i = 0; i != ReductionSteps; ++i)
41924 Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);
41925
41926 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
41927}
41928
41929/// Detect vector gather/scatter index generation and convert it from being a
41930/// bunch of shuffles and extracts into a somewhat faster sequence.
41931/// For i686, the best sequence is apparently storing the value and loading
41932/// scalars back, while for x64 we should use 64-bit extracts and shifts.
41933static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
41934 TargetLowering::DAGCombinerInfo &DCI,
41935 const X86Subtarget &Subtarget) {
41936 if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
41937 return NewOp;
41938
41939 SDValue InputVector = N->getOperand(0);
41940 SDValue EltIdx = N->getOperand(1);
41941 auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx);
41942
41943 EVT SrcVT = InputVector.getValueType();
41944 EVT VT = N->getValueType(0);
41945 SDLoc dl(InputVector);
41946 bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;
41947 unsigned NumSrcElts = SrcVT.getVectorNumElements();
41948
41949 if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts))
41950 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
41951
41952 // Integer Constant Folding.
41953 if (CIdx && VT.isInteger()) {
41954 APInt UndefVecElts;
41955 SmallVector<APInt, 16> EltBits;
41956 unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits();
41957 if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts,
41958 EltBits, true, false)) {
41959 uint64_t Idx = CIdx->getZExtValue();
41960 if (UndefVecElts[Idx])
41961 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
41962 return DAG.getConstant(EltBits[Idx].zextOrSelf(VT.getScalarSizeInBits()),
41963 dl, VT);
41964 }
41965 }
41966
41967 if (IsPextr) {
41968 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41969 if (TLI.SimplifyDemandedBits(
41970 SDValue(N, 0), APInt::getAllOnesValue(VT.getSizeInBits()), DCI))
41971 return SDValue(N, 0);
41972
41973 // PEXTR*(PINSR*(v, s, c), c) -> s (with implicit zext handling).
41974 if ((InputVector.getOpcode() == X86ISD::PINSRB ||
41975 InputVector.getOpcode() == X86ISD::PINSRW) &&
41976 InputVector.getOperand(2) == EltIdx) {
41977 assert(SrcVT == InputVector.getOperand(0).getValueType() &&(static_cast<void> (0))
41978 "Vector type mismatch")(static_cast<void> (0));
41979 SDValue Scl = InputVector.getOperand(1);
41980 Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl);
41981 return DAG.getZExtOrTrunc(Scl, dl, VT);
41982 }
41983
41984 // TODO - Remove this once we can handle the implicit zero-extension of
41985 // X86ISD::PEXTRW/X86ISD::PEXTRB in combinePredicateReduction and
41986 // combineBasicSADPattern.
41987 return SDValue();
41988 }
41989
41990 // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
41991 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
41992 VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
41993 SDValue MMXSrc = InputVector.getOperand(0);
41994
41995 // The bitcast source is a direct mmx result.
41996 if (MMXSrc.getValueType() == MVT::x86mmx)
41997 return DAG.getBitcast(VT, InputVector);
41998 }
41999
42000 // Detect mmx to i32 conversion through a v2i32 elt extract.
42001 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
42002 VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {
42003 SDValue MMXSrc = InputVector.getOperand(0);
42004
42005 // The bitcast source is a direct mmx result.
42006 if (MMXSrc.getValueType() == MVT::x86mmx)
42007 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
42008 }
42009
42010 // Check whether this extract is the root of a sum of absolute differences
42011 // pattern. This has to be done here because we really want it to happen
42012 // pre-legalization,
42013 if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
42014 return SAD;
42015
42016 // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
42017 if (SDValue Cmp = combinePredicateReduction(N, DAG, Subtarget))
42018 return Cmp;
42019
42020 // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
42021 if (SDValue MinMax = combineMinMaxReduction(N, DAG, Subtarget))
42022 return MinMax;
42023
42024 // Attempt to optimize ADD/FADD/MUL reductions with HADD, promotion etc..
42025 if (SDValue V = combineArithReduction(N, DAG, Subtarget))
42026 return V;
42027
42028 if (SDValue V = scalarizeExtEltFP(N, DAG, Subtarget))
42029 return V;
42030
42031 // Attempt to extract a i1 element by using MOVMSK to extract the signbits
42032 // and then testing the relevant element.
42033 //
42034 // Note that we only combine extracts on the *same* result number, i.e.
42035 // t0 = merge_values a0, a1, a2, a3
42036 // i1 = extract_vector_elt t0, Constant:i64<2>
42037 // i1 = extract_vector_elt t0, Constant:i64<3>
42038 // but not
42039 // i1 = extract_vector_elt t0:1, Constant:i64<2>
42040 // since the latter would need its own MOVMSK.
42041 if (CIdx && SrcVT.getScalarType() == MVT::i1) {
42042 SmallVector<SDNode *, 16> BoolExtracts;
42043 unsigned ResNo = InputVector.getResNo();
42044 auto IsBoolExtract = [&BoolExtracts, &ResNo](SDNode *Use) {
42045 if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
42046 isa<ConstantSDNode>(Use->getOperand(1)) &&
42047 Use->getOperand(0).getResNo() == ResNo &&
42048 Use->getValueType(0) == MVT::i1) {
42049 BoolExtracts.push_back(Use);
42050 return true;
42051 }
42052 return false;
42053 };
42054 if (all_of(InputVector->uses(), IsBoolExtract) &&
42055 BoolExtracts.size() > 1) {
42056 EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);
42057 if (SDValue BC =
42058 combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {
42059 for (SDNode *Use : BoolExtracts) {
42060 // extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask
42061 unsigned MaskIdx = Use->getConstantOperandVal(1);
42062 APInt MaskBit = APInt::getOneBitSet(NumSrcElts, MaskIdx);
42063 SDValue Mask = DAG.getConstant(MaskBit, dl, BCVT);
42064 SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);
42065 Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);
42066 DCI.CombineTo(Use, Res);
42067 }
42068 return SDValue(N, 0);
42069 }
42070 }
42071 }
42072
42073 return SDValue();
42074}
42075
42076/// If a vector select has an operand that is -1 or 0, try to simplify the
42077/// select to a bitwise logic operation.
42078/// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
42079static SDValue
42080combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
42081 TargetLowering::DAGCombinerInfo &DCI,
42082 const X86Subtarget &Subtarget) {
42083 SDValue Cond = N->getOperand(0);
42084 SDValue LHS = N->getOperand(1);
42085 SDValue RHS = N->getOperand(2);
42086 EVT VT = LHS.getValueType();
42087 EVT CondVT = Cond.getValueType();
42088 SDLoc DL(N);
42089 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42090
42091 if (N->getOpcode() != ISD::VSELECT)
42092 return SDValue();
42093
42094 assert(CondVT.isVector() && "Vector select expects a vector selector!")(static_cast<void> (0));
42095
42096 // TODO: Use isNullOrNullSplat() to distinguish constants with undefs?
42097 // TODO: Can we assert that both operands are not zeros (because that should
42098 // get simplified at node creation time)?
42099 bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
42100 bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
42101
42102 // If both inputs are 0/undef, create a complete zero vector.
42103 // FIXME: As noted above this should be handled by DAGCombiner/getNode.
42104 if (TValIsAllZeros && FValIsAllZeros) {
42105 if (VT.isFloatingPoint())
42106 return DAG.getConstantFP(0.0, DL, VT);
42107 return DAG.getConstant(0, DL, VT);
42108 }
42109
42110 // To use the condition operand as a bitwise mask, it must have elements that
42111 // are the same size as the select elements. Ie, the condition operand must
42112 // have already been promoted from the IR select condition type <N x i1>.
42113 // Don't check if the types themselves are equal because that excludes
42114 // vector floating-point selects.
42115 if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
42116 return SDValue();
42117
42118 // Try to invert the condition if true value is not all 1s and false value is
42119 // not all 0s. Only do this if the condition has one use.
42120 bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
42121 if (!TValIsAllOnes && !FValIsAllZeros && Cond.hasOneUse() &&
42122 // Check if the selector will be produced by CMPP*/PCMP*.
42123 Cond.getOpcode() == ISD::SETCC &&
42124 // Check if SETCC has already been promoted.
42125 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
42126 CondVT) {
42127 bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
42128
42129 if (TValIsAllZeros || FValIsAllOnes) {
42130 SDValue CC = Cond.getOperand(2);
42131 ISD::CondCode NewCC = ISD::getSetCCInverse(
42132 cast<CondCodeSDNode>(CC)->get(), Cond.getOperand(0).getValueType());
42133 Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
42134 NewCC);
42135 std::swap(LHS, RHS);
42136 TValIsAllOnes = FValIsAllOnes;
42137 FValIsAllZeros = TValIsAllZeros;
42138 }
42139 }
42140
42141 // Cond value must be 'sign splat' to be converted to a logical op.
42142 if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())
42143 return SDValue();
42144
42145 // vselect Cond, 111..., 000... -> Cond
42146 if (TValIsAllOnes && FValIsAllZeros)
42147 return DAG.getBitcast(VT, Cond);
42148
42149 if (!TLI.isTypeLegal(CondVT))
42150 return SDValue();
42151
42152 // vselect Cond, 111..., X -> or Cond, X
42153 if (TValIsAllOnes) {
42154 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
42155 SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
42156 return DAG.getBitcast(VT, Or);
42157 }
42158
42159 // vselect Cond, X, 000... -> and Cond, X
42160 if (FValIsAllZeros) {
42161 SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
42162 SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
42163 return DAG.getBitcast(VT, And);
42164 }
42165
42166 // vselect Cond, 000..., X -> andn Cond, X
42167 if (TValIsAllZeros) {
42168 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
42169 SDValue AndN;
42170 // The canonical form differs for i1 vectors - x86andnp is not used
42171 if (CondVT.getScalarType() == MVT::i1)
42172 AndN = DAG.getNode(ISD::AND, DL, CondVT, DAG.getNOT(DL, Cond, CondVT),
42173 CastRHS);
42174 else
42175 AndN = DAG.getNode(X86ISD::ANDNP, DL, CondVT, Cond, CastRHS);
42176 return DAG.getBitcast(VT, AndN);
42177 }
42178
42179 return SDValue();
42180}
42181
42182/// If both arms of a vector select are concatenated vectors, split the select,
42183/// and concatenate the result to eliminate a wide (256-bit) vector instruction:
42184/// vselect Cond, (concat T0, T1), (concat F0, F1) -->
42185/// concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1)
42186static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG,
42187 const X86Subtarget &Subtarget) {
42188 unsigned Opcode = N->getOpcode();
42189 if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT)
42190 return SDValue();
42191
42192 // TODO: Split 512-bit vectors too?
42193 EVT VT = N->getValueType(0);
42194 if (!VT.is256BitVector())
42195 return SDValue();
42196
42197 // TODO: Split as long as any 2 of the 3 operands are concatenated?
42198 SDValue Cond = N->getOperand(0);
42199 SDValue TVal = N->getOperand(1);
42200 SDValue FVal = N->getOperand(2);
42201 SmallVector<SDValue, 4> CatOpsT, CatOpsF;
42202 if (!TVal.hasOneUse() || !FVal.hasOneUse() ||
42203 !collectConcatOps(TVal.getNode(), CatOpsT) ||
42204 !collectConcatOps(FVal.getNode(), CatOpsF))
42205 return SDValue();
42206
42207 auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,
42208 ArrayRef<SDValue> Ops) {
42209 return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops);
42210 };
42211 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { Cond, TVal, FVal },
42212 makeBlend, /*CheckBWI*/ false);
42213}
42214
42215static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
42216 SDValue Cond = N->getOperand(0);
42217 SDValue LHS = N->getOperand(1);
42218 SDValue RHS = N->getOperand(2);
42219 SDLoc DL(N);
42220
42221 auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
42222 auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
42223 if (!TrueC || !FalseC)
42224 return SDValue();
42225
42226 // Don't do this for crazy integer types.
42227 EVT VT = N->getValueType(0);
42228 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
42229 return SDValue();
42230
42231 // We're going to use the condition bit in math or logic ops. We could allow
42232 // this with a wider condition value (post-legalization it becomes an i8),
42233 // but if nothing is creating selects that late, it doesn't matter.
42234 if (Cond.getValueType() != MVT::i1)
42235 return SDValue();
42236
42237 // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
42238 // 3, 5, or 9 with i32/i64, so those get transformed too.
42239 // TODO: For constants that overflow or do not differ by power-of-2 or small
42240 // multiplier, convert to 'and' + 'add'.
42241 const APInt &TrueVal = TrueC->getAPIntValue();
42242 const APInt &FalseVal = FalseC->getAPIntValue();
42243 bool OV;
42244 APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
42245 if (OV)
42246 return SDValue();
42247
42248 APInt AbsDiff = Diff.abs();
42249 if (AbsDiff.isPowerOf2() ||
42250 ((VT == MVT::i32 || VT == MVT::i64) &&
42251 (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {
42252
42253 // We need a positive multiplier constant for shift/LEA codegen. The 'not'
42254 // of the condition can usually be folded into a compare predicate, but even
42255 // without that, the sequence should be cheaper than a CMOV alternative.
42256 if (TrueVal.slt(FalseVal)) {
42257 Cond = DAG.getNOT(DL, Cond, MVT::i1);
42258 std::swap(TrueC, FalseC);
42259 }
42260
42261 // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
42262 SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
42263
42264 // Multiply condition by the difference if non-one.
42265 if (!AbsDiff.isOneValue())
42266 R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
42267
42268 // Add the base if non-zero.
42269 if (!FalseC->isNullValue())
42270 R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
42271
42272 return R;
42273 }
42274
42275 return SDValue();
42276}
42277
42278/// If this is a *dynamic* select (non-constant condition) and we can match
42279/// this node with one of the variable blend instructions, restructure the
42280/// condition so that blends can use the high (sign) bit of each element.
42281/// This function will also call SimplifyDemandedBits on already created
42282/// BLENDV to perform additional simplifications.
42283static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,
42284 TargetLowering::DAGCombinerInfo &DCI,
42285 const X86Subtarget &Subtarget) {
42286 SDValue Cond = N->getOperand(0);
42287 if ((N->getOpcode() != ISD::VSELECT &&
42288 N->getOpcode() != X86ISD::BLENDV) ||
42289 ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
42290 return SDValue();
42291
42292 // Don't optimize before the condition has been transformed to a legal type
42293 // and don't ever optimize vector selects that map to AVX512 mask-registers.
42294 unsigned BitWidth = Cond.getScalarValueSizeInBits();
42295 if (BitWidth < 8 || BitWidth > 64)
42296 return SDValue();
42297
42298 // We can only handle the cases where VSELECT is directly legal on the
42299 // subtarget. We custom lower VSELECT nodes with constant conditions and
42300 // this makes it hard to see whether a dynamic VSELECT will correctly
42301 // lower, so we both check the operation's status and explicitly handle the
42302 // cases where a *dynamic* blend will fail even though a constant-condition
42303 // blend could be custom lowered.
42304 // FIXME: We should find a better way to handle this class of problems.
42305 // Potentially, we should combine constant-condition vselect nodes
42306 // pre-legalization into shuffles and not mark as many types as custom
42307 // lowered.
42308 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42309 EVT VT = N->getValueType(0);
42310 if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
42311 return SDValue();
42312 // FIXME: We don't support i16-element blends currently. We could and
42313 // should support them by making *all* the bits in the condition be set
42314 // rather than just the high bit and using an i8-element blend.
42315 if (VT.getVectorElementType() == MVT::i16)
42316 return SDValue();
42317 // Dynamic blending was only available from SSE4.1 onward.
42318 if (VT.is128BitVector() && !Subtarget.hasSSE41())
42319 return SDValue();
42320 // Byte blends are only available in AVX2
42321 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
42322 return SDValue();
42323 // There are no 512-bit blend instructions that use sign bits.
42324 if (VT.is512BitVector())
42325 return SDValue();
42326
42327 auto OnlyUsedAsSelectCond = [](SDValue Cond) {
42328 for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
42329 UI != UE; ++UI)
42330 if ((UI->getOpcode() != ISD::VSELECT &&
42331 UI->getOpcode() != X86ISD::BLENDV) ||
42332 UI.getOperandNo() != 0)
42333 return false;
42334
42335 return true;
42336 };
42337
42338 APInt DemandedBits(APInt::getSignMask(BitWidth));
42339
42340 if (OnlyUsedAsSelectCond(Cond)) {
42341 KnownBits Known;
42342 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
42343 !DCI.isBeforeLegalizeOps());
42344 if (!TLI.SimplifyDemandedBits(Cond, DemandedBits, Known, TLO, 0, true))
42345 return SDValue();
42346
42347 // If we changed the computation somewhere in the DAG, this change will
42348 // affect all users of Cond. Update all the nodes so that we do not use
42349 // the generic VSELECT anymore. Otherwise, we may perform wrong
42350 // optimizations as we messed with the actual expectation for the vector
42351 // boolean values.
42352 for (SDNode *U : Cond->uses()) {
42353 if (U->getOpcode() == X86ISD::BLENDV)
42354 continue;
42355
42356 SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),
42357 Cond, U->getOperand(1), U->getOperand(2));
42358 DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
42359 DCI.AddToWorklist(U);
42360 }
42361 DCI.CommitTargetLoweringOpt(TLO);
42362 return SDValue(N, 0);
42363 }
42364
42365 // Otherwise we can still at least try to simplify multiple use bits.
42366 if (SDValue V = TLI.SimplifyMultipleUseDemandedBits(Cond, DemandedBits, DAG))
42367 return DAG.getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0), V,
42368 N->getOperand(1), N->getOperand(2));
42369
42370 return SDValue();
42371}
42372
42373// Try to match:
42374// (or (and (M, (sub 0, X)), (pandn M, X)))
42375// which is a special case of:
42376// (select M, (sub 0, X), X)
42377// Per:
42378// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
42379// We know that, if fNegate is 0 or 1:
42380// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
42381//
42382// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
42383// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
42384// ( M ? -X : X) == ((X ^ M ) + (M & 1))
42385// This lets us transform our vselect to:
42386// (add (xor X, M), (and M, 1))
42387// And further to:
42388// (sub (xor X, M), M)
42389static SDValue combineLogicBlendIntoConditionalNegate(
42390 EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,
42391 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
42392 EVT MaskVT = Mask.getValueType();
42393 assert(MaskVT.isInteger() &&(static_cast<void> (0))
42394 DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&(static_cast<void> (0))
42395 "Mask must be zero/all-bits")(static_cast<void> (0));
42396
42397 if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT)
42398 return SDValue();
42399 if (!DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT))
42400 return SDValue();
42401
42402 auto IsNegV = [](SDNode *N, SDValue V) {
42403 return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
42404 ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
42405 };
42406
42407 SDValue V;
42408 if (IsNegV(Y.getNode(), X))
42409 V = X;
42410 else if (IsNegV(X.getNode(), Y))
42411 V = Y;
42412 else
42413 return SDValue();
42414
42415 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
42416 SDValue SubOp2 = Mask;
42417
42418 // If the negate was on the false side of the select, then
42419 // the operands of the SUB need to be swapped. PR 27251.
42420 // This is because the pattern being matched above is
42421 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
42422 // but if the pattern matched was
42423 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
42424 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
42425 // pattern also needs to be a negation of the replacement pattern above.
42426 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
42427 // sub accomplishes the negation of the replacement pattern.
42428 if (V == Y)
42429 std::swap(SubOp1, SubOp2);
42430
42431 SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
42432 return DAG.getBitcast(VT, Res);
42433}
42434
42435/// Do target-specific dag combines on SELECT and VSELECT nodes.
42436static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
42437 TargetLowering::DAGCombinerInfo &DCI,
42438 const X86Subtarget &Subtarget) {
42439 SDLoc DL(N);
42440 SDValue Cond = N->getOperand(0);
42441 SDValue LHS = N->getOperand(1);
42442 SDValue RHS = N->getOperand(2);
42443
42444 // Try simplification again because we use this function to optimize
42445 // BLENDV nodes that are not handled by the generic combiner.
42446 if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS))
42447 return V;
42448
42449 EVT VT = LHS.getValueType();
42450 EVT CondVT = Cond.getValueType();
42451 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42452 bool CondConstantVector = ISD::isBuildVectorOfConstantSDNodes(Cond.getNode());
42453
42454 // Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M).
42455 // Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT
42456 // can't catch, plus vXi8 cases where we'd likely end up with BLENDV.
42457 if (CondVT.isVector() && CondVT.isInteger() &&
42458 CondVT.getScalarSizeInBits() == VT.getScalarSizeInBits() &&
42459 (!CondConstantVector || CondVT.getScalarType() == MVT::i8) &&
42460 DAG.ComputeNumSignBits(Cond) == CondVT.getScalarSizeInBits())
42461 if (SDValue V = combineLogicBlendIntoConditionalNegate(VT, Cond, RHS, LHS,
42462 DL, DAG, Subtarget))
42463 return V;
42464
42465 // Convert vselects with constant condition into shuffles.
42466 if (CondConstantVector && DCI.isBeforeLegalizeOps()) {
42467 SmallVector<int, 64> Mask;
42468 if (createShuffleMaskFromVSELECT(Mask, Cond))
42469 return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);
42470 }
42471
42472 // fold vselect(cond, pshufb(x), pshufb(y)) -> or (pshufb(x), pshufb(y))
42473 // by forcing the unselected elements to zero.
42474 // TODO: Can we handle more shuffles with this?
42475 if (N->getOpcode() == ISD::VSELECT && CondVT.isVector() &&
42476 LHS.getOpcode() == X86ISD::PSHUFB && RHS.getOpcode() == X86ISD::PSHUFB &&
42477 LHS.hasOneUse() && RHS.hasOneUse()) {
42478 MVT SimpleVT = VT.getSimpleVT();
42479 SmallVector<SDValue, 1> LHSOps, RHSOps;
42480 SmallVector<int, 64> LHSMask, RHSMask, CondMask;
42481 if (createShuffleMaskFromVSELECT(CondMask, Cond) &&
42482 getTargetShuffleMask(LHS.getNode(), SimpleVT, true, LHSOps, LHSMask) &&
42483 getTargetShuffleMask(RHS.getNode(), SimpleVT, true, RHSOps, RHSMask)) {
42484 int NumElts = VT.getVectorNumElements();
42485 for (int i = 0; i != NumElts; ++i) {
42486 if (CondMask[i] < NumElts)
42487 RHSMask[i] = 0x80;
42488 else
42489 LHSMask[i] = 0x80;
42490 }
42491 LHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, LHS.getOperand(0),
42492 getConstVector(LHSMask, SimpleVT, DAG, DL, true));
42493 RHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, RHS.getOperand(0),
42494 getConstVector(RHSMask, SimpleVT, DAG, DL, true));
42495 return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
42496 }
42497 }
42498
42499 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
42500 // instructions match the semantics of the common C idiom x<y?x:y but not
42501 // x<=y?x:y, because of how they handle negative zero (which can be
42502 // ignored in unsafe-math mode).
42503 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
42504 if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
42505 VT != MVT::f80 && VT != MVT::f128 &&
42506 (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
42507 (Subtarget.hasSSE2() ||
42508 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
42509 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
42510
42511 unsigned Opcode = 0;
42512 // Check for x CC y ? x : y.
42513 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
42514 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
42515 switch (CC) {
42516 default: break;
42517 case ISD::SETULT:
42518 // Converting this to a min would handle NaNs incorrectly, and swapping
42519 // the operands would cause it to handle comparisons between positive
42520 // and negative zero incorrectly.
42521 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
42522 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
42523 !(DAG.isKnownNeverZeroFloat(LHS) ||
42524 DAG.isKnownNeverZeroFloat(RHS)))
42525 break;
42526 std::swap(LHS, RHS);
42527 }
42528 Opcode = X86ISD::FMIN;
42529 break;
42530 case ISD::SETOLE:
42531 // Converting this to a min would handle comparisons between positive
42532 // and negative zero incorrectly.
42533 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
42534 !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
42535 break;
42536 Opcode = X86ISD::FMIN;
42537 break;
42538 case ISD::SETULE:
42539 // Converting this to a min would handle both negative zeros and NaNs
42540 // incorrectly, but we can swap the operands to fix both.
42541 std::swap(LHS, RHS);
42542 LLVM_FALLTHROUGH[[gnu::fallthrough]];
42543 case ISD::SETOLT:
42544 case ISD::SETLT:
42545 case ISD::SETLE:
42546 Opcode = X86ISD::FMIN;
42547 break;
42548
42549 case ISD::SETOGE:
42550 // Converting this to a max would handle comparisons between positive
42551 // and negative zero incorrectly.
42552 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
42553 !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
42554 break;
42555 Opcode = X86ISD::FMAX;
42556 break;
42557 case ISD::SETUGT:
42558 // Converting this to a max would handle NaNs incorrectly, and swapping
42559 // the operands would cause it to handle comparisons between positive
42560 // and negative zero incorrectly.
42561 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
42562 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
42563 !(DAG.isKnownNeverZeroFloat(LHS) ||
42564 DAG.isKnownNeverZeroFloat(RHS)))
42565 break;
42566 std::swap(LHS, RHS);
42567 }
42568 Opcode = X86ISD::FMAX;
42569 break;
42570 case ISD::SETUGE:
42571 // Converting this to a max would handle both negative zeros and NaNs
42572 // incorrectly, but we can swap the operands to fix both.
42573 std::swap(LHS, RHS);
42574 LLVM_FALLTHROUGH[[gnu::fallthrough]];
42575 case ISD::SETOGT:
42576 case ISD::SETGT:
42577 case ISD::SETGE:
42578 Opcode = X86ISD::FMAX;
42579 break;
42580 }
42581 // Check for x CC y ? y : x -- a min/max with reversed arms.
42582 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
42583 DAG.isEqualTo(RHS, Cond.getOperand(0))) {
42584 switch (CC) {
42585 default: break;
42586 case ISD::SETOGE:
42587 // Converting this to a min would handle comparisons between positive
42588 // and negative zero incorrectly, and swapping the operands would
42589 // cause it to handle NaNs incorrectly.
42590 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
42591 !(DAG.isKnownNeverZeroFloat(LHS) ||
42592 DAG.isKnownNeverZeroFloat(RHS))) {
42593 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
42594 break;
42595 std::swap(LHS, RHS);
42596 }
42597 Opcode = X86ISD::FMIN;
42598 break;
42599 case ISD::SETUGT:
42600 // Converting this to a min would handle NaNs incorrectly.
42601 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
42602 break;
42603 Opcode = X86ISD::FMIN;
42604 break;
42605 case ISD::SETUGE:
42606 // Converting this to a min would handle both negative zeros and NaNs
42607 // incorrectly, but we can swap the operands to fix both.
42608 std::swap(LHS, RHS);
42609 LLVM_FALLTHROUGH[[gnu::fallthrough]];
42610 case ISD::SETOGT:
42611 case ISD::SETGT:
42612 case ISD::SETGE:
42613 Opcode = X86ISD::FMIN;
42614 break;
42615
42616 case ISD::SETULT:
42617 // Converting this to a max would handle NaNs incorrectly.
42618 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
42619 break;
42620 Opcode = X86ISD::FMAX;
42621 break;
42622 case ISD::SETOLE:
42623 // Converting this to a max would handle comparisons between positive
42624 // and negative zero incorrectly, and swapping the operands would
42625 // cause it to handle NaNs incorrectly.
42626 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
42627 !DAG.isKnownNeverZeroFloat(LHS) &&
42628 !DAG.isKnownNeverZeroFloat(RHS)) {
42629 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
42630 break;
42631 std::swap(LHS, RHS);
42632 }
42633 Opcode = X86ISD::FMAX;
42634 break;
42635 case ISD::SETULE:
42636 // Converting this to a max would handle both negative zeros and NaNs
42637 // incorrectly, but we can swap the operands to fix both.
42638 std::swap(LHS, RHS);
42639 LLVM_FALLTHROUGH[[gnu::fallthrough]];
42640 case ISD::SETOLT:
42641 case ISD::SETLT:
42642 case ISD::SETLE:
42643 Opcode = X86ISD::FMAX;
42644 break;
42645 }
42646 }
42647
42648 if (Opcode)
42649 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
42650 }
42651
42652 // Some mask scalar intrinsics rely on checking if only one bit is set
42653 // and implement it in C code like this:
42654 // A[0] = (U & 1) ? A[0] : W[0];
42655 // This creates some redundant instructions that break pattern matching.
42656 // fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
42657 if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
42658 Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {
42659 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
42660 SDValue AndNode = Cond.getOperand(0);
42661 if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&
42662 isNullConstant(Cond.getOperand(1)) &&
42663 isOneConstant(AndNode.getOperand(1))) {
42664 // LHS and RHS swapped due to
42665 // setcc outputting 1 when AND resulted in 0 and vice versa.
42666 AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);
42667 return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);
42668 }
42669 }
42670
42671 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
42672 // lowering on KNL. In this case we convert it to
42673 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
42674 // The same situation all vectors of i8 and i16 without BWI.
42675 // Make sure we extend these even before type legalization gets a chance to
42676 // split wide vectors.
42677 // Since SKX these selects have a proper lowering.
42678 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
42679 CondVT.getVectorElementType() == MVT::i1 &&
42680 (VT.getVectorElementType() == MVT::i8 ||
42681 VT.getVectorElementType() == MVT::i16)) {
42682 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
42683 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
42684 }
42685
42686 // AVX512 - Extend select with zero to merge with target shuffle.
42687 // select(mask, extract_subvector(shuffle(x)), zero) -->
42688 // extract_subvector(select(insert_subvector(mask), shuffle(x), zero))
42689 // TODO - support non target shuffles as well.
42690 if (Subtarget.hasAVX512() && CondVT.isVector() &&
42691 CondVT.getVectorElementType() == MVT::i1) {
42692 auto SelectableOp = [&TLI](SDValue Op) {
42693 return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
42694 isTargetShuffle(Op.getOperand(0).getOpcode()) &&
42695 isNullConstant(Op.getOperand(1)) &&
42696 TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&
42697 Op.hasOneUse() && Op.getOperand(0).hasOneUse();
42698 };
42699
42700 bool SelectableLHS = SelectableOp(LHS);
42701 bool SelectableRHS = SelectableOp(RHS);
42702 bool ZeroLHS = ISD::isBuildVectorAllZeros(LHS.getNode());
42703 bool ZeroRHS = ISD::isBuildVectorAllZeros(RHS.getNode());
42704
42705 if ((SelectableLHS && ZeroRHS) || (SelectableRHS && ZeroLHS)) {
42706 EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType()
42707 : RHS.getOperand(0).getValueType();
42708 EVT SrcCondVT = SrcVT.changeVectorElementType(MVT::i1);
42709 LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL,
42710 VT.getSizeInBits());
42711 RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL,
42712 VT.getSizeInBits());
42713 Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT,
42714 DAG.getUNDEF(SrcCondVT), Cond,
42715 DAG.getIntPtrConstant(0, DL));
42716 SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS);
42717 return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
42718 }
42719 }
42720
42721 if (SDValue V = combineSelectOfTwoConstants(N, DAG))
42722 return V;
42723
42724 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
42725 Cond.hasOneUse()) {
42726 EVT CondVT = Cond.getValueType();
42727 SDValue Cond0 = Cond.getOperand(0);
42728 SDValue Cond1 = Cond.getOperand(1);
42729 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
42730
42731 // Canonicalize min/max:
42732 // (x > 0) ? x : 0 -> (x >= 0) ? x : 0
42733 // (x < -1) ? x : -1 -> (x <= -1) ? x : -1
42734 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
42735 // the need for an extra compare against zero. e.g.
42736 // (a - b) > 0 : (a - b) ? 0 -> (a - b) >= 0 : (a - b) ? 0
42737 // subl %esi, %edi
42738 // testl %edi, %edi
42739 // movl $0, %eax
42740 // cmovgl %edi, %eax
42741 // =>
42742 // xorl %eax, %eax
42743 // subl %esi, $edi
42744 // cmovsl %eax, %edi
42745 //
42746 // We can also canonicalize
42747 // (x s> 1) ? x : 1 -> (x s>= 1) ? x : 1 -> (x s> 0) ? x : 1
42748 // (x u> 1) ? x : 1 -> (x u>= 1) ? x : 1 -> (x != 0) ? x : 1
42749 // This allows the use of a test instruction for the compare.
42750 if (LHS == Cond0 && RHS == Cond1) {
42751 if ((CC == ISD::SETGT && (isNullConstant(RHS) || isOneConstant(RHS))) ||
42752 (CC == ISD::SETLT && isAllOnesConstant(RHS))) {
42753 ISD::CondCode NewCC = CC == ISD::SETGT ? ISD::SETGE : ISD::SETLE;
42754 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
42755 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
42756 }
42757 if (CC == ISD::SETUGT && isOneConstant(RHS)) {
42758 ISD::CondCode NewCC = ISD::SETUGE;
42759 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
42760 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
42761 }
42762 }
42763
42764 // Similar to DAGCombine's select(or(CC0,CC1),X,Y) fold but for legal types.
42765 // fold eq + gt/lt nested selects into ge/le selects
42766 // select (cmpeq Cond0, Cond1), LHS, (select (cmpugt Cond0, Cond1), LHS, Y)
42767 // --> (select (cmpuge Cond0, Cond1), LHS, Y)
42768 // select (cmpslt Cond0, Cond1), LHS, (select (cmpeq Cond0, Cond1), LHS, Y)
42769 // --> (select (cmpsle Cond0, Cond1), LHS, Y)
42770 // .. etc ..
42771 if (RHS.getOpcode() == ISD::SELECT && RHS.getOperand(1) == LHS &&
42772 RHS.getOperand(0).getOpcode() == ISD::SETCC) {
42773 SDValue InnerSetCC = RHS.getOperand(0);
42774 ISD::CondCode InnerCC =
42775 cast<CondCodeSDNode>(InnerSetCC.getOperand(2))->get();
42776 if ((CC == ISD::SETEQ || InnerCC == ISD::SETEQ) &&
42777 Cond0 == InnerSetCC.getOperand(0) &&
42778 Cond1 == InnerSetCC.getOperand(1)) {
42779 ISD::CondCode NewCC;
42780 switch (CC == ISD::SETEQ ? InnerCC : CC) {
42781 case ISD::SETGT: NewCC = ISD::SETGE; break;
42782 case ISD::SETLT: NewCC = ISD::SETLE; break;
42783 case ISD::SETUGT: NewCC = ISD::SETUGE; break;
42784 case ISD::SETULT: NewCC = ISD::SETULE; break;
42785 default: NewCC = ISD::SETCC_INVALID; break;
42786 }
42787 if (NewCC != ISD::SETCC_INVALID) {
42788 Cond = DAG.getSetCC(DL, CondVT, Cond0, Cond1, NewCC);
42789 return DAG.getSelect(DL, VT, Cond, LHS, RHS.getOperand(2));
42790 }
42791 }
42792 }
42793 }
42794
42795 // Check if the first operand is all zeros and Cond type is vXi1.
42796 // If this an avx512 target we can improve the use of zero masking by
42797 // swapping the operands and inverting the condition.
42798 if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&
42799 Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&
42800 ISD::isBuildVectorAllZeros(LHS.getNode()) &&
42801 !ISD::isBuildVectorAllZeros(RHS.getNode())) {
42802 // Invert the cond to not(cond) : xor(op,allones)=not(op)
42803 SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
42804 // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
42805 return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
42806 }
42807
42808 // Early exit check
42809 if (!TLI.isTypeLegal(VT))
42810 return SDValue();
42811
42812 if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
42813 return V;
42814
42815 if (SDValue V = combineVSelectToBLENDV(N, DAG, DCI, Subtarget))
42816 return V;
42817
42818 if (SDValue V = narrowVectorSelect(N, DAG, Subtarget))
42819 return V;
42820
42821 // select(~Cond, X, Y) -> select(Cond, Y, X)
42822 if (CondVT.getScalarType() != MVT::i1) {
42823 if (SDValue CondNot = IsNOT(Cond, DAG))
42824 return DAG.getNode(N->getOpcode(), DL, VT,
42825 DAG.getBitcast(CondVT, CondNot), RHS, LHS);
42826 // pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the signbit.
42827 if (Cond.getOpcode() == X86ISD::PCMPGT && Cond.hasOneUse() &&
42828 ISD::isBuildVectorAllOnes(Cond.getOperand(1).getNode())) {
42829 Cond = DAG.getNode(X86ISD::PCMPGT, DL, CondVT,
42830 DAG.getConstant(0, DL, CondVT), Cond.getOperand(0));
42831 return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
42832 }
42833 }
42834
42835 // Try to optimize vXi1 selects if both operands are either all constants or
42836 // bitcasts from scalar integer type. In that case we can convert the operands
42837 // to integer and use an integer select which will be converted to a CMOV.
42838 // We need to take a little bit of care to avoid creating an i64 type after
42839 // type legalization.
42840 if (N->getOpcode() == ISD::SELECT && VT.isVector() &&
42841 VT.getVectorElementType() == MVT::i1 &&
42842 (DCI.isBeforeLegalize() || (VT != MVT::v64i1 || Subtarget.is64Bit()))) {
42843 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());
42844 bool LHSIsConst = ISD::isBuildVectorOfConstantSDNodes(LHS.getNode());
42845 bool RHSIsConst = ISD::isBuildVectorOfConstantSDNodes(RHS.getNode());
42846
42847 if ((LHSIsConst ||
42848 (LHS.getOpcode() == ISD::BITCAST &&
42849 LHS.getOperand(0).getValueType() == IntVT)) &&
42850 (RHSIsConst ||
42851 (RHS.getOpcode() == ISD::BITCAST &&
42852 RHS.getOperand(0).getValueType() == IntVT))) {
42853 if (LHSIsConst)
42854 LHS = combinevXi1ConstantToInteger(LHS, DAG);
42855 else
42856 LHS = LHS.getOperand(0);
42857
42858 if (RHSIsConst)
42859 RHS = combinevXi1ConstantToInteger(RHS, DAG);
42860 else
42861 RHS = RHS.getOperand(0);
42862
42863 SDValue Select = DAG.getSelect(DL, IntVT, Cond, LHS, RHS);
42864 return DAG.getBitcast(VT, Select);
42865 }
42866 }
42867
42868 // If this is "((X & C) == 0) ? Y : Z" and C is a constant mask vector of
42869 // single bits, then invert the predicate and swap the select operands.
42870 // This can lower using a vector shift bit-hack rather than mask and compare.
42871 if (DCI.isBeforeLegalize() && !Subtarget.hasAVX512() &&
42872 N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
42873 Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1 &&
42874 Cond.getOperand(0).getOpcode() == ISD::AND &&
42875 isNullOrNullSplat(Cond.getOperand(1)) &&
42876 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
42877 Cond.getOperand(0).getValueType() == VT) {
42878 // The 'and' mask must be composed of power-of-2 constants.
42879 SDValue And = Cond.getOperand(0);
42880 auto *C = isConstOrConstSplat(And.getOperand(1));
42881 if (C && C->getAPIntValue().isPowerOf2()) {
42882 // vselect (X & C == 0), LHS, RHS --> vselect (X & C != 0), RHS, LHS
42883 SDValue NotCond =
42884 DAG.getSetCC(DL, CondVT, And, Cond.getOperand(1), ISD::SETNE);
42885 return DAG.getSelect(DL, VT, NotCond, RHS, LHS);
42886 }
42887
42888 // If we have a non-splat but still powers-of-2 mask, AVX1 can use pmulld
42889 // and AVX2 can use vpsllv{dq}. 8-bit lacks a proper shift or multiply.
42890 // 16-bit lacks a proper blendv.
42891 unsigned EltBitWidth = VT.getScalarSizeInBits();
42892 bool CanShiftBlend =
42893 TLI.isTypeLegal(VT) && ((Subtarget.hasAVX() && EltBitWidth == 32) ||
42894 (Subtarget.hasAVX2() && EltBitWidth == 64) ||
42895 (Subtarget.hasXOP()));
42896 if (CanShiftBlend &&
42897 ISD::matchUnaryPredicate(And.getOperand(1), [](ConstantSDNode *C) {
42898 return C->getAPIntValue().isPowerOf2();
42899 })) {
42900 // Create a left-shift constant to get the mask bits over to the sign-bit.
42901 SDValue Mask = And.getOperand(1);
42902 SmallVector<int, 32> ShlVals;
42903 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
42904 auto *MaskVal = cast<ConstantSDNode>(Mask.getOperand(i));
42905 ShlVals.push_back(EltBitWidth - 1 -
42906 MaskVal->getAPIntValue().exactLogBase2());
42907 }
42908 // vsel ((X & C) == 0), LHS, RHS --> vsel ((shl X, C') < 0), RHS, LHS
42909 SDValue ShlAmt = getConstVector(ShlVals, VT.getSimpleVT(), DAG, DL);
42910 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And.getOperand(0), ShlAmt);
42911 SDValue NewCond =
42912 DAG.getSetCC(DL, CondVT, Shl, Cond.getOperand(1), ISD::SETLT);
42913 return DAG.getSelect(DL, VT, NewCond, RHS, LHS);
42914 }
42915 }
42916
42917 return SDValue();
42918}
42919
42920/// Combine:
42921/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
42922/// to:
42923/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
42924/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
42925/// Note that this is only legal for some op/cc combinations.
42926static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
42927 SelectionDAG &DAG,
42928 const X86Subtarget &Subtarget) {
42929 // This combine only operates on CMP-like nodes.
42930 if (!(Cmp.getOpcode() == X86ISD::CMP ||
42931 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
42932 return SDValue();
42933
42934 // Can't replace the cmp if it has more uses than the one we're looking at.
42935 // FIXME: We would like to be able to handle this, but would need to make sure
42936 // all uses were updated.
42937 if (!Cmp.hasOneUse())
42938 return SDValue();
42939
42940 // This only applies to variations of the common case:
42941 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
42942 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
42943 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
42944 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
42945 // Using the proper condcodes (see below), overflow is checked for.
42946
42947 // FIXME: We can generalize both constraints:
42948 // - XOR/OR/AND (if they were made to survive AtomicExpand)
42949 // - LHS != 1
42950 // if the result is compared.
42951
42952 SDValue CmpLHS = Cmp.getOperand(0);
42953 SDValue CmpRHS = Cmp.getOperand(1);
42954 EVT CmpVT = CmpLHS.getValueType();
42955
42956 if (!CmpLHS.hasOneUse())
42957 return SDValue();
42958
42959 unsigned Opc = CmpLHS.getOpcode();
42960 if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
42961 return SDValue();
42962
42963 SDValue OpRHS = CmpLHS.getOperand(2);
42964 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
42965 if (!OpRHSC)
42966 return SDValue();
42967
42968 APInt Addend = OpRHSC->getAPIntValue();
42969 if (Opc == ISD::ATOMIC_LOAD_SUB)
42970 Addend = -Addend;
42971
42972 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
42973 if (!CmpRHSC)
42974 return SDValue();
42975
42976 APInt Comparison = CmpRHSC->getAPIntValue();
42977 APInt NegAddend = -Addend;
42978
42979 // See if we can adjust the CC to make the comparison match the negated
42980 // addend.
42981 if (Comparison != NegAddend) {
42982 APInt IncComparison = Comparison + 1;
42983 if (IncComparison == NegAddend) {
42984 if (CC == X86::COND_A && !Comparison.isMaxValue()) {
42985 Comparison = IncComparison;
42986 CC = X86::COND_AE;
42987 } else if (CC == X86::COND_LE && !Comparison.isMaxSignedValue()) {
42988 Comparison = IncComparison;
42989 CC = X86::COND_L;
42990 }
42991 }
42992 APInt DecComparison = Comparison - 1;
42993 if (DecComparison == NegAddend) {
42994 if (CC == X86::COND_AE && !Comparison.isMinValue()) {
42995 Comparison = DecComparison;
42996 CC = X86::COND_A;
42997 } else if (CC == X86::COND_L && !Comparison.isMinSignedValue()) {
42998 Comparison = DecComparison;
42999 CC = X86::COND_LE;
43000 }
43001 }
43002 }
43003
43004 // If the addend is the negation of the comparison value, then we can do
43005 // a full comparison by emitting the atomic arithmetic as a locked sub.
43006 if (Comparison == NegAddend) {
43007 // The CC is fine, but we need to rewrite the LHS of the comparison as an
43008 // atomic sub.
43009 auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
43010 auto AtomicSub = DAG.getAtomic(
43011 ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpVT,
43012 /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
43013 /*RHS*/ DAG.getConstant(NegAddend, SDLoc(CmpRHS), CmpVT),
43014 AN->getMemOperand());
43015 auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget);
43016 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
43017 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
43018 return LockOp;
43019 }
43020
43021 // We can handle comparisons with zero in a number of cases by manipulating
43022 // the CC used.
43023 if (!Comparison.isNullValue())
43024 return SDValue();
43025
43026 if (CC == X86::COND_S && Addend == 1)
43027 CC = X86::COND_LE;
43028 else if (CC == X86::COND_NS && Addend == 1)
43029 CC = X86::COND_G;
43030 else if (CC == X86::COND_G && Addend == -1)
43031 CC = X86::COND_GE;
43032 else if (CC == X86::COND_LE && Addend == -1)
43033 CC = X86::COND_L;
43034 else
43035 return SDValue();
43036
43037 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
43038 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
43039 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
43040 return LockOp;
43041}
43042
43043// Check whether a boolean test is testing a boolean value generated by
43044// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
43045// code.
43046//
43047// Simplify the following patterns:
43048// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
43049// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
43050// to (Op EFLAGS Cond)
43051//
43052// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
43053// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
43054// to (Op EFLAGS !Cond)
43055//
43056// where Op could be BRCOND or CMOV.
43057//
43058static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
43059 // This combine only operates on CMP-like nodes.
43060 if (!(Cmp.getOpcode() == X86ISD::CMP ||
43061 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
43062 return SDValue();
43063
43064 // Quit if not used as a boolean value.
43065 if (CC != X86::COND_E && CC != X86::COND_NE)
43066 return SDValue();
43067
43068 // Check CMP operands. One of them should be 0 or 1 and the other should be
43069 // an SetCC or extended from it.
43070 SDValue Op1 = Cmp.getOperand(0);
43071 SDValue Op2 = Cmp.getOperand(1);
43072
43073 SDValue SetCC;
43074 const ConstantSDNode* C = nullptr;
43075 bool needOppositeCond = (CC == X86::COND_E);
43076 bool checkAgainstTrue = false; // Is it a comparison against 1?
43077
43078 if ((C = dyn_cast<ConstantSDNode>(Op1)))
43079 SetCC = Op2;
43080 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
43081 SetCC = Op1;
43082 else // Quit if all operands are not constants.
43083 return SDValue();
43084
43085 if (C->getZExtValue() == 1) {
43086 needOppositeCond = !needOppositeCond;
43087 checkAgainstTrue = true;
43088 } else if (C->getZExtValue() != 0)
43089 // Quit if the constant is neither 0 or 1.
43090 return SDValue();
43091
43092 bool truncatedToBoolWithAnd = false;
43093 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
43094 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
43095 SetCC.getOpcode() == ISD::TRUNCATE ||
43096 SetCC.getOpcode() == ISD::AND) {
43097 if (SetCC.getOpcode() == ISD::AND) {
43098 int OpIdx = -1;
43099 if (isOneConstant(SetCC.getOperand(0)))
43100 OpIdx = 1;
43101 if (isOneConstant(SetCC.getOperand(1)))
43102 OpIdx = 0;
43103 if (OpIdx < 0)
43104 break;
43105 SetCC = SetCC.getOperand(OpIdx);
43106 truncatedToBoolWithAnd = true;
43107 } else
43108 SetCC = SetCC.getOperand(0);
43109 }
43110
43111 switch (SetCC.getOpcode()) {
43112 case X86ISD::SETCC_CARRY:
43113 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
43114 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
43115 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
43116 // truncated to i1 using 'and'.
43117 if (checkAgainstTrue && !truncatedToBoolWithAnd)
43118 break;
43119 assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&(static_cast<void> (0))
43120 "Invalid use of SETCC_CARRY!")(static_cast<void> (0));
43121 LLVM_FALLTHROUGH[[gnu::fallthrough]];
43122 case X86ISD::SETCC:
43123 // Set the condition code or opposite one if necessary.
43124 CC = X86::CondCode(SetCC.getConstantOperandVal(0));
43125 if (needOppositeCond)
43126 CC = X86::GetOppositeBranchCondition(CC);
43127 return SetCC.getOperand(1);
43128 case X86ISD::CMOV: {
43129 // Check whether false/true value has canonical one, i.e. 0 or 1.
43130 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
43131 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
43132 // Quit if true value is not a constant.
43133 if (!TVal)
43134 return SDValue();
43135 // Quit if false value is not a constant.
43136 if (!FVal) {
43137 SDValue Op = SetCC.getOperand(0);
43138 // Skip 'zext' or 'trunc' node.
43139 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
43140 Op.getOpcode() == ISD::TRUNCATE)
43141 Op = Op.getOperand(0);
43142 // A special case for rdrand/rdseed, where 0 is set if false cond is
43143 // found.
43144 if ((Op.getOpcode() != X86ISD::RDRAND &&
43145 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
43146 return SDValue();
43147 }
43148 // Quit if false value is not the constant 0 or 1.
43149 bool FValIsFalse = true;
43150 if (FVal && FVal->getZExtValue() != 0) {
43151 if (FVal->getZExtValue() != 1)
43152 return SDValue();
43153 // If FVal is 1, opposite cond is needed.
43154 needOppositeCond = !needOppositeCond;
43155 FValIsFalse = false;
43156 }
43157 // Quit if TVal is not the constant opposite of FVal.
43158 if (FValIsFalse && TVal->getZExtValue() != 1)
43159 return SDValue();
43160 if (!FValIsFalse && TVal->getZExtValue() != 0)
43161 return SDValue();
43162 CC = X86::CondCode(SetCC.getConstantOperandVal(2));
43163 if (needOppositeCond)
43164 CC = X86::GetOppositeBranchCondition(CC);
43165 return SetCC.getOperand(3);
43166 }
43167 }
43168
43169 return SDValue();
43170}
43171
43172/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
43173/// Match:
43174/// (X86or (X86setcc) (X86setcc))
43175/// (X86cmp (and (X86setcc) (X86setcc)), 0)
43176static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
43177 X86::CondCode &CC1, SDValue &Flags,
43178 bool &isAnd) {
43179 if (Cond->getOpcode() == X86ISD::CMP) {
43180 if (!isNullConstant(Cond->getOperand(1)))
43181 return false;
43182
43183 Cond = Cond->getOperand(0);
43184 }
43185
43186 isAnd = false;
43187
43188 SDValue SetCC0, SetCC1;
43189 switch (Cond->getOpcode()) {
43190 default: return false;
43191 case ISD::AND:
43192 case X86ISD::AND:
43193 isAnd = true;
43194 LLVM_FALLTHROUGH[[gnu::fallthrough]];
43195 case ISD::OR:
43196 case X86ISD::OR:
43197 SetCC0 = Cond->getOperand(0);
43198 SetCC1 = Cond->getOperand(1);
43199 break;
43200 };
43201
43202 // Make sure we have SETCC nodes, using the same flags value.
43203 if (SetCC0.getOpcode() != X86ISD::SETCC ||
43204 SetCC1.getOpcode() != X86ISD::SETCC ||
43205 SetCC0->getOperand(1) != SetCC1->getOperand(1))
43206 return false;
43207
43208 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
43209 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
43210 Flags = SetCC0->getOperand(1);
43211 return true;
43212}
43213
43214// When legalizing carry, we create carries via add X, -1
43215// If that comes from an actual carry, via setcc, we use the
43216// carry directly.
43217static SDValue combineCarryThroughADD(SDValue EFLAGS, SelectionDAG &DAG) {
43218 if (EFLAGS.getOpcode() == X86ISD::ADD) {
43219 if (isAllOnesConstant(EFLAGS.getOperand(1))) {
43220 SDValue Carry = EFLAGS.getOperand(0);
43221 while (Carry.getOpcode() == ISD::TRUNCATE ||
43222 Carry.getOpcode() == ISD::ZERO_EXTEND ||
43223 Carry.getOpcode() == ISD::SIGN_EXTEND ||
43224 Carry.getOpcode() == ISD::ANY_EXTEND ||
43225 (Carry.getOpcode() == ISD::AND &&
43226 isOneConstant(Carry.getOperand(1))))
43227 Carry = Carry.getOperand(0);
43228 if (Carry.getOpcode() == X86ISD::SETCC ||
43229 Carry.getOpcode() == X86ISD::SETCC_CARRY) {
43230 // TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB?
43231 uint64_t CarryCC = Carry.getConstantOperandVal(0);
43232 SDValue CarryOp1 = Carry.getOperand(1);
43233 if (CarryCC == X86::COND_B)
43234 return CarryOp1;
43235 if (CarryCC == X86::COND_A) {
43236 // Try to convert COND_A into COND_B in an attempt to facilitate
43237 // materializing "setb reg".
43238 //
43239 // Do not flip "e > c", where "c" is a constant, because Cmp
43240 // instruction cannot take an immediate as its first operand.
43241 //
43242 if (CarryOp1.getOpcode() == X86ISD::SUB &&
43243 CarryOp1.getNode()->hasOneUse() &&
43244 CarryOp1.getValueType().isInteger() &&
43245 !isa<ConstantSDNode>(CarryOp1.getOperand(1))) {
43246 SDValue SubCommute =
43247 DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),
43248 CarryOp1.getOperand(1), CarryOp1.getOperand(0));
43249 return SDValue(SubCommute.getNode(), CarryOp1.getResNo());
43250 }
43251 }
43252 // If this is a check of the z flag of an add with 1, switch to the
43253 // C flag.
43254 if (CarryCC == X86::COND_E &&
43255 CarryOp1.getOpcode() == X86ISD::ADD &&
43256 isOneConstant(CarryOp1.getOperand(1)))
43257 return CarryOp1;
43258 }
43259 }
43260 }
43261
43262 return SDValue();
43263}
43264
43265/// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC
43266/// to avoid the inversion.
43267static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC,
43268 SelectionDAG &DAG,
43269 const X86Subtarget &Subtarget) {
43270 // TODO: Handle X86ISD::KTEST/X86ISD::KORTEST.
43271 if (EFLAGS.getOpcode() != X86ISD::PTEST &&
43272 EFLAGS.getOpcode() != X86ISD::TESTP)
43273 return SDValue();
43274
43275 // PTEST/TESTP sets EFLAGS as:
43276 // TESTZ: ZF = (Op0 & Op1) == 0
43277 // TESTC: CF = (~Op0 & Op1) == 0
43278 // TESTNZC: ZF == 0 && CF == 0
43279 EVT VT = EFLAGS.getValueType();
43280 SDValue Op0 = EFLAGS.getOperand(0);
43281 SDValue Op1 = EFLAGS.getOperand(1);
43282 EVT OpVT = Op0.getValueType();
43283
43284 // TEST*(~X,Y) == TEST*(X,Y)
43285 if (SDValue NotOp0 = IsNOT(Op0, DAG)) {
43286 X86::CondCode InvCC;
43287 switch (CC) {
43288 case X86::COND_B:
43289 // testc -> testz.
43290 InvCC = X86::COND_E;
43291 break;
43292 case X86::COND_AE:
43293 // !testc -> !testz.
43294 InvCC = X86::COND_NE;
43295 break;
43296 case X86::COND_E:
43297 // testz -> testc.
43298 InvCC = X86::COND_B;
43299 break;
43300 case X86::COND_NE:
43301 // !testz -> !testc.
43302 InvCC = X86::COND_AE;
43303 break;
43304 case X86::COND_A:
43305 case X86::COND_BE:
43306 // testnzc -> testnzc (no change).
43307 InvCC = CC;
43308 break;
43309 default:
43310 InvCC = X86::COND_INVALID;
43311 break;
43312 }
43313
43314 if (InvCC != X86::COND_INVALID) {
43315 CC = InvCC;
43316 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
43317 DAG.getBitcast(OpVT, NotOp0), Op1);
43318 }
43319 }
43320
43321 if (CC == X86::COND_E || CC == X86::COND_NE) {
43322 // TESTZ(X,~Y) == TESTC(Y,X)
43323 if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
43324 CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
43325 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
43326 DAG.getBitcast(OpVT, NotOp1), Op0);
43327 }
43328
43329 if (Op0 == Op1) {
43330 SDValue BC = peekThroughBitcasts(Op0);
43331 EVT BCVT = BC.getValueType();
43332 assert(BCVT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(BCVT) &&(static_cast<void> (0))
43333 "Unexpected vector type")(static_cast<void> (0));
43334
43335 // TESTZ(AND(X,Y),AND(X,Y)) == TESTZ(X,Y)
43336 if (BC.getOpcode() == ISD::AND || BC.getOpcode() == X86ISD::FAND) {
43337 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
43338 DAG.getBitcast(OpVT, BC.getOperand(0)),
43339 DAG.getBitcast(OpVT, BC.getOperand(1)));
43340 }
43341
43342 // TESTZ(AND(~X,Y),AND(~X,Y)) == TESTC(X,Y)
43343 if (BC.getOpcode() == X86ISD::ANDNP || BC.getOpcode() == X86ISD::FANDN) {
43344 CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
43345 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
43346 DAG.getBitcast(OpVT, BC.getOperand(0)),
43347 DAG.getBitcast(OpVT, BC.getOperand(1)));
43348 }
43349
43350 // If every element is an all-sign value, see if we can use MOVMSK to
43351 // more efficiently extract the sign bits and compare that.
43352 // TODO: Handle TESTC with comparison inversion.
43353 // TODO: Can we remove SimplifyMultipleUseDemandedBits and rely on
43354 // MOVMSK combines to make sure its never worse than PTEST?
43355 unsigned EltBits = BCVT.getScalarSizeInBits();
43356 if (DAG.ComputeNumSignBits(BC) == EltBits) {
43357 assert(VT == MVT::i32 && "Expected i32 EFLAGS comparison result")(static_cast<void> (0));
43358 APInt SignMask = APInt::getSignMask(EltBits);
43359 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43360 if (SDValue Res =
43361 TLI.SimplifyMultipleUseDemandedBits(BC, SignMask, DAG)) {
43362 // For vXi16 cases we need to use pmovmksb and extract every other
43363 // sign bit.
43364 SDLoc DL(EFLAGS);
43365 if (EltBits == 16) {
43366 MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8;
43367 Res = DAG.getBitcast(MovmskVT, Res);
43368 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
43369 Res = DAG.getNode(ISD::AND, DL, MVT::i32, Res,
43370 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
43371 } else {
43372 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
43373 }
43374 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Res,
43375 DAG.getConstant(0, DL, MVT::i32));
43376 }
43377 }
43378 }
43379
43380 // TESTZ(-1,X) == TESTZ(X,X)
43381 if (ISD::isBuildVectorAllOnes(Op0.getNode()))
43382 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1);
43383
43384 // TESTZ(X,-1) == TESTZ(X,X)
43385 if (ISD::isBuildVectorAllOnes(Op1.getNode()))
43386 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0);
43387 }
43388
43389 return SDValue();
43390}
43391
43392// Attempt to simplify the MOVMSK input based on the comparison type.
43393static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC,
43394 SelectionDAG &DAG,
43395 const X86Subtarget &Subtarget) {
43396 // Handle eq/ne against zero (any_of).
43397 // Handle eq/ne against -1 (all_of).
43398 if (!(CC == X86::COND_E || CC == X86::COND_NE))
43399 return SDValue();
43400 if (EFLAGS.getValueType() != MVT::i32)
43401 return SDValue();
43402 unsigned CmpOpcode = EFLAGS.getOpcode();
43403 if (CmpOpcode != X86ISD::CMP && CmpOpcode != X86ISD::SUB)
43404 return SDValue();
43405 auto *CmpConstant = dyn_cast<ConstantSDNode>(EFLAGS.getOperand(1));
43406 if (!CmpConstant)
43407 return SDValue();
43408 const APInt &CmpVal = CmpConstant->getAPIntValue();
43409
43410 SDValue CmpOp = EFLAGS.getOperand(0);
43411 unsigned CmpBits = CmpOp.getValueSizeInBits();
43412 assert(CmpBits == CmpVal.getBitWidth() && "Value size mismatch")(static_cast<void> (0));
43413
43414 // Peek through any truncate.
43415 if (CmpOp.getOpcode() == ISD::TRUNCATE)
43416 CmpOp = CmpOp.getOperand(0);
43417
43418 // Bail if we don't find a MOVMSK.
43419 if (CmpOp.getOpcode() != X86ISD::MOVMSK)
43420 return SDValue();
43421
43422 SDValue Vec = CmpOp.getOperand(0);
43423 MVT VecVT = Vec.getSimpleValueType();
43424 assert((VecVT.is128BitVector() || VecVT.is256BitVector()) &&(static_cast<void> (0))
43425 "Unexpected MOVMSK operand")(static_cast<void> (0));
43426 unsigned NumElts = VecVT.getVectorNumElements();
43427 unsigned NumEltBits = VecVT.getScalarSizeInBits();
43428
43429 bool IsAnyOf = CmpOpcode == X86ISD::CMP && CmpVal.isNullValue();
43430 bool IsAllOf = CmpOpcode == X86ISD::SUB && NumElts <= CmpBits &&
43431 CmpVal.isMask(NumElts);
43432 if (!IsAnyOf && !IsAllOf)
43433 return SDValue();
43434
43435 // See if we can peek through to a vector with a wider element type, if the
43436 // signbits extend down to all the sub-elements as well.
43437 // Calling MOVMSK with the wider type, avoiding the bitcast, helps expose
43438 // potential SimplifyDemandedBits/Elts cases.
43439 if (Vec.getOpcode() == ISD::BITCAST) {
43440 SDValue BC = peekThroughBitcasts(Vec);
43441 MVT BCVT = BC.getSimpleValueType();
43442 unsigned BCNumElts = BCVT.getVectorNumElements();
43443 unsigned BCNumEltBits = BCVT.getScalarSizeInBits();
43444 if ((BCNumEltBits == 32 || BCNumEltBits == 64) &&
43445 BCNumEltBits > NumEltBits &&
43446 DAG.ComputeNumSignBits(BC) > (BCNumEltBits - NumEltBits)) {
43447 SDLoc DL(EFLAGS);
43448 unsigned CmpMask = IsAnyOf ? 0 : ((1 << BCNumElts) - 1);
43449 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
43450 DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, BC),
43451 DAG.getConstant(CmpMask, DL, MVT::i32));
43452 }
43453 }
43454
43455 // MOVMSK(PCMPEQ(X,0)) == -1 -> PTESTZ(X,X).
43456 // MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X).
43457 if (IsAllOf && Subtarget.hasSSE41()) {
43458 SDValue BC = peekThroughBitcasts(Vec);
43459 if (BC.getOpcode() == X86ISD::PCMPEQ &&
43460 ISD::isBuildVectorAllZeros(BC.getOperand(1).getNode())) {
43461 MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
43462 SDValue V = DAG.getBitcast(TestVT, BC.getOperand(0));
43463 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
43464 }
43465 }
43466
43467 // See if we can avoid a PACKSS by calling MOVMSK on the sources.
43468 // For vXi16 cases we can use a v2Xi8 PMOVMSKB. We must mask out
43469 // sign bits prior to the comparison with zero unless we know that
43470 // the vXi16 splats the sign bit down to the lower i8 half.
43471 // TODO: Handle all_of patterns.
43472 if (Vec.getOpcode() == X86ISD::PACKSS && VecVT == MVT::v16i8) {
43473 SDValue VecOp0 = Vec.getOperand(0);
43474 SDValue VecOp1 = Vec.getOperand(1);
43475 bool SignExt0 = DAG.ComputeNumSignBits(VecOp0) > 8;
43476 bool SignExt1 = DAG.ComputeNumSignBits(VecOp1) > 8;
43477 // PMOVMSKB(PACKSSBW(X, undef)) -> PMOVMSKB(BITCAST_v16i8(X)) & 0xAAAA.
43478 if (IsAnyOf && CmpBits == 8 && VecOp1.isUndef()) {
43479 SDLoc DL(EFLAGS);
43480 SDValue Result = DAG.getBitcast(MVT::v16i8, VecOp0);
43481 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
43482 Result = DAG.getZExtOrTrunc(Result, DL, MVT::i16);
43483 if (!SignExt0) {
43484 Result = DAG.getNode(ISD::AND, DL, MVT::i16, Result,
43485 DAG.getConstant(0xAAAA, DL, MVT::i16));
43486 }
43487 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
43488 DAG.getConstant(0, DL, MVT::i16));
43489 }
43490 // PMOVMSKB(PACKSSBW(LO(X), HI(X)))
43491 // -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA.
43492 if (CmpBits >= 16 && Subtarget.hasInt256() &&
43493 VecOp0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
43494 VecOp1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
43495 VecOp0.getOperand(0) == VecOp1.getOperand(0) &&
43496 VecOp0.getConstantOperandAPInt(1) == 0 &&
43497 VecOp1.getConstantOperandAPInt(1) == 8 &&
43498 (IsAnyOf || (SignExt0 && SignExt1))) {
43499 SDLoc DL(EFLAGS);
43500 SDValue Result = DAG.getBitcast(MVT::v32i8, VecOp0.getOperand(0));
43501 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
43502 unsigned CmpMask = IsAnyOf ? 0 : 0xFFFFFFFF;
43503 if (!SignExt0 || !SignExt1) {
43504 assert(IsAnyOf && "Only perform v16i16 signmasks for any_of patterns")(static_cast<void> (0));
43505 Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,
43506 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
43507 }
43508 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
43509 DAG.getConstant(CmpMask, DL, MVT::i32));
43510 }
43511 }
43512
43513 // MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced.
43514 SmallVector<int, 32> ShuffleMask;
43515 SmallVector<SDValue, 2> ShuffleInputs;
43516 if (NumElts <= CmpBits &&
43517 getTargetShuffleInputs(peekThroughBitcasts(Vec), ShuffleInputs,
43518 ShuffleMask, DAG) &&
43519 ShuffleInputs.size() == 1 && !isAnyZeroOrUndef(ShuffleMask) &&
43520 ShuffleInputs[0].getValueSizeInBits() == VecVT.getSizeInBits()) {
43521 unsigned NumShuffleElts = ShuffleMask.size();
43522 APInt DemandedElts = APInt::getNullValue(NumShuffleElts);
43523 for (int M : ShuffleMask) {
43524 assert(0 <= M && M < (int)NumShuffleElts && "Bad unary shuffle index")(static_cast<void> (0));
43525 DemandedElts.setBit(M);
43526 }
43527 if (DemandedElts.isAllOnesValue()) {
43528 SDLoc DL(EFLAGS);
43529 SDValue Result = DAG.getBitcast(VecVT, ShuffleInputs[0]);
43530 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
43531 Result =
43532 DAG.getZExtOrTrunc(Result, DL, EFLAGS.getOperand(0).getValueType());
43533 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
43534 EFLAGS.getOperand(1));
43535 }
43536 }
43537
43538 return SDValue();
43539}
43540
43541/// Optimize an EFLAGS definition used according to the condition code \p CC
43542/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
43543/// uses of chain values.
43544static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
43545 SelectionDAG &DAG,
43546 const X86Subtarget &Subtarget) {
43547 if (CC == X86::COND_B)
43548 if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG))
43549 return Flags;
43550
43551 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
43552 return R;
43553
43554 if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG, Subtarget))
43555 return R;
43556
43557 if (SDValue R = combineSetCCMOVMSK(EFLAGS, CC, DAG, Subtarget))
43558 return R;
43559
43560 return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
43561}
43562
43563/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
43564static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
43565 TargetLowering::DAGCombinerInfo &DCI,
43566 const X86Subtarget &Subtarget) {
43567 SDLoc DL(N);
43568
43569 SDValue FalseOp = N->getOperand(0);
43570 SDValue TrueOp = N->getOperand(1);
43571 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
43572 SDValue Cond = N->getOperand(3);
43573
43574 // cmov X, X, ?, ? --> X
43575 if (TrueOp == FalseOp)
43576 return TrueOp;
43577
43578 // Try to simplify the EFLAGS and condition code operands.
43579 // We can't always do this as FCMOV only supports a subset of X86 cond.
43580 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
43581 if (!(FalseOp.getValueType() == MVT::f80 ||
43582 (FalseOp.getValueType() == MVT::f64 && !Subtarget.hasSSE2()) ||
43583 (FalseOp.getValueType() == MVT::f32 && !Subtarget.hasSSE1())) ||
43584 !Subtarget.hasCMov() || hasFPCMov(CC)) {
43585 SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),
43586 Flags};
43587 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
43588 }
43589 }
43590
43591 // If this is a select between two integer constants, try to do some
43592 // optimizations. Note that the operands are ordered the opposite of SELECT
43593 // operands.
43594 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
43595 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
43596 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
43597 // larger than FalseC (the false value).
43598 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
43599 CC = X86::GetOppositeBranchCondition(CC);
43600 std::swap(TrueC, FalseC);
43601 std::swap(TrueOp, FalseOp);
43602 }
43603
43604 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
43605 // This is efficient for any integer data type (including i8/i16) and
43606 // shift amount.
43607 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
43608 Cond = getSETCC(CC, Cond, DL, DAG);
43609
43610 // Zero extend the condition if needed.
43611 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
43612
43613 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
43614 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
43615 DAG.getConstant(ShAmt, DL, MVT::i8));
43616 return Cond;
43617 }
43618
43619 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
43620 // for any integer data type, including i8/i16.
43621 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
43622 Cond = getSETCC(CC, Cond, DL, DAG);
43623
43624 // Zero extend the condition if needed.
43625 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
43626 FalseC->getValueType(0), Cond);
43627 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
43628 SDValue(FalseC, 0));
43629 return Cond;
43630 }
43631
43632 // Optimize cases that will turn into an LEA instruction. This requires
43633 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
43634 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
43635 APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
43636 assert(Diff.getBitWidth() == N->getValueType(0).getSizeInBits() &&(static_cast<void> (0))
43637 "Implicit constant truncation")(static_cast<void> (0));
43638
43639 bool isFastMultiplier = false;
43640 if (Diff.ult(10)) {
43641 switch (Diff.getZExtValue()) {
43642 default: break;
43643 case 1: // result = add base, cond
43644 case 2: // result = lea base( , cond*2)
43645 case 3: // result = lea base(cond, cond*2)
43646 case 4: // result = lea base( , cond*4)
43647 case 5: // result = lea base(cond, cond*4)
43648 case 8: // result = lea base( , cond*8)
43649 case 9: // result = lea base(cond, cond*8)
43650 isFastMultiplier = true;
43651 break;
43652 }
43653 }
43654
43655 if (isFastMultiplier) {
43656 Cond = getSETCC(CC, Cond, DL ,DAG);
43657 // Zero extend the condition if needed.
43658 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
43659 Cond);
43660 // Scale the condition by the difference.
43661 if (Diff != 1)
43662 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
43663 DAG.getConstant(Diff, DL, Cond.getValueType()));
43664
43665 // Add the base if non-zero.
43666 if (FalseC->getAPIntValue() != 0)
43667 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
43668 SDValue(FalseC, 0));
43669 return Cond;
43670 }
43671 }
43672 }
43673 }
43674
43675 // Handle these cases:
43676 // (select (x != c), e, c) -> select (x != c), e, x),
43677 // (select (x == c), c, e) -> select (x == c), x, e)
43678 // where the c is an integer constant, and the "select" is the combination
43679 // of CMOV and CMP.
43680 //
43681 // The rationale for this change is that the conditional-move from a constant
43682 // needs two instructions, however, conditional-move from a register needs
43683 // only one instruction.
43684 //
43685 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
43686 // some instruction-combining opportunities. This opt needs to be
43687 // postponed as late as possible.
43688 //
43689 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
43690 // the DCI.xxxx conditions are provided to postpone the optimization as
43691 // late as possible.
43692
43693 ConstantSDNode *CmpAgainst = nullptr;
43694 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
43695 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
43696 !isa<ConstantSDNode>(Cond.getOperand(0))) {
43697
43698 if (CC == X86::COND_NE &&
43699 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
43700 CC = X86::GetOppositeBranchCondition(CC);
43701 std::swap(TrueOp, FalseOp);
43702 }
43703
43704 if (CC == X86::COND_E &&
43705 CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
43706 SDValue Ops[] = {FalseOp, Cond.getOperand(0),
43707 DAG.getTargetConstant(CC, DL, MVT::i8), Cond};
43708 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
43709 }
43710 }
43711 }
43712
43713 // Fold and/or of setcc's to double CMOV:
43714 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
43715 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
43716 //
43717 // This combine lets us generate:
43718 // cmovcc1 (jcc1 if we don't have CMOV)
43719 // cmovcc2 (same)
43720 // instead of:
43721 // setcc1
43722 // setcc2
43723 // and/or
43724 // cmovne (jne if we don't have CMOV)
43725 // When we can't use the CMOV instruction, it might increase branch
43726 // mispredicts.
43727 // When we can use CMOV, or when there is no mispredict, this improves
43728 // throughput and reduces register pressure.
43729 //
43730 if (CC == X86::COND_NE) {
43731 SDValue Flags;
43732 X86::CondCode CC0, CC1;
43733 bool isAndSetCC;
43734 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
43735 if (isAndSetCC) {
43736 std::swap(FalseOp, TrueOp);
43737 CC0 = X86::GetOppositeBranchCondition(CC0);
43738 CC1 = X86::GetOppositeBranchCondition(CC1);
43739 }
43740
43741 SDValue LOps[] = {FalseOp, TrueOp,
43742 DAG.getTargetConstant(CC0, DL, MVT::i8), Flags};
43743 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);
43744 SDValue Ops[] = {LCMOV, TrueOp, DAG.getTargetConstant(CC1, DL, MVT::i8),
43745 Flags};
43746 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
43747 return CMOV;
43748 }
43749 }
43750
43751 // Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->
43752 // (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)
43753 // Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->
43754 // (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)
43755 if ((CC == X86::COND_NE || CC == X86::COND_E) &&
43756 Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {
43757 SDValue Add = TrueOp;
43758 SDValue Const = FalseOp;
43759 // Canonicalize the condition code for easier matching and output.
43760 if (CC == X86::COND_E)
43761 std::swap(Add, Const);
43762
43763 // We might have replaced the constant in the cmov with the LHS of the
43764 // compare. If so change it to the RHS of the compare.
43765 if (Const == Cond.getOperand(0))
43766 Const = Cond.getOperand(1);
43767
43768 // Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant.
43769 if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD &&
43770 Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) &&
43771 (Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
43772 Add.getOperand(0).getOpcode() == ISD::CTTZ) &&
43773 Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {
43774 EVT VT = N->getValueType(0);
43775 // This should constant fold.
43776 SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));
43777 SDValue CMov =
43778 DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),
43779 DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), Cond);
43780 return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));
43781 }
43782 }
43783
43784 return SDValue();
43785}
43786
43787/// Different mul shrinking modes.
43788enum class ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
43789
43790static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
43791 EVT VT = N->getOperand(0).getValueType();
43792 if (VT.getScalarSizeInBits() != 32)
43793 return false;
43794
43795 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2")(static_cast<void> (0));
43796 unsigned SignBits[2] = {1, 1};
43797 bool IsPositive[2] = {false, false};
43798 for (unsigned i = 0; i < 2; i++) {
43799 SDValue Opd = N->getOperand(i);
43800
43801 SignBits[i] = DAG.ComputeNumSignBits(Opd);
43802 IsPositive[i] = DAG.SignBitIsZero(Opd);
43803 }
43804
43805 bool AllPositive = IsPositive[0] && IsPositive[1];
43806 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
43807 // When ranges are from -128 ~ 127, use MULS8 mode.
43808 if (MinSignBits >= 25)
43809 Mode = ShrinkMode::MULS8;
43810 // When ranges are from 0 ~ 255, use MULU8 mode.
43811 else if (AllPositive && MinSignBits >= 24)
43812 Mode = ShrinkMode::MULU8;
43813 // When ranges are from -32768 ~ 32767, use MULS16 mode.
43814 else if (MinSignBits >= 17)
43815 Mode = ShrinkMode::MULS16;
43816 // When ranges are from 0 ~ 65535, use MULU16 mode.
43817 else if (AllPositive && MinSignBits >= 16)
43818 Mode = ShrinkMode::MULU16;
43819 else
43820 return false;
43821 return true;
43822}
43823
43824/// When the operands of vector mul are extended from smaller size values,
43825/// like i8 and i16, the type of mul may be shrinked to generate more
43826/// efficient code. Two typical patterns are handled:
43827/// Pattern1:
43828/// %2 = sext/zext <N x i8> %1 to <N x i32>
43829/// %4 = sext/zext <N x i8> %3 to <N x i32>
43830// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
43831/// %5 = mul <N x i32> %2, %4
43832///
43833/// Pattern2:
43834/// %2 = zext/sext <N x i16> %1 to <N x i32>
43835/// %4 = zext/sext <N x i16> %3 to <N x i32>
43836/// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
43837/// %5 = mul <N x i32> %2, %4
43838///
43839/// There are four mul shrinking modes:
43840/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
43841/// -128 to 128, and the scalar value range of %4 is also -128 to 128,
43842/// generate pmullw+sext32 for it (MULS8 mode).
43843/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
43844/// 0 to 255, and the scalar value range of %4 is also 0 to 255,
43845/// generate pmullw+zext32 for it (MULU8 mode).
43846/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
43847/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
43848/// generate pmullw+pmulhw for it (MULS16 mode).
43849/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
43850/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
43851/// generate pmullw+pmulhuw for it (MULU16 mode).
43852static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
43853 const X86Subtarget &Subtarget) {
43854 // Check for legality
43855 // pmullw/pmulhw are not supported by SSE.
43856 if (!Subtarget.hasSSE2())
43857 return SDValue();
43858
43859 // Check for profitability
43860 // pmulld is supported since SSE41. It is better to use pmulld
43861 // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
43862 // the expansion.
43863 bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize();
43864 if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
43865 return SDValue();
43866
43867 ShrinkMode Mode;
43868 if (!canReduceVMulWidth(N, DAG, Mode))
43869 return SDValue();
43870
43871 SDLoc DL(N);
43872 SDValue N0 = N->getOperand(0);
43873 SDValue N1 = N->getOperand(1);
43874 EVT VT = N->getOperand(0).getValueType();
43875 unsigned NumElts = VT.getVectorNumElements();
43876 if ((NumElts % 2) != 0)
43877 return SDValue();
43878
43879 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
43880
43881 // Shrink the operands of mul.
43882 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
43883 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
43884
43885 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
43886 // lower part is needed.
43887 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
43888 if (Mode == ShrinkMode::MULU8 || Mode == ShrinkMode::MULS8)
43889 return DAG.getNode((Mode == ShrinkMode::MULU8) ? ISD::ZERO_EXTEND
43890 : ISD::SIGN_EXTEND,
43891 DL, VT, MulLo);
43892
43893 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts / 2);
43894 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
43895 // the higher part is also needed.
43896 SDValue MulHi =
43897 DAG.getNode(Mode == ShrinkMode::MULS16 ? ISD::MULHS : ISD::MULHU, DL,
43898 ReducedVT, NewN0, NewN1);
43899
43900 // Repack the lower part and higher part result of mul into a wider
43901 // result.
43902 // Generate shuffle functioning as punpcklwd.
43903 SmallVector<int, 16> ShuffleMask(NumElts);
43904 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
43905 ShuffleMask[2 * i] = i;
43906 ShuffleMask[2 * i + 1] = i + NumElts;
43907 }
43908 SDValue ResLo =
43909 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
43910 ResLo = DAG.getBitcast(ResVT, ResLo);
43911 // Generate shuffle functioning as punpckhwd.
43912 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
43913 ShuffleMask[2 * i] = i + NumElts / 2;
43914 ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
43915 }
43916 SDValue ResHi =
43917 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
43918 ResHi = DAG.getBitcast(ResVT, ResHi);
43919 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
43920}
43921
43922static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
43923 EVT VT, const SDLoc &DL) {
43924
43925 auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
43926 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
43927 DAG.getConstant(Mult, DL, VT));
43928 Result = DAG.getNode(ISD::SHL, DL, VT, Result,
43929 DAG.getConstant(Shift, DL, MVT::i8));
43930 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
43931 N->getOperand(0));
43932 return Result;
43933 };
43934
43935 auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {
43936 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
43937 DAG.getConstant(Mul1, DL, VT));
43938 Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,
43939 DAG.getConstant(Mul2, DL, VT));
43940 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
43941 N->getOperand(0));
43942 return Result;
43943 };
43944
43945 switch (MulAmt) {
43946 default:
43947 break;
43948 case 11:
43949 // mul x, 11 => add ((shl (mul x, 5), 1), x)
43950 return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
43951 case 21:
43952 // mul x, 21 => add ((shl (mul x, 5), 2), x)
43953 return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
43954 case 41:
43955 // mul x, 41 => add ((shl (mul x, 5), 3), x)
43956 return combineMulShlAddOrSub(5, 3, /*isAdd*/ true);
43957 case 22:
43958 // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
43959 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
43960 combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
43961 case 19:
43962 // mul x, 19 => add ((shl (mul x, 9), 1), x)
43963 return combineMulShlAddOrSub(9, 1, /*isAdd*/ true);
43964 case 37:
43965 // mul x, 37 => add ((shl (mul x, 9), 2), x)
43966 return combineMulShlAddOrSub(9, 2, /*isAdd*/ true);
43967 case 73:
43968 // mul x, 73 => add ((shl (mul x, 9), 3), x)
43969 return combineMulShlAddOrSub(9, 3, /*isAdd*/ true);
43970 case 13:
43971 // mul x, 13 => add ((shl (mul x, 3), 2), x)
43972 return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
43973 case 23:
43974 // mul x, 23 => sub ((shl (mul x, 3), 3), x)
43975 return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
43976 case 26:
43977 // mul x, 26 => add ((mul (mul x, 5), 5), x)
43978 return combineMulMulAddOrSub(5, 5, /*isAdd*/ true);
43979 case 28:
43980 // mul x, 28 => add ((mul (mul x, 9), 3), x)
43981 return combineMulMulAddOrSub(9, 3, /*isAdd*/ true);
43982 case 29:
43983 // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
43984 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
43985 combineMulMulAddOrSub(9, 3, /*isAdd*/ true));
43986 }
43987
43988 // Another trick. If this is a power 2 + 2/4/8, we can use a shift followed
43989 // by a single LEA.
43990 // First check if this a sum of two power of 2s because that's easy. Then
43991 // count how many zeros are up to the first bit.
43992 // TODO: We can do this even without LEA at a cost of two shifts and an add.
43993 if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
43994 unsigned ScaleShift = countTrailingZeros(MulAmt);
43995 if (ScaleShift >= 1 && ScaleShift < 4) {
43996 unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
43997 SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
43998 DAG.getConstant(ShiftAmt, DL, MVT::i8));
43999 SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
44000 DAG.getConstant(ScaleShift, DL, MVT::i8));
44001 return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);
44002 }
44003 }
44004
44005 return SDValue();
44006}
44007
44008// If the upper 17 bits of either element are zero and the other element are
44009// zero/sign bits then we can use PMADDWD, which is always at least as quick as
44010// PMULLD, except on KNL.
44011static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
44012 const X86Subtarget &Subtarget) {
44013 if (!Subtarget.hasSSE2())
44014 return SDValue();
44015
44016 if (Subtarget.isPMADDWDSlow())
44017 return SDValue();
44018
44019 EVT VT = N->getValueType(0);
44020
44021 // Only support vXi32 vectors.
44022 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
44023 return SDValue();
44024
44025 // Make sure the type is legal or will be widened to a legal type.
44026 if (VT != MVT::v2i32 && !DAG.getTargetLoweringInfo().isTypeLegal(VT))
44027 return SDValue();
44028
44029 MVT WVT = MVT::getVectorVT(MVT::i16, 2 * VT.getVectorNumElements());
44030
44031 // Without BWI, we would need to split v32i16.
44032 if (WVT == MVT::v32i16 && !Subtarget.hasBWI())
44033 return SDValue();
44034
44035 SDValue N0 = N->getOperand(0);
44036 SDValue N1 = N->getOperand(1);
44037
44038 // If we are zero extending two steps without SSE4.1, its better to reduce
44039 // the vmul width instead.
44040 if (!Subtarget.hasSSE41() &&
44041 (N0.getOpcode() == ISD::ZERO_EXTEND &&
44042 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
44043 (N1.getOpcode() == ISD::ZERO_EXTEND &&
44044 N1.getOperand(0).getScalarValueSizeInBits() <= 8))
44045 return SDValue();
44046
44047 // Sign bits must extend through the upper 17 bits.
44048 if (DAG.ComputeNumSignBits(N1) < 17 || DAG.ComputeNumSignBits(N0) < 17)
44049 return SDValue();
44050
44051 // At least one of the elements must be zero in the upper 17 bits.
44052 APInt Mask17 = APInt::getHighBitsSet(32, 17);
44053 if (!DAG.MaskedValueIsZero(N1, Mask17) && !DAG.MaskedValueIsZero(N0, Mask17))
44054 return SDValue();
44055
44056 // Use SplitOpsAndApply to handle AVX splitting.
44057 auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
44058 ArrayRef<SDValue> Ops) {
44059 MVT OpVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
44060 return DAG.getNode(X86ISD::VPMADDWD, DL, OpVT, Ops);
44061 };
44062 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
44063 { DAG.getBitcast(WVT, N0), DAG.getBitcast(WVT, N1) },
44064 PMADDWDBuilder);
44065}
44066
44067static SDValue combineMulToPMULDQ(SDNode *N, SelectionDAG &DAG,
44068 const X86Subtarget &Subtarget) {
44069 if (!Subtarget.hasSSE2())
44070 return SDValue();
44071
44072 EVT VT = N->getValueType(0);
44073
44074 // Only support vXi64 vectors.
44075 if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||
44076 VT.getVectorNumElements() < 2 ||
44077 !isPowerOf2_32(VT.getVectorNumElements()))
44078 return SDValue();
44079
44080 SDValue N0 = N->getOperand(0);
44081 SDValue N1 = N->getOperand(1);
44082
44083 // MULDQ returns the 64-bit result of the signed multiplication of the lower
44084 // 32-bits. We can lower with this if the sign bits stretch that far.
44085 if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&
44086 DAG.ComputeNumSignBits(N1) > 32) {
44087 auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
44088 ArrayRef<SDValue> Ops) {
44089 return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);
44090 };
44091 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
44092 PMULDQBuilder, /*CheckBWI*/false);
44093 }
44094
44095 // If the upper bits are zero we can use a single pmuludq.
44096 APInt Mask = APInt::getHighBitsSet(64, 32);
44097 if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {
44098 auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
44099 ArrayRef<SDValue> Ops) {
44100 return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);
44101 };
44102 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
44103 PMULUDQBuilder, /*CheckBWI*/false);
44104 }
44105
44106 return SDValue();
44107}
44108
44109static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
44110 TargetLowering::DAGCombinerInfo &DCI,
44111 const X86Subtarget &Subtarget) {
44112 EVT VT = N->getValueType(0);
44113
44114 if (SDValue V = combineMulToPMADDWD(N, DAG, Subtarget))
44115 return V;
44116
44117 if (SDValue V = combineMulToPMULDQ(N, DAG, Subtarget))
44118 return V;
44119
44120 if (DCI.isBeforeLegalize() && VT.isVector())
44121 return reduceVMULWidth(N, DAG, Subtarget);
44122
44123 // Optimize a single multiply with constant into two operations in order to
44124 // implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
44125 if (!MulConstantOptimization)
44126 return SDValue();
44127
44128 // An imul is usually smaller than the alternative sequence.
44129 if (DAG.getMachineFunction().getFunction().hasMinSize())
44130 return SDValue();
44131
44132 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
44133 return SDValue();
44134
44135 if (VT != MVT::i64 && VT != MVT::i32)
44136 return SDValue();
44137
44138 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
44139 if (!C)
44140 return SDValue();
44141 if (isPowerOf2_64(C->getZExtValue()))
44142 return SDValue();
44143
44144 int64_t SignMulAmt = C->getSExtValue();
44145 assert(SignMulAmt != INT64_MIN && "Int min should have been handled!")(static_cast<void> (0));
44146 uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;
44147
44148 SDLoc DL(N);
44149 if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) {
44150 SDValue NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
44151 DAG.getConstant(AbsMulAmt, DL, VT));
44152 if (SignMulAmt < 0)
44153 NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
44154 NewMul);
44155
44156 return NewMul;
44157 }
44158
44159 uint64_t MulAmt1 = 0;
44160 uint64_t MulAmt2 = 0;
44161 if ((AbsMulAmt % 9) == 0) {
44162 MulAmt1 = 9;
44163 MulAmt2 = AbsMulAmt / 9;
44164 } else if ((AbsMulAmt % 5) == 0) {
44165 MulAmt1 = 5;
44166 MulAmt2 = AbsMulAmt / 5;
44167 } else if ((AbsMulAmt % 3) == 0) {
44168 MulAmt1 = 3;
44169 MulAmt2 = AbsMulAmt / 3;
44170 }
44171
44172 SDValue NewMul;
44173 // For negative multiply amounts, only allow MulAmt2 to be a power of 2.
44174 if (MulAmt2 &&
44175 (isPowerOf2_64(MulAmt2) ||
44176 (SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) {
44177
44178 if (isPowerOf2_64(MulAmt2) &&
44179 !(SignMulAmt >= 0 && N->hasOneUse() &&
44180 N->use_begin()->getOpcode() == ISD::ADD))
44181 // If second multiplifer is pow2, issue it first. We want the multiply by
44182 // 3, 5, or 9 to be folded into the addressing mode unless the lone use
44183 // is an add. Only do this for positive multiply amounts since the
44184 // negate would prevent it from being used as an address mode anyway.
44185 std::swap(MulAmt1, MulAmt2);
44186
44187 if (isPowerOf2_64(MulAmt1))
44188 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
44189 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
44190 else
44191 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
44192 DAG.getConstant(MulAmt1, DL, VT));
44193
44194 if (isPowerOf2_64(MulAmt2))
44195 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
44196 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
44197 else
44198 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
44199 DAG.getConstant(MulAmt2, DL, VT));
44200
44201 // Negate the result.
44202 if (SignMulAmt < 0)
44203 NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
44204 NewMul);
44205 } else if (!Subtarget.slowLEA())
44206 NewMul = combineMulSpecial(C->getZExtValue(), N, DAG, VT, DL);
44207
44208 if (!NewMul) {
44209 assert(C->getZExtValue() != 0 &&(static_cast<void> (0))
44210 C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&(static_cast<void> (0))
44211 "Both cases that could cause potential overflows should have "(static_cast<void> (0))
44212 "already been handled.")(static_cast<void> (0));
44213 if (isPowerOf2_64(AbsMulAmt - 1)) {
44214 // (mul x, 2^N + 1) => (add (shl x, N), x)
44215 NewMul = DAG.getNode(
44216 ISD::ADD, DL, VT, N->getOperand(0),
44217 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
44218 DAG.getConstant(Log2_64(AbsMulAmt - 1), DL,
44219 MVT::i8)));
44220 // To negate, subtract the number from zero
44221 if (SignMulAmt < 0)
44222 NewMul = DAG.getNode(ISD::SUB, DL, VT,
44223 DAG.getConstant(0, DL, VT), NewMul);
44224 } else if (isPowerOf2_64(AbsMulAmt + 1)) {
44225 // (mul x, 2^N - 1) => (sub (shl x, N), x)
44226 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
44227 DAG.getConstant(Log2_64(AbsMulAmt + 1),
44228 DL, MVT::i8));
44229 // To negate, reverse the operands of the subtract.
44230 if (SignMulAmt < 0)
44231 NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);
44232 else
44233 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
44234 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2)) {
44235 // (mul x, 2^N + 2) => (add (add (shl x, N), x), x)
44236 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
44237 DAG.getConstant(Log2_64(AbsMulAmt - 2),
44238 DL, MVT::i8));
44239 NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0));
44240 NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0));
44241 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2)) {
44242 // (mul x, 2^N - 2) => (sub (sub (shl x, N), x), x)
44243 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
44244 DAG.getConstant(Log2_64(AbsMulAmt + 2),
44245 DL, MVT::i8));
44246 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
44247 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
44248 }
44249 }
44250
44251 return NewMul;
44252}
44253
44254// Try to form a MULHU or MULHS node by looking for
44255// (srl (mul ext, ext), 16)
44256// TODO: This is X86 specific because we want to be able to handle wide types
44257// before type legalization. But we can only do it if the vector will be
44258// legalized via widening/splitting. Type legalization can't handle promotion
44259// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
44260// combiner.
44261static SDValue combineShiftToPMULH(SDNode *N, SelectionDAG &DAG,
44262 const X86Subtarget &Subtarget) {
44263 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&(static_cast<void> (0))
44264 "SRL or SRA node is required here!")(static_cast<void> (0));
44265 SDLoc DL(N);
44266
44267 // Only do this with SSE4.1. On earlier targets reduceVMULWidth will expand
44268 // the multiply.
44269 if (!Subtarget.hasSSE41())
44270 return SDValue();
44271
44272 // The operation feeding into the shift must be a multiply.
44273 SDValue ShiftOperand = N->getOperand(0);
44274 if (ShiftOperand.getOpcode() != ISD::MUL || !ShiftOperand.hasOneUse())
44275 return SDValue();
44276
44277 // Input type should be at least vXi32.
44278 EVT VT = N->getValueType(0);
44279 if (!VT.isVector() || VT.getVectorElementType().getSizeInBits() < 32)
44280 return SDValue();
44281
44282 // Need a shift by 16.
44283 APInt ShiftAmt;
44284 if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), ShiftAmt) ||
44285 ShiftAmt != 16)
44286 return SDValue();
44287
44288 SDValue LHS = ShiftOperand.getOperand(0);
44289 SDValue RHS = ShiftOperand.getOperand(1);
44290
44291 unsigned ExtOpc = LHS.getOpcode();
44292 if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
44293 RHS.getOpcode() != ExtOpc)
44294 return SDValue();
44295
44296 // Peek through the extends.
44297 LHS = LHS.getOperand(0);
44298 RHS = RHS.getOperand(0);
44299
44300 // Ensure the input types match.
44301 EVT MulVT = LHS.getValueType();
44302 if (MulVT.getVectorElementType() != MVT::i16 || RHS.getValueType() != MulVT)
44303 return SDValue();
44304
44305 unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
44306 SDValue Mulh = DAG.getNode(Opc, DL, MulVT, LHS, RHS);
44307
44308 ExtOpc = N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
44309 return DAG.getNode(ExtOpc, DL, VT, Mulh);
44310}
44311
44312static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
44313 SDValue N0 = N->getOperand(0);
44314 SDValue N1 = N->getOperand(1);
44315 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
44316 EVT VT = N0.getValueType();
44317
44318 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
44319 // since the result of setcc_c is all zero's or all ones.
44320 if (VT.isInteger() && !VT.isVector() &&
44321 N1C && N0.getOpcode() == ISD::AND &&
44322 N0.getOperand(1).getOpcode() == ISD::Constant) {
44323 SDValue N00 = N0.getOperand(0);
44324 APInt Mask = N0.getConstantOperandAPInt(1);
44325 Mask <<= N1C->getAPIntValue();
44326 bool MaskOK = false;
44327 // We can handle cases concerning bit-widening nodes containing setcc_c if
44328 // we carefully interrogate the mask to make sure we are semantics
44329 // preserving.
44330 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
44331 // of the underlying setcc_c operation if the setcc_c was zero extended.
44332 // Consider the following example:
44333 // zext(setcc_c) -> i32 0x0000FFFF
44334 // c1 -> i32 0x0000FFFF
44335 // c2 -> i32 0x00000001
44336 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
44337 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
44338 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
44339 MaskOK = true;
44340 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
44341 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
44342 MaskOK = true;
44343 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
44344 N00.getOpcode() == ISD::ANY_EXTEND) &&
44345 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
44346 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
44347 }
44348 if (MaskOK && Mask != 0) {
44349 SDLoc DL(N);
44350 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
44351 }
44352 }
44353
44354 // Hardware support for vector shifts is sparse which makes us scalarize the
44355 // vector operations in many cases. Also, on sandybridge ADD is faster than
44356 // shl.
44357 // (shl V, 1) -> add V,V
44358 if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
44359 if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
44360 assert(N0.getValueType().isVector() && "Invalid vector shift type")(static_cast<void> (0));
44361 // We shift all of the values by one. In many cases we do not have
44362 // hardware support for this operation. This is better expressed as an ADD
44363 // of two values.
44364 if (N1SplatC->isOne())
44365 return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
44366 }
44367
44368 return SDValue();
44369}
44370
44371static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG,
44372 const X86Subtarget &Subtarget) {
44373 SDValue N0 = N->getOperand(0);
44374 SDValue N1 = N->getOperand(1);
44375 EVT VT = N0.getValueType();
44376 unsigned Size = VT.getSizeInBits();
44377
44378 if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
44379 return V;
44380
44381 // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
44382 // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
44383 // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
44384 // depending on sign of (SarConst - [56,48,32,24,16])
44385
44386 // sexts in X86 are MOVs. The MOVs have the same code size
44387 // as above SHIFTs (only SHIFT on 1 has lower code size).
44388 // However the MOVs have 2 advantages to a SHIFT:
44389 // 1. MOVs can write to a register that differs from source
44390 // 2. MOVs accept memory operands
44391
44392 if (VT.isVector() || N1.getOpcode() != ISD::Constant ||
44393 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
44394 N0.getOperand(1).getOpcode() != ISD::Constant)
44395 return SDValue();
44396
44397 SDValue N00 = N0.getOperand(0);
44398 SDValue N01 = N0.getOperand(1);
44399 APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
44400 APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
44401 EVT CVT = N1.getValueType();
44402
44403 if (SarConst.isNegative())
44404 return SDValue();
44405
44406 for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
44407 unsigned ShiftSize = SVT.getSizeInBits();
44408 // skipping types without corresponding sext/zext and
44409 // ShlConst that is not one of [56,48,32,24,16]
44410 if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
44411 continue;
44412 SDLoc DL(N);
44413 SDValue NN =
44414 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
44415 SarConst = SarConst - (Size - ShiftSize);
44416 if (SarConst == 0)
44417 return NN;
44418 else if (SarConst.isNegative())
44419 return DAG.getNode(ISD::SHL, DL, VT, NN,
44420 DAG.getConstant(-SarConst, DL, CVT));
44421 else
44422 return DAG.getNode(ISD::SRA, DL, VT, NN,
44423 DAG.getConstant(SarConst, DL, CVT));
44424 }
44425 return SDValue();
44426}
44427
44428static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,
44429 TargetLowering::DAGCombinerInfo &DCI,
44430 const X86Subtarget &Subtarget) {
44431 SDValue N0 = N->getOperand(0);
44432 SDValue N1 = N->getOperand(1);
44433 EVT VT = N0.getValueType();
44434
44435 if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
44436 return V;
44437
44438 // Only do this on the last DAG combine as it can interfere with other
44439 // combines.
44440 if (!DCI.isAfterLegalizeDAG())
44441 return SDValue();
44442
44443 // Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
44444 // TODO: This is a generic DAG combine that became an x86-only combine to
44445 // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
44446 // and-not ('andn').
44447 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
44448 return SDValue();
44449
44450 auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
44451 auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
44452 if (!ShiftC || !AndC)
44453 return SDValue();
44454
44455 // If we can shrink the constant mask below 8-bits or 32-bits, then this
44456 // transform should reduce code size. It may also enable secondary transforms
44457 // from improved known-bits analysis or instruction selection.
44458 APInt MaskVal = AndC->getAPIntValue();
44459
44460 // If this can be matched by a zero extend, don't optimize.
44461 if (MaskVal.isMask()) {
44462 unsigned TO = MaskVal.countTrailingOnes();
44463 if (TO >= 8 && isPowerOf2_32(TO))
44464 return SDValue();
44465 }
44466
44467 APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
44468 unsigned OldMaskSize = MaskVal.getMinSignedBits();
44469 unsigned NewMaskSize = NewMaskVal.getMinSignedBits();
44470 if ((OldMaskSize > 8 && NewMaskSize <= 8) ||
44471 (OldMaskSize > 32 && NewMaskSize <= 32)) {
44472 // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
44473 SDLoc DL(N);
44474 SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
44475 SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
44476 return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
44477 }
44478 return SDValue();
44479}
44480
44481static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG,
44482 const X86Subtarget &Subtarget) {
44483 unsigned Opcode = N->getOpcode();
44484 assert(isHorizOp(Opcode) && "Unexpected hadd/hsub/pack opcode")(static_cast<void> (0));
44485
44486 SDLoc DL(N);
44487 EVT VT = N->getValueType(0);
44488 SDValue N0 = N->getOperand(0);
44489 SDValue N1 = N->getOperand(1);
44490 EVT SrcVT = N0.getValueType();
44491
44492 SDValue BC0 =
44493 N->isOnlyUserOf(N0.getNode()) ? peekThroughOneUseBitcasts(N0) : N0;
44494 SDValue BC1 =
44495 N->isOnlyUserOf(N1.getNode()) ? peekThroughOneUseBitcasts(N1) : N1;
44496
44497 // Attempt to fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))
44498 // to SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for
44499 // truncation trees that help us avoid lane crossing shuffles.
44500 // TODO: There's a lot more we can do for PACK/HADD style shuffle combines.
44501 // TODO: We don't handle vXf64 shuffles yet.
44502 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32 &&
44503 BC0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
44504 BC1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
44505 BC0.getOperand(0) == BC1.getOperand(0) &&
44506 BC0.getOperand(0).getValueType().is256BitVector() &&
44507 BC0.getConstantOperandAPInt(1) == 0 &&
44508 BC1.getConstantOperandAPInt(1) ==
44509 BC0.getValueType().getVectorNumElements()) {
44510 SmallVector<SDValue> ShuffleOps;
44511 SmallVector<int> ShuffleMask, ScaledMask;
44512 SDValue Vec = peekThroughBitcasts(BC0.getOperand(0));
44513 if (getTargetShuffleInputs(Vec, ShuffleOps, ShuffleMask, DAG)) {
44514 resolveTargetShuffleInputsAndMask(ShuffleOps, ShuffleMask);
44515 // To keep the HOP LHS/RHS coherency, we must be able to scale the unary
44516 // shuffle to a v4X64 width - we can probably relax this in the future.
44517 if (!isAnyZero(ShuffleMask) && ShuffleOps.size() == 1 &&
44518 ShuffleOps[0].getValueType().is256BitVector() &&
44519 scaleShuffleElements(ShuffleMask, 4, ScaledMask)) {
44520 SDValue Lo, Hi;
44521 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
44522 std::tie(Lo, Hi) = DAG.SplitVector(ShuffleOps[0], DL);
44523 Lo = DAG.getBitcast(SrcVT, Lo);
44524 Hi = DAG.getBitcast(SrcVT, Hi);
44525 SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);
44526 Res = DAG.getBitcast(ShufVT, Res);
44527 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ScaledMask);
44528 return DAG.getBitcast(VT, Res);
44529 }
44530 }
44531 }
44532
44533 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(Z,W)) -> SHUFFLE(HOP()).
44534 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
44535 // If either/both ops are a shuffle that can scale to v2x64,
44536 // then see if we can perform this as a v4x32 post shuffle.
44537 SmallVector<SDValue> Ops0, Ops1;
44538 SmallVector<int> Mask0, Mask1, ScaledMask0, ScaledMask1;
44539 bool IsShuf0 =
44540 getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
44541 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
44542 all_of(Ops0, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
44543 bool IsShuf1 =
44544 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
44545 scaleShuffleElements(Mask1, 2, ScaledMask1) &&
44546 all_of(Ops1, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
44547 if (IsShuf0 || IsShuf1) {
44548 if (!IsShuf0) {
44549 Ops0.assign({BC0});
44550 ScaledMask0.assign({0, 1});
44551 }
44552 if (!IsShuf1) {
44553 Ops1.assign({BC1});
44554 ScaledMask1.assign({0, 1});
44555 }
44556
44557 SDValue LHS, RHS;
44558 int PostShuffle[4] = {-1, -1, -1, -1};
44559 auto FindShuffleOpAndIdx = [&](int M, int &Idx, ArrayRef<SDValue> Ops) {
44560 if (M < 0)
44561 return true;
44562 Idx = M % 2;
44563 SDValue Src = Ops[M / 2];
44564 if (!LHS || LHS == Src) {
44565 LHS = Src;
44566 return true;
44567 }
44568 if (!RHS || RHS == Src) {
44569 Idx += 2;
44570 RHS = Src;
44571 return true;
44572 }
44573 return false;
44574 };
44575 if (FindShuffleOpAndIdx(ScaledMask0[0], PostShuffle[0], Ops0) &&
44576 FindShuffleOpAndIdx(ScaledMask0[1], PostShuffle[1], Ops0) &&
44577 FindShuffleOpAndIdx(ScaledMask1[0], PostShuffle[2], Ops1) &&
44578 FindShuffleOpAndIdx(ScaledMask1[1], PostShuffle[3], Ops1)) {
44579 LHS = DAG.getBitcast(SrcVT, LHS);
44580 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
44581 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
44582 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
44583 Res = DAG.getBitcast(ShufVT, Res);
44584 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, PostShuffle);
44585 return DAG.getBitcast(VT, Res);
44586 }
44587 }
44588 }
44589
44590 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)).
44591 if (VT.is256BitVector() && Subtarget.hasInt256()) {
44592 SmallVector<int> Mask0, Mask1;
44593 SmallVector<SDValue> Ops0, Ops1;
44594 SmallVector<int, 2> ScaledMask0, ScaledMask1;
44595 if (getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
44596 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
44597 !Ops0.empty() && !Ops1.empty() &&
44598 all_of(Ops0,
44599 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
44600 all_of(Ops1,
44601 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
44602 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
44603 scaleShuffleElements(Mask1, 2, ScaledMask1)) {
44604 SDValue Op00 = peekThroughBitcasts(Ops0.front());
44605 SDValue Op10 = peekThroughBitcasts(Ops1.front());
44606 SDValue Op01 = peekThroughBitcasts(Ops0.back());
44607 SDValue Op11 = peekThroughBitcasts(Ops1.back());
44608 if ((Op00 == Op11) && (Op01 == Op10)) {
44609 std::swap(Op10, Op11);
44610 ShuffleVectorSDNode::commuteMask(ScaledMask1);
44611 }
44612 if ((Op00 == Op10) && (Op01 == Op11)) {
44613 const int Map[4] = {0, 2, 1, 3};
44614 SmallVector<int, 4> ShuffleMask(
44615 {Map[ScaledMask0[0]], Map[ScaledMask1[0]], Map[ScaledMask0[1]],
44616 Map[ScaledMask1[1]]});
44617 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
44618 SDValue Res = DAG.getNode(Opcode, DL, VT, DAG.getBitcast(SrcVT, Op00),
44619 DAG.getBitcast(SrcVT, Op01));
44620 Res = DAG.getBitcast(ShufVT, Res);
44621 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);
44622 return DAG.getBitcast(VT, Res);
44623 }
44624 }
44625 }
44626
44627 return SDValue();
44628}
44629
44630static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
44631 TargetLowering::DAGCombinerInfo &DCI,
44632 const X86Subtarget &Subtarget) {
44633 unsigned Opcode = N->getOpcode();
44634 assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&(static_cast<void> (0))
44635 "Unexpected pack opcode")(static_cast<void> (0));
44636
44637 EVT VT = N->getValueType(0);
44638 SDValue N0 = N->getOperand(0);
44639 SDValue N1 = N->getOperand(1);
44640 unsigned NumDstElts = VT.getVectorNumElements();
44641 unsigned DstBitsPerElt = VT.getScalarSizeInBits();
44642 unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
44643 assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&(static_cast<void> (0))
44644 N1.getScalarValueSizeInBits() == SrcBitsPerElt &&(static_cast<void> (0))
44645 "Unexpected PACKSS/PACKUS input type")(static_cast<void> (0));
44646
44647 bool IsSigned = (X86ISD::PACKSS == Opcode);
44648
44649 // Constant Folding.
44650 APInt UndefElts0, UndefElts1;
44651 SmallVector<APInt, 32> EltBits0, EltBits1;
44652 if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) &&
44653 (N1.isUndef() || N->isOnlyUserOf(N1.getNode())) &&
44654 getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) &&
44655 getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) {
44656 unsigned NumLanes = VT.getSizeInBits() / 128;
44657 unsigned NumSrcElts = NumDstElts / 2;
44658 unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
44659 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
44660
44661 APInt Undefs(NumDstElts, 0);
44662 SmallVector<APInt, 32> Bits(NumDstElts, APInt::getNullValue(DstBitsPerElt));
44663 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
44664 for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
44665 unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
44666 auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
44667 auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);
44668
44669 if (UndefElts[SrcIdx]) {
44670 Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
44671 continue;
44672 }
44673
44674 APInt &Val = EltBits[SrcIdx];
44675 if (IsSigned) {
44676 // PACKSS: Truncate signed value with signed saturation.
44677 // Source values less than dst minint are saturated to minint.
44678 // Source values greater than dst maxint are saturated to maxint.
44679 if (Val.isSignedIntN(DstBitsPerElt))
44680 Val = Val.trunc(DstBitsPerElt);
44681 else if (Val.isNegative())
44682 Val = APInt::getSignedMinValue(DstBitsPerElt);
44683 else
44684 Val = APInt::getSignedMaxValue(DstBitsPerElt);
44685 } else {
44686 // PACKUS: Truncate signed value with unsigned saturation.
44687 // Source values less than zero are saturated to zero.
44688 // Source values greater than dst maxuint are saturated to maxuint.
44689 if (Val.isIntN(DstBitsPerElt))
44690 Val = Val.trunc(DstBitsPerElt);
44691 else if (Val.isNegative())
44692 Val = APInt::getNullValue(DstBitsPerElt);
44693 else
44694 Val = APInt::getAllOnesValue(DstBitsPerElt);
44695 }
44696 Bits[Lane * NumDstEltsPerLane + Elt] = Val;
44697 }
44698 }
44699
44700 return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
44701 }
44702
44703 // Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()).
44704 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
44705 return V;
44706
44707 // Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular
44708 // truncate to create a larger truncate.
44709 if (Subtarget.hasAVX512() &&
44710 N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&
44711 N0.getOperand(0).getValueType() == MVT::v8i32) {
44712 if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) ||
44713 (!IsSigned &&
44714 DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) {
44715 if (Subtarget.hasVLX())
44716 return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0));
44717
44718 // Widen input to v16i32 so we can truncate that.
44719 SDLoc dl(N);
44720 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32,
44721 N0.getOperand(0), DAG.getUNDEF(MVT::v8i32));
44722 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat);
44723 }
44724 }
44725
44726 // Try to fold PACK(EXTEND(X),EXTEND(Y)) -> CONCAT(X,Y) subvectors.
44727 if (VT.is128BitVector()) {
44728 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
44729 SDValue Src0, Src1;
44730 if (N0.getOpcode() == ExtOpc &&
44731 N0.getOperand(0).getValueType().is64BitVector() &&
44732 N0.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
44733 Src0 = N0.getOperand(0);
44734 }
44735 if (N1.getOpcode() == ExtOpc &&
44736 N1.getOperand(0).getValueType().is64BitVector() &&
44737 N1.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
44738 Src1 = N1.getOperand(0);
44739 }
44740 if ((Src0 || N0.isUndef()) && (Src1 || N1.isUndef())) {
44741 assert((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)")(static_cast<void> (0));
44742 Src0 = Src0 ? Src0 : DAG.getUNDEF(Src1.getValueType());
44743 Src1 = Src1 ? Src1 : DAG.getUNDEF(Src0.getValueType());
44744 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Src0, Src1);
44745 }
44746 }
44747
44748 // Attempt to combine as shuffle.
44749 SDValue Op(N, 0);
44750 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
44751 return Res;
44752
44753 return SDValue();
44754}
44755
44756static SDValue combineVectorHADDSUB(SDNode *N, SelectionDAG &DAG,
44757 TargetLowering::DAGCombinerInfo &DCI,
44758 const X86Subtarget &Subtarget) {
44759 assert((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() ||(static_cast<void> (0))
44760 X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&(static_cast<void> (0))
44761 "Unexpected horizontal add/sub opcode")(static_cast<void> (0));
44762
44763 if (!shouldUseHorizontalOp(true, DAG, Subtarget)) {
44764 // For slow-hop targets, if we have a hop with a single op, see if we already
44765 // have another user that we can reuse and shuffle the result.
44766 MVT VT = N->getSimpleValueType(0);
44767 SDValue LHS = N->getOperand(0);
44768 SDValue RHS = N->getOperand(1);
44769 if (VT.is128BitVector() && LHS == RHS) {
44770 for (SDNode *User : LHS->uses()) {
44771 if (User != N && User->getOpcode() == N->getOpcode()) {
44772 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
44773 if (User->getOperand(0) == LHS && !User->getOperand(1).isUndef()) {
44774 return DAG.getBitcast(
44775 VT,
44776 DAG.getVectorShuffle(ShufVT, SDLoc(N),
44777 DAG.getBitcast(ShufVT, SDValue(User, 0)),
44778 DAG.getUNDEF(ShufVT), {0, 1, 0, 1}));
44779 }
44780 if (User->getOperand(1) == LHS && !User->getOperand(0).isUndef()) {
44781 return DAG.getBitcast(
44782 VT,
44783 DAG.getVectorShuffle(ShufVT, SDLoc(N),
44784 DAG.getBitcast(ShufVT, SDValue(User, 0)),
44785 DAG.getUNDEF(ShufVT), {2, 3, 2, 3}));
44786 }
44787 }
44788 }
44789 }
44790
44791 // HOP(HOP'(X,X),HOP'(Y,Y)) -> HOP(PERMUTE(HOP'(X,Y)),PERMUTE(HOP'(X,Y)).
44792 if (LHS != RHS && LHS.getOpcode() == N->getOpcode() &&
44793 LHS.getOpcode() == RHS.getOpcode() &&
44794 LHS.getValueType() == RHS.getValueType()) {
44795 SDValue LHS0 = LHS.getOperand(0);
44796 SDValue RHS0 = LHS.getOperand(1);
44797 SDValue LHS1 = RHS.getOperand(0);
44798 SDValue RHS1 = RHS.getOperand(1);
44799 if ((LHS0 == RHS0 || LHS0.isUndef() || RHS0.isUndef()) &&
44800 (LHS1 == RHS1 || LHS1.isUndef() || RHS1.isUndef())) {
44801 SDLoc DL(N);
44802 SDValue Res = DAG.getNode(LHS.getOpcode(), DL, LHS.getValueType(),
44803 LHS0.isUndef() ? RHS0 : LHS0,
44804 LHS1.isUndef() ? RHS1 : LHS1);
44805 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
44806 Res = DAG.getBitcast(ShufVT, Res);
44807 SDValue NewLHS =
44808 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
44809 getV4X86ShuffleImm8ForMask({0, 1, 0, 1}, DL, DAG));
44810 SDValue NewRHS =
44811 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
44812 getV4X86ShuffleImm8ForMask({2, 3, 2, 3}, DL, DAG));
44813 DAG.ReplaceAllUsesOfValueWith(LHS, DAG.getBitcast(VT, NewLHS));
44814 DAG.ReplaceAllUsesOfValueWith(RHS, DAG.getBitcast(VT, NewRHS));
44815 return SDValue(N, 0);
44816 }
44817 }
44818 }
44819
44820 // Try to fold HOP(SHUFFLE(),SHUFFLE()) -> SHUFFLE(HOP()).
44821 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
44822 return V;
44823
44824 return SDValue();
44825}
44826
44827static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG,
44828 TargetLowering::DAGCombinerInfo &DCI,
44829 const X86Subtarget &Subtarget) {
44830 assert((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() ||(static_cast<void> (0))
44831 X86ISD::VSRL == N->getOpcode()) &&(static_cast<void> (0))
44832 "Unexpected shift opcode")(static_cast<void> (0));
44833 EVT VT = N->getValueType(0);
44834 SDValue N0 = N->getOperand(0);
44835 SDValue N1 = N->getOperand(1);
44836
44837 // Shift zero -> zero.
44838 if (ISD::isBuildVectorAllZeros(N0.getNode()))
44839 return DAG.getConstant(0, SDLoc(N), VT);
44840
44841 // Detect constant shift amounts.
44842 APInt UndefElts;
44843 SmallVector<APInt, 32> EltBits;
44844 if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits, true, false)) {
44845 unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);
44846 return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0,
44847 EltBits[0].getZExtValue(), DAG);
44848 }
44849
44850 APInt KnownUndef, KnownZero;
44851 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44852 APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
44853 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
44854 KnownZero, DCI))
44855 return SDValue(N, 0);
44856
44857 return SDValue();
44858}
44859
44860static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
44861 TargetLowering::DAGCombinerInfo &DCI,
44862 const X86Subtarget &Subtarget) {
44863 unsigned Opcode = N->getOpcode();
44864 assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||(static_cast<void> (0))
44865 X86ISD::VSRLI == Opcode) &&(static_cast<void> (0))
44866 "Unexpected shift opcode")(static_cast<void> (0));
44867 bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
44868 EVT VT = N->getValueType(0);
44869 SDValue N0 = N->getOperand(0);
44870 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
44871 assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&(static_cast<void> (0))
44872 "Unexpected value type")(static_cast<void> (0));
44873 assert(N->getOperand(1).getValueType() == MVT::i8 &&(static_cast<void> (0))
44874 "Unexpected shift amount type")(static_cast<void> (0));
44875
44876 // (shift undef, X) -> 0
44877 if (N0.isUndef())
44878 return DAG.getConstant(0, SDLoc(N), VT);
44879
44880 // Out of range logical bit shifts are guaranteed to be zero.
44881 // Out of range arithmetic bit shifts splat the sign bit.
44882 unsigned ShiftVal = N->getConstantOperandVal(1);
44883 if (ShiftVal >= NumBitsPerElt) {
44884 if (LogicalShift)
44885 return DAG.getConstant(0, SDLoc(N), VT);
44886 ShiftVal = NumBitsPerElt - 1;
44887 }
44888
44889 // (shift X, 0) -> X
44890 if (!ShiftVal)
44891 return N0;
44892
44893 // (shift 0, C) -> 0
44894 if (ISD::isBuildVectorAllZeros(N0.getNode()))
44895 // N0 is all zeros or undef. We guarantee that the bits shifted into the
44896 // result are all zeros, not undef.
44897 return DAG.getConstant(0, SDLoc(N), VT);
44898
44899 // (VSRAI -1, C) -> -1
44900 if (!LogicalShift && ISD::isBuildVectorAllOnes(N0.getNode()))
44901 // N0 is all ones or undef. We guarantee that the bits shifted into the
44902 // result are all ones, not undef.
44903 return DAG.getConstant(-1, SDLoc(N), VT);
44904
44905 // (shift (shift X, C2), C1) -> (shift X, (C1 + C2))
44906 if (Opcode == N0.getOpcode()) {
44907 unsigned ShiftVal2 = cast<ConstantSDNode>(N0.getOperand(1))->getZExtValue();
44908 unsigned NewShiftVal = ShiftVal + ShiftVal2;
44909 if (NewShiftVal >= NumBitsPerElt) {
44910 // Out of range logical bit shifts are guaranteed to be zero.
44911 // Out of range arithmetic bit shifts splat the sign bit.
44912 if (LogicalShift)
44913 return DAG.getConstant(0, SDLoc(N), VT);
44914 NewShiftVal = NumBitsPerElt - 1;
44915 }
44916 return DAG.getNode(Opcode, SDLoc(N), VT, N0.getOperand(0),
44917 DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8));
44918 }
44919
44920 // We can decode 'whole byte' logical bit shifts as shuffles.
44921 if (LogicalShift && (ShiftVal % 8) == 0) {
44922 SDValue Op(N, 0);
44923 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
44924 return Res;
44925 }
44926
44927 // Constant Folding.
44928 APInt UndefElts;
44929 SmallVector<APInt, 32> EltBits;
44930 if (N->isOnlyUserOf(N0.getNode()) &&
44931 getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {
44932 assert(EltBits.size() == VT.getVectorNumElements() &&(static_cast<void> (0))
44933 "Unexpected shift value type")(static_cast<void> (0));
44934 // Undef elements need to fold to 0. It's possible SimplifyDemandedBits
44935 // created an undef input due to no input bits being demanded, but user
44936 // still expects 0 in other bits.
44937 for (unsigned i = 0, e = EltBits.size(); i != e; ++i) {
44938 APInt &Elt = EltBits[i];
44939 if (UndefElts[i])
44940 Elt = 0;
44941 else if (X86ISD::VSHLI == Opcode)
44942 Elt <<= ShiftVal;
44943 else if (X86ISD::VSRAI == Opcode)
44944 Elt.ashrInPlace(ShiftVal);
44945 else
44946 Elt.lshrInPlace(ShiftVal);
44947 }
44948 // Reset undef elements since they were zeroed above.
44949 UndefElts = 0;
44950 return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
44951 }
44952
44953 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44954 if (TLI.SimplifyDemandedBits(SDValue(N, 0),
44955 APInt::getAllOnesValue(NumBitsPerElt), DCI))
44956 return SDValue(N, 0);
44957
44958 return SDValue();
44959}
44960
44961static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
44962 TargetLowering::DAGCombinerInfo &DCI,
44963 const X86Subtarget &Subtarget) {
44964 EVT VT = N->getValueType(0);
44965 assert(((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) ||(static_cast<void> (0))
44966 (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16) ||(static_cast<void> (0))
44967 N->getOpcode() == ISD::INSERT_VECTOR_ELT) &&(static_cast<void> (0))
44968 "Unexpected vector insertion")(static_cast<void> (0));
44969
44970 if (N->getOpcode() == X86ISD::PINSRB || N->getOpcode() == X86ISD::PINSRW) {
44971 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
44972 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44973 if (TLI.SimplifyDemandedBits(SDValue(N, 0),
44974 APInt::getAllOnesValue(NumBitsPerElt), DCI))
44975 return SDValue(N, 0);
44976 }
44977
44978 // Attempt to combine insertion patterns to a shuffle.
44979 if (VT.isSimple() && DCI.isAfterLegalizeDAG()) {
44980 SDValue Op(N, 0);
44981 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
44982 return Res;
44983 }
44984
44985 return SDValue();
44986}
44987
44988/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
44989/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
44990/// OR -> CMPNEQSS.
44991static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
44992 TargetLowering::DAGCombinerInfo &DCI,
44993 const X86Subtarget &Subtarget) {
44994 unsigned opcode;
44995
44996 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
44997 // we're requiring SSE2 for both.
44998 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
44999 SDValue N0 = N->getOperand(0);
45000 SDValue N1 = N->getOperand(1);
45001 SDValue CMP0 = N0.getOperand(1);
45002 SDValue CMP1 = N1.getOperand(1);
45003 SDLoc DL(N);
45004
45005 // The SETCCs should both refer to the same CMP.
45006 if (CMP0.getOpcode() != X86ISD::FCMP || CMP0 != CMP1)
45007 return SDValue();
45008
45009 SDValue CMP00 = CMP0->getOperand(0);
45010 SDValue CMP01 = CMP0->getOperand(1);
45011 EVT VT = CMP00.getValueType();
45012
45013 if (VT == MVT::f32 || VT == MVT::f64 ||
45014 (VT == MVT::f16 && Subtarget.hasFP16())) {
45015 bool ExpectingFlags = false;
45016 // Check for any users that want flags:
45017 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
45018 !ExpectingFlags && UI != UE; ++UI)
45019 switch (UI->getOpcode()) {
45020 default:
45021 case ISD::BR_CC:
45022 case ISD::BRCOND:
45023 case ISD::SELECT:
45024 ExpectingFlags = true;
45025 break;
45026 case ISD::CopyToReg:
45027 case ISD::SIGN_EXTEND:
45028 case ISD::ZERO_EXTEND:
45029 case ISD::ANY_EXTEND:
45030 break;
45031 }
45032
45033 if (!ExpectingFlags) {
45034 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
45035 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
45036
45037 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
45038 X86::CondCode tmp = cc0;
45039 cc0 = cc1;
45040 cc1 = tmp;
45041 }
45042
45043 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
45044 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
45045 // FIXME: need symbolic constants for these magic numbers.
45046 // See X86ATTInstPrinter.cpp:printSSECC().
45047 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
45048 if (Subtarget.hasAVX512()) {
45049 SDValue FSetCC =
45050 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
45051 DAG.getTargetConstant(x86cc, DL, MVT::i8));
45052 // Need to fill with zeros to ensure the bitcast will produce zeroes
45053 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
45054 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
45055 DAG.getConstant(0, DL, MVT::v16i1),
45056 FSetCC, DAG.getIntPtrConstant(0, DL));
45057 return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
45058 N->getSimpleValueType(0));
45059 }
45060 SDValue OnesOrZeroesF =
45061 DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00,
45062 CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8));
45063
45064 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
45065 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
45066
45067 if (is64BitFP && !Subtarget.is64Bit()) {
45068 // On a 32-bit target, we cannot bitcast the 64-bit float to a
45069 // 64-bit integer, since that's not a legal type. Since
45070 // OnesOrZeroesF is all ones of all zeroes, we don't need all the
45071 // bits, but can do this little dance to extract the lowest 32 bits
45072 // and work with those going forward.
45073 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
45074 OnesOrZeroesF);
45075 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
45076 OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
45077 Vector32, DAG.getIntPtrConstant(0, DL));
45078 IntVT = MVT::i32;
45079 }
45080
45081 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
45082 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
45083 DAG.getConstant(1, DL, IntVT));
45084 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
45085 ANDed);
45086 return OneBitOfTruth;
45087 }
45088 }
45089 }
45090 }
45091 return SDValue();
45092}
45093
45094/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
45095static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
45096 assert(N->getOpcode() == ISD::AND)(static_cast<void> (0));
45097
45098 MVT VT = N->getSimpleValueType(0);
45099 if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
45100 return SDValue();
45101
45102 SDValue X, Y;
45103 SDValue N0 = N->getOperand(0);
45104 SDValue N1 = N->getOperand(1);
45105
45106 auto GetNot = [&VT, &DAG](SDValue V) {
45107 // Basic X = NOT(Y) detection.
45108 if (SDValue Not = IsNOT(V, DAG))
45109 return Not;
45110 // Fold BROADCAST(NOT(Y)) -> BROADCAST(Y).
45111 if (V.getOpcode() == X86ISD::VBROADCAST) {
45112 SDValue Src = V.getOperand(0);
45113 EVT SrcVT = Src.getValueType();
45114 if (!SrcVT.isVector())
45115 return SDValue();
45116 if (SDValue Not = IsNOT(Src, DAG))
45117 return DAG.getNode(X86ISD::VBROADCAST, SDLoc(V), VT,
45118 DAG.getBitcast(SrcVT, Not));
45119 }
45120 return SDValue();
45121 };
45122
45123 if (SDValue Not = GetNot(N0)) {
45124 X = Not;
45125 Y = N1;
45126 } else if (SDValue Not = GetNot(N1)) {
45127 X = Not;
45128 Y = N0;
45129 } else
45130 return SDValue();
45131
45132 X = DAG.getBitcast(VT, X);
45133 Y = DAG.getBitcast(VT, Y);
45134 return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);
45135}
45136
45137// Try to widen AND, OR and XOR nodes to VT in order to remove casts around
45138// logical operations, like in the example below.
45139// or (and (truncate x, truncate y)),
45140// (xor (truncate z, build_vector (constants)))
45141// Given a target type \p VT, we generate
45142// or (and x, y), (xor z, zext(build_vector (constants)))
45143// given x, y and z are of type \p VT. We can do so, if operands are either
45144// truncates from VT types, the second operand is a vector of constants or can
45145// be recursively promoted.
45146static SDValue PromoteMaskArithmetic(SDNode *N, EVT VT, SelectionDAG &DAG,
45147 unsigned Depth) {
45148 // Limit recursion to avoid excessive compile times.
45149 if (Depth >= SelectionDAG::MaxRecursionDepth)
45150 return SDValue();
45151
45152 if (N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND &&
45153 N->getOpcode() != ISD::OR)
45154 return SDValue();
45155
45156 SDValue N0 = N->getOperand(0);
45157 SDValue N1 = N->getOperand(1);
45158 SDLoc DL(N);
45159
45160 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45161 if (!TLI.isOperationLegalOrPromote(N->getOpcode(), VT))
45162 return SDValue();
45163
45164 if (SDValue NN0 = PromoteMaskArithmetic(N0.getNode(), VT, DAG, Depth + 1))
45165 N0 = NN0;
45166 else {
45167 // The Left side has to be a trunc.
45168 if (N0.getOpcode() != ISD::TRUNCATE)
45169 return SDValue();
45170
45171 // The type of the truncated inputs.
45172 if (N0.getOperand(0).getValueType() != VT)
45173 return SDValue();
45174
45175 N0 = N0.getOperand(0);
45176 }
45177
45178 if (SDValue NN1 = PromoteMaskArithmetic(N1.getNode(), VT, DAG, Depth + 1))
45179 N1 = NN1;
45180 else {
45181 // The right side has to be a 'trunc' or a constant vector.
45182 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
45183 N1.getOperand(0).getValueType() == VT;
45184 if (!RHSTrunc && !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()))
45185 return SDValue();
45186
45187 if (RHSTrunc)
45188 N1 = N1.getOperand(0);
45189 else
45190 N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);
45191 }
45192
45193 return DAG.getNode(N->getOpcode(), DL, VT, N0, N1);
45194}
45195
45196// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
45197// register. In most cases we actually compare or select YMM-sized registers
45198// and mixing the two types creates horrible code. This method optimizes
45199// some of the transition sequences.
45200// Even with AVX-512 this is still useful for removing casts around logical
45201// operations on vXi1 mask types.
45202static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG,
45203 const X86Subtarget &Subtarget) {
45204 EVT VT = N->getValueType(0);
45205 assert(VT.isVector() && "Expected vector type")(static_cast<void> (0));
45206
45207 SDLoc DL(N);
45208 assert((N->getOpcode() == ISD::ANY_EXTEND ||(static_cast<void> (0))
45209 N->getOpcode() == ISD::ZERO_EXTEND ||(static_cast<void> (0))
45210 N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node")(static_cast<void> (0));
45211
45212 SDValue Narrow = N->getOperand(0);
45213 EVT NarrowVT = Narrow.getValueType();
45214
45215 // Generate the wide operation.
45216 SDValue Op = PromoteMaskArithmetic(Narrow.getNode(), VT, DAG, 0);
45217 if (!Op)
45218 return SDValue();
45219 switch (N->getOpcode()) {
45220 default: llvm_unreachable("Unexpected opcode")__builtin_unreachable();
45221 case ISD::ANY_EXTEND:
45222 return Op;
45223 case ISD::ZERO_EXTEND:
45224 return DAG.getZeroExtendInReg(Op, DL, NarrowVT);
45225 case ISD::SIGN_EXTEND:
45226 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
45227 Op, DAG.getValueType(NarrowVT));
45228 }
45229}
45230
45231static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) {
45232 unsigned FPOpcode;
45233 switch (Opcode) {
45234 default: llvm_unreachable("Unexpected input node for FP logic conversion")__builtin_unreachable();
45235 case ISD::AND: FPOpcode = X86ISD::FAND; break;
45236 case ISD::OR: FPOpcode = X86ISD::FOR; break;
45237 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
45238 }
45239 return FPOpcode;
45240}
45241
45242/// If both input operands of a logic op are being cast from floating point
45243/// types, try to convert this into a floating point logic node to avoid
45244/// unnecessary moves from SSE to integer registers.
45245static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
45246 const X86Subtarget &Subtarget) {
45247 EVT VT = N->getValueType(0);
45248 SDValue N0 = N->getOperand(0);
45249 SDValue N1 = N->getOperand(1);
45250 SDLoc DL(N);
45251
45252 if (N0.getOpcode() != ISD::BITCAST || N1.getOpcode() != ISD::BITCAST)
45253 return SDValue();
45254
45255 SDValue N00 = N0.getOperand(0);
45256 SDValue N10 = N1.getOperand(0);
45257 EVT N00Type = N00.getValueType();
45258 EVT N10Type = N10.getValueType();
45259
45260 // Ensure that both types are the same and are legal scalar fp types.
45261 if (N00Type != N10Type || !((Subtarget.hasSSE1() && N00Type == MVT::f32) ||
45262 (Subtarget.hasSSE2() && N00Type == MVT::f64) ||
45263 (Subtarget.hasFP16() && N00Type == MVT::f16)))
45264 return SDValue();
45265
45266 unsigned FPOpcode = convertIntLogicToFPLogicOpcode(N->getOpcode());
45267 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
45268 return DAG.getBitcast(VT, FPLogic);
45269}
45270
45271// Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y))
45272// to reduce XMM->GPR traffic.
45273static SDValue combineBitOpWithMOVMSK(SDNode *N, SelectionDAG &DAG) {
45274 unsigned Opc = N->getOpcode();
45275 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&(static_cast<void> (0))
45276 "Unexpected bit opcode")(static_cast<void> (0));
45277
45278 SDValue N0 = N->getOperand(0);
45279 SDValue N1 = N->getOperand(1);
45280
45281 // Both operands must be single use MOVMSK.
45282 if (N0.getOpcode() != X86ISD::MOVMSK || !N0.hasOneUse() ||
45283 N1.getOpcode() != X86ISD::MOVMSK || !N1.hasOneUse())
45284 return SDValue();
45285
45286 SDValue Vec0 = N0.getOperand(0);
45287 SDValue Vec1 = N1.getOperand(0);
45288 EVT VecVT0 = Vec0.getValueType();
45289 EVT VecVT1 = Vec1.getValueType();
45290
45291 // Both MOVMSK operands must be from vectors of the same size and same element
45292 // size, but its OK for a fp/int diff.
45293 if (VecVT0.getSizeInBits() != VecVT1.getSizeInBits() ||
45294 VecVT0.getScalarSizeInBits() != VecVT1.getScalarSizeInBits())
45295 return SDValue();
45296
45297 SDLoc DL(N);
45298 unsigned VecOpc =
45299 VecVT0.isFloatingPoint() ? convertIntLogicToFPLogicOpcode(Opc) : Opc;
45300 SDValue Result =
45301 DAG.getNode(VecOpc, DL, VecVT0, Vec0, DAG.getBitcast(VecVT0, Vec1));
45302 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
45303}
45304
45305/// If this is a zero/all-bits result that is bitwise-anded with a low bits
45306/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
45307/// with a shift-right to eliminate loading the vector constant mask value.
45308static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
45309 const X86Subtarget &Subtarget) {
45310 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
45311 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
45312 EVT VT0 = Op0.getValueType();
45313 EVT VT1 = Op1.getValueType();
45314
45315 if (VT0 != VT1 || !VT0.isSimple() || !VT0.isInteger())
45316 return SDValue();
45317
45318 APInt SplatVal;
45319 if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) ||
45320 !SplatVal.isMask())
45321 return SDValue();
45322
45323 // Don't prevent creation of ANDN.
45324 if (isBitwiseNot(Op0))
45325 return SDValue();
45326
45327 if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL))
45328 return SDValue();
45329
45330 unsigned EltBitWidth = VT0.getScalarSizeInBits();
45331 if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
45332 return SDValue();
45333
45334 SDLoc DL(N);
45335 unsigned ShiftVal = SplatVal.countTrailingOnes();
45336 SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
45337 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
45338 return DAG.getBitcast(N->getValueType(0), Shift);
45339}
45340
45341// Get the index node from the lowered DAG of a GEP IR instruction with one
45342// indexing dimension.
45343static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) {
45344 if (Ld->isIndexed())
45345 return SDValue();
45346
45347 SDValue Base = Ld->getBasePtr();
45348
45349 if (Base.getOpcode() != ISD::ADD)
45350 return SDValue();
45351
45352 SDValue ShiftedIndex = Base.getOperand(0);
45353
45354 if (ShiftedIndex.getOpcode() != ISD::SHL)
45355 return SDValue();
45356
45357 return ShiftedIndex.getOperand(0);
45358
45359}
45360
45361static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
45362 if (Subtarget.hasBMI2() && VT.isScalarInteger()) {
45363 switch (VT.getSizeInBits()) {
45364 default: return false;
45365 case 64: return Subtarget.is64Bit() ? true : false;
45366 case 32: return true;
45367 }
45368 }
45369 return false;
45370}
45371
45372// This function recognizes cases where X86 bzhi instruction can replace and
45373// 'and-load' sequence.
45374// In case of loading integer value from an array of constants which is defined
45375// as follows:
45376//
45377// int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
45378//
45379// then applying a bitwise and on the result with another input.
45380// It's equivalent to performing bzhi (zero high bits) on the input, with the
45381// same index of the load.
45382static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
45383 const X86Subtarget &Subtarget) {
45384 MVT VT = Node->getSimpleValueType(0);
45385 SDLoc dl(Node);
45386
45387 // Check if subtarget has BZHI instruction for the node's type
45388 if (!hasBZHI(Subtarget, VT))
45389 return SDValue();
45390
45391 // Try matching the pattern for both operands.
45392 for (unsigned i = 0; i < 2; i++) {
45393 SDValue N = Node->getOperand(i);
45394 LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());
45395
45396 // continue if the operand is not a load instruction
45397 if (!Ld)
45398 return SDValue();
45399
45400 const Value *MemOp = Ld->getMemOperand()->getValue();
45401
45402 if (!MemOp)
45403 return SDValue();
45404
45405 if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
45406 if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
45407 if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
45408
45409 Constant *Init = GV->getInitializer();
45410 Type *Ty = Init->getType();
45411 if (!isa<ConstantDataArray>(Init) ||
45412 !Ty->getArrayElementType()->isIntegerTy() ||
45413 Ty->getArrayElementType()->getScalarSizeInBits() !=
45414 VT.getSizeInBits() ||
45415 Ty->getArrayNumElements() >
45416 Ty->getArrayElementType()->getScalarSizeInBits())
45417 continue;
45418
45419 // Check if the array's constant elements are suitable to our case.
45420 uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
45421 bool ConstantsMatch = true;
45422 for (uint64_t j = 0; j < ArrayElementCount; j++) {
45423 auto *Elem = cast<ConstantInt>(Init->getAggregateElement(j));
45424 if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
45425 ConstantsMatch = false;
45426 break;
45427 }
45428 }
45429 if (!ConstantsMatch)
45430 continue;
45431
45432 // Do the transformation (For 32-bit type):
45433 // -> (and (load arr[idx]), inp)
45434 // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
45435 // that will be replaced with one bzhi instruction.
45436 SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);
45437 SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);
45438
45439 // Get the Node which indexes into the array.
45440 SDValue Index = getIndexFromUnindexedLoad(Ld);
45441 if (!Index)
45442 return SDValue();
45443 Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);
45444
45445 SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);
45446 Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);
45447
45448 SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
45449 SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
45450
45451 return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
45452 }
45453 }
45454 }
45455 }
45456 return SDValue();
45457}
45458
45459// Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C)
45460// Where C is a mask containing the same number of bits as the setcc and
45461// where the setcc will freely 0 upper bits of k-register. We can replace the
45462// undef in the concat with 0s and remove the AND. This mainly helps with
45463// v2i1/v4i1 setcc being casted to scalar.
45464static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG,
45465 const X86Subtarget &Subtarget) {
45466 assert(N->getOpcode() == ISD::AND && "Unexpected opcode!")(static_cast<void> (0));
45467
45468 EVT VT = N->getValueType(0);
45469
45470 // Make sure this is an AND with constant. We will check the value of the
45471 // constant later.
45472 if (!isa<ConstantSDNode>(N->getOperand(1)))
45473 return SDValue();
45474
45475 // This is implied by the ConstantSDNode.
45476 assert(!VT.isVector() && "Expected scalar VT!")(static_cast<void> (0));
45477
45478 if (N->getOperand(0).getOpcode() != ISD::BITCAST ||
45479 !N->getOperand(0).hasOneUse() ||
45480 !N->getOperand(0).getOperand(0).hasOneUse())
45481 return SDValue();
45482
45483 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45484 SDValue Src = N->getOperand(0).getOperand(0);
45485 EVT SrcVT = Src.getValueType();
45486 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::i1 ||
45487 !TLI.isTypeLegal(SrcVT))
45488 return SDValue();
45489
45490 if (Src.getOpcode() != ISD::CONCAT_VECTORS)
45491 return SDValue();
45492
45493 // We only care about the first subvector of the concat, we expect the
45494 // other subvectors to be ignored due to the AND if we make the change.
45495 SDValue SubVec = Src.getOperand(0);
45496 EVT SubVecVT = SubVec.getValueType();
45497
45498 // First subvector should be a setcc with a legal result type. The RHS of the
45499 // AND should be a mask with this many bits.
45500 if (SubVec.getOpcode() != ISD::SETCC || !TLI.isTypeLegal(SubVecVT) ||
45501 !N->getConstantOperandAPInt(1).isMask(SubVecVT.getVectorNumElements()))
45502 return SDValue();
45503
45504 EVT SetccVT = SubVec.getOperand(0).getValueType();
45505 if (!TLI.isTypeLegal(SetccVT) ||
45506 !(Subtarget.hasVLX() || SetccVT.is512BitVector()))
45507 return SDValue();
45508
45509 if (!(Subtarget.hasBWI() || SetccVT.getScalarSizeInBits() >= 32))
45510 return SDValue();
45511
45512 // We passed all the checks. Rebuild the concat_vectors with zeroes
45513 // and cast it back to VT.
45514 SDLoc dl(N);
45515 SmallVector<SDValue, 4> Ops(Src.getNumOperands(),
45516 DAG.getConstant(0, dl, SubVecVT));
45517 Ops[0] = SubVec;
45518 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT,
45519 Ops);
45520 return DAG.getBitcast(VT, Concat);
45521}
45522
45523static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
45524 TargetLowering::DAGCombinerInfo &DCI,
45525 const X86Subtarget &Subtarget) {
45526 EVT VT = N->getValueType(0);
45527
45528 // If this is SSE1 only convert to FAND to avoid scalarization.
45529 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
45530 return DAG.getBitcast(
45531 MVT::v4i32, DAG.getNode(X86ISD::FAND, SDLoc(N), MVT::v4f32,
45532 DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
45533 DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
45534 }
45535
45536 // Use a 32-bit and+zext if upper bits known zero.
45537 if (VT == MVT::i64 && Subtarget.is64Bit() &&
45538 !isa<ConstantSDNode>(N->getOperand(1))) {
45539 APInt HiMask = APInt::getHighBitsSet(64, 32);
45540 if (DAG.MaskedValueIsZero(N->getOperand(1), HiMask) ||
45541 DAG.MaskedValueIsZero(N->getOperand(0), HiMask)) {
45542 SDLoc dl(N);
45543 SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(0));
45544 SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(1));
45545 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
45546 DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));
45547 }
45548 }
45549
45550 // Match all-of bool scalar reductions into a bitcast/movmsk + cmp.
45551 // TODO: Support multiple SrcOps.
45552 if (VT == MVT::i1) {
45553 SmallVector<SDValue, 2> SrcOps;
45554 SmallVector<APInt, 2> SrcPartials;
45555 if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps, &SrcPartials) &&
45556 SrcOps.size() == 1) {
45557 SDLoc dl(N);
45558 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45559 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
45560 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
45561 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
45562 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
45563 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
45564 if (Mask) {
45565 assert(SrcPartials[0].getBitWidth() == NumElts &&(static_cast<void> (0))
45566 "Unexpected partial reduction mask")(static_cast<void> (0));
45567 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
45568 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
45569 return DAG.getSetCC(dl, MVT::i1, Mask, PartialBits, ISD::SETEQ);
45570 }
45571 }
45572 }
45573
45574 if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget))
45575 return V;
45576
45577 if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
45578 return R;
45579
45580 if (DCI.isBeforeLegalizeOps())
45581 return SDValue();
45582
45583 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
45584 return R;
45585
45586 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
45587 return FPLogic;
45588
45589 if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
45590 return R;
45591
45592 if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
45593 return ShiftRight;
45594
45595 if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
45596 return R;
45597
45598 // Attempt to recursively combine a bitmask AND with shuffles.
45599 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
45600 SDValue Op(N, 0);
45601 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
45602 return Res;
45603 }
45604
45605 // Attempt to combine a scalar bitmask AND with an extracted shuffle.
45606 if ((VT.getScalarSizeInBits() % 8) == 0 &&
45607 N->getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
45608 isa<ConstantSDNode>(N->getOperand(0).getOperand(1))) {
45609 SDValue BitMask = N->getOperand(1);
45610 SDValue SrcVec = N->getOperand(0).getOperand(0);
45611 EVT SrcVecVT = SrcVec.getValueType();
45612
45613 // Check that the constant bitmask masks whole bytes.
45614 APInt UndefElts;
45615 SmallVector<APInt, 64> EltBits;
45616 if (VT == SrcVecVT.getScalarType() &&
45617 N->getOperand(0)->isOnlyUserOf(SrcVec.getNode()) &&
45618 getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
45619 llvm::all_of(EltBits, [](const APInt &M) {
45620 return M.isNullValue() || M.isAllOnesValue();
45621 })) {
45622 unsigned NumElts = SrcVecVT.getVectorNumElements();
45623 unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
45624 unsigned Idx = N->getOperand(0).getConstantOperandVal(1);
45625
45626 // Create a root shuffle mask from the byte mask and the extracted index.
45627 SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
45628 for (unsigned i = 0; i != Scale; ++i) {
45629 if (UndefElts[i])
45630 continue;
45631 int VecIdx = Scale * Idx + i;
45632 ShuffleMask[VecIdx] =
45633 EltBits[i].isNullValue() ? SM_SentinelZero : VecIdx;
45634 }
45635
45636 if (SDValue Shuffle = combineX86ShufflesRecursively(
45637 {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 1,
45638 X86::MaxShuffleCombineDepth,
45639 /*HasVarMask*/ false, /*AllowVarCrossLaneMask*/ true,
45640 /*AllowVarPerLaneMask*/ true, DAG, Subtarget))
45641 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), VT, Shuffle,
45642 N->getOperand(0).getOperand(1));
45643 }
45644 }
45645
45646 return SDValue();
45647}
45648
45649// Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))
45650static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG,
45651 const X86Subtarget &Subtarget) {
45652 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode")(static_cast<void> (0));
45653
45654 MVT VT = N->getSimpleValueType(0);
45655 if (!VT.isVector() || (VT.getScalarSizeInBits() % 8) != 0)
45656 return SDValue();
45657
45658 SDValue N0 = peekThroughBitcasts(N->getOperand(0));
45659 SDValue N1 = peekThroughBitcasts(N->getOperand(1));
45660 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)
45661 return SDValue();
45662
45663 // On XOP we'll lower to PCMOV so accept one use. With AVX512, we can use
45664 // VPTERNLOG. Otherwise only do this if either mask has multiple uses already.
45665 bool UseVPTERNLOG = (Subtarget.hasAVX512() && VT.is512BitVector()) ||
45666 Subtarget.hasVLX();
45667 if (!(Subtarget.hasXOP() || UseVPTERNLOG ||
45668 !N0.getOperand(1).hasOneUse() || !N1.getOperand(1).hasOneUse()))
45669 return SDValue();
45670
45671 // Attempt to extract constant byte masks.
45672 APInt UndefElts0, UndefElts1;
45673 SmallVector<APInt, 32> EltBits0, EltBits1;
45674 if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0,
45675 false, false))
45676 return SDValue();
45677 if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1,
45678 false, false))
45679 return SDValue();
45680
45681 for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) {
45682 // TODO - add UNDEF elts support.
45683 if (UndefElts0[i] || UndefElts1[i])
45684 return SDValue();
45685 if (EltBits0[i] != ~EltBits1[i])
45686 return SDValue();
45687 }
45688
45689 SDLoc DL(N);
45690
45691 if (UseVPTERNLOG) {
45692 // Emit a VPTERNLOG node directly.
45693 SDValue A = DAG.getBitcast(VT, N0.getOperand(1));
45694 SDValue B = DAG.getBitcast(VT, N0.getOperand(0));
45695 SDValue C = DAG.getBitcast(VT, N1.getOperand(0));
45696 SDValue Imm = DAG.getTargetConstant(0xCA, DL, MVT::i8);
45697 return DAG.getNode(X86ISD::VPTERNLOG, DL, VT, A, B, C, Imm);
45698 }
45699
45700 SDValue X = N->getOperand(0);
45701 SDValue Y =
45702 DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),
45703 DAG.getBitcast(VT, N1.getOperand(0)));
45704 return DAG.getNode(ISD::OR, DL, VT, X, Y);
45705}
45706
45707// Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
45708static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
45709 if (N->getOpcode() != ISD::OR)
45710 return false;
45711
45712 SDValue N0 = N->getOperand(0);
45713 SDValue N1 = N->getOperand(1);
45714
45715 // Canonicalize AND to LHS.
45716 if (N1.getOpcode() == ISD::AND)
45717 std::swap(N0, N1);
45718
45719 // Attempt to match OR(AND(M,Y),ANDNP(M,X)).
45720 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
45721 return false;
45722
45723 Mask = N1.getOperand(0);
45724 X = N1.getOperand(1);
45725
45726 // Check to see if the mask appeared in both the AND and ANDNP.
45727 if (N0.getOperand(0) == Mask)
45728 Y = N0.getOperand(1);
45729 else if (N0.getOperand(1) == Mask)
45730 Y = N0.getOperand(0);
45731 else
45732 return false;
45733
45734 // TODO: Attempt to match against AND(XOR(-1,M),Y) as well, waiting for
45735 // ANDNP combine allows other combines to happen that prevent matching.
45736 return true;
45737}
45738
45739// Try to fold:
45740// (or (and (m, y), (pandn m, x)))
45741// into:
45742// (vselect m, x, y)
45743// As a special case, try to fold:
45744// (or (and (m, (sub 0, x)), (pandn m, x)))
45745// into:
45746// (sub (xor X, M), M)
45747static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
45748 const X86Subtarget &Subtarget) {
45749 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode")(static_cast<void> (0));
45750
45751 EVT VT = N->getValueType(0);
45752 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
45753 (VT.is256BitVector() && Subtarget.hasInt256())))
45754 return SDValue();
45755
45756 SDValue X, Y, Mask;
45757 if (!matchLogicBlend(N, X, Y, Mask))
45758 return SDValue();
45759
45760 // Validate that X, Y, and Mask are bitcasts, and see through them.
45761 Mask = peekThroughBitcasts(Mask);
45762 X = peekThroughBitcasts(X);
45763 Y = peekThroughBitcasts(Y);
45764
45765 EVT MaskVT = Mask.getValueType();
45766 unsigned EltBits = MaskVT.getScalarSizeInBits();
45767
45768 // TODO: Attempt to handle floating point cases as well?
45769 if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
45770 return SDValue();
45771
45772 SDLoc DL(N);
45773
45774 // Attempt to combine to conditional negate: (sub (xor X, M), M)
45775 if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL,
45776 DAG, Subtarget))
45777 return Res;
45778
45779 // PBLENDVB is only available on SSE 4.1.
45780 if (!Subtarget.hasSSE41())
45781 return SDValue();
45782
45783 // If we have VPTERNLOG we should prefer that since PBLENDVB is multiple uops.
45784 if (Subtarget.hasVLX())
45785 return SDValue();
45786
45787 MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;
45788
45789 X = DAG.getBitcast(BlendVT, X);
45790 Y = DAG.getBitcast(BlendVT, Y);
45791 Mask = DAG.getBitcast(BlendVT, Mask);
45792 Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
45793 return DAG.getBitcast(VT, Mask);
45794}
45795
45796// Helper function for combineOrCmpEqZeroToCtlzSrl
45797// Transforms:
45798// seteq(cmp x, 0)
45799// into:
45800// srl(ctlz x), log2(bitsize(x))
45801// Input pattern is checked by caller.
45802static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
45803 SelectionDAG &DAG) {
45804 SDValue Cmp = Op.getOperand(1);
45805 EVT VT = Cmp.getOperand(0).getValueType();
45806 unsigned Log2b = Log2_32(VT.getSizeInBits());
45807 SDLoc dl(Op);
45808 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
45809 // The result of the shift is true or false, and on X86, the 32-bit
45810 // encoding of shr and lzcnt is more desirable.
45811 SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
45812 SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
45813 DAG.getConstant(Log2b, dl, MVT::i8));
45814 return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
45815}
45816
45817// Try to transform:
45818// zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
45819// into:
45820// srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
45821// Will also attempt to match more generic cases, eg:
45822// zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
45823// Only applies if the target supports the FastLZCNT feature.
45824static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
45825 TargetLowering::DAGCombinerInfo &DCI,
45826 const X86Subtarget &Subtarget) {
45827 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
45828 return SDValue();
45829
45830 auto isORCandidate = [](SDValue N) {
45831 return (N->getOpcode() == ISD::OR && N->hasOneUse());
45832 };
45833
45834 // Check the zero extend is extending to 32-bit or more. The code generated by
45835 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
45836 // instructions to clear the upper bits.
45837 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
45838 !isORCandidate(N->getOperand(0)))
45839 return SDValue();
45840
45841 // Check the node matches: setcc(eq, cmp 0)
45842 auto isSetCCCandidate = [](SDValue N) {
45843 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
45844 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
45845 N->getOperand(1).getOpcode() == X86ISD::CMP &&
45846 isNullConstant(N->getOperand(1).getOperand(1)) &&
45847 N->getOperand(1).getValueType().bitsGE(MVT::i32);
45848 };
45849
45850 SDNode *OR = N->getOperand(0).getNode();
45851 SDValue LHS = OR->getOperand(0);
45852 SDValue RHS = OR->getOperand(1);
45853
45854 // Save nodes matching or(or, setcc(eq, cmp 0)).
45855 SmallVector<SDNode *, 2> ORNodes;
45856 while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
45857 (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
45858 ORNodes.push_back(OR);
45859 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
45860 LHS = OR->getOperand(0);
45861 RHS = OR->getOperand(1);
45862 }
45863
45864 // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
45865 if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
45866 !isORCandidate(SDValue(OR, 0)))
45867 return SDValue();
45868
45869 // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
45870 // to
45871 // or(srl(ctlz),srl(ctlz)).
45872 // The dag combiner can then fold it into:
45873 // srl(or(ctlz, ctlz)).
45874 EVT VT = OR->getValueType(0);
45875 SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);
45876 SDValue Ret, NewRHS;
45877 if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG)))
45878 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);
45879
45880 if (!Ret)
45881 return SDValue();
45882
45883 // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
45884 while (ORNodes.size() > 0) {
45885 OR = ORNodes.pop_back_val();
45886 LHS = OR->getOperand(0);
45887 RHS = OR->getOperand(1);
45888 // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
45889 if (RHS->getOpcode() == ISD::OR)
45890 std::swap(LHS, RHS);
45891 NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
45892 if (!NewRHS)
45893 return SDValue();
45894 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
45895 }
45896
45897 if (Ret)
45898 Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
45899
45900 return Ret;
45901}
45902
45903static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
45904 TargetLowering::DAGCombinerInfo &DCI,
45905 const X86Subtarget &Subtarget) {
45906 SDValue N0 = N->getOperand(0);
45907 SDValue N1 = N->getOperand(1);
45908 EVT VT = N->getValueType(0);
45909
45910 // If this is SSE1 only convert to FOR to avoid scalarization.
45911 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
45912 return DAG.getBitcast(MVT::v4i32,
45913 DAG.getNode(X86ISD::FOR, SDLoc(N), MVT::v4f32,
45914 DAG.getBitcast(MVT::v4f32, N0),
45915 DAG.getBitcast(MVT::v4f32, N1)));
45916 }
45917
45918 // Match any-of bool scalar reductions into a bitcast/movmsk + cmp.
45919 // TODO: Support multiple SrcOps.
45920 if (VT == MVT::i1) {
45921 SmallVector<SDValue, 2> SrcOps;
45922 SmallVector<APInt, 2> SrcPartials;
45923 if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps, &SrcPartials) &&
45924 SrcOps.size() == 1) {
45925 SDLoc dl(N);
45926 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45927 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
45928 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
45929 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
45930 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
45931 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
45932 if (Mask) {
45933 assert(SrcPartials[0].getBitWidth() == NumElts &&(static_cast<void> (0))
45934 "Unexpected partial reduction mask")(static_cast<void> (0));
45935 SDValue ZeroBits = DAG.getConstant(0, dl, MaskVT);
45936 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
45937 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
45938 return DAG.getSetCC(dl, MVT::i1, Mask, ZeroBits, ISD::SETNE);
45939 }
45940 }
45941 }
45942
45943 if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
45944 return R;
45945
45946 if (DCI.isBeforeLegalizeOps())
45947 return SDValue();
45948
45949 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
45950 return R;
45951
45952 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
45953 return FPLogic;
45954
45955 if (SDValue R = canonicalizeBitSelect(N, DAG, Subtarget))
45956 return R;
45957
45958 if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
45959 return R;
45960
45961 // Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y).
45962 // Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X).
45963 // iff the upper elements of the non-shifted arg are zero.
45964 // KUNPCK require 16+ bool vector elements.
45965 if (N0.getOpcode() == X86ISD::KSHIFTL || N1.getOpcode() == X86ISD::KSHIFTL) {
45966 unsigned NumElts = VT.getVectorNumElements();
45967 unsigned HalfElts = NumElts / 2;
45968 APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts);
45969 if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL &&
45970 N1.getConstantOperandAPInt(1) == HalfElts &&
45971 DAG.MaskedValueIsZero(N0, APInt(1, 1), UpperElts)) {
45972 SDLoc dl(N);
45973 return DAG.getNode(
45974 ISD::CONCAT_VECTORS, dl, VT,
45975 extractSubVector(N0, 0, DAG, dl, HalfElts),
45976 extractSubVector(N1.getOperand(0), 0, DAG, dl, HalfElts));
45977 }
45978 if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL &&
45979 N0.getConstantOperandAPInt(1) == HalfElts &&
45980 DAG.MaskedValueIsZero(N1, APInt(1, 1), UpperElts)) {
45981 SDLoc dl(N);
45982 return DAG.getNode(
45983 ISD::CONCAT_VECTORS, dl, VT,
45984 extractSubVector(N1, 0, DAG, dl, HalfElts),
45985 extractSubVector(N0.getOperand(0), 0, DAG, dl, HalfElts));
45986 }
45987 }
45988
45989 // Attempt to recursively combine an OR of shuffles.
45990 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
45991 SDValue Op(N, 0);
45992 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
45993 return Res;
45994 }
45995
45996 return SDValue();
45997}
45998
45999/// Try to turn tests against the signbit in the form of:
46000/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
46001/// into:
46002/// SETGT(X, -1)
46003static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
46004 // This is only worth doing if the output type is i8 or i1.
46005 EVT ResultType = N->getValueType(0);
46006 if (ResultType != MVT::i8 && ResultType != MVT::i1)
46007 return SDValue();
46008
46009 SDValue N0 = N->getOperand(0);
46010 SDValue N1 = N->getOperand(1);
46011
46012 // We should be performing an xor against a truncated shift.
46013 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
46014 return SDValue();
46015
46016 // Make sure we are performing an xor against one.
46017 if (!isOneConstant(N1))
46018 return SDValue();
46019
46020 // SetCC on x86 zero extends so only act on this if it's a logical shift.
46021 SDValue Shift = N0.getOperand(0);
46022 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
46023 return SDValue();
46024
46025 // Make sure we are truncating from one of i16, i32 or i64.
46026 EVT ShiftTy = Shift.getValueType();
46027 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
46028 return SDValue();
46029
46030 // Make sure the shift amount extracts the sign bit.
46031 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
46032 Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))
46033 return SDValue();
46034
46035 // Create a greater-than comparison against -1.
46036 // N.B. Using SETGE against 0 works but we want a canonical looking
46037 // comparison, using SETGT matches up with what TranslateX86CC.
46038 SDLoc DL(N);
46039 SDValue ShiftOp = Shift.getOperand(0);
46040 EVT ShiftOpTy = ShiftOp.getValueType();
46041 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46042 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
46043 *DAG.getContext(), ResultType);
46044 SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
46045 DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
46046 if (SetCCResultType != ResultType)
46047 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
46048 return Cond;
46049}
46050
46051/// Turn vector tests of the signbit in the form of:
46052/// xor (sra X, elt_size(X)-1), -1
46053/// into:
46054/// pcmpgt X, -1
46055///
46056/// This should be called before type legalization because the pattern may not
46057/// persist after that.
46058static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
46059 const X86Subtarget &Subtarget) {
46060 EVT VT = N->getValueType(0);
46061 if (!VT.isSimple())
46062 return SDValue();
46063
46064 switch (VT.getSimpleVT().SimpleTy) {
46065 default: return SDValue();
46066 case MVT::v16i8:
46067 case MVT::v8i16:
46068 case MVT::v4i32:
46069 case MVT::v2i64: if (!Subtarget.hasSSE2()) return SDValue(); break;
46070 case MVT::v32i8:
46071 case MVT::v16i16:
46072 case MVT::v8i32:
46073 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
46074 }
46075
46076 // There must be a shift right algebraic before the xor, and the xor must be a
46077 // 'not' operation.
46078 SDValue Shift = N->getOperand(0);
46079 SDValue Ones = N->getOperand(1);
46080 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
46081 !ISD::isBuildVectorAllOnes(Ones.getNode()))
46082 return SDValue();
46083
46084 // The shift should be smearing the sign bit across each vector element.
46085 auto *ShiftAmt =
46086 isConstOrConstSplat(Shift.getOperand(1), /*AllowUndefs*/ true);
46087 if (!ShiftAmt ||
46088 ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))
46089 return SDValue();
46090
46091 // Create a greater-than comparison against -1. We don't use the more obvious
46092 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
46093 return DAG.getSetCC(SDLoc(N), VT, Shift.getOperand(0), Ones, ISD::SETGT);
46094}
46095
46096/// Detect patterns of truncation with unsigned saturation:
46097///
46098/// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
46099/// Return the source value x to be truncated or SDValue() if the pattern was
46100/// not matched.
46101///
46102/// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),
46103/// where C1 >= 0 and C2 is unsigned max of destination type.
46104///
46105/// (truncate (smax (smin (x, C2), C1)) to dest_type)
46106/// where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.
46107///
46108/// These two patterns are equivalent to:
46109/// (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)
46110/// So return the smax(x, C1) value to be truncated or SDValue() if the
46111/// pattern was not matched.
46112static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG,
46113 const SDLoc &DL) {
46114 EVT InVT = In.getValueType();
46115
46116 // Saturation with truncation. We truncate from InVT to VT.
46117 assert(InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&(static_cast<void> (0))
46118 "Unexpected types for truncate operation")(static_cast<void> (0));
46119
46120 // Match min/max and return limit value as a parameter.
46121 auto MatchMinMax = [](SDValue V, unsigned Opcode, APInt &Limit) -> SDValue {
46122 if (V.getOpcode() == Opcode &&
46123 ISD::isConstantSplatVector(V.getOperand(1).getNode(), Limit))
46124 return V.getOperand(0);
46125 return SDValue();
46126 };
46127
46128 APInt C1, C2;
46129 if (SDValue UMin = MatchMinMax(In, ISD::UMIN, C2))
46130 // C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
46131 // the element size of the destination type.
46132 if (C2.isMask(VT.getScalarSizeInBits()))
46133 return UMin;
46134
46135 if (SDValue SMin = MatchMinMax(In, ISD::SMIN, C2))
46136 if (MatchMinMax(SMin, ISD::SMAX, C1))
46137 if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))
46138 return SMin;
46139
46140 if (SDValue SMax = MatchMinMax(In, ISD::SMAX, C1))
46141 if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, C2))
46142 if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) &&
46143 C2.uge(C1)) {
46144 return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));
46145 }
46146
46147 return SDValue();
46148}
46149
46150/// Detect patterns of truncation with signed saturation:
46151/// (truncate (smin ((smax (x, signed_min_of_dest_type)),
46152/// signed_max_of_dest_type)) to dest_type)
46153/// or:
46154/// (truncate (smax ((smin (x, signed_max_of_dest_type)),
46155/// signed_min_of_dest_type)) to dest_type).
46156/// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].
46157/// Return the source value to be truncated or SDValue() if the pattern was not
46158/// matched.
46159static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
46160 unsigned NumDstBits = VT.getScalarSizeInBits();
46161 unsigned NumSrcBits = In.getScalarValueSizeInBits();
46162 assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation")(static_cast<void> (0));
46163
46164 auto MatchMinMax = [](SDValue V, unsigned Opcode,
46165 const APInt &Limit) -> SDValue {
46166 APInt C;
46167 if (V.getOpcode() == Opcode &&
46168 ISD::isConstantSplatVector(V.getOperand(1).getNode(), C) && C == Limit)
46169 return V.getOperand(0);
46170 return SDValue();
46171 };
46172
46173 APInt SignedMax, SignedMin;
46174 if (MatchPackUS) {
46175 SignedMax = APInt::getAllOnesValue(NumDstBits).zext(NumSrcBits);
46176 SignedMin = APInt(NumSrcBits, 0);
46177 } else {
46178 SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
46179 SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);
46180 }
46181
46182 if (SDValue SMin = MatchMinMax(In, ISD::SMIN, SignedMax))
46183 if (SDValue SMax = MatchMinMax(SMin, ISD::SMAX, SignedMin))
46184 return SMax;
46185
46186 if (SDValue SMax = MatchMinMax(In, ISD::SMAX, SignedMin))
46187 if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, SignedMax))
46188 return SMin;
46189
46190 return SDValue();
46191}
46192
46193static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
46194 SelectionDAG &DAG,
46195 const X86Subtarget &Subtarget) {
46196 if (!Subtarget.hasSSE2() || !VT.isVector())
46197 return SDValue();
46198
46199 EVT SVT = VT.getVectorElementType();
46200 EVT InVT = In.getValueType();
46201 EVT InSVT = InVT.getVectorElementType();
46202
46203 // If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is
46204 // split across two registers. We can use a packusdw+perm to clamp to 0-65535
46205 // and concatenate at the same time. Then we can use a final vpmovuswb to
46206 // clip to 0-255.
46207 if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
46208 InVT == MVT::v16i32 && VT == MVT::v16i8) {
46209 if (auto USatVal = detectSSatPattern(In, VT, true)) {
46210 // Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.
46211 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,
46212 DL, DAG, Subtarget);
46213 assert(Mid && "Failed to pack!")(static_cast<void> (0));
46214 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid);
46215 }
46216 }
46217
46218 // vXi32 truncate instructions are available with AVX512F.
46219 // vXi16 truncate instructions are only available with AVX512BW.
46220 // For 256-bit or smaller vectors, we require VLX.
46221 // FIXME: We could widen truncates to 512 to remove the VLX restriction.
46222 // If the result type is 256-bits or larger and we have disable 512-bit
46223 // registers, we should go ahead and use the pack instructions if possible.
46224 bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) ||
46225 (Subtarget.hasBWI() && InSVT == MVT::i16)) &&
46226 (InVT.getSizeInBits() > 128) &&
46227 (Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&
46228 !(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);
46229
46230 if (isPowerOf2_32(VT.getVectorNumElements()) && !PreferAVX512 &&
46231 VT.getSizeInBits() >= 64 &&
46232 (SVT == MVT::i8 || SVT == MVT::i16) &&
46233 (InSVT == MVT::i16 || InSVT == MVT::i32)) {
46234 if (auto USatVal = detectSSatPattern(In, VT, true)) {
46235 // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
46236 // Only do this when the result is at least 64 bits or we'll leaving
46237 // dangling PACKSSDW nodes.
46238 if (SVT == MVT::i8 && InSVT == MVT::i32) {
46239 EVT MidVT = VT.changeVectorElementType(MVT::i16);
46240 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
46241 DAG, Subtarget);
46242 assert(Mid && "Failed to pack!")(static_cast<void> (0));
46243 SDValue V = truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG,
46244 Subtarget);
46245 assert(V && "Failed to pack!")(static_cast<void> (0));
46246 return V;
46247 } else if (SVT == MVT::i8 || Subtarget.hasSSE41())
46248 return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
46249 Subtarget);
46250 }
46251 if (auto SSatVal = detectSSatPattern(In, VT))
46252 return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
46253 Subtarget);
46254 }
46255
46256 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46257 if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&
46258 Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI()) &&
46259 (SVT == MVT::i32 || SVT == MVT::i16 || SVT == MVT::i8)) {
46260 unsigned TruncOpc = 0;
46261 SDValue SatVal;
46262 if (auto SSatVal = detectSSatPattern(In, VT)) {
46263 SatVal = SSatVal;
46264 TruncOpc = X86ISD::VTRUNCS;
46265 } else if (auto USatVal = detectUSatPattern(In, VT, DAG, DL)) {
46266 SatVal = USatVal;
46267 TruncOpc = X86ISD::VTRUNCUS;
46268 }
46269 if (SatVal) {
46270 unsigned ResElts = VT.getVectorNumElements();
46271 // If the input type is less than 512 bits and we don't have VLX, we need
46272 // to widen to 512 bits.
46273 if (!Subtarget.hasVLX() && !InVT.is512BitVector()) {
46274 unsigned NumConcats = 512 / InVT.getSizeInBits();
46275 ResElts *= NumConcats;
46276 SmallVector<SDValue, 4> ConcatOps(NumConcats, DAG.getUNDEF(InVT));
46277 ConcatOps[0] = SatVal;
46278 InVT = EVT::getVectorVT(*DAG.getContext(), InSVT,
46279 NumConcats * InVT.getVectorNumElements());
46280 SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps);
46281 }
46282 // Widen the result if its narrower than 128 bits.
46283 if (ResElts * SVT.getSizeInBits() < 128)
46284 ResElts = 128 / SVT.getSizeInBits();
46285 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts);
46286 SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal);
46287 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
46288 DAG.getIntPtrConstant(0, DL));
46289 }
46290 }
46291
46292 return SDValue();
46293}
46294
46295/// This function detects the AVG pattern between vectors of unsigned i8/i16,
46296/// which is c = (a + b + 1) / 2, and replace this operation with the efficient
46297/// X86ISD::AVG instruction.
46298static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
46299 const X86Subtarget &Subtarget,
46300 const SDLoc &DL) {
46301 if (!VT.isVector())
46302 return SDValue();
46303 EVT InVT = In.getValueType();
46304 unsigned NumElems = VT.getVectorNumElements();
46305
46306 EVT ScalarVT = VT.getVectorElementType();
46307 if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) && NumElems >= 2))
46308 return SDValue();
46309
46310 // InScalarVT is the intermediate type in AVG pattern and it should be greater
46311 // than the original input type (i8/i16).
46312 EVT InScalarVT = InVT.getVectorElementType();
46313 if (InScalarVT.getFixedSizeInBits() <= ScalarVT.getFixedSizeInBits())
46314 return SDValue();
46315
46316 if (!Subtarget.hasSSE2())
46317 return SDValue();
46318
46319 // Detect the following pattern:
46320 //
46321 // %1 = zext <N x i8> %a to <N x i32>
46322 // %2 = zext <N x i8> %b to <N x i32>
46323 // %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
46324 // %4 = add nuw nsw <N x i32> %3, %2
46325 // %5 = lshr <N x i32> %N, <i32 1 x N>
46326 // %6 = trunc <N x i32> %5 to <N x i8>
46327 //
46328 // In AVX512, the last instruction can also be a trunc store.
46329 if (In.getOpcode() != ISD::SRL)
46330 return SDValue();
46331
46332 // A lambda checking the given SDValue is a constant vector and each element
46333 // is in the range [Min, Max].
46334 auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
46335 return ISD::matchUnaryPredicate(V, [Min, Max](ConstantSDNode *C) {
46336 return !(C->getAPIntValue().ult(Min) || C->getAPIntValue().ugt(Max));
46337 });
46338 };
46339
46340 // Check if each element of the vector is right-shifted by one.
46341 SDValue LHS = In.getOperand(0);
46342 SDValue RHS = In.getOperand(1);
46343 if (!IsConstVectorInRange(RHS, 1, 1))
46344 return SDValue();
46345 if (LHS.getOpcode() != ISD::ADD)
46346 return SDValue();
46347
46348 // Detect a pattern of a + b + 1 where the order doesn't matter.
46349 SDValue Operands[3];
46350 Operands[0] = LHS.getOperand(0);
46351 Operands[1] = LHS.getOperand(1);
46352
46353 auto AVGBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
46354 ArrayRef<SDValue> Ops) {
46355 return DAG.getNode(X86ISD::AVG, DL, Ops[0].getValueType(), Ops);
46356 };
46357
46358 auto AVGSplitter = [&](SDValue Op0, SDValue Op1) {
46359 // Pad to a power-of-2 vector, split+apply and extract the original vector.
46360 unsigned NumElemsPow2 = PowerOf2Ceil(NumElems);
46361 EVT Pow2VT = EVT::getVectorVT(*DAG.getContext(), ScalarVT, NumElemsPow2);
46362 if (NumElemsPow2 != NumElems) {
46363 SmallVector<SDValue, 32> Ops0(NumElemsPow2, DAG.getUNDEF(ScalarVT));
46364 SmallVector<SDValue, 32> Ops1(NumElemsPow2, DAG.getUNDEF(ScalarVT));
46365 for (unsigned i = 0; i != NumElems; ++i) {
46366 SDValue Idx = DAG.getIntPtrConstant(i, DL);
46367 Ops0[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Op0, Idx);
46368 Ops1[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Op1, Idx);
46369 }
46370 Op0 = DAG.getBuildVector(Pow2VT, DL, Ops0);
46371 Op1 = DAG.getBuildVector(Pow2VT, DL, Ops1);
46372 }
46373 SDValue Res =
46374 SplitOpsAndApply(DAG, Subtarget, DL, Pow2VT, {Op0, Op1}, AVGBuilder);
46375 if (NumElemsPow2 == NumElems)
46376 return Res;
46377 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
46378 DAG.getIntPtrConstant(0, DL));
46379 };
46380
46381 // Take care of the case when one of the operands is a constant vector whose
46382 // element is in the range [1, 256].
46383 if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
46384 Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
46385 Operands[0].getOperand(0).getValueType() == VT) {
46386 // The pattern is detected. Subtract one from the constant vector, then
46387 // demote it and emit X86ISD::AVG instruction.
46388 SDValue VecOnes = DAG.getConstant(1, DL, InVT);
46389 Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
46390 Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
46391 return AVGSplitter(Operands[0].getOperand(0), Operands[1]);
46392 }
46393
46394 // Matches 'add like' patterns: add(Op0,Op1) + zext(or(Op0,Op1)).
46395 // Match the or case only if its 'add-like' - can be replaced by an add.
46396 auto FindAddLike = [&](SDValue V, SDValue &Op0, SDValue &Op1) {
46397 if (ISD::ADD == V.getOpcode()) {
46398 Op0 = V.getOperand(0);
46399 Op1 = V.getOperand(1);
46400 return true;
46401 }
46402 if (ISD::ZERO_EXTEND != V.getOpcode())
46403 return false;
46404 V = V.getOperand(0);
46405 if (V.getValueType() != VT || ISD::OR != V.getOpcode() ||
46406 !DAG.haveNoCommonBitsSet(V.getOperand(0), V.getOperand(1)))
46407 return false;
46408 Op0 = V.getOperand(0);
46409 Op1 = V.getOperand(1);
46410 return true;
46411 };
46412
46413 SDValue Op0, Op1;
46414 if (FindAddLike(Operands[0], Op0, Op1))
46415 std::swap(Operands[0], Operands[1]);
46416 else if (!FindAddLike(Operands[1], Op0, Op1))
46417 return SDValue();
46418 Operands[2] = Op0;
46419 Operands[1] = Op1;
46420
46421 // Now we have three operands of two additions. Check that one of them is a
46422 // constant vector with ones, and the other two can be promoted from i8/i16.
46423 for (int i = 0; i < 3; ++i) {
46424 if (!IsConstVectorInRange(Operands[i], 1, 1))
46425 continue;
46426 std::swap(Operands[i], Operands[2]);
46427
46428 // Check if Operands[0] and Operands[1] are results of type promotion.
46429 for (int j = 0; j < 2; ++j)
46430 if (Operands[j].getValueType() != VT) {
46431 if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
46432 Operands[j].getOperand(0).getValueType() != VT)
46433 return SDValue();
46434 Operands[j] = Operands[j].getOperand(0);
46435 }
46436
46437 // The pattern is detected, emit X86ISD::AVG instruction(s).
46438 return AVGSplitter(Operands[0], Operands[1]);
46439 }
46440
46441 return SDValue();
46442}
46443
46444static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
46445 TargetLowering::DAGCombinerInfo &DCI,
46446 const X86Subtarget &Subtarget) {
46447 LoadSDNode *Ld = cast<LoadSDNode>(N);
46448 EVT RegVT = Ld->getValueType(0);
46449 EVT MemVT = Ld->getMemoryVT();
46450 SDLoc dl(Ld);
46451 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46452
46453 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
46454 // into two 16-byte operations. Also split non-temporal aligned loads on
46455 // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
46456 ISD::LoadExtType Ext = Ld->getExtensionType();
46457 bool Fast;
46458 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
46459 Ext == ISD::NON_EXTLOAD &&
46460 ((Ld->isNonTemporal() && !Subtarget.hasInt256() &&
46461 Ld->getAlignment() >= 16) ||
46462 (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
46463 *Ld->getMemOperand(), &Fast) &&
46464 !Fast))) {
46465 unsigned NumElems = RegVT.getVectorNumElements();
46466 if (NumElems < 2)
46467 return SDValue();
46468
46469 unsigned HalfOffset = 16;
46470 SDValue Ptr1 = Ld->getBasePtr();
46471 SDValue Ptr2 =
46472 DAG.getMemBasePlusOffset(Ptr1, TypeSize::Fixed(HalfOffset), dl);
46473 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
46474 NumElems / 2);
46475 SDValue Load1 =
46476 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),
46477 Ld->getOriginalAlign(),
46478 Ld->getMemOperand()->getFlags());
46479 SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,
46480 Ld->getPointerInfo().getWithOffset(HalfOffset),
46481 Ld->getOriginalAlign(),
46482 Ld->getMemOperand()->getFlags());
46483 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
46484 Load1.getValue(1), Load2.getValue(1));
46485
46486 SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
46487 return DCI.CombineTo(N, NewVec, TF, true);
46488 }
46489
46490 // Bool vector load - attempt to cast to an integer, as we have good
46491 // (vXiY *ext(vXi1 bitcast(iX))) handling.
46492 if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() &&
46493 RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) {
46494 unsigned NumElts = RegVT.getVectorNumElements();
46495 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
46496 if (TLI.isTypeLegal(IntVT)) {
46497 SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),
46498 Ld->getPointerInfo(),
46499 Ld->getOriginalAlign(),
46500 Ld->getMemOperand()->getFlags());
46501 SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);
46502 return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);
46503 }
46504 }
46505
46506 // If we also broadcast this as a subvector to a wider type, then just extract
46507 // the lowest subvector.
46508 if (Ext == ISD::NON_EXTLOAD && Subtarget.hasAVX() && Ld->isSimple() &&
46509 (RegVT.is128BitVector() || RegVT.is256BitVector())) {
46510 SDValue Ptr = Ld->getBasePtr();
46511 SDValue Chain = Ld->getChain();
46512 for (SDNode *User : Ptr->uses()) {
46513 if (User != N && User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
46514 cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
46515 cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
46516 cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
46517 MemVT.getSizeInBits() &&
46518 !User->hasAnyUseOfValue(1) &&
46519 User->getValueSizeInBits(0).getFixedSize() >
46520 RegVT.getFixedSizeInBits()) {
46521 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
46522 RegVT.getSizeInBits());
46523 Extract = DAG.getBitcast(RegVT, Extract);
46524 return DCI.CombineTo(N, Extract, SDValue(User, 1));
46525 }
46526 }
46527 }
46528
46529 // Cast ptr32 and ptr64 pointers to the default address space before a load.
46530 unsigned AddrSpace = Ld->getAddressSpace();
46531 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
46532 AddrSpace == X86AS::PTR32_UPTR) {
46533 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
46534 if (PtrVT != Ld->getBasePtr().getSimpleValueType()) {
46535 SDValue Cast =
46536 DAG.getAddrSpaceCast(dl, PtrVT, Ld->getBasePtr(), AddrSpace, 0);
46537 return DAG.getLoad(RegVT, dl, Ld->getChain(), Cast, Ld->getPointerInfo(),
46538 Ld->getOriginalAlign(),
46539 Ld->getMemOperand()->getFlags());
46540 }
46541 }
46542
46543 return SDValue();
46544}
46545
46546/// If V is a build vector of boolean constants and exactly one of those
46547/// constants is true, return the operand index of that true element.
46548/// Otherwise, return -1.
46549static int getOneTrueElt(SDValue V) {
46550 // This needs to be a build vector of booleans.
46551 // TODO: Checking for the i1 type matches the IR definition for the mask,
46552 // but the mask check could be loosened to i8 or other types. That might
46553 // also require checking more than 'allOnesValue'; eg, the x86 HW
46554 // instructions only require that the MSB is set for each mask element.
46555 // The ISD::MSTORE comments/definition do not specify how the mask operand
46556 // is formatted.
46557 auto *BV = dyn_cast<BuildVectorSDNode>(V);
46558 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
46559 return -1;
46560
46561 int TrueIndex = -1;
46562 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
46563 for (unsigned i = 0; i < NumElts; ++i) {
46564 const SDValue &Op = BV->getOperand(i);
46565 if (Op.isUndef())
46566 continue;
46567 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
46568 if (!ConstNode)
46569 return -1;
46570 if (ConstNode->getAPIntValue().countTrailingOnes() >= 1) {
46571 // If we already found a one, this is too many.
46572 if (TrueIndex >= 0)
46573 return -1;
46574 TrueIndex = i;
46575 }
46576 }
46577 return TrueIndex;
46578}
46579
46580/// Given a masked memory load/store operation, return true if it has one mask
46581/// bit set. If it has one mask bit set, then also return the memory address of
46582/// the scalar element to load/store, the vector index to insert/extract that
46583/// scalar element, and the alignment for the scalar memory access.
46584static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
46585 SelectionDAG &DAG, SDValue &Addr,
46586 SDValue &Index, Align &Alignment,
46587 unsigned &Offset) {
46588 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
46589 if (TrueMaskElt < 0)
46590 return false;
46591
46592 // Get the address of the one scalar element that is specified by the mask
46593 // using the appropriate offset from the base pointer.
46594 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
46595 Offset = 0;
46596 Addr = MaskedOp->getBasePtr();
46597 if (TrueMaskElt != 0) {
46598 Offset = TrueMaskElt * EltVT.getStoreSize();
46599 Addr = DAG.getMemBasePlusOffset(Addr, TypeSize::Fixed(Offset),
46600 SDLoc(MaskedOp));
46601 }
46602
46603 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
46604 Alignment = commonAlignment(MaskedOp->getOriginalAlign(),
46605 EltVT.getStoreSize());
46606 return true;
46607}
46608
46609/// If exactly one element of the mask is set for a non-extending masked load,
46610/// it is a scalar load and vector insert.
46611/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
46612/// mask have already been optimized in IR, so we don't bother with those here.
46613static SDValue
46614reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
46615 TargetLowering::DAGCombinerInfo &DCI,
46616 const X86Subtarget &Subtarget) {
46617 assert(ML->isUnindexed() && "Unexpected indexed masked load!")(static_cast<void> (0));
46618 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
46619 // However, some target hooks may need to be added to know when the transform
46620 // is profitable. Endianness would also have to be considered.
46621
46622 SDValue Addr, VecIndex;
46623 Align Alignment;
46624 unsigned Offset;
46625 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment, Offset))
46626 return SDValue();
46627
46628 // Load the one scalar element that is specified by the mask using the
46629 // appropriate offset from the base pointer.
46630 SDLoc DL(ML);
46631 EVT VT = ML->getValueType(0);
46632 EVT EltVT = VT.getVectorElementType();
46633
46634 EVT CastVT = VT;
46635 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
46636 EltVT = MVT::f64;
46637 CastVT = VT.changeVectorElementType(EltVT);
46638 }
46639
46640 SDValue Load =
46641 DAG.getLoad(EltVT, DL, ML->getChain(), Addr,
46642 ML->getPointerInfo().getWithOffset(Offset),
46643 Alignment, ML->getMemOperand()->getFlags());
46644
46645 SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru());
46646
46647 // Insert the loaded element into the appropriate place in the vector.
46648 SDValue Insert =
46649 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, CastVT, PassThru, Load, VecIndex);
46650 Insert = DAG.getBitcast(VT, Insert);
46651 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
46652}
46653
46654static SDValue
46655combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
46656 TargetLowering::DAGCombinerInfo &DCI) {
46657 assert(ML->isUnindexed() && "Unexpected indexed masked load!")(static_cast<void> (0));
46658 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
46659 return SDValue();
46660
46661 SDLoc DL(ML);
46662 EVT VT = ML->getValueType(0);
46663
46664 // If we are loading the first and last elements of a vector, it is safe and
46665 // always faster to load the whole vector. Replace the masked load with a
46666 // vector load and select.
46667 unsigned NumElts = VT.getVectorNumElements();
46668 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
46669 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
46670 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
46671 if (LoadFirstElt && LoadLastElt) {
46672 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
46673 ML->getMemOperand());
46674 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,
46675 ML->getPassThru());
46676 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
46677 }
46678
46679 // Convert a masked load with a constant mask into a masked load and a select.
46680 // This allows the select operation to use a faster kind of select instruction
46681 // (for example, vblendvps -> vblendps).
46682
46683 // Don't try this if the pass-through operand is already undefined. That would
46684 // cause an infinite loop because that's what we're about to create.
46685 if (ML->getPassThru().isUndef())
46686 return SDValue();
46687
46688 if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))
46689 return SDValue();
46690
46691 // The new masked load has an undef pass-through operand. The select uses the
46692 // original pass-through operand.
46693 SDValue NewML = DAG.getMaskedLoad(
46694 VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),
46695 DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),
46696 ML->getAddressingMode(), ML->getExtensionType());
46697 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,
46698 ML->getPassThru());
46699
46700 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
46701}
46702
46703static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
46704 TargetLowering::DAGCombinerInfo &DCI,
46705 const X86Subtarget &Subtarget) {
46706 auto *Mld = cast<MaskedLoadSDNode>(N);
46707
46708 // TODO: Expanding load with constant mask may be optimized as well.
46709 if (Mld->isExpandingLoad())
46710 return SDValue();
46711
46712 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
46713 if (SDValue ScalarLoad =
46714 reduceMaskedLoadToScalarLoad(Mld, DAG, DCI, Subtarget))
46715 return ScalarLoad;
46716
46717 // TODO: Do some AVX512 subsets benefit from this transform?
46718 if (!Subtarget.hasAVX512())
46719 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
46720 return Blend;
46721 }
46722
46723 // If the mask value has been legalized to a non-boolean vector, try to
46724 // simplify ops leading up to it. We only demand the MSB of each lane.
46725 SDValue Mask = Mld->getMask();
46726 if (Mask.getScalarValueSizeInBits() != 1) {
46727 EVT VT = Mld->getValueType(0);
46728 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46729 APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));
46730 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
46731 if (N->getOpcode() != ISD::DELETED_NODE)
46732 DCI.AddToWorklist(N);
46733 return SDValue(N, 0);
46734 }
46735 if (SDValue NewMask =
46736 TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))
46737 return DAG.getMaskedLoad(
46738 VT, SDLoc(N), Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(),
46739 NewMask, Mld->getPassThru(), Mld->getMemoryVT(), Mld->getMemOperand(),
46740 Mld->getAddressingMode(), Mld->getExtensionType());
46741 }
46742
46743 return SDValue();
46744}
46745
46746/// If exactly one element of the mask is set for a non-truncating masked store,
46747/// it is a vector extract and scalar store.
46748/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
46749/// mask have already been optimized in IR, so we don't bother with those here.
46750static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
46751 SelectionDAG &DAG,
46752 const X86Subtarget &Subtarget) {
46753 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
46754 // However, some target hooks may need to be added to know when the transform
46755 // is profitable. Endianness would also have to be considered.
46756
46757 SDValue Addr, VecIndex;
46758 Align Alignment;
46759 unsigned Offset;
46760 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment, Offset))
46761 return SDValue();
46762
46763 // Extract the one scalar element that is actually being stored.
46764 SDLoc DL(MS);
46765 SDValue Value = MS->getValue();
46766 EVT VT = Value.getValueType();
46767 EVT EltVT = VT.getVectorElementType();
46768 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
46769 EltVT = MVT::f64;
46770 EVT CastVT = VT.changeVectorElementType(EltVT);
46771 Value = DAG.getBitcast(CastVT, Value);
46772 }
46773 SDValue Extract =
46774 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Value, VecIndex);
46775
46776 // Store that element at the appropriate offset from the base pointer.
46777 return DAG.getStore(MS->getChain(), DL, Extract, Addr,
46778 MS->getPointerInfo().getWithOffset(Offset),
46779 Alignment, MS->getMemOperand()->getFlags());
46780}
46781
46782static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
46783 TargetLowering::DAGCombinerInfo &DCI,
46784 const X86Subtarget &Subtarget) {
46785 MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
46786 if (Mst->isCompressingStore())
46787 return SDValue();
46788
46789 EVT VT = Mst->getValue().getValueType();
46790 SDLoc dl(Mst);
46791 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46792
46793 if (Mst->isTruncatingStore())
46794 return SDValue();
46795
46796 if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget))
46797 return ScalarStore;
46798
46799 // If the mask value has been legalized to a non-boolean vector, try to
46800 // simplify ops leading up to it. We only demand the MSB of each lane.
46801 SDValue Mask = Mst->getMask();
46802 if (Mask.getScalarValueSizeInBits() != 1) {
46803 APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));
46804 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
46805 if (N->getOpcode() != ISD::DELETED_NODE)
46806 DCI.AddToWorklist(N);
46807 return SDValue(N, 0);
46808 }
46809 if (SDValue NewMask =
46810 TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))
46811 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(),
46812 Mst->getBasePtr(), Mst->getOffset(), NewMask,
46813 Mst->getMemoryVT(), Mst->getMemOperand(),
46814 Mst->getAddressingMode());
46815 }
46816
46817 SDValue Value = Mst->getValue();
46818 if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
46819 TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
46820 Mst->getMemoryVT())) {
46821 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
46822 Mst->getBasePtr(), Mst->getOffset(), Mask,
46823 Mst->getMemoryVT(), Mst->getMemOperand(),
46824 Mst->getAddressingMode(), true);
46825 }
46826
46827 return SDValue();
46828}
46829
46830static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
46831 TargetLowering::DAGCombinerInfo &DCI,
46832 const X86Subtarget &Subtarget) {
46833 StoreSDNode *St = cast<StoreSDNode>(N);
46834 EVT StVT = St->getMemoryVT();
46835 SDLoc dl(St);
46836 SDValue StoredVal = St->getValue();
46837 EVT VT = StoredVal.getValueType();
46838 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46839
46840 // Convert a store of vXi1 into a store of iX and a bitcast.
46841 if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() &&
46842 VT.getVectorElementType() == MVT::i1) {
46843
46844 EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());
46845 StoredVal = DAG.getBitcast(NewVT, StoredVal);
46846
46847 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
46848 St->getPointerInfo(), St->getOriginalAlign(),
46849 St->getMemOperand()->getFlags());
46850 }
46851
46852 // If this is a store of a scalar_to_vector to v1i1, just use a scalar store.
46853 // This will avoid a copy to k-register.
46854 if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
46855 StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
46856 StoredVal.getOperand(0).getValueType() == MVT::i8) {
46857 SDValue Val = StoredVal.getOperand(0);
46858 // We must store zeros to the unused bits.
46859 Val = DAG.getZeroExtendInReg(Val, dl, MVT::i1);
46860 return DAG.getStore(St->getChain(), dl, Val,
46861 St->getBasePtr(), St->getPointerInfo(),
46862 St->getOriginalAlign(),
46863 St->getMemOperand()->getFlags());
46864 }
46865
46866 // Widen v2i1/v4i1 stores to v8i1.
46867 if ((VT == MVT::v1i1 || VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
46868 Subtarget.hasAVX512()) {
46869 unsigned NumConcats = 8 / VT.getVectorNumElements();
46870 // We must store zeros to the unused bits.
46871 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, VT));
46872 Ops[0] = StoredVal;
46873 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
46874 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
46875 St->getPointerInfo(), St->getOriginalAlign(),
46876 St->getMemOperand()->getFlags());
46877 }
46878
46879 // Turn vXi1 stores of constants into a scalar store.
46880 if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||
46881 VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
46882 ISD::isBuildVectorOfConstantSDNodes(StoredVal.getNode())) {
46883 // If its a v64i1 store without 64-bit support, we need two stores.
46884 if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) {
46885 SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
46886 StoredVal->ops().slice(0, 32));
46887 Lo = combinevXi1ConstantToInteger(Lo, DAG);
46888 SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
46889 StoredVal->ops().slice(32, 32));
46890 Hi = combinevXi1ConstantToInteger(Hi, DAG);
46891
46892 SDValue Ptr0 = St->getBasePtr();
46893 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(4), dl);
46894
46895 SDValue Ch0 =
46896 DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
46897 St->getOriginalAlign(),
46898 St->getMemOperand()->getFlags());
46899 SDValue Ch1 =
46900 DAG.getStore(St->getChain(), dl, Hi, Ptr1,
46901 St->getPointerInfo().getWithOffset(4),
46902 St->getOriginalAlign(),
46903 St->getMemOperand()->getFlags());
46904 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
46905 }
46906
46907 StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
46908 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
46909 St->getPointerInfo(), St->getOriginalAlign(),
46910 St->getMemOperand()->getFlags());
46911 }
46912
46913 // If we are saving a 32-byte vector and 32-byte stores are slow, such as on
46914 // Sandy Bridge, perform two 16-byte stores.
46915 bool Fast;
46916 if (VT.is256BitVector() && StVT == VT &&
46917 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
46918 *St->getMemOperand(), &Fast) &&
46919 !Fast) {
46920 unsigned NumElems = VT.getVectorNumElements();
46921 if (NumElems < 2)
46922 return SDValue();
46923
46924 return splitVectorStore(St, DAG);
46925 }
46926
46927 // Split under-aligned vector non-temporal stores.
46928 if (St->isNonTemporal() && StVT == VT &&
46929 St->getAlignment() < VT.getStoreSize()) {
46930 // ZMM/YMM nt-stores - either it can be stored as a series of shorter
46931 // vectors or the legalizer can scalarize it to use MOVNTI.
46932 if (VT.is256BitVector() || VT.is512BitVector()) {
46933 unsigned NumElems = VT.getVectorNumElements();
46934 if (NumElems < 2)
46935 return SDValue();
46936 return splitVectorStore(St, DAG);
46937 }
46938
46939 // XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64
46940 // to use MOVNTI.
46941 if (VT.is128BitVector() && Subtarget.hasSSE2()) {
46942 MVT NTVT = Subtarget.hasSSE4A()
46943 ? MVT::v2f64
46944 : (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);
46945 return scalarizeVectorStore(St, NTVT, DAG);
46946 }
46947 }
46948
46949 // Try to optimize v16i16->v16i8 truncating stores when BWI is not
46950 // supported, but avx512f is by extending to v16i32 and truncating.
46951 if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&
46952 St->getValue().getOpcode() == ISD::TRUNCATE &&
46953 St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&
46954 TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) &&
46955 St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) {
46956 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32, St->getValue());
46957 return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
46958 MVT::v16i8, St->getMemOperand());
46959 }
46960
46961 // Try to fold a VTRUNCUS or VTRUNCS into a truncating store.
46962 if (!St->isTruncatingStore() && StoredVal.hasOneUse() &&
46963 (StoredVal.getOpcode() == X86ISD::VTRUNCUS ||
46964 StoredVal.getOpcode() == X86ISD::VTRUNCS) &&
46965 TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) {
46966 bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS;
46967 return EmitTruncSStore(IsSigned, St->getChain(),
46968 dl, StoredVal.getOperand(0), St->getBasePtr(),
46969 VT, St->getMemOperand(), DAG);
46970 }
46971
46972 // Try to fold a extract_element(VTRUNC) pattern into a truncating store.
46973 if (!St->isTruncatingStore() && StoredVal.hasOneUse()) {
46974 auto IsExtractedElement = [](SDValue V) {
46975 if (V.getOpcode() == ISD::TRUNCATE && V.getOperand(0).hasOneUse())
46976 V = V.getOperand(0);
46977 unsigned Opc = V.getOpcode();
46978 if (Opc == ISD::EXTRACT_VECTOR_ELT || Opc == X86ISD::PEXTRW) {
46979 if (V.getOperand(0).hasOneUse() && isNullConstant(V.getOperand(1)))
46980 return V.getOperand(0);
46981 }
46982 return SDValue();
46983 };
46984 if (SDValue Extract = IsExtractedElement(StoredVal)) {
46985 SDValue Trunc = peekThroughOneUseBitcasts(Extract);
46986 if (Trunc.getOpcode() == X86ISD::VTRUNC) {
46987 SDValue Src = Trunc.getOperand(0);
46988 MVT DstVT = Trunc.getSimpleValueType();
46989 MVT SrcVT = Src.getSimpleValueType();
46990 unsigned NumSrcElts = SrcVT.getVectorNumElements();
46991 unsigned NumTruncBits = DstVT.getScalarSizeInBits() * NumSrcElts;
46992 MVT TruncVT = MVT::getVectorVT(DstVT.getScalarType(), NumSrcElts);
46993 if (NumTruncBits == VT.getSizeInBits() &&
46994 TLI.isTruncStoreLegal(SrcVT, TruncVT)) {
46995 return DAG.getTruncStore(St->getChain(), dl, Src, St->getBasePtr(),
46996 TruncVT, St->getMemOperand());
46997 }
46998 }
46999 }
47000 }
47001
47002 // Optimize trunc store (of multiple scalars) to shuffle and store.
47003 // First, pack all of the elements in one place. Next, store to memory
47004 // in fewer chunks.
47005 if (St->isTruncatingStore() && VT.isVector()) {
47006 // Check if we can detect an AVG pattern from the truncation. If yes,
47007 // replace the trunc store by a normal store with the result of X86ISD::AVG
47008 // instruction.
47009 if (DCI.isBeforeLegalize() || TLI.isTypeLegal(St->getMemoryVT()))
47010 if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
47011 Subtarget, dl))
47012 return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
47013 St->getPointerInfo(), St->getOriginalAlign(),
47014 St->getMemOperand()->getFlags());
47015
47016 if (TLI.isTruncStoreLegal(VT, StVT)) {
47017 if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))
47018 return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
47019 dl, Val, St->getBasePtr(),
47020 St->getMemoryVT(), St->getMemOperand(), DAG);
47021 if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),
47022 DAG, dl))
47023 return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
47024 dl, Val, St->getBasePtr(),
47025 St->getMemoryVT(), St->getMemOperand(), DAG);
47026 }
47027
47028 return SDValue();
47029 }
47030
47031 // Cast ptr32 and ptr64 pointers to the default address space before a store.
47032 unsigned AddrSpace = St->getAddressSpace();
47033 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
47034 AddrSpace == X86AS::PTR32_UPTR) {
47035 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
47036 if (PtrVT != St->getBasePtr().getSimpleValueType()) {
47037 SDValue Cast =
47038 DAG.getAddrSpaceCast(dl, PtrVT, St->getBasePtr(), AddrSpace, 0);
47039 return DAG.getStore(St->getChain(), dl, StoredVal, Cast,
47040 St->getPointerInfo(), St->getOriginalAlign(),
47041 St->getMemOperand()->getFlags(), St->getAAInfo());
47042 }
47043 }
47044
47045 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
47046 // the FP state in cases where an emms may be missing.
47047 // A preferable solution to the general problem is to figure out the right
47048 // places to insert EMMS. This qualifies as a quick hack.
47049
47050 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
47051 if (VT.getSizeInBits() != 64)
47052 return SDValue();
47053
47054 const Function &F = DAG.getMachineFunction().getFunction();
47055 bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
47056 bool F64IsLegal =
47057 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
47058 if ((VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit()) &&
47059 isa<LoadSDNode>(St->getValue()) &&
47060 cast<LoadSDNode>(St->getValue())->isSimple() &&
47061 St->getChain().hasOneUse() && St->isSimple()) {
47062 LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode());
47063
47064 if (!ISD::isNormalLoad(Ld))
47065 return SDValue();
47066
47067 // Avoid the transformation if there are multiple uses of the loaded value.
47068 if (!Ld->hasNUsesOfValue(1, 0))
47069 return SDValue();
47070
47071 SDLoc LdDL(Ld);
47072 SDLoc StDL(N);
47073 // Lower to a single movq load/store pair.
47074 SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(),
47075 Ld->getBasePtr(), Ld->getMemOperand());
47076
47077 // Make sure new load is placed in same chain order.
47078 DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
47079 return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
47080 St->getMemOperand());
47081 }
47082
47083 // This is similar to the above case, but here we handle a scalar 64-bit
47084 // integer store that is extracted from a vector on a 32-bit target.
47085 // If we have SSE2, then we can treat it like a floating-point double
47086 // to get past legalization. The execution dependencies fixup pass will
47087 // choose the optimal machine instruction for the store if this really is
47088 // an integer or v2f32 rather than an f64.
47089 if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
47090 St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
47091 SDValue OldExtract = St->getOperand(1);
47092 SDValue ExtOp0 = OldExtract.getOperand(0);
47093 unsigned VecSize = ExtOp0.getValueSizeInBits();
47094 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
47095 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
47096 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
47097 BitCast, OldExtract.getOperand(1));
47098 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
47099 St->getPointerInfo(), St->getOriginalAlign(),
47100 St->getMemOperand()->getFlags());
47101 }
47102
47103 return SDValue();
47104}
47105
47106static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG,
47107 TargetLowering::DAGCombinerInfo &DCI,
47108 const X86Subtarget &Subtarget) {
47109 auto *St = cast<MemIntrinsicSDNode>(N);
47110
47111 SDValue StoredVal = N->getOperand(1);
47112 MVT VT = StoredVal.getSimpleValueType();
47113 EVT MemVT = St->getMemoryVT();
47114
47115 // Figure out which elements we demand.
47116 unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits();
47117 APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts);
47118
47119 APInt KnownUndef, KnownZero;
47120 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47121 if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, KnownUndef,
47122 KnownZero, DCI)) {
47123 if (N->getOpcode() != ISD::DELETED_NODE)
47124 DCI.AddToWorklist(N);
47125 return SDValue(N, 0);
47126 }
47127
47128 return SDValue();
47129}
47130
47131/// Return 'true' if this vector operation is "horizontal"
47132/// and return the operands for the horizontal operation in LHS and RHS. A
47133/// horizontal operation performs the binary operation on successive elements
47134/// of its first operand, then on successive elements of its second operand,
47135/// returning the resulting values in a vector. For example, if
47136/// A = < float a0, float a1, float a2, float a3 >
47137/// and
47138/// B = < float b0, float b1, float b2, float b3 >
47139/// then the result of doing a horizontal operation on A and B is
47140/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
47141/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
47142/// A horizontal-op B, for some already available A and B, and if so then LHS is
47143/// set to A, RHS to B, and the routine returns 'true'.
47144static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS,
47145 SelectionDAG &DAG, const X86Subtarget &Subtarget,
47146 bool IsCommutative,
47147 SmallVectorImpl<int> &PostShuffleMask) {
47148 // If either operand is undef, bail out. The binop should be simplified.
47149 if (LHS.isUndef() || RHS.isUndef())
47150 return false;
47151
47152 // Look for the following pattern:
47153 // A = < float a0, float a1, float a2, float a3 >
47154 // B = < float b0, float b1, float b2, float b3 >
47155 // and
47156 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
47157 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
47158 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
47159 // which is A horizontal-op B.
47160
47161 MVT VT = LHS.getSimpleValueType();
47162 assert((VT.is128BitVector() || VT.is256BitVector()) &&(static_cast<void> (0))
47163 "Unsupported vector type for horizontal add/sub")(static_cast<void> (0));
47164 unsigned NumElts = VT.getVectorNumElements();
47165
47166 auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,
47167 SmallVectorImpl<int> &ShuffleMask) {
47168 bool UseSubVector = false;
47169 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
47170 Op.getOperand(0).getValueType().is256BitVector() &&
47171 llvm::isNullConstant(Op.getOperand(1))) {
47172 Op = Op.getOperand(0);
47173 UseSubVector = true;
47174 }
47175 SmallVector<SDValue, 2> SrcOps;
47176 SmallVector<int, 16> SrcMask, ScaledMask;
47177 SDValue BC = peekThroughBitcasts(Op);
47178 if (getTargetShuffleInputs(BC, SrcOps, SrcMask, DAG) &&
47179 !isAnyZero(SrcMask) && all_of(SrcOps, [BC](SDValue Op) {
47180 return Op.getValueSizeInBits() == BC.getValueSizeInBits();
47181 })) {
47182 resolveTargetShuffleInputsAndMask(SrcOps, SrcMask);
47183 if (!UseSubVector && SrcOps.size() <= 2 &&
47184 scaleShuffleElements(SrcMask, NumElts, ScaledMask)) {
47185 N0 = SrcOps.size() > 0 ? SrcOps[0] : SDValue();
47186 N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();
47187 ShuffleMask.assign(ScaledMask.begin(), ScaledMask.end());
47188 }
47189 if (UseSubVector && SrcOps.size() == 1 &&
47190 scaleShuffleElements(SrcMask, 2 * NumElts, ScaledMask)) {
47191 std::tie(N0, N1) = DAG.SplitVector(SrcOps[0], SDLoc(Op));
47192 ArrayRef<int> Mask = ArrayRef<int>(ScaledMask).slice(0, NumElts);
47193 ShuffleMask.assign(Mask.begin(), Mask.end());
47194 }
47195 }
47196 };
47197
47198 // View LHS in the form
47199 // LHS = VECTOR_SHUFFLE A, B, LMask
47200 // If LHS is not a shuffle, then pretend it is the identity shuffle:
47201 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
47202 // NOTE: A default initialized SDValue represents an UNDEF of type VT.
47203 SDValue A, B;
47204 SmallVector<int, 16> LMask;
47205 GetShuffle(LHS, A, B, LMask);
47206
47207 // Likewise, view RHS in the form
47208 // RHS = VECTOR_SHUFFLE C, D, RMask
47209 SDValue C, D;
47210 SmallVector<int, 16> RMask;
47211 GetShuffle(RHS, C, D, RMask);
47212
47213 // At least one of the operands should be a vector shuffle.
47214 unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1);
47215 if (NumShuffles == 0)
47216 return false;
47217
47218 if (LMask.empty()) {
47219 A = LHS;
47220 for (unsigned i = 0; i != NumElts; ++i)
47221 LMask.push_back(i);
47222 }
47223
47224 if (RMask.empty()) {
47225 C = RHS;
47226 for (unsigned i = 0; i != NumElts; ++i)
47227 RMask.push_back(i);
47228 }
47229
47230 // If we have an unary mask, ensure the other op is set to null.
47231 if (isUndefOrInRange(LMask, 0, NumElts))
47232 B = SDValue();
47233 else if (isUndefOrInRange(LMask, NumElts, NumElts * 2))
47234 A = SDValue();
47235
47236 if (isUndefOrInRange(RMask, 0, NumElts))
47237 D = SDValue();
47238 else if (isUndefOrInRange(RMask, NumElts, NumElts * 2))
47239 C = SDValue();
47240
47241 // If A and B occur in reverse order in RHS, then canonicalize by commuting
47242 // RHS operands and shuffle mask.
47243 if (A != C) {
47244 std::swap(C, D);
47245 ShuffleVectorSDNode::commuteMask(RMask);
47246 }
47247 // Check that the shuffles are both shuffling the same vectors.
47248 if (!(A == C && B == D))
47249 return false;
47250
47251 PostShuffleMask.clear();
47252 PostShuffleMask.append(NumElts, SM_SentinelUndef);
47253
47254 // LHS and RHS are now:
47255 // LHS = shuffle A, B, LMask
47256 // RHS = shuffle A, B, RMask
47257 // Check that the masks correspond to performing a horizontal operation.
47258 // AVX defines horizontal add/sub to operate independently on 128-bit lanes,
47259 // so we just repeat the inner loop if this is a 256-bit op.
47260 unsigned Num128BitChunks = VT.getSizeInBits() / 128;
47261 unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;
47262 unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
47263 assert((NumEltsPer128BitChunk % 2 == 0) &&(static_cast<void> (0))
47264 "Vector type should have an even number of elements in each lane")(static_cast<void> (0));
47265 for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {
47266 for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {
47267 // Ignore undefined components.
47268 int LIdx = LMask[i + j], RIdx = RMask[i + j];
47269 if (LIdx < 0 || RIdx < 0 ||
47270 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
47271 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
47272 continue;
47273
47274 // Check that successive odd/even elements are being operated on. If not,
47275 // this is not a horizontal operation.
47276 if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) &&
47277 !((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative))
47278 return false;
47279
47280 // Compute the post-shuffle mask index based on where the element
47281 // is stored in the HOP result, and where it needs to be moved to.
47282 int Base = LIdx & ~1u;
47283 int Index = ((Base % NumEltsPer128BitChunk) / 2) +
47284 ((Base % NumElts) & ~(NumEltsPer128BitChunk - 1));
47285
47286 // The low half of the 128-bit result must choose from A.
47287 // The high half of the 128-bit result must choose from B,
47288 // unless B is undef. In that case, we are always choosing from A.
47289 if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk))
47290 Index += NumEltsPer64BitChunk;
47291 PostShuffleMask[i + j] = Index;
47292 }
47293 }
47294
47295 SDValue NewLHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
47296 SDValue NewRHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
47297
47298 bool IsIdentityPostShuffle =
47299 isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);
47300 if (IsIdentityPostShuffle)
47301 PostShuffleMask.clear();
47302
47303 // Avoid 128-bit multi lane shuffles if pre-AVX2 and FP (integer will split).
47304 if (!IsIdentityPostShuffle && !Subtarget.hasAVX2() && VT.isFloatingPoint() &&
47305 isMultiLaneShuffleMask(128, VT.getScalarSizeInBits(), PostShuffleMask))
47306 return false;
47307
47308 // If the source nodes are already used in HorizOps then always accept this.
47309 // Shuffle folding should merge these back together.
47310 bool FoundHorizLHS = llvm::any_of(NewLHS->uses(), [&](SDNode *User) {
47311 return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
47312 });
47313 bool FoundHorizRHS = llvm::any_of(NewRHS->uses(), [&](SDNode *User) {
47314 return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
47315 });
47316 bool ForceHorizOp = FoundHorizLHS && FoundHorizRHS;
47317
47318 // Assume a SingleSource HOP if we only shuffle one input and don't need to
47319 // shuffle the result.
47320 if (!ForceHorizOp &&
47321 !shouldUseHorizontalOp(NewLHS == NewRHS &&
47322 (NumShuffles < 2 || !IsIdentityPostShuffle),
47323 DAG, Subtarget))
47324 return false;
47325
47326 LHS = DAG.getBitcast(VT, NewLHS);
47327 RHS = DAG.getBitcast(VT, NewRHS);
47328 return true;
47329}
47330
47331// Try to synthesize horizontal (f)hadd/hsub from (f)adds/subs of shuffles.
47332static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG,
47333 const X86Subtarget &Subtarget) {
47334 EVT VT = N->getValueType(0);
47335 unsigned Opcode = N->getOpcode();
47336 bool IsAdd = (Opcode == ISD::FADD) || (Opcode == ISD::ADD);
47337 SmallVector<int, 8> PostShuffleMask;
47338
47339 switch (Opcode) {
47340 case ISD::FADD:
47341 case ISD::FSUB:
47342 if ((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
47343 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
47344 SDValue LHS = N->getOperand(0);
47345 SDValue RHS = N->getOperand(1);
47346 auto HorizOpcode = IsAdd ? X86ISD::FHADD : X86ISD::FHSUB;
47347 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
47348 PostShuffleMask)) {
47349 SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
47350 if (!PostShuffleMask.empty())
47351 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
47352 DAG.getUNDEF(VT), PostShuffleMask);
47353 return HorizBinOp;
47354 }
47355 }
47356 break;
47357 case ISD::ADD:
47358 case ISD::SUB:
47359 if (Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
47360 VT == MVT::v16i16 || VT == MVT::v8i32)) {
47361 SDValue LHS = N->getOperand(0);
47362 SDValue RHS = N->getOperand(1);
47363 auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB;
47364 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
47365 PostShuffleMask)) {
47366 auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,
47367 ArrayRef<SDValue> Ops) {
47368 return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);
47369 };
47370 SDValue HorizBinOp = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
47371 {LHS, RHS}, HOpBuilder);
47372 if (!PostShuffleMask.empty())
47373 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
47374 DAG.getUNDEF(VT), PostShuffleMask);
47375 return HorizBinOp;
47376 }
47377 }
47378 break;
47379 }
47380
47381 return SDValue();
47382}
47383
47384// Try to combine the following nodes
47385// t29: i64 = X86ISD::Wrapper TargetConstantPool:i64
47386// <i32 -2147483648[float -0.000000e+00]> 0
47387// t27: v16i32[v16f32],ch = X86ISD::VBROADCAST_LOAD
47388// <(load 4 from constant-pool)> t0, t29
47389// [t30: v16i32 = bitcast t27]
47390// t6: v16i32 = xor t7, t27[t30]
47391// t11: v16f32 = bitcast t6
47392// t21: v16f32 = X86ISD::VFMULC[X86ISD::VCFMULC] t11, t8
47393// into X86ISD::VFCMULC[X86ISD::VFMULC] if possible:
47394// t22: v16f32 = bitcast t7
47395// t23: v16f32 = X86ISD::VFCMULC[X86ISD::VFMULC] t8, t22
47396// t24: v32f16 = bitcast t23
47397static SDValue combineFMulcFCMulc(SDNode *N, SelectionDAG &DAG,
47398 const X86Subtarget &Subtarget) {
47399 EVT VT = N->getValueType(0);
47400 SDValue LHS = N->getOperand(0);
47401 SDValue RHS = N->getOperand(1);
47402 int CombineOpcode =
47403 N->getOpcode() == X86ISD::VFCMULC ? X86ISD::VFMULC : X86ISD::VFCMULC;
47404 auto isConjugationConstant = [](const Constant *c) {
47405 if (const auto *CI = dyn_cast<ConstantInt>(c)) {
47406 APInt ConjugationInt32 = APInt(32, 0x80000000, true);
47407 APInt ConjugationInt64 = APInt(64, 0x8000000080000000ULL, true);
47408 switch (CI->getBitWidth()) {
47409 case 16:
47410 return false;
47411 case 32:
47412 return CI->getValue() == ConjugationInt32;
47413 case 64:
47414 return CI->getValue() == ConjugationInt64;
47415 default:
47416 llvm_unreachable("Unexpected bit width")__builtin_unreachable();
47417 }
47418 }
47419 if (const auto *CF = dyn_cast<ConstantFP>(c))
47420 return CF->isNegativeZeroValue();
47421 return false;
47422 };
47423 auto combineConjugation = [&](SDValue &r) {
47424 if (LHS->getOpcode() == ISD::BITCAST && RHS.hasOneUse()) {
47425 SDValue XOR = LHS.getOperand(0);
47426 if (XOR->getOpcode() == ISD::XOR && XOR.hasOneUse()) {
47427 SDValue XORRHS = XOR.getOperand(1);
47428 if (XORRHS.getOpcode() == ISD::BITCAST && XORRHS.hasOneUse())
47429 XORRHS = XORRHS.getOperand(0);
47430 if (XORRHS.getOpcode() == X86ISD::VBROADCAST_LOAD &&
47431 XORRHS.getOperand(1).getNumOperands()) {
47432 ConstantPoolSDNode *CP =
47433 dyn_cast<ConstantPoolSDNode>(XORRHS.getOperand(1).getOperand(0));
47434 if (CP && isConjugationConstant(CP->getConstVal())) {
47435 SelectionDAG::FlagInserter FlagsInserter(DAG, N);
47436 SDValue I2F = DAG.getBitcast(VT, LHS.getOperand(0).getOperand(0));
47437 SDValue FCMulC = DAG.getNode(CombineOpcode, SDLoc(N), VT, RHS, I2F);
47438 r = DAG.getBitcast(VT, FCMulC);
47439 return true;
47440 }
47441 }
47442 }
47443 }
47444 return false;
47445 };
47446 SDValue Res;
47447 if (combineConjugation(Res))
47448 return Res;
47449 std::swap(LHS, RHS);
47450 if (combineConjugation(Res))
47451 return Res;
47452 return Res;
47453}
47454
47455// Try to combine the following nodes
47456// t21: v16f32 = X86ISD::VFMULC/VFCMULC t7, t8
47457// t15: v32f16 = bitcast t21
47458// t16: v32f16 = fadd nnan ninf nsz arcp contract afn reassoc t15, t2
47459// into X86ISD::VFMADDC/VFCMADDC if possible:
47460// t22: v16f32 = bitcast t2
47461// t23: v16f32 = nnan ninf nsz arcp contract afn reassoc
47462// X86ISD::VFMADDC/VFCMADDC t7, t8, t22
47463// t24: v32f16 = bitcast t23
47464static SDValue combineFaddCFmul(SDNode *N, SelectionDAG &DAG,
47465 const X86Subtarget &Subtarget) {
47466 auto AllowContract = [&DAG](SDNode *N) {
47467 return DAG.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast ||
47468 N->getFlags().hasAllowContract();
47469 };
47470 if (N->getOpcode() != ISD::FADD || !Subtarget.hasFP16() || !AllowContract(N))
47471 return SDValue();
47472
47473 EVT VT = N->getValueType(0);
47474 if (VT != MVT::v8f16 && VT != MVT::v16f16 && VT != MVT::v32f16)
47475 return SDValue();
47476
47477 SDValue LHS = N->getOperand(0);
47478 SDValue RHS = N->getOperand(1);
47479 SDValue CFmul, FAddOp1;
47480 auto GetCFmulFrom = [&CFmul, &AllowContract](SDValue N) -> bool {
47481 if (!N.hasOneUse() || N.getOpcode() != ISD::BITCAST)
47482 return false;
47483 SDValue Op0 = N.getOperand(0);
47484 unsigned Opcode = Op0.getOpcode();
47485 if (Op0.hasOneUse() && AllowContract(Op0.getNode()) &&
47486 (Opcode == X86ISD::VFMULC || Opcode == X86ISD::VFCMULC))
47487 CFmul = Op0;
47488 return !!CFmul;
47489 };
47490
47491 if (GetCFmulFrom(LHS))
47492 FAddOp1 = RHS;
47493 else if (GetCFmulFrom(RHS))
47494 FAddOp1 = LHS;
47495 else
47496 return SDValue();
47497
47498 MVT CVT = MVT::getVectorVT(MVT::f32, VT.getVectorNumElements() / 2);
47499 assert(CFmul->getValueType(0) == CVT && "Complex type mismatch")(static_cast<void> (0));
47500 FAddOp1 = DAG.getBitcast(CVT, FAddOp1);
47501 unsigned newOp = CFmul.getOpcode() == X86ISD::VFMULC ? X86ISD::VFMADDC
47502 : X86ISD::VFCMADDC;
47503 // FIXME: How do we handle when fast math flags of FADD are different from
47504 // CFMUL's?
47505 CFmul = DAG.getNode(newOp, SDLoc(N), CVT, FAddOp1, CFmul.getOperand(0),
47506 CFmul.getOperand(1), N->getFlags());
47507 return DAG.getBitcast(VT, CFmul);
47508}
47509
47510/// Do target-specific dag combines on floating-point adds/subs.
47511static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
47512 const X86Subtarget &Subtarget) {
47513 if (SDValue HOp = combineToHorizontalAddSub(N, DAG, Subtarget))
47514 return HOp;
47515
47516 if (SDValue COp = combineFaddCFmul(N, DAG, Subtarget))
47517 return COp;
47518
47519 return SDValue();
47520}
47521
47522/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
47523/// the codegen.
47524/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
47525/// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove
47526/// anything that is guaranteed to be transformed by DAGCombiner.
47527static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
47528 const X86Subtarget &Subtarget,
47529 const SDLoc &DL) {
47530 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode")(static_cast<void> (0));
47531 SDValue Src = N->getOperand(0);
47532 unsigned SrcOpcode = Src.getOpcode();
47533 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47534
47535 EVT VT = N->getValueType(0);
47536 EVT SrcVT = Src.getValueType();
47537
47538 auto IsFreeTruncation = [VT](SDValue Op) {
47539 unsigned TruncSizeInBits = VT.getScalarSizeInBits();
47540
47541 // See if this has been extended from a smaller/equal size to
47542 // the truncation size, allowing a truncation to combine with the extend.
47543 unsigned Opcode = Op.getOpcode();
47544 if ((Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND ||
47545 Opcode == ISD::ZERO_EXTEND) &&
47546 Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
47547 return true;
47548
47549 // See if this is a single use constant which can be constant folded.
47550 // NOTE: We don't peek throught bitcasts here because there is currently
47551 // no support for constant folding truncate+bitcast+vector_of_constants. So
47552 // we'll just send up with a truncate on both operands which will
47553 // get turned back into (truncate (binop)) causing an infinite loop.
47554 return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
47555 };
47556
47557 auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
47558 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
47559 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
47560 return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1);
47561 };
47562
47563 // Don't combine if the operation has other uses.
47564 if (!Src.hasOneUse())
47565 return SDValue();
47566
47567 // Only support vector truncation for now.
47568 // TODO: i64 scalar math would benefit as well.
47569 if (!VT.isVector())
47570 return SDValue();
47571
47572 // In most cases its only worth pre-truncating if we're only facing the cost
47573 // of one truncation.
47574 // i.e. if one of the inputs will constant fold or the input is repeated.
47575 switch (SrcOpcode) {
47576 case ISD::MUL:
47577 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
47578 // better to truncate if we have the chance.
47579 if (SrcVT.getScalarType() == MVT::i64 &&
47580 TLI.isOperationLegal(SrcOpcode, VT) &&
47581 !TLI.isOperationLegal(SrcOpcode, SrcVT))
47582 return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
47583 LLVM_FALLTHROUGH[[gnu::fallthrough]];
47584 case ISD::AND:
47585 case ISD::XOR:
47586 case ISD::OR:
47587 case ISD::ADD:
47588 case ISD::SUB: {
47589 SDValue Op0 = Src.getOperand(0);
47590 SDValue Op1 = Src.getOperand(1);
47591 if (TLI.isOperationLegal(SrcOpcode, VT) &&
47592 (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
47593 return TruncateArithmetic(Op0, Op1);
47594 break;
47595 }
47596 }
47597
47598 return SDValue();
47599}
47600
47601/// Truncate using ISD::AND mask and X86ISD::PACKUS.
47602/// e.g. trunc <8 x i32> X to <8 x i16> -->
47603/// MaskX = X & 0xffff (clear high bits to prevent saturation)
47604/// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1)
47605static SDValue combineVectorTruncationWithPACKUS(SDNode *N, const SDLoc &DL,
47606 const X86Subtarget &Subtarget,
47607 SelectionDAG &DAG) {
47608 SDValue In = N->getOperand(0);
47609 EVT InVT = In.getValueType();
47610 EVT OutVT = N->getValueType(0);
47611
47612 APInt Mask = APInt::getLowBitsSet(InVT.getScalarSizeInBits(),
47613 OutVT.getScalarSizeInBits());
47614 In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(Mask, DL, InVT));
47615 return truncateVectorWithPACK(X86ISD::PACKUS, OutVT, In, DL, DAG, Subtarget);
47616}
47617
47618/// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
47619static SDValue combineVectorTruncationWithPACKSS(SDNode *N, const SDLoc &DL,
47620 const X86Subtarget &Subtarget,
47621 SelectionDAG &DAG) {
47622 SDValue In = N->getOperand(0);
47623 EVT InVT = In.getValueType();
47624 EVT OutVT = N->getValueType(0);
47625 In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, InVT, In,
47626 DAG.getValueType(OutVT));
47627 return truncateVectorWithPACK(X86ISD::PACKSS, OutVT, In, DL, DAG, Subtarget);
47628}
47629
47630/// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
47631/// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
47632/// legalization the truncation will be translated into a BUILD_VECTOR with each
47633/// element that is extracted from a vector and then truncated, and it is
47634/// difficult to do this optimization based on them.
47635static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
47636 const X86Subtarget &Subtarget) {
47637 EVT OutVT = N->getValueType(0);
47638 if (!OutVT.isVector())
47639 return SDValue();
47640
47641 SDValue In = N->getOperand(0);
47642 if (!In.getValueType().isSimple())
47643 return SDValue();
47644
47645 EVT InVT = In.getValueType();
47646 unsigned NumElems = OutVT.getVectorNumElements();
47647
47648 // AVX512 provides fast truncate ops.
47649 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
47650 return SDValue();
47651
47652 EVT OutSVT = OutVT.getVectorElementType();
47653 EVT InSVT = InVT.getVectorElementType();
47654 if (!((InSVT == MVT::i16 || InSVT == MVT::i32 || InSVT == MVT::i64) &&
47655 (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
47656 NumElems >= 8))
47657 return SDValue();
47658
47659 // SSSE3's pshufb results in less instructions in the cases below.
47660 if (Subtarget.hasSSSE3() && NumElems == 8 && InSVT != MVT::i64)
47661 return SDValue();
47662
47663 SDLoc DL(N);
47664 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
47665 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
47666 // truncate 2 x v4i32 to v8i16.
47667 if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
47668 return combineVectorTruncationWithPACKUS(N, DL, Subtarget, DAG);
47669 if (InSVT == MVT::i32)
47670 return combineVectorTruncationWithPACKSS(N, DL, Subtarget, DAG);
47671
47672 return SDValue();
47673}
47674
47675/// This function transforms vector truncation of 'extended sign-bits' or
47676/// 'extended zero-bits' values.
47677/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
47678static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
47679 SelectionDAG &DAG,
47680 const X86Subtarget &Subtarget) {
47681 // Requires SSE2.
47682 if (!Subtarget.hasSSE2())
47683 return SDValue();
47684
47685 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())
47686 return SDValue();
47687
47688 SDValue In = N->getOperand(0);
47689 if (!In.getValueType().isSimple())
47690 return SDValue();
47691
47692 MVT VT = N->getValueType(0).getSimpleVT();
47693 MVT SVT = VT.getScalarType();
47694
47695 MVT InVT = In.getValueType().getSimpleVT();
47696 MVT InSVT = InVT.getScalarType();
47697
47698 // Check we have a truncation suited for PACKSS/PACKUS.
47699 if (!isPowerOf2_32(VT.getVectorNumElements()))
47700 return SDValue();
47701 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
47702 return SDValue();
47703 if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
47704 return SDValue();
47705
47706 // Truncation to sub-128bit vXi32 can be better handled with shuffles.
47707 if (SVT == MVT::i32 && VT.getSizeInBits() < 128)
47708 return SDValue();
47709
47710 // AVX512 has fast truncate, but if the input is already going to be split,
47711 // there's no harm in trying pack.
47712 if (Subtarget.hasAVX512() &&
47713 !(!Subtarget.useAVX512Regs() && VT.is256BitVector() &&
47714 InVT.is512BitVector())) {
47715 // PACK should still be worth it for 128-bit vectors if the sources were
47716 // originally concatenated from subvectors.
47717 SmallVector<SDValue> ConcatOps;
47718 if (VT.getSizeInBits() > 128 || !collectConcatOps(In.getNode(), ConcatOps))
47719 return SDValue();
47720 }
47721
47722 unsigned NumPackedSignBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
47723 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
47724
47725 // Use PACKUS if the input has zero-bits that extend all the way to the
47726 // packed/truncated value. e.g. masks, zext_in_reg, etc.
47727 KnownBits Known = DAG.computeKnownBits(In);
47728 unsigned NumLeadingZeroBits = Known.countMinLeadingZeros();
47729 if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedZeroBits))
47730 return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget);
47731
47732 // Use PACKSS if the input has sign-bits that extend all the way to the
47733 // packed/truncated value. e.g. Comparison result, sext_in_reg, etc.
47734 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
47735
47736 // Don't use PACKSS for vXi64 -> vXi32 truncations unless we're dealing with
47737 // a sign splat. ComputeNumSignBits struggles to see through BITCASTs later
47738 // on and combines/simplifications can't then use it.
47739 if (SVT == MVT::i32 && NumSignBits != InSVT.getSizeInBits())
47740 return SDValue();
47741
47742 unsigned MinSignBits = InSVT.getSizeInBits() - NumPackedSignBits;
47743 if (NumSignBits > MinSignBits)
47744 return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);
47745
47746 // If we have a srl that only generates signbits that we will discard in
47747 // the truncation then we can use PACKSS by converting the srl to a sra.
47748 // SimplifyDemandedBits often relaxes sra to srl so we need to reverse it.
47749 if (In.getOpcode() == ISD::SRL && N->isOnlyUserOf(In.getNode()))
47750 if (const APInt *ShAmt = DAG.getValidShiftAmountConstant(
47751 In, APInt::getAllOnesValue(VT.getVectorNumElements()))) {
47752 if (*ShAmt == MinSignBits) {
47753 SDValue NewIn = DAG.getNode(ISD::SRA, DL, InVT, In->ops());
47754 return truncateVectorWithPACK(X86ISD::PACKSS, VT, NewIn, DL, DAG,
47755 Subtarget);
47756 }
47757 }
47758
47759 return SDValue();
47760}
47761
47762// Try to form a MULHU or MULHS node by looking for
47763// (trunc (srl (mul ext, ext), 16))
47764// TODO: This is X86 specific because we want to be able to handle wide types
47765// before type legalization. But we can only do it if the vector will be
47766// legalized via widening/splitting. Type legalization can't handle promotion
47767// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
47768// combiner.
47769static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
47770 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
47771 // First instruction should be a right shift of a multiply.
47772 if (Src.getOpcode() != ISD::SRL ||
47773 Src.getOperand(0).getOpcode() != ISD::MUL)
47774 return SDValue();
47775
47776 if (!Subtarget.hasSSE2())
47777 return SDValue();
47778
47779 // Only handle vXi16 types that are at least 128-bits unless they will be
47780 // widened.
47781 if (!VT.isVector() || VT.getVectorElementType() != MVT::i16)
47782 return SDValue();
47783
47784 // Input type should be at least vXi32.
47785 EVT InVT = Src.getValueType();
47786 if (InVT.getVectorElementType().getSizeInBits() < 32)
47787 return SDValue();
47788
47789 // Need a shift by 16.
47790 APInt ShiftAmt;
47791 if (!ISD::isConstantSplatVector(Src.getOperand(1).getNode(), ShiftAmt) ||
47792 ShiftAmt != 16)
47793 return SDValue();
47794
47795 SDValue LHS = Src.getOperand(0).getOperand(0);
47796 SDValue RHS = Src.getOperand(0).getOperand(1);
47797
47798 unsigned ExtOpc = LHS.getOpcode();
47799 if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
47800 RHS.getOpcode() != ExtOpc)
47801 return SDValue();
47802
47803 // Peek through the extends.
47804 LHS = LHS.getOperand(0);
47805 RHS = RHS.getOperand(0);
47806
47807 // Ensure the input types match.
47808 if (LHS.getValueType() != VT || RHS.getValueType() != VT)
47809 return SDValue();
47810
47811 unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
47812 return DAG.getNode(Opc, DL, VT, LHS, RHS);
47813}
47814
47815// Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes
47816// from one vector with signed bytes from another vector, adds together
47817// adjacent pairs of 16-bit products, and saturates the result before
47818// truncating to 16-bits.
47819//
47820// Which looks something like this:
47821// (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),
47822// (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))
47823static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG,
47824 const X86Subtarget &Subtarget,
47825 const SDLoc &DL) {
47826 if (!VT.isVector() || !Subtarget.hasSSSE3())
47827 return SDValue();
47828
47829 unsigned NumElems = VT.getVectorNumElements();
47830 EVT ScalarVT = VT.getVectorElementType();
47831 if (ScalarVT != MVT::i16 || NumElems < 8 || !isPowerOf2_32(NumElems))
47832 return SDValue();
47833
47834 SDValue SSatVal = detectSSatPattern(In, VT);
47835 if (!SSatVal || SSatVal.getOpcode() != ISD::ADD)
47836 return SDValue();
47837
47838 // Ok this is a signed saturation of an ADD. See if this ADD is adding pairs
47839 // of multiplies from even/odd elements.
47840 SDValue N0 = SSatVal.getOperand(0);
47841 SDValue N1 = SSatVal.getOperand(1);
47842
47843 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
47844 return SDValue();
47845
47846 SDValue N00 = N0.getOperand(0);
47847 SDValue N01 = N0.getOperand(1);
47848 SDValue N10 = N1.getOperand(0);
47849 SDValue N11 = N1.getOperand(1);
47850
47851 // TODO: Handle constant vectors and use knownbits/computenumsignbits?
47852 // Canonicalize zero_extend to LHS.
47853 if (N01.getOpcode() == ISD::ZERO_EXTEND)
47854 std::swap(N00, N01);
47855 if (N11.getOpcode() == ISD::ZERO_EXTEND)
47856 std::swap(N10, N11);
47857
47858 // Ensure we have a zero_extend and a sign_extend.
47859 if (N00.getOpcode() != ISD::ZERO_EXTEND ||
47860 N01.getOpcode() != ISD::SIGN_EXTEND ||
47861 N10.getOpcode() != ISD::ZERO_EXTEND ||
47862 N11.getOpcode() != ISD::SIGN_EXTEND)
47863 return SDValue();
47864
47865 // Peek through the extends.
47866 N00 = N00.getOperand(0);
47867 N01 = N01.getOperand(0);
47868 N10 = N10.getOperand(0);
47869 N11 = N11.getOperand(0);
47870
47871 // Ensure the extend is from vXi8.
47872 if (N00.getValueType().getVectorElementType() != MVT::i8 ||
47873 N01.getValueType().getVectorElementType() != MVT::i8 ||
47874 N10.getValueType().getVectorElementType() != MVT::i8 ||
47875 N11.getValueType().getVectorElementType() != MVT::i8)
47876 return SDValue();
47877
47878 // All inputs should be build_vectors.
47879 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
47880 N01.getOpcode() != ISD::BUILD_VECTOR ||
47881 N10.getOpcode() != ISD::BUILD_VECTOR ||
47882 N11.getOpcode() != ISD::BUILD_VECTOR)
47883 return SDValue();
47884
47885 // N00/N10 are zero extended. N01/N11 are sign extended.
47886
47887 // For each element, we need to ensure we have an odd element from one vector
47888 // multiplied by the odd element of another vector and the even element from
47889 // one of the same vectors being multiplied by the even element from the
47890 // other vector. So we need to make sure for each element i, this operator
47891 // is being performed:
47892 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
47893 SDValue ZExtIn, SExtIn;
47894 for (unsigned i = 0; i != NumElems; ++i) {
47895 SDValue N00Elt = N00.getOperand(i);
47896 SDValue N01Elt = N01.getOperand(i);
47897 SDValue N10Elt = N10.getOperand(i);
47898 SDValue N11Elt = N11.getOperand(i);
47899 // TODO: Be more tolerant to undefs.
47900 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
47901 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
47902 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
47903 N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
47904 return SDValue();
47905 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
47906 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
47907 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
47908 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
47909 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
47910 return SDValue();
47911 unsigned IdxN00 = ConstN00Elt->getZExtValue();
47912 unsigned IdxN01 = ConstN01Elt->getZExtValue();
47913 unsigned IdxN10 = ConstN10Elt->getZExtValue();
47914 unsigned IdxN11 = ConstN11Elt->getZExtValue();
47915 // Add is commutative so indices can be reordered.
47916 if (IdxN00 > IdxN10) {
47917 std::swap(IdxN00, IdxN10);
47918 std::swap(IdxN01, IdxN11);
47919 }
47920 // N0 indices be the even element. N1 indices must be the next odd element.
47921 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
47922 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
47923 return SDValue();
47924 SDValue N00In = N00Elt.getOperand(0);
47925 SDValue N01In = N01Elt.getOperand(0);
47926 SDValue N10In = N10Elt.getOperand(0);
47927 SDValue N11In = N11Elt.getOperand(0);
47928 // First time we find an input capture it.
47929 if (!ZExtIn) {
47930 ZExtIn = N00In;
47931 SExtIn = N01In;
47932 }
47933 if (ZExtIn != N00In || SExtIn != N01In ||
47934 ZExtIn != N10In || SExtIn != N11In)
47935 return SDValue();
47936 }
47937
47938 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
47939 ArrayRef<SDValue> Ops) {
47940 // Shrink by adding truncate nodes and let DAGCombine fold with the
47941 // sources.
47942 EVT InVT = Ops[0].getValueType();
47943 assert(InVT.getScalarType() == MVT::i8 &&(static_cast<void> (0))
47944 "Unexpected scalar element type")(static_cast<void> (0));
47945 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch")(static_cast<void> (0));
47946 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
47947 InVT.getVectorNumElements() / 2);
47948 return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);
47949 };
47950 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },
47951 PMADDBuilder);
47952}
47953
47954static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
47955 const X86Subtarget &Subtarget) {
47956 EVT VT = N->getValueType(0);
47957 SDValue Src = N->getOperand(0);
47958 SDLoc DL(N);
47959
47960 // Attempt to pre-truncate inputs to arithmetic ops instead.
47961 if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
47962 return V;
47963
47964 // Try to detect AVG pattern first.
47965 if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
47966 return Avg;
47967
47968 // Try to detect PMADD
47969 if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))
47970 return PMAdd;
47971
47972 // Try to combine truncation with signed/unsigned saturation.
47973 if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
47974 return Val;
47975
47976 // Try to combine PMULHUW/PMULHW for vXi16.
47977 if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
47978 return V;
47979
47980 // The bitcast source is a direct mmx result.
47981 // Detect bitcasts between i32 to x86mmx
47982 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
47983 SDValue BCSrc = Src.getOperand(0);
47984 if (BCSrc.getValueType() == MVT::x86mmx)
47985 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
47986 }
47987
47988 // Try to truncate extended sign/zero bits with PACKSS/PACKUS.
47989 if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
47990 return V;
47991
47992 return combineVectorTruncation(N, DAG, Subtarget);
47993}
47994
47995static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG,
47996 TargetLowering::DAGCombinerInfo &DCI) {
47997 EVT VT = N->getValueType(0);
47998 SDValue In = N->getOperand(0);
47999 SDLoc DL(N);
48000
48001 if (auto SSatVal = detectSSatPattern(In, VT))
48002 return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
48003 if (auto USatVal = detectUSatPattern(In, VT, DAG, DL))
48004 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
48005
48006 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48007 APInt DemandedMask(APInt::getAllOnesValue(VT.getScalarSizeInBits()));
48008 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
48009 return SDValue(N, 0);
48010
48011 return SDValue();
48012}
48013
48014/// Returns the negated value if the node \p N flips sign of FP value.
48015///
48016/// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)
48017/// or FSUB(0, x)
48018/// AVX512F does not have FXOR, so FNEG is lowered as
48019/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
48020/// In this case we go though all bitcasts.
48021/// This also recognizes splat of a negated value and returns the splat of that
48022/// value.
48023static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {
48024 if (N->getOpcode() == ISD::FNEG)
48025 return N->getOperand(0);
48026
48027 // Don't recurse exponentially.
48028 if (Depth > SelectionDAG::MaxRecursionDepth)
48029 return SDValue();
48030
48031 unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();
48032
48033 SDValue Op = peekThroughBitcasts(SDValue(N, 0));
48034 EVT VT = Op->getValueType(0);
48035
48036 // Make sure the element size doesn't change.
48037 if (VT.getScalarSizeInBits() != ScalarSize)
48038 return SDValue();
48039
48040 unsigned Opc = Op.getOpcode();
48041 switch (Opc) {
48042 case ISD::VECTOR_SHUFFLE: {
48043 // For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate
48044 // of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here.
48045 if (!Op.getOperand(1).isUndef())
48046 return SDValue();
48047 if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1))
48048 if (NegOp0.getValueType() == VT) // FIXME: Can we do better?
48049 return DAG.getVectorShuffle(VT, SDLoc(Op), NegOp0, DAG.getUNDEF(VT),
48050 cast<ShuffleVectorSDNode>(Op)->getMask());
48051 break;
48052 }
48053 case ISD::INSERT_VECTOR_ELT: {
48054 // Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF,
48055 // -V, INDEX).
48056 SDValue InsVector = Op.getOperand(0);
48057 SDValue InsVal = Op.getOperand(1);
48058 if (!InsVector.isUndef())
48059 return SDValue();
48060 if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1))
48061 if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME
48062 return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,
48063 NegInsVal, Op.getOperand(2));
48064 break;
48065 }
48066 case ISD::FSUB:
48067 case ISD::XOR:
48068 case X86ISD::FXOR: {
48069 SDValue Op1 = Op.getOperand(1);
48070 SDValue Op0 = Op.getOperand(0);
48071
48072 // For XOR and FXOR, we want to check if constant
48073 // bits of Op1 are sign bit masks. For FSUB, we
48074 // have to check if constant bits of Op0 are sign
48075 // bit masks and hence we swap the operands.
48076 if (Opc == ISD::FSUB)
48077 std::swap(Op0, Op1);
48078
48079 APInt UndefElts;
48080 SmallVector<APInt, 16> EltBits;
48081 // Extract constant bits and see if they are all
48082 // sign bit masks. Ignore the undef elements.
48083 if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits,
48084 /* AllowWholeUndefs */ true,
48085 /* AllowPartialUndefs */ false)) {
48086 for (unsigned I = 0, E = EltBits.size(); I < E; I++)
48087 if (!UndefElts[I] && !EltBits[I].isSignMask())
48088 return SDValue();
48089
48090 return peekThroughBitcasts(Op0);
48091 }
48092 }
48093 }
48094
48095 return SDValue();
48096}
48097
48098static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
48099 bool NegRes) {
48100 if (NegMul) {
48101 switch (Opcode) {
48102 default: llvm_unreachable("Unexpected opcode")__builtin_unreachable();
48103 case ISD::FMA: Opcode = X86ISD::FNMADD; break;
48104 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FNMADD; break;
48105 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;
48106 case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;
48107 case X86ISD::STRICT_FMSUB: Opcode = X86ISD::STRICT_FNMSUB; break;
48108 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;
48109 case X86ISD::FNMADD: Opcode = ISD::FMA; break;
48110 case X86ISD::STRICT_FNMADD: Opcode = ISD::STRICT_FMA; break;
48111 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;
48112 case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;
48113 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB; break;
48114 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;
48115 }
48116 }
48117
48118 if (NegAcc) {
48119 switch (Opcode) {
48120 default: llvm_unreachable("Unexpected opcode")__builtin_unreachable();
48121 case ISD::FMA: Opcode = X86ISD::FMSUB; break;
48122 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FMSUB; break;
48123 case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
48124 case X86ISD::FMSUB: Opcode = ISD::FMA; break;
48125 case X86ISD::STRICT_FMSUB: Opcode = ISD::STRICT_FMA; break;
48126 case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
48127 case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;
48128 case X86ISD::STRICT_FNMADD: Opcode = X86ISD::STRICT_FNMSUB; break;
48129 case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
48130 case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;
48131 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FNMADD; break;
48132 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
48133 case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break;
48134 case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break;
48135 case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break;
48136 case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break;
48137 }
48138 }
48139
48140 if (NegRes) {
48141 switch (Opcode) {
48142 // For accuracy reason, we never combine fneg and fma under strict FP.
48143 default: llvm_unreachable("Unexpected opcode")__builtin_unreachable();
48144 case ISD::FMA: Opcode = X86ISD::FNMSUB; break;
48145 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
48146 case X86ISD::FMSUB: Opcode = X86ISD::FNMADD; break;
48147 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
48148 case X86ISD::FNMADD: Opcode = X86ISD::FMSUB; break;
48149 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
48150 case X86ISD::FNMSUB: Opcode = ISD::FMA; break;
48151 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
48152 }
48153 }
48154
48155 return Opcode;
48156}
48157
48158/// Do target-specific dag combines on floating point negations.
48159static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
48160 TargetLowering::DAGCombinerInfo &DCI,
48161 const X86Subtarget &Subtarget) {
48162 EVT OrigVT = N->getValueType(0);
48163 SDValue Arg = isFNEG(DAG, N);
48164 if (!Arg)
48165 return SDValue();
48166
48167 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48168 EVT VT = Arg.getValueType();
48169 EVT SVT = VT.getScalarType();
48170 SDLoc DL(N);
48171
48172 // Let legalize expand this if it isn't a legal type yet.
48173 if (!TLI.isTypeLegal(VT))
48174 return SDValue();
48175
48176 // If we're negating a FMUL node on a target with FMA, then we can avoid the
48177 // use of a constant by performing (-0 - A*B) instead.
48178 // FIXME: Check rounding control flags as well once it becomes available.
48179 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
48180 Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
48181 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
48182 SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
48183 Arg.getOperand(1), Zero);
48184 return DAG.getBitcast(OrigVT, NewNode);
48185 }
48186
48187 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
48188 bool LegalOperations = !DCI.isBeforeLegalizeOps();
48189 if (SDValue NegArg =
48190 TLI.getNegatedExpression(Arg, DAG, LegalOperations, CodeSize))
48191 return DAG.getBitcast(OrigVT, NegArg);
48192
48193 return SDValue();
48194}
48195
48196SDValue X86TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
48197 bool LegalOperations,
48198 bool ForCodeSize,
48199 NegatibleCost &Cost,
48200 unsigned Depth) const {
48201 // fneg patterns are removable even if they have multiple uses.
48202 if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth)) {
48203 Cost = NegatibleCost::Cheaper;
48204 return DAG.getBitcast(Op.getValueType(), Arg);
48205 }
48206
48207 EVT VT = Op.getValueType();
48208 EVT SVT = VT.getScalarType();
48209 unsigned Opc = Op.getOpcode();
48210 SDNodeFlags Flags = Op.getNode()->getFlags();
48211 switch (Opc) {
48212 case ISD::FMA:
48213 case X86ISD::FMSUB:
48214 case X86ISD::FNMADD:
48215 case X86ISD::FNMSUB:
48216 case X86ISD::FMADD_RND:
48217 case X86ISD::FMSUB_RND:
48218 case X86ISD::FNMADD_RND:
48219 case X86ISD::FNMSUB_RND: {
48220 if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||
48221 !(SVT == MVT::f32 || SVT == MVT::f64) ||
48222 !isOperationLegal(ISD::FMA, VT))
48223 break;
48224
48225 // Don't fold (fneg (fma (fneg x), y, (fneg z))) to (fma x, y, z)
48226 // if it may have signed zeros.
48227 if (!Flags.hasNoSignedZeros())
48228 break;
48229
48230 // This is always negatible for free but we might be able to remove some
48231 // extra operand negations as well.
48232 SmallVector<SDValue, 4> NewOps(Op.getNumOperands(), SDValue());
48233 for (int i = 0; i != 3; ++i)
48234 NewOps[i] = getCheaperNegatedExpression(
48235 Op.getOperand(i), DAG, LegalOperations, ForCodeSize, Depth + 1);
48236
48237 bool NegA = !!NewOps[0];
48238 bool NegB = !!NewOps[1];
48239 bool NegC = !!NewOps[2];
48240 unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true);
48241
48242 Cost = (NegA || NegB || NegC) ? NegatibleCost::Cheaper
48243 : NegatibleCost::Neutral;
48244
48245 // Fill in the non-negated ops with the original values.
48246 for (int i = 0, e = Op.getNumOperands(); i != e; ++i)
48247 if (!NewOps[i])
48248 NewOps[i] = Op.getOperand(i);
48249 return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps);
48250 }
48251 case X86ISD::FRCP:
48252 if (SDValue NegOp0 =
48253 getNegatedExpression(Op.getOperand(0), DAG, LegalOperations,
48254 ForCodeSize, Cost, Depth + 1))
48255 return DAG.getNode(Opc, SDLoc(Op), VT, NegOp0);
48256 break;
48257 }
48258
48259 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
48260 ForCodeSize, Cost, Depth);
48261}
48262
48263static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
48264 const X86Subtarget &Subtarget) {
48265 MVT VT = N->getSimpleValueType(0);
48266 // If we have integer vector types available, use the integer opcodes.
48267 if (!VT.isVector() || !Subtarget.hasSSE2())
48268 return SDValue();
48269
48270 SDLoc dl(N);
48271
48272 unsigned IntBits = VT.getScalarSizeInBits();
48273 MVT IntSVT = MVT::getIntegerVT(IntBits);
48274 MVT IntVT = MVT::getVectorVT(IntSVT, VT.getSizeInBits() / IntBits);
48275
48276 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
48277 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
48278 unsigned IntOpcode;
48279 switch (N->getOpcode()) {
48280 default: llvm_unreachable("Unexpected FP logic op")__builtin_unreachable();
48281 case X86ISD::FOR: IntOpcode = ISD::OR; break;
48282 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
48283 case X86ISD::FAND: IntOpcode = ISD::AND; break;
48284 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
48285 }
48286 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
48287 return DAG.getBitcast(VT, IntOp);
48288}
48289
48290
48291/// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
48292static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) {
48293 if (N->getOpcode() != ISD::XOR)
48294 return SDValue();
48295
48296 SDValue LHS = N->getOperand(0);
48297 if (!isOneConstant(N->getOperand(1)) || LHS->getOpcode() != X86ISD::SETCC)
48298 return SDValue();
48299
48300 X86::CondCode NewCC = X86::GetOppositeBranchCondition(
48301 X86::CondCode(LHS->getConstantOperandVal(0)));
48302 SDLoc DL(N);
48303 return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
48304}
48305
48306static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
48307 TargetLowering::DAGCombinerInfo &DCI,
48308 const X86Subtarget &Subtarget) {
48309 SDValue N0 = N->getOperand(0);
48310 SDValue N1 = N->getOperand(1);
48311 EVT VT = N->getValueType(0);
48312
48313 // If this is SSE1 only convert to FXOR to avoid scalarization.
48314 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
48315 return DAG.getBitcast(MVT::v4i32,
48316 DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32,
48317 DAG.getBitcast(MVT::v4f32, N0),
48318 DAG.getBitcast(MVT::v4f32, N1)));
48319 }
48320
48321 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
48322 return Cmp;
48323
48324 if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
48325 return R;
48326
48327 if (DCI.isBeforeLegalizeOps())
48328 return SDValue();
48329
48330 if (SDValue SetCC = foldXor1SetCC(N, DAG))
48331 return SetCC;
48332
48333 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
48334 return RV;
48335
48336 // Fold not(iX bitcast(vXi1)) -> (iX bitcast(not(vec))) for legal boolvecs.
48337 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48338 if (llvm::isAllOnesConstant(N1) && N0.getOpcode() == ISD::BITCAST &&
48339 N0.getOperand(0).getValueType().isVector() &&
48340 N0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
48341 TLI.isTypeLegal(N0.getOperand(0).getValueType()) && N0.hasOneUse()) {
48342 return DAG.getBitcast(VT, DAG.getNOT(SDLoc(N), N0.getOperand(0),
48343 N0.getOperand(0).getValueType()));
48344 }
48345
48346 // Handle AVX512 mask widening.
48347 // Fold not(insert_subvector(undef,sub)) -> insert_subvector(undef,not(sub))
48348 if (ISD::isBuildVectorAllOnes(N1.getNode()) && VT.isVector() &&
48349 VT.getVectorElementType() == MVT::i1 &&
48350 N0.getOpcode() == ISD::INSERT_SUBVECTOR && N0.getOperand(0).isUndef() &&
48351 TLI.isTypeLegal(N0.getOperand(1).getValueType())) {
48352 return DAG.getNode(
48353 ISD::INSERT_SUBVECTOR, SDLoc(N), VT, N0.getOperand(0),
48354 DAG.getNOT(SDLoc(N), N0.getOperand(1), N0.getOperand(1).getValueType()),
48355 N0.getOperand(2));
48356 }
48357
48358 // Fold xor(zext(xor(x,c1)),c2) -> xor(zext(x),xor(zext(c1),c2))
48359 // Fold xor(truncate(xor(x,c1)),c2) -> xor(truncate(x),xor(truncate(c1),c2))
48360 // TODO: Under what circumstances could this be performed in DAGCombine?
48361 if ((N0.getOpcode() == ISD::TRUNCATE || N0.getOpcode() == ISD::ZERO_EXTEND) &&
48362 N0.getOperand(0).getOpcode() == N->getOpcode()) {
48363 SDValue TruncExtSrc = N0.getOperand(0);
48364 auto *N1C = dyn_cast<ConstantSDNode>(N1);
48365 auto *N001C = dyn_cast<ConstantSDNode>(TruncExtSrc.getOperand(1));
48366 if (N1C && !N1C->isOpaque() && N001C && !N001C->isOpaque()) {
48367 SDLoc DL(N);
48368 SDValue LHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(0), DL, VT);
48369 SDValue RHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(1), DL, VT);
48370 return DAG.getNode(ISD::XOR, DL, VT, LHS,
48371 DAG.getNode(ISD::XOR, DL, VT, RHS, N1));
48372 }
48373 }
48374
48375 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
48376 return FPLogic;
48377
48378 return combineFneg(N, DAG, DCI, Subtarget);
48379}
48380
48381static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG,
48382 TargetLowering::DAGCombinerInfo &DCI,
48383 const X86Subtarget &Subtarget) {
48384 EVT VT = N->getValueType(0);
48385 unsigned NumBits = VT.getSizeInBits();
48386
48387 // TODO - Constant Folding.
48388
48389 // Simplify the inputs.
48390 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48391 APInt DemandedMask(APInt::getAllOnesValue(NumBits));
48392 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
48393 return SDValue(N, 0);
48394
48395 return SDValue();
48396}
48397
48398static bool isNullFPScalarOrVectorConst(SDValue V) {
48399 return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
48400}
48401
48402/// If a value is a scalar FP zero or a vector FP zero (potentially including
48403/// undefined elements), return a zero constant that may be used to fold away
48404/// that value. In the case of a vector, the returned constant will not contain
48405/// undefined elements even if the input parameter does. This makes it suitable
48406/// to be used as a replacement operand with operations (eg, bitwise-and) where
48407/// an undef should not propagate.
48408static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
48409 const X86Subtarget &Subtarget) {
48410 if (!isNullFPScalarOrVectorConst(V))
48411 return SDValue();
48412
48413 if (V.getValueType().isVector())
48414 return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
48415
48416 return V;
48417}
48418
48419static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
48420 const X86Subtarget &Subtarget) {
48421 SDValue N0 = N->getOperand(0);
48422 SDValue N1 = N->getOperand(1);
48423 EVT VT = N->getValueType(0);
48424 SDLoc DL(N);
48425
48426 // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
48427 if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
48428 (VT == MVT::f64 && Subtarget.hasSSE2()) ||
48429 (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
48430 return SDValue();
48431
48432 auto isAllOnesConstantFP = [](SDValue V) {
48433 if (V.getSimpleValueType().isVector())
48434 return ISD::isBuildVectorAllOnes(V.getNode());
48435 auto *C = dyn_cast<ConstantFPSDNode>(V);
48436 return C && C->getConstantFPValue()->isAllOnesValue();
48437 };
48438
48439 // fand (fxor X, -1), Y --> fandn X, Y
48440 if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
48441 return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
48442
48443 // fand X, (fxor Y, -1) --> fandn Y, X
48444 if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
48445 return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
48446
48447 return SDValue();
48448}
48449
48450/// Do target-specific dag combines on X86ISD::FAND nodes.
48451static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
48452 const X86Subtarget &Subtarget) {
48453 // FAND(0.0, x) -> 0.0
48454 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
48455 return V;
48456
48457 // FAND(x, 0.0) -> 0.0
48458 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
48459 return V;
48460
48461 if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
48462 return V;
48463
48464 return lowerX86FPLogicOp(N, DAG, Subtarget);
48465}
48466
48467/// Do target-specific dag combines on X86ISD::FANDN nodes.
48468static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
48469 const X86Subtarget &Subtarget) {
48470 // FANDN(0.0, x) -> x
48471 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
48472 return N->getOperand(1);
48473
48474 // FANDN(x, 0.0) -> 0.0
48475 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
48476 return V;
48477
48478 return lowerX86FPLogicOp(N, DAG, Subtarget);
48479}
48480
48481/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
48482static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
48483 TargetLowering::DAGCombinerInfo &DCI,
48484 const X86Subtarget &Subtarget) {
48485 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR)(static_cast<void> (0));
48486
48487 // F[X]OR(0.0, x) -> x
48488 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
48489 return N->getOperand(1);
48490
48491 // F[X]OR(x, 0.0) -> x
48492 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
48493 return N->getOperand(0);
48494
48495 if (SDValue NewVal = combineFneg(N, DAG, DCI, Subtarget))
48496 return NewVal;
48497
48498 return lowerX86FPLogicOp(N, DAG, Subtarget);
48499}
48500
48501/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
48502static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
48503 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX)(static_cast<void> (0));
48504
48505 // FMIN/FMAX are commutative if no NaNs and no negative zeros are allowed.
48506 if (!DAG.getTarget().Options.NoNaNsFPMath ||
48507 !DAG.getTarget().Options.NoSignedZerosFPMath)
48508 return SDValue();
48509
48510 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
48511 // into FMINC and FMAXC, which are Commutative operations.
48512 unsigned NewOp = 0;
48513 switch (N->getOpcode()) {
48514 default: llvm_unreachable("unknown opcode")__builtin_unreachable();
48515 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
48516 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
48517 }
48518
48519 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
48520 N->getOperand(0), N->getOperand(1));
48521}
48522
48523static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
48524 const X86Subtarget &Subtarget) {
48525 if (Subtarget.useSoftFloat())
48526 return SDValue();
48527
48528 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48529
48530 EVT VT = N->getValueType(0);
48531 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
48532 (Subtarget.hasSSE2() && VT == MVT::f64) ||
48533 (Subtarget.hasFP16() && VT == MVT::f16) ||
48534 (VT.isVector() && TLI.isTypeLegal(VT))))
48535 return SDValue();
48536
48537 SDValue Op0 = N->getOperand(0);
48538 SDValue Op1 = N->getOperand(1);
48539 SDLoc DL(N);
48540 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
48541
48542 // If we don't have to respect NaN inputs, this is a direct translation to x86
48543 // min/max instructions.
48544 if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
48545 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
48546
48547 // If one of the operands is known non-NaN use the native min/max instructions
48548 // with the non-NaN input as second operand.
48549 if (DAG.isKnownNeverNaN(Op1))
48550 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
48551 if (DAG.isKnownNeverNaN(Op0))
48552 return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());
48553
48554 // If we have to respect NaN inputs, this takes at least 3 instructions.
48555 // Favor a library call when operating on a scalar and minimizing code size.
48556 if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())
48557 return SDValue();
48558
48559 EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
48560 VT);
48561
48562 // There are 4 possibilities involving NaN inputs, and these are the required
48563 // outputs:
48564 // Op1
48565 // Num NaN
48566 // ----------------
48567 // Num | Max | Op0 |
48568 // Op0 ----------------
48569 // NaN | Op1 | NaN |
48570 // ----------------
48571 //
48572 // The SSE FP max/min instructions were not designed for this case, but rather
48573 // to implement:
48574 // Min = Op1 < Op0 ? Op1 : Op0
48575 // Max = Op1 > Op0 ? Op1 : Op0
48576 //
48577 // So they always return Op0 if either input is a NaN. However, we can still
48578 // use those instructions for fmaxnum by selecting away a NaN input.
48579
48580 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
48581 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
48582 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);
48583
48584 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
48585 // are NaN, the NaN value of Op1 is the result.
48586 return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
48587}
48588
48589static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,
48590 TargetLowering::DAGCombinerInfo &DCI) {
48591 EVT VT = N->getValueType(0);
48592 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48593
48594 APInt KnownUndef, KnownZero;
48595 APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
48596 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
48597 KnownZero, DCI))
48598 return SDValue(N, 0);
48599
48600 // Convert a full vector load into vzload when not all bits are needed.
48601 SDValue In = N->getOperand(0);
48602 MVT InVT = In.getSimpleValueType();
48603 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
48604 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
48605 assert(InVT.is128BitVector() && "Expected 128-bit input vector")(static_cast<void> (0));
48606 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
48607 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
48608 MVT MemVT = MVT::getIntegerVT(NumBits);
48609 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
48610 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
48611 SDLoc dl(N);
48612 SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
48613 DAG.getBitcast(InVT, VZLoad));
48614 DCI.CombineTo(N, Convert);
48615 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
48616 DCI.recursivelyDeleteUnusedNodes(LN);
48617 return SDValue(N, 0);
48618 }
48619 }
48620
48621 return SDValue();
48622}
48623
48624static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG,
48625 TargetLowering::DAGCombinerInfo &DCI) {
48626 bool IsStrict = N->isTargetStrictFPOpcode();
48627 EVT VT = N->getValueType(0);
48628
48629 // Convert a full vector load into vzload when not all bits are needed.
48630 SDValue In = N->getOperand(IsStrict ? 1 : 0);
48631 MVT InVT = In.getSimpleValueType();
48632 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
48633 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
48634 assert(InVT.is128BitVector() && "Expected 128-bit input vector")(static_cast<void> (0));
48635 LoadSDNode *LN = cast<LoadSDNode>(In);
48636 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
48637 MVT MemVT = MVT::getFloatingPointVT(NumBits);
48638 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
48639 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
48640 SDLoc dl(N);
48641 if (IsStrict) {
48642 SDValue Convert =
48643 DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
48644 {N->getOperand(0), DAG.getBitcast(InVT, VZLoad)});
48645 DCI.CombineTo(N, Convert, Convert.getValue(1));
48646 } else {
48647 SDValue Convert =
48648 DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad));
48649 DCI.CombineTo(N, Convert);
48650 }
48651 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
48652 DCI.recursivelyDeleteUnusedNodes(LN);
48653 return SDValue(N, 0);
48654 }
48655 }
48656
48657 return SDValue();
48658}
48659
48660/// Do target-specific dag combines on X86ISD::ANDNP nodes.
48661static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
48662 TargetLowering::DAGCombinerInfo &DCI,
48663 const X86Subtarget &Subtarget) {
48664 MVT VT = N->getSimpleValueType(0);
48665
48666 // ANDNP(0, x) -> x
48667 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
48668 return N->getOperand(1);
48669
48670 // ANDNP(x, 0) -> 0
48671 if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode()))
48672 return DAG.getConstant(0, SDLoc(N), VT);
48673
48674 // Turn ANDNP back to AND if input is inverted.
48675 if (SDValue Not = IsNOT(N->getOperand(0), DAG))
48676 return DAG.getNode(ISD::AND, SDLoc(N), VT, DAG.getBitcast(VT, Not),
48677 N->getOperand(1));
48678
48679 // Attempt to recursively combine a bitmask ANDNP with shuffles.
48680 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
48681 SDValue Op(N, 0);
48682 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
48683 return Res;
48684 }
48685
48686 return SDValue();
48687}
48688
48689static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
48690 TargetLowering::DAGCombinerInfo &DCI) {
48691 SDValue N1 = N->getOperand(1);
48692
48693 // BT ignores high bits in the bit index operand.
48694 unsigned BitWidth = N1.getValueSizeInBits();
48695 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
48696 if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(N1, DemandedMask, DCI)) {
48697 if (N->getOpcode() != ISD::DELETED_NODE)
48698 DCI.AddToWorklist(N);
48699 return SDValue(N, 0);
48700 }
48701
48702 return SDValue();
48703}
48704
48705static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG,
48706 TargetLowering::DAGCombinerInfo &DCI) {
48707 bool IsStrict = N->getOpcode() == X86ISD::STRICT_CVTPH2PS;
48708 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
48709
48710 if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) {
48711 APInt KnownUndef, KnownZero;
48712 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48713 APInt DemandedElts = APInt::getLowBitsSet(8, 4);
48714 if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
48715 DCI)) {
48716 if (N->getOpcode() != ISD::DELETED_NODE)
48717 DCI.AddToWorklist(N);
48718 return SDValue(N, 0);
48719 }
48720
48721 // Convert a full vector load into vzload when not all bits are needed.
48722 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
48723 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(IsStrict ? 1 : 0));
48724 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::i64, MVT::v2i64, DAG)) {
48725 SDLoc dl(N);
48726 if (IsStrict) {
48727 SDValue Convert = DAG.getNode(
48728 N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
48729 {N->getOperand(0), DAG.getBitcast(MVT::v8i16, VZLoad)});
48730 DCI.CombineTo(N, Convert, Convert.getValue(1));
48731 } else {
48732 SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32,
48733 DAG.getBitcast(MVT::v8i16, VZLoad));
48734 DCI.CombineTo(N, Convert);
48735 }
48736
48737 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
48738 DCI.recursivelyDeleteUnusedNodes(LN);
48739 return SDValue(N, 0);
48740 }
48741 }
48742 }
48743
48744 return SDValue();
48745}
48746
48747// Try to combine sext_in_reg of a cmov of constants by extending the constants.
48748static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG) {
48749 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG)(static_cast<void> (0));
48750
48751 EVT DstVT = N->getValueType(0);
48752
48753 SDValue N0 = N->getOperand(0);
48754 SDValue N1 = N->getOperand(1);
48755 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
48756
48757 if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16)
48758 return SDValue();
48759
48760 // Look through single use any_extends / truncs.
48761 SDValue IntermediateBitwidthOp;
48762 if ((N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::TRUNCATE) &&
48763 N0.hasOneUse()) {
48764 IntermediateBitwidthOp = N0;
48765 N0 = N0.getOperand(0);
48766 }
48767
48768 // See if we have a single use cmov.
48769 if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())
48770 return SDValue();
48771
48772 SDValue CMovOp0 = N0.getOperand(0);
48773 SDValue CMovOp1 = N0.getOperand(1);
48774
48775 // Make sure both operands are constants.
48776 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
48777 !isa<ConstantSDNode>(CMovOp1.getNode()))
48778 return SDValue();
48779
48780 SDLoc DL(N);
48781
48782 // If we looked through an any_extend/trunc above, add one to the constants.
48783 if (IntermediateBitwidthOp) {
48784 unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode();
48785 CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0);
48786 CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1);
48787 }
48788
48789 CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1);
48790 CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1);
48791
48792 EVT CMovVT = DstVT;
48793 // We do not want i16 CMOV's. Promote to i32 and truncate afterwards.
48794 if (DstVT == MVT::i16) {
48795 CMovVT = MVT::i32;
48796 CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0);
48797 CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1);
48798 }
48799
48800 SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1,
48801 N0.getOperand(2), N0.getOperand(3));
48802
48803 if (CMovVT != DstVT)
48804 CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov);
48805
48806 return CMov;
48807}
48808
48809static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
48810 const X86Subtarget &Subtarget) {
48811 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG)(static_cast<void> (0));
48812
48813 if (SDValue V = combineSextInRegCmov(N, DAG))
48814 return V;
48815
48816 EVT VT = N->getValueType(0);
48817 SDValue N0 = N->getOperand(0);
48818 SDValue N1 = N->getOperand(1);
48819 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
48820 SDLoc dl(N);
48821
48822 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
48823 // both SSE and AVX2 since there is no sign-extended shift right
48824 // operation on a vector with 64-bit elements.
48825 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
48826 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
48827 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
48828 N0.getOpcode() == ISD::SIGN_EXTEND)) {
48829 SDValue N00 = N0.getOperand(0);
48830
48831 // EXTLOAD has a better solution on AVX2,
48832 // it may be replaced with X86ISD::VSEXT node.
48833 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
48834 if (!ISD::isNormalLoad(N00.getNode()))
48835 return SDValue();
48836
48837 // Attempt to promote any comparison mask ops before moving the
48838 // SIGN_EXTEND_INREG in the way.
48839 if (SDValue Promote = PromoteMaskArithmetic(N0.getNode(), DAG, Subtarget))
48840 return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1);
48841
48842 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
48843 SDValue Tmp =
48844 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1);
48845 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
48846 }
48847 }
48848 return SDValue();
48849}
48850
48851/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
48852/// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
48853/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
48854/// opportunities to combine math ops, use an LEA, or use a complex addressing
48855/// mode. This can eliminate extend, add, and shift instructions.
48856static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
48857 const X86Subtarget &Subtarget) {
48858 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
48859 Ext->getOpcode() != ISD::ZERO_EXTEND)
48860 return SDValue();
48861
48862 // TODO: This should be valid for other integer types.
48863 EVT VT = Ext->getValueType(0);
48864 if (VT != MVT::i64)
48865 return SDValue();
48866
48867 SDValue Add = Ext->getOperand(0);
48868 if (Add.getOpcode() != ISD::ADD)
48869 return SDValue();
48870
48871 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
48872 bool NSW = Add->getFlags().hasNoSignedWrap();
48873 bool NUW = Add->getFlags().hasNoUnsignedWrap();
48874
48875 // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
48876 // into the 'zext'
48877 if ((Sext && !NSW) || (!Sext && !NUW))
48878 return SDValue();
48879
48880 // Having a constant operand to the 'add' ensures that we are not increasing
48881 // the instruction count because the constant is extended for free below.
48882 // A constant operand can also become the displacement field of an LEA.
48883 auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
48884 if (!AddOp1)
48885 return SDValue();
48886
48887 // Don't make the 'add' bigger if there's no hope of combining it with some
48888 // other 'add' or 'shl' instruction.
48889 // TODO: It may be profitable to generate simpler LEA instructions in place
48890 // of single 'add' instructions, but the cost model for selecting an LEA
48891 // currently has a high threshold.
48892 bool HasLEAPotential = false;
48893 for (auto *User : Ext->uses()) {
48894 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
48895 HasLEAPotential = true;
48896 break;
48897 }
48898 }
48899 if (!HasLEAPotential)
48900 return SDValue();
48901
48902 // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
48903 int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
48904 SDValue AddOp0 = Add.getOperand(0);
48905 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
48906 SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
48907
48908 // The wider add is guaranteed to not wrap because both operands are
48909 // sign-extended.
48910 SDNodeFlags Flags;
48911 Flags.setNoSignedWrap(NSW);
48912 Flags.setNoUnsignedWrap(NUW);
48913 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
48914}
48915
48916// If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
48917// operands and the result of CMOV is not used anywhere else - promote CMOV
48918// itself instead of promoting its result. This could be beneficial, because:
48919// 1) X86TargetLowering::EmitLoweredSelect later can do merging of two
48920// (or more) pseudo-CMOVs only when they go one-after-another and
48921// getting rid of result extension code after CMOV will help that.
48922// 2) Promotion of constant CMOV arguments is free, hence the
48923// {ANY,SIGN,ZERO}_EXTEND will just be deleted.
48924// 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
48925// promotion is also good in terms of code-size.
48926// (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
48927// promotion).
48928static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) {
48929 SDValue CMovN = Extend->getOperand(0);
48930 if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())
48931 return SDValue();
48932
48933 EVT TargetVT = Extend->getValueType(0);
48934 unsigned ExtendOpcode = Extend->getOpcode();
48935 SDLoc DL(Extend);
48936
48937 EVT VT = CMovN.getValueType();
48938 SDValue CMovOp0 = CMovN.getOperand(0);
48939 SDValue CMovOp1 = CMovN.getOperand(1);
48940
48941 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
48942 !isa<ConstantSDNode>(CMovOp1.getNode()))
48943 return SDValue();
48944
48945 // Only extend to i32 or i64.
48946 if (TargetVT != MVT::i32 && TargetVT != MVT::i64)
48947 return SDValue();
48948
48949 // Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32
48950 // are free.
48951 if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))
48952 return SDValue();
48953
48954 // If this a zero extend to i64, we should only extend to i32 and use a free
48955 // zero extend to finish.
48956 EVT ExtendVT = TargetVT;
48957 if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)
48958 ExtendVT = MVT::i32;
48959
48960 CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);
48961 CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);
48962
48963 SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,
48964 CMovN.getOperand(2), CMovN.getOperand(3));
48965
48966 // Finish extending if needed.
48967 if (ExtendVT != TargetVT)
48968 Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);
48969
48970 return Res;
48971}
48972
48973// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
48974// This is more or less the reverse of combineBitcastvxi1.
48975static SDValue
48976combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG,
48977 TargetLowering::DAGCombinerInfo &DCI,
48978 const X86Subtarget &Subtarget) {
48979 unsigned Opcode = N->getOpcode();
48980 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
48981 Opcode != ISD::ANY_EXTEND)
48982 return SDValue();
48983 if (!DCI.isBeforeLegalizeOps())
48984 return SDValue();
48985 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
48986 return SDValue();
48987
48988 SDValue N0 = N->getOperand(0);
48989 EVT VT = N->getValueType(0);
48990 EVT SVT = VT.getScalarType();
48991 EVT InSVT = N0.getValueType().getScalarType();
48992 unsigned EltSizeInBits = SVT.getSizeInBits();
48993
48994 // Input type must be extending a bool vector (bit-casted from a scalar
48995 // integer) to legal integer types.
48996 if (!VT.isVector())
48997 return SDValue();
48998 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
48999 return SDValue();
49000 if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
49001 return SDValue();
49002
49003 SDValue N00 = N0.getOperand(0);
49004 EVT SclVT = N0.getOperand(0).getValueType();
49005 if (!SclVT.isScalarInteger())
49006 return SDValue();
49007
49008 SDLoc DL(N);
49009 SDValue Vec;
49010 SmallVector<int, 32> ShuffleMask;
49011 unsigned NumElts = VT.getVectorNumElements();
49012 assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size")(static_cast<void> (0));
49013
49014 // Broadcast the scalar integer to the vector elements.
49015 if (NumElts > EltSizeInBits) {
49016 // If the scalar integer is greater than the vector element size, then we
49017 // must split it down into sub-sections for broadcasting. For example:
49018 // i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
49019 // i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
49020 assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale")(static_cast<void> (0));
49021 unsigned Scale = NumElts / EltSizeInBits;
49022 EVT BroadcastVT =
49023 EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
49024 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
49025 Vec = DAG.getBitcast(VT, Vec);
49026
49027 for (unsigned i = 0; i != Scale; ++i)
49028 ShuffleMask.append(EltSizeInBits, i);
49029 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
49030 } else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits &&
49031 (SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) {
49032 // If we have register broadcast instructions, use the scalar size as the
49033 // element type for the shuffle. Then cast to the wider element type. The
49034 // widened bits won't be used, and this might allow the use of a broadcast
49035 // load.
49036 assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale")(static_cast<void> (0));
49037 unsigned Scale = EltSizeInBits / NumElts;
49038 EVT BroadcastVT =
49039 EVT::getVectorVT(*DAG.getContext(), SclVT, NumElts * Scale);
49040 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
49041 ShuffleMask.append(NumElts * Scale, 0);
49042 Vec = DAG.getVectorShuffle(BroadcastVT, DL, Vec, Vec, ShuffleMask);
49043 Vec = DAG.getBitcast(VT, Vec);
49044 } else {
49045 // For smaller scalar integers, we can simply any-extend it to the vector
49046 // element size (we don't care about the upper bits) and broadcast it to all
49047 // elements.
49048 SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
49049 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
49050 ShuffleMask.append(NumElts, 0);
49051 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
49052 }
49053
49054 // Now, mask the relevant bit in each element.
49055 SmallVector<SDValue, 32> Bits;
49056 for (unsigned i = 0; i != NumElts; ++i) {
49057 int BitIdx = (i % EltSizeInBits);
49058 APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
49059 Bits.push_back(DAG.getConstant(Bit, DL, SVT));
49060 }
49061 SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
49062 Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
49063
49064 // Compare against the bitmask and extend the result.
49065 EVT CCVT = VT.changeVectorElementType(MVT::i1);
49066 Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
49067 Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
49068
49069 // For SEXT, this is now done, otherwise shift the result down for
49070 // zero-extension.
49071 if (Opcode == ISD::SIGN_EXTEND)
49072 return Vec;
49073 return DAG.getNode(ISD::SRL, DL, VT, Vec,
49074 DAG.getConstant(EltSizeInBits - 1, DL, VT));
49075}
49076
49077// Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
49078// result type.
49079static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,
49080 const X86Subtarget &Subtarget) {
49081 SDValue N0 = N->getOperand(0);
49082 EVT VT = N->getValueType(0);
49083 SDLoc dl(N);
49084
49085 // Only do this combine with AVX512 for vector extends.
49086 if (!Subtarget.hasAVX512() || !VT.isVector() || N0.getOpcode() != ISD::SETCC)
49087 return SDValue();
49088
49089 // Only combine legal element types.
49090 EVT SVT = VT.getVectorElementType();
49091 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
49092 SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
49093 return SDValue();
49094
49095 // We don't have CMPP Instruction for vxf16
49096 if (N0.getOperand(0).getValueType().getVectorElementType() == MVT::f16)
49097 return SDValue();
49098 // We can only do this if the vector size in 256 bits or less.
49099 unsigned Size = VT.getSizeInBits();
49100 if (Size > 256 && Subtarget.useAVX512Regs())
49101 return SDValue();
49102
49103 // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
49104 // that's the only integer compares with we have.
49105 ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
49106 if (ISD::isUnsignedIntSetCC(CC))
49107 return SDValue();
49108
49109 // Only do this combine if the extension will be fully consumed by the setcc.
49110 EVT N00VT = N0.getOperand(0).getValueType();
49111 EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
49112 if (Size != MatchingVecType.getSizeInBits())
49113 return SDValue();
49114
49115 SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
49116
49117 if (N->getOpcode() == ISD::ZERO_EXTEND)
49118 Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType());
49119
49120 return Res;
49121}
49122
49123static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
49124 TargetLowering::DAGCombinerInfo &DCI,
49125 const X86Subtarget &Subtarget) {
49126 SDValue N0 = N->getOperand(0);
49127 EVT VT = N->getValueType(0);
49128 SDLoc DL(N);
49129
49130 // (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
49131 if (!DCI.isBeforeLegalizeOps() &&
49132 N0.getOpcode() == X86ISD::SETCC_CARRY) {
49133 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0),
49134 N0->getOperand(1));
49135 bool ReplaceOtherUses = !N0.hasOneUse();
49136 DCI.CombineTo(N, Setcc);
49137 // Replace other uses with a truncate of the widened setcc_carry.
49138 if (ReplaceOtherUses) {
49139 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
49140 N0.getValueType(), Setcc);
49141 DCI.CombineTo(N0.getNode(), Trunc);
49142 }
49143
49144 return SDValue(N, 0);
49145 }
49146
49147 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
49148 return NewCMov;
49149
49150 if (!DCI.isBeforeLegalizeOps())
49151 return SDValue();
49152
49153 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
49154 return V;
49155
49156 if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
49157 return V;
49158
49159 if (VT.isVector()) {
49160 if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
49161 return R;
49162
49163 if (N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG)
49164 return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0));
49165 }
49166
49167 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
49168 return NewAdd;
49169
49170 return SDValue();
49171}
49172
49173static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
49174 TargetLowering::DAGCombinerInfo &DCI,
49175 const X86Subtarget &Subtarget) {
49176 SDLoc dl(N);
49177 EVT VT = N->getValueType(0);
49178 bool IsStrict = N->isStrictFPOpcode() || N->isTargetStrictFPOpcode();
49179
49180 // Let legalize expand this if it isn't a legal type yet.
49181 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49182 if (!TLI.isTypeLegal(VT))
49183 return SDValue();
49184
49185 SDValue A = N->getOperand(IsStrict ? 1 : 0);
49186 SDValue B = N->getOperand(IsStrict ? 2 : 1);
49187 SDValue C = N->getOperand(IsStrict ? 3 : 2);
49188
49189 // If the operation allows fast-math and the target does not support FMA,
49190 // split this into mul+add to avoid libcall(s).
49191 SDNodeFlags Flags = N->getFlags();
49192 if (!IsStrict && Flags.hasAllowReassociation() &&
49193 TLI.isOperationExpand(ISD::FMA, VT)) {
49194 SDValue Fmul = DAG.getNode(ISD::FMUL, dl, VT, A, B, Flags);
49195 return DAG.getNode(ISD::FADD, dl, VT, Fmul, C, Flags);
49196 }
49197
49198 EVT ScalarVT = VT.getScalarType();
49199 if (((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
49200 !Subtarget.hasAnyFMA()) &&
49201 !(ScalarVT == MVT::f16 && Subtarget.hasFP16()))
49202 return SDValue();
49203
49204 auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {
49205 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
49206 bool LegalOperations = !DCI.isBeforeLegalizeOps();
49207 if (SDValue NegV = TLI.getCheaperNegatedExpression(V, DAG, LegalOperations,
49208 CodeSize)) {
49209 V = NegV;
49210 return true;
49211 }
49212 // Look through extract_vector_elts. If it comes from an FNEG, create a
49213 // new extract from the FNEG input.
49214 if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
49215 isNullConstant(V.getOperand(1))) {
49216 SDValue Vec = V.getOperand(0);
49217 if (SDValue NegV = TLI.getCheaperNegatedExpression(
49218 Vec, DAG, LegalOperations, CodeSize)) {
49219 V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
49220 NegV, V.getOperand(1));
49221 return true;
49222 }
49223 }
49224
49225 return false;
49226 };
49227
49228 // Do not convert the passthru input of scalar intrinsics.
49229 // FIXME: We could allow negations of the lower element only.
49230 bool NegA = invertIfNegative(A);
49231 bool NegB = invertIfNegative(B);
49232 bool NegC = invertIfNegative(C);
49233
49234 if (!NegA && !NegB && !NegC)
49235 return SDValue();
49236
49237 unsigned NewOpcode =
49238 negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);
49239
49240 // Propagate fast-math-flags to new FMA node.
49241 SelectionDAG::FlagInserter FlagsInserter(DAG, Flags);
49242 if (IsStrict) {
49243 assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4")(static_cast<void> (0));
49244 return DAG.getNode(NewOpcode, dl, {VT, MVT::Other},
49245 {N->getOperand(0), A, B, C});
49246 } else {
49247 if (N->getNumOperands() == 4)
49248 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
49249 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
49250 }
49251}
49252
49253// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
49254// Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C)
49255static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,
49256 TargetLowering::DAGCombinerInfo &DCI) {
49257 SDLoc dl(N);
49258 EVT VT = N->getValueType(0);
49259 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49260 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
49261 bool LegalOperations = !DCI.isBeforeLegalizeOps();
49262
49263 SDValue N2 = N->getOperand(2);
49264
49265 SDValue NegN2 =
49266 TLI.getCheaperNegatedExpression(N2, DAG, LegalOperations, CodeSize);
49267 if (!NegN2)
49268 return SDValue();
49269 unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);
49270
49271 if (N->getNumOperands() == 4)
49272 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
49273 NegN2, N->getOperand(3));
49274 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
49275 NegN2);
49276}
49277
49278static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
49279 TargetLowering::DAGCombinerInfo &DCI,
49280 const X86Subtarget &Subtarget) {
49281 SDLoc dl(N);
49282 SDValue N0 = N->getOperand(0);
49283 EVT VT = N->getValueType(0);
49284
49285 // (i32 (aext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
49286 // FIXME: Is this needed? We don't seem to have any tests for it.
49287 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND &&
49288 N0.getOpcode() == X86ISD::SETCC_CARRY) {
49289 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0),
49290 N0->getOperand(1));
49291 bool ReplaceOtherUses = !N0.hasOneUse();
49292 DCI.CombineTo(N, Setcc);
49293 // Replace other uses with a truncate of the widened setcc_carry.
49294 if (ReplaceOtherUses) {
49295 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
49296 N0.getValueType(), Setcc);
49297 DCI.CombineTo(N0.getNode(), Trunc);
49298 }
49299
49300 return SDValue(N, 0);
49301 }
49302
49303 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
49304 return NewCMov;
49305
49306 if (DCI.isBeforeLegalizeOps())
49307 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
49308 return V;
49309
49310 if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
49311 return V;
49312
49313 if (VT.isVector())
49314 if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
49315 return R;
49316
49317 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
49318 return NewAdd;
49319
49320 if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
49321 return R;
49322
49323 // TODO: Combine with any target/faux shuffle.
49324 if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 &&
49325 VT.getScalarSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits()) {
49326 SDValue N00 = N0.getOperand(0);
49327 SDValue N01 = N0.getOperand(1);
49328 unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();
49329 APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);
49330 if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) &&
49331 (N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) {
49332 return concatSubVectors(N00, N01, DAG, dl);
49333 }
49334 }
49335
49336 return SDValue();
49337}
49338
49339/// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a
49340/// recognizable memcmp expansion.
49341static bool isOrXorXorTree(SDValue X, bool Root = true) {
49342 if (X.getOpcode() == ISD::OR)
49343 return isOrXorXorTree(X.getOperand(0), false) &&
49344 isOrXorXorTree(X.getOperand(1), false);
49345 if (Root)
49346 return false;
49347 return X.getOpcode() == ISD::XOR;
49348}
49349
49350/// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp
49351/// expansion.
49352template<typename F>
49353static SDValue emitOrXorXorTree(SDValue X, SDLoc &DL, SelectionDAG &DAG,
49354 EVT VecVT, EVT CmpVT, bool HasPT, F SToV) {
49355 SDValue Op0 = X.getOperand(0);
49356 SDValue Op1 = X.getOperand(1);
49357 if (X.getOpcode() == ISD::OR) {
49358 SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT, HasPT, SToV);
49359 SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT, HasPT, SToV);
49360 if (VecVT != CmpVT)
49361 return DAG.getNode(ISD::OR, DL, CmpVT, A, B);
49362 if (HasPT)
49363 return DAG.getNode(ISD::OR, DL, VecVT, A, B);
49364 return DAG.getNode(ISD::AND, DL, CmpVT, A, B);
49365 } else if (X.getOpcode() == ISD::XOR) {
49366 SDValue A = SToV(Op0);
49367 SDValue B = SToV(Op1);
49368 if (VecVT != CmpVT)
49369 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE);
49370 if (HasPT)
49371 return DAG.getNode(ISD::XOR, DL, VecVT, A, B);
49372 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
49373 }
49374 llvm_unreachable("Impossible")__builtin_unreachable();
49375}
49376
49377/// Try to map a 128-bit or larger integer comparison to vector instructions
49378/// before type legalization splits it up into chunks.
49379static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
49380 const X86Subtarget &Subtarget) {
49381 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
49382 assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate")(static_cast<void> (0));
49383
49384 // We're looking for an oversized integer equality comparison.
49385 SDValue X = SetCC->getOperand(0);
49386 SDValue Y = SetCC->getOperand(1);
49387 EVT OpVT = X.getValueType();
49388 unsigned OpSize = OpVT.getSizeInBits();
49389 if (!OpVT.isScalarInteger() || OpSize < 128)
49390 return SDValue();
49391
49392 // Ignore a comparison with zero because that gets special treatment in
49393 // EmitTest(). But make an exception for the special case of a pair of
49394 // logically-combined vector-sized operands compared to zero. This pattern may
49395 // be generated by the memcmp expansion pass with oversized integer compares
49396 // (see PR33325).
49397 bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);
49398 if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)
49399 return SDValue();
49400
49401 // Don't perform this combine if constructing the vector will be expensive.
49402 auto IsVectorBitCastCheap = [](SDValue X) {
49403 X = peekThroughBitcasts(X);
49404 return isa<ConstantSDNode>(X) || X.getValueType().isVector() ||
49405 X.getOpcode() == ISD::LOAD;
49406 };
49407 if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) &&
49408 !IsOrXorXorTreeCCZero)
49409 return SDValue();
49410
49411 EVT VT = SetCC->getValueType(0);
49412 SDLoc DL(SetCC);
49413
49414 // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.
49415 // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.
49416 // Otherwise use PCMPEQ (plus AND) and mask testing.
49417 if ((OpSize == 128 && Subtarget.hasSSE2()) ||
49418 (OpSize == 256 && Subtarget.hasAVX()) ||
49419 (OpSize == 512 && Subtarget.useAVX512Regs())) {
49420 bool HasPT = Subtarget.hasSSE41();
49421
49422 // PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened
49423 // vector registers are essentially free. (Technically, widening registers
49424 // prevents load folding, but the tradeoff is worth it.)
49425 bool PreferKOT = Subtarget.preferMaskRegisters();
49426 bool NeedZExt = PreferKOT && !Subtarget.hasVLX() && OpSize != 512;
49427
49428 EVT VecVT = MVT::v16i8;
49429 EVT CmpVT = PreferKOT ? MVT::v16i1 : VecVT;
49430 if (OpSize == 256) {
49431 VecVT = MVT::v32i8;
49432 CmpVT = PreferKOT ? MVT::v32i1 : VecVT;
49433 }
49434 EVT CastVT = VecVT;
49435 bool NeedsAVX512FCast = false;
49436 if (OpSize == 512 || NeedZExt) {
49437 if (Subtarget.hasBWI()) {
49438 VecVT = MVT::v64i8;
49439 CmpVT = MVT::v64i1;
49440 if (OpSize == 512)
49441 CastVT = VecVT;
49442 } else {
49443 VecVT = MVT::v16i32;
49444 CmpVT = MVT::v16i1;
49445 CastVT = OpSize == 512 ? VecVT :
49446 OpSize == 256 ? MVT::v8i32 : MVT::v4i32;
49447 NeedsAVX512FCast = true;
49448 }
49449 }
49450
49451 auto ScalarToVector = [&](SDValue X) -> SDValue {
49452 bool TmpZext = false;
49453 EVT TmpCastVT = CastVT;
49454 if (X.getOpcode() == ISD::ZERO_EXTEND) {
49455 SDValue OrigX = X.getOperand(0);
49456 unsigned OrigSize = OrigX.getScalarValueSizeInBits();
49457 if (OrigSize < OpSize) {
49458 if (OrigSize == 128) {
49459 TmpCastVT = NeedsAVX512FCast ? MVT::v4i32 : MVT::v16i8;
49460 X = OrigX;
49461 TmpZext = true;
49462 } else if (OrigSize == 256) {
49463 TmpCastVT = NeedsAVX512FCast ? MVT::v8i32 : MVT::v32i8;
49464 X = OrigX;
49465 TmpZext = true;
49466 }
49467 }
49468 }
49469 X = DAG.getBitcast(TmpCastVT, X);
49470 if (!NeedZExt && !TmpZext)
49471 return X;
49472 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT,
49473 DAG.getConstant(0, DL, VecVT), X,
49474 DAG.getVectorIdxConstant(0, DL));
49475 };
49476
49477 SDValue Cmp;
49478 if (IsOrXorXorTreeCCZero) {
49479 // This is a bitwise-combined equality comparison of 2 pairs of vectors:
49480 // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne
49481 // Use 2 vector equality compares and 'and' the results before doing a
49482 // MOVMSK.
49483 Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT, HasPT, ScalarToVector);
49484 } else {
49485 SDValue VecX = ScalarToVector(X);
49486 SDValue VecY = ScalarToVector(Y);
49487 if (VecVT != CmpVT) {
49488 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE);
49489 } else if (HasPT) {
49490 Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY);
49491 } else {
49492 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
49493 }
49494 }
49495 // AVX512 should emit a setcc that will lower to kortest.
49496 if (VecVT != CmpVT) {
49497 EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64 :
49498 CmpVT == MVT::v32i1 ? MVT::i32 : MVT::i16;
49499 return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp),
49500 DAG.getConstant(0, DL, KRegVT), CC);
49501 }
49502 if (HasPT) {
49503 SDValue BCCmp = DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64,
49504 Cmp);
49505 SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp);
49506 X86::CondCode X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
49507 SDValue X86SetCC = getSETCC(X86CC, PT, DL, DAG);
49508 return DAG.getNode(ISD::TRUNCATE, DL, VT, X86SetCC.getValue(0));
49509 }
49510 // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
49511 // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
49512 // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
49513 assert(Cmp.getValueType() == MVT::v16i8 &&(static_cast<void> (0))
49514 "Non 128-bit vector on pre-SSE41 target")(static_cast<void> (0));
49515 SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
49516 SDValue FFFFs = DAG.getConstant(0xFFFF, DL, MVT::i32);
49517 return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
49518 }
49519
49520 return SDValue();
49521}
49522
49523static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
49524 TargetLowering::DAGCombinerInfo &DCI,
49525 const X86Subtarget &Subtarget) {
49526 const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
49527 const SDValue LHS = N->getOperand(0);
49528 const SDValue RHS = N->getOperand(1);
49529 EVT VT = N->getValueType(0);
49530 EVT OpVT = LHS.getValueType();
49531 SDLoc DL(N);
49532
49533 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
49534 if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))
49535 return V;
49536
49537 if (VT == MVT::i1 && isNullConstant(RHS)) {
49538 SDValue X86CC;
49539 if (SDValue V =
49540 MatchVectorAllZeroTest(LHS, CC, DL, Subtarget, DAG, X86CC))
49541 return DAG.getNode(ISD::TRUNCATE, DL, VT,
49542 DAG.getNode(X86ISD::SETCC, DL, MVT::i8, X86CC, V));
49543 }
49544
49545 if (OpVT.isScalarInteger()) {
49546 // cmpeq(or(X,Y),X) --> cmpeq(and(~X,Y),0)
49547 // cmpne(or(X,Y),X) --> cmpne(and(~X,Y),0)
49548 auto MatchOrCmpEq = [&](SDValue N0, SDValue N1) {
49549 if (N0.getOpcode() == ISD::OR && N0->hasOneUse()) {
49550 if (N0.getOperand(0) == N1)
49551 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
49552 N0.getOperand(1));
49553 if (N0.getOperand(1) == N1)
49554 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
49555 N0.getOperand(0));
49556 }
49557 return SDValue();
49558 };
49559 if (SDValue AndN = MatchOrCmpEq(LHS, RHS))
49560 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
49561 if (SDValue AndN = MatchOrCmpEq(RHS, LHS))
49562 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
49563
49564 // cmpeq(and(X,Y),Y) --> cmpeq(and(~X,Y),0)
49565 // cmpne(and(X,Y),Y) --> cmpne(and(~X,Y),0)
49566 auto MatchAndCmpEq = [&](SDValue N0, SDValue N1) {
49567 if (N0.getOpcode() == ISD::AND && N0->hasOneUse()) {
49568 if (N0.getOperand(0) == N1)
49569 return DAG.getNode(ISD::AND, DL, OpVT, N1,
49570 DAG.getNOT(DL, N0.getOperand(1), OpVT));
49571 if (N0.getOperand(1) == N1)
49572 return DAG.getNode(ISD::AND, DL, OpVT, N1,
49573 DAG.getNOT(DL, N0.getOperand(0), OpVT));
49574 }
49575 return SDValue();
49576 };
49577 if (SDValue AndN = MatchAndCmpEq(LHS, RHS))
49578 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
49579 if (SDValue AndN = MatchAndCmpEq(RHS, LHS))
49580 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
49581
49582 // cmpeq(trunc(x),0) --> cmpeq(x,0)
49583 // cmpne(trunc(x),0) --> cmpne(x,0)
49584 // iff x upper bits are zero.
49585 // TODO: Add support for RHS to be truncate as well?
49586 if (LHS.getOpcode() == ISD::TRUNCATE &&
49587 LHS.getOperand(0).getScalarValueSizeInBits() >= 32 &&
49588 isNullConstant(RHS) && !DCI.isBeforeLegalize()) {
49589 EVT SrcVT = LHS.getOperand(0).getValueType();
49590 APInt UpperBits = APInt::getBitsSetFrom(SrcVT.getScalarSizeInBits(),
49591 OpVT.getScalarSizeInBits());
49592 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49593 if (DAG.MaskedValueIsZero(LHS.getOperand(0), UpperBits) &&
49594 TLI.isTypeLegal(LHS.getOperand(0).getValueType()))
49595 return DAG.getSetCC(DL, VT, LHS.getOperand(0),
49596 DAG.getConstant(0, DL, SrcVT), CC);
49597 }
49598 }
49599 }
49600
49601 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
49602 (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
49603 // Using temporaries to avoid messing up operand ordering for later
49604 // transformations if this doesn't work.
49605 SDValue Op0 = LHS;
49606 SDValue Op1 = RHS;
49607 ISD::CondCode TmpCC = CC;
49608 // Put build_vector on the right.
49609 if (Op0.getOpcode() == ISD::BUILD_VECTOR) {
49610 std::swap(Op0, Op1);
49611 TmpCC = ISD::getSetCCSwappedOperands(TmpCC);
49612 }
49613
49614 bool IsSEXT0 =
49615 (Op0.getOpcode() == ISD::SIGN_EXTEND) &&
49616 (Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
49617 bool IsVZero1 = ISD::isBuildVectorAllZeros(Op1.getNode());
49618
49619 if (IsSEXT0 && IsVZero1) {
49620 assert(VT == Op0.getOperand(0).getValueType() &&(static_cast<void> (0))
49621 "Unexpected operand type")(static_cast<void> (0));
49622 if (TmpCC == ISD::SETGT)
49623 return DAG.getConstant(0, DL, VT);
49624 if (TmpCC == ISD::SETLE)
49625 return DAG.getConstant(1, DL, VT);
49626 if (TmpCC == ISD::SETEQ || TmpCC == ISD::SETGE)
49627 return DAG.getNOT(DL, Op0.getOperand(0), VT);
49628
49629 assert((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) &&(static_cast<void> (0))
49630 "Unexpected condition code!")(static_cast<void> (0));
49631 return Op0.getOperand(0);
49632 }
49633 }
49634
49635 // If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just
49636 // pre-promote its result type since vXi1 vectors don't get promoted
49637 // during type legalization.
49638 // NOTE: The element count check is to ignore operand types that need to
49639 // go through type promotion to a 128-bit vector.
49640 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
49641 VT.getVectorElementType() == MVT::i1 &&
49642 (OpVT.getVectorElementType() == MVT::i8 ||
49643 OpVT.getVectorElementType() == MVT::i16)) {
49644 SDValue Setcc = DAG.getSetCC(DL, OpVT, LHS, RHS, CC);
49645 return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
49646 }
49647
49648 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
49649 // to avoid scalarization via legalization because v4i32 is not a legal type.
49650 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
49651 LHS.getValueType() == MVT::v4f32)
49652 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
49653
49654 return SDValue();
49655}
49656
49657static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
49658 TargetLowering::DAGCombinerInfo &DCI,
49659 const X86Subtarget &Subtarget) {
49660 SDValue Src = N->getOperand(0);
49661 MVT SrcVT = Src.getSimpleValueType();
49662 MVT VT = N->getSimpleValueType(0);
49663 unsigned NumBits = VT.getScalarSizeInBits();
49664 unsigned NumElts = SrcVT.getVectorNumElements();
49665
49666 // Perform constant folding.
49667 if (ISD::isBuildVectorOfConstantSDNodes(Src.getNode())) {
49668 assert(VT == MVT::i32 && "Unexpected result type")(static_cast<void> (0));
49669 APInt Imm(32, 0);
49670 for (unsigned Idx = 0, e = Src.getNumOperands(); Idx < e; ++Idx) {
49671 if (!Src.getOperand(Idx).isUndef() &&
49672 Src.getConstantOperandAPInt(Idx).isNegative())
49673 Imm.setBit(Idx);
49674 }
49675 return DAG.getConstant(Imm, SDLoc(N), VT);
49676 }
49677
49678 // Look through int->fp bitcasts that don't change the element width.
49679 unsigned EltWidth = SrcVT.getScalarSizeInBits();
49680 if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST &&
49681 Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)
49682 return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));
49683
49684 // Fold movmsk(not(x)) -> not(movmsk(x)) to improve folding of movmsk results
49685 // with scalar comparisons.
49686 if (SDValue NotSrc = IsNOT(Src, DAG)) {
49687 SDLoc DL(N);
49688 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
49689 NotSrc = DAG.getBitcast(SrcVT, NotSrc);
49690 return DAG.getNode(ISD::XOR, DL, VT,
49691 DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc),
49692 DAG.getConstant(NotMask, DL, VT));
49693 }
49694
49695 // Fold movmsk(icmp_sgt(x,-1)) -> not(movmsk(x)) to improve folding of movmsk
49696 // results with scalar comparisons.
49697 if (Src.getOpcode() == X86ISD::PCMPGT &&
49698 ISD::isBuildVectorAllOnes(Src.getOperand(1).getNode())) {
49699 SDLoc DL(N);
49700 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
49701 return DAG.getNode(ISD::XOR, DL, VT,
49702 DAG.getNode(X86ISD::MOVMSK, DL, VT, Src.getOperand(0)),
49703 DAG.getConstant(NotMask, DL, VT));
49704 }
49705
49706 // Simplify the inputs.
49707 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49708 APInt DemandedMask(APInt::getAllOnesValue(NumBits));
49709 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
49710 return SDValue(N, 0);
49711
49712 return SDValue();
49713}
49714
49715static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG,
49716 TargetLowering::DAGCombinerInfo &DCI) {
49717 // With vector masks we only demand the upper bit of the mask.
49718 SDValue Mask = cast<X86MaskedGatherScatterSDNode>(N)->getMask();
49719 if (Mask.getScalarValueSizeInBits() != 1) {
49720 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49721 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
49722 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
49723 if (N->getOpcode() != ISD::DELETED_NODE)
49724 DCI.AddToWorklist(N);
49725 return SDValue(N, 0);
49726 }
49727 }
49728
49729 return SDValue();
49730}
49731
49732static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS,
49733 SDValue Index, SDValue Base, SDValue Scale,
49734 SelectionDAG &DAG) {
49735 SDLoc DL(GorS);
49736
49737 if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
49738 SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),
49739 Gather->getMask(), Base, Index, Scale } ;
49740 return DAG.getMaskedGather(Gather->getVTList(),
49741 Gather->getMemoryVT(), DL, Ops,
49742 Gather->getMemOperand(),
49743 Gather->getIndexType(),
49744 Gather->getExtensionType());
49745 }
49746 auto *Scatter = cast<MaskedScatterSDNode>(GorS);
49747 SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),
49748 Scatter->getMask(), Base, Index, Scale };
49749 return DAG.getMaskedScatter(Scatter->getVTList(),
49750 Scatter->getMemoryVT(), DL,
49751 Ops, Scatter->getMemOperand(),
49752 Scatter->getIndexType(),
49753 Scatter->isTruncatingStore());
49754}
49755
49756static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
49757 TargetLowering::DAGCombinerInfo &DCI) {
49758 SDLoc DL(N);
49759 auto *GorS = cast<MaskedGatherScatterSDNode>(N);
49760 SDValue Index = GorS->getIndex();
49761 SDValue Base = GorS->getBasePtr();
49762 SDValue Scale = GorS->getScale();
49763
49764 if (DCI.isBeforeLegalize()) {
49765 unsigned IndexWidth = Index.getScalarValueSizeInBits();
49766
49767 // Shrink constant indices if they are larger than 32-bits.
49768 // Only do this before legalize types since v2i64 could become v2i32.
49769 // FIXME: We could check that the type is legal if we're after legalize
49770 // types, but then we would need to construct test cases where that happens.
49771 // FIXME: We could support more than just constant vectors, but we need to
49772 // careful with costing. A truncate that can be optimized out would be fine.
49773 // Otherwise we might only want to create a truncate if it avoids a split.
49774 if (auto *BV = dyn_cast<BuildVectorSDNode>(Index)) {
49775 if (BV->isConstant() && IndexWidth > 32 &&
49776 DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
49777 EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);
49778 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
49779 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
49780 }
49781 }
49782
49783 // Shrink any sign/zero extends from 32 or smaller to larger than 32 if
49784 // there are sufficient sign bits. Only do this before legalize types to
49785 // avoid creating illegal types in truncate.
49786 if ((Index.getOpcode() == ISD::SIGN_EXTEND ||
49787 Index.getOpcode() == ISD::ZERO_EXTEND) &&
49788 IndexWidth > 32 &&
49789 Index.getOperand(0).getScalarValueSizeInBits() <= 32 &&
49790 DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
49791 EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);
49792 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
49793 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
49794 }
49795 }
49796
49797 if (DCI.isBeforeLegalizeOps()) {
49798 unsigned IndexWidth = Index.getScalarValueSizeInBits();
49799
49800 // Make sure the index is either i32 or i64
49801 if (IndexWidth != 32 && IndexWidth != 64) {
49802 MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32;
49803 EVT IndexVT = Index.getValueType().changeVectorElementType(EltVT);
49804 Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
49805 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
49806 }
49807 }
49808
49809 // With vector masks we only demand the upper bit of the mask.
49810 SDValue Mask = GorS->getMask();
49811 if (Mask.getScalarValueSizeInBits() != 1) {
49812 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49813 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
49814 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
49815 if (N->getOpcode() != ISD::DELETED_NODE)
49816 DCI.AddToWorklist(N);
49817 return SDValue(N, 0);
49818 }
49819 }
49820
49821 return SDValue();
49822}
49823
49824// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
49825static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
49826 const X86Subtarget &Subtarget) {
49827 SDLoc DL(N);
49828 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
49829 SDValue EFLAGS = N->getOperand(1);
49830
49831 // Try to simplify the EFLAGS and condition code operands.
49832 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
49833 return getSETCC(CC, Flags, DL, DAG);
49834
49835 return SDValue();
49836}
49837
49838/// Optimize branch condition evaluation.
49839static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
49840 const X86Subtarget &Subtarget) {
49841 SDLoc DL(N);
49842 SDValue EFLAGS = N->getOperand(3);
49843 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
49844
49845 // Try to simplify the EFLAGS and condition code operands.
49846 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
49847 // RAUW them under us.
49848 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
49849 SDValue Cond = DAG.getTargetConstant(CC, DL, MVT::i8);
49850 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
49851 N->getOperand(1), Cond, Flags);
49852 }
49853
49854 return SDValue();
49855}
49856
49857// TODO: Could we move this to DAGCombine?
49858static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
49859 SelectionDAG &DAG) {
49860 // Take advantage of vector comparisons (etc.) producing 0 or -1 in each lane
49861 // to optimize away operation when it's from a constant.
49862 //
49863 // The general transformation is:
49864 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
49865 // AND(VECTOR_CMP(x,y), constant2)
49866 // constant2 = UNARYOP(constant)
49867
49868 // Early exit if this isn't a vector operation, the operand of the
49869 // unary operation isn't a bitwise AND, or if the sizes of the operations
49870 // aren't the same.
49871 EVT VT = N->getValueType(0);
49872 bool IsStrict = N->isStrictFPOpcode();
49873 unsigned NumEltBits = VT.getScalarSizeInBits();
49874 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
49875 if (!VT.isVector() || Op0.getOpcode() != ISD::AND ||
49876 DAG.ComputeNumSignBits(Op0.getOperand(0)) != NumEltBits ||
49877 VT.getSizeInBits() != Op0.getValueSizeInBits())
49878 return SDValue();
49879
49880 // Now check that the other operand of the AND is a constant. We could
49881 // make the transformation for non-constant splats as well, but it's unclear
49882 // that would be a benefit as it would not eliminate any operations, just
49883 // perform one more step in scalar code before moving to the vector unit.
49884 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op0.getOperand(1))) {
49885 // Bail out if the vector isn't a constant.
49886 if (!BV->isConstant())
49887 return SDValue();
49888
49889 // Everything checks out. Build up the new and improved node.
49890 SDLoc DL(N);
49891 EVT IntVT = BV->getValueType(0);
49892 // Create a new constant of the appropriate type for the transformed
49893 // DAG.
49894 SDValue SourceConst;
49895 if (IsStrict)
49896 SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other},
49897 {N->getOperand(0), SDValue(BV, 0)});
49898 else
49899 SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
49900 // The AND node needs bitcasts to/from an integer vector type around it.
49901 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
49902 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0),
49903 MaskConst);
49904 SDValue Res = DAG.getBitcast(VT, NewAnd);
49905 if (IsStrict)
49906 return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL);
49907 return Res;
49908 }
49909
49910 return SDValue();
49911}
49912
49913/// If we are converting a value to floating-point, try to replace scalar
49914/// truncate of an extracted vector element with a bitcast. This tries to keep
49915/// the sequence on XMM registers rather than moving between vector and GPRs.
49916static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG) {
49917 // TODO: This is currently only used by combineSIntToFP, but it is generalized
49918 // to allow being called by any similar cast opcode.
49919 // TODO: Consider merging this into lowering: vectorizeExtractedCast().
49920 SDValue Trunc = N->getOperand(0);
49921 if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE)
49922 return SDValue();
49923
49924 SDValue ExtElt = Trunc.getOperand(0);
49925 if (!ExtElt.hasOneUse() || ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
49926 !isNullConstant(ExtElt.getOperand(1)))
49927 return SDValue();
49928
49929 EVT TruncVT = Trunc.getValueType();
49930 EVT SrcVT = ExtElt.getValueType();
49931 unsigned DestWidth = TruncVT.getSizeInBits();
49932 unsigned SrcWidth = SrcVT.getSizeInBits();
49933 if (SrcWidth % DestWidth != 0)
49934 return SDValue();
49935
49936 // inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)
49937 EVT SrcVecVT = ExtElt.getOperand(0).getValueType();
49938 unsigned VecWidth = SrcVecVT.getSizeInBits();
49939 unsigned NumElts = VecWidth / DestWidth;
49940 EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts);
49941 SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0));
49942 SDLoc DL(N);
49943 SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT,
49944 BitcastVec, ExtElt.getOperand(1));
49945 return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);
49946}
49947
49948static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
49949 const X86Subtarget &Subtarget) {
49950 bool IsStrict = N->isStrictFPOpcode();
49951 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
49952 EVT VT = N->getValueType(0);
49953 EVT InVT = Op0.getValueType();
49954
49955 // UINT_TO_FP(vXi1~15) -> UINT_TO_FP(ZEXT(vXi1~15 to vXi16))
49956 // UINT_TO_FP(vXi17~31) -> UINT_TO_FP(ZEXT(vXi17~31 to vXi32))
49957 // UINT_TO_FP(vXi33~63) -> UINT_TO_FP(ZEXT(vXi33~63 to vXi64))
49958 if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
49959 unsigned ScalarSize = InVT.getScalarSizeInBits();
49960 if (ScalarSize == 16 || ScalarSize == 32 || ScalarSize >= 64)
49961 return SDValue();
49962 SDLoc dl(N);
49963 EVT DstVT = EVT::getVectorVT(*DAG.getContext(),
49964 ScalarSize < 16 ? MVT::i16
49965 : ScalarSize < 32 ? MVT::i32
49966 : MVT::i64,
49967 InVT.getVectorNumElements());
49968 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
49969 if (IsStrict)
49970 return DAG.getNode(ISD::STRICT_UINT_TO_FP, dl, {VT, MVT::Other},
49971 {N->getOperand(0), P});
49972 return DAG.getNode(ISD::UINT_TO_FP, dl, VT, P);
49973 }
49974
49975 // UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))
49976 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
49977 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
49978 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
49979 VT.getScalarType() != MVT::f16) {
49980 SDLoc dl(N);
49981 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
49982 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
49983
49984 // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
49985 if (IsStrict)
49986 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
49987 {N->getOperand(0), P});
49988 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
49989 }
49990
49991 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
49992 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
49993 // the optimization here.
49994 if (DAG.SignBitIsZero(Op0)) {
49995 if (IsStrict)
49996 return DAG.getNode(ISD::STRICT_SINT_TO_FP, SDLoc(N), {VT, MVT::Other},
49997 {N->getOperand(0), Op0});
49998 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
49999 }
50000
50001 return SDValue();
50002}
50003
50004static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
50005 TargetLowering::DAGCombinerInfo &DCI,
50006 const X86Subtarget &Subtarget) {
50007 // First try to optimize away the conversion entirely when it's
50008 // conditionally from a constant. Vectors only.
50009 bool IsStrict = N->isStrictFPOpcode();
50010 if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
50011 return Res;
50012
50013 // Now move on to more general possibilities.
50014 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
50015 EVT VT = N->getValueType(0);
50016 EVT InVT = Op0.getValueType();
50017
50018 // SINT_TO_FP(vXi1~15) -> SINT_TO_FP(SEXT(vXi1~15 to vXi16))
50019 // SINT_TO_FP(vXi17~31) -> SINT_TO_FP(SEXT(vXi17~31 to vXi32))
50020 // SINT_TO_FP(vXi33~63) -> SINT_TO_FP(SEXT(vXi33~63 to vXi64))
50021 if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
50022 unsigned ScalarSize = InVT.getScalarSizeInBits();
50023 if (ScalarSize == 16 || ScalarSize == 32 || ScalarSize >= 64)
50024 return SDValue();
50025 SDLoc dl(N);
50026 EVT DstVT = EVT::getVectorVT(*DAG.getContext(),
50027 ScalarSize < 16 ? MVT::i16
50028 : ScalarSize < 32 ? MVT::i32
50029 : MVT::i64,
50030 InVT.getVectorNumElements());
50031 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
50032 if (IsStrict)
50033 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
50034 {N->getOperand(0), P});
50035 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
50036 }
50037
50038 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
50039 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
50040 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
50041 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
50042 VT.getScalarType() != MVT::f16) {
50043 SDLoc dl(N);
50044 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
50045 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
50046 if (IsStrict)
50047 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
50048 {N->getOperand(0), P});
50049 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
50050 }
50051
50052 // Without AVX512DQ we only support i64 to float scalar conversion. For both
50053 // vectors and scalars, see if we know that the upper bits are all the sign
50054 // bit, in which case we can truncate the input to i32 and convert from that.
50055 if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
50056 unsigned BitWidth = InVT.getScalarSizeInBits();
50057 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
50058 if (NumSignBits >= (BitWidth - 31)) {
50059 EVT TruncVT = MVT::i32;
50060 if (InVT.isVector())
50061 TruncVT = InVT.changeVectorElementType(TruncVT);
50062 SDLoc dl(N);
50063 if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) {
50064 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
50065 if (IsStrict)
50066 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
50067 {N->getOperand(0), Trunc});
50068 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
50069 }
50070 // If we're after legalize and the type is v2i32 we need to shuffle and
50071 // use CVTSI2P.
50072 assert(InVT == MVT::v2i64 && "Unexpected VT!")(static_cast<void> (0));
50073 SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);
50074 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,
50075 { 0, 2, -1, -1 });
50076 if (IsStrict)
50077 return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
50078 {N->getOperand(0), Shuf});
50079 return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf);
50080 }
50081 }
50082
50083 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
50084 // a 32-bit target where SSE doesn't support i64->FP operations.
50085 if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&
50086 Op0.getOpcode() == ISD::LOAD) {
50087 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
50088
50089 // This transformation is not supported if the result type is f16 or f128.
50090 if (VT == MVT::f16 || VT == MVT::f128)
50091 return SDValue();
50092
50093 // If we have AVX512DQ we can use packed conversion instructions unless
50094 // the VT is f80.
50095 if (Subtarget.hasDQI() && VT != MVT::f80)
50096 return SDValue();
50097
50098 if (Ld->isSimple() && !VT.isVector() && ISD::isNormalLoad(Op0.getNode()) &&
50099 Op0.hasOneUse() && !Subtarget.is64Bit() && InVT == MVT::i64) {
50100 std::pair<SDValue, SDValue> Tmp =
50101 Subtarget.getTargetLowering()->BuildFILD(
50102 VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(),
50103 Ld->getPointerInfo(), Ld->getOriginalAlign(), DAG);
50104 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second);
50105 return Tmp.first;
50106 }
50107 }
50108
50109 if (IsStrict)
50110 return SDValue();
50111
50112 if (SDValue V = combineToFPTruncExtElt(N, DAG))
50113 return V;
50114
50115 return SDValue();
50116}
50117
50118static bool needCarryOrOverflowFlag(SDValue Flags) {
50119 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!")(static_cast<void> (0));
50120
50121 for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
50122 UI != UE; ++UI) {
50123 SDNode *User = *UI;
50124
50125 X86::CondCode CC;
50126 switch (User->getOpcode()) {
50127 default:
50128 // Be conservative.
50129 return true;
50130 case X86ISD::SETCC:
50131 case X86ISD::SETCC_CARRY:
50132 CC = (X86::CondCode)User->getConstantOperandVal(0);
50133 break;
50134 case X86ISD::BRCOND:
50135 CC = (X86::CondCode)User->getConstantOperandVal(2);
50136 break;
50137 case X86ISD::CMOV:
50138 CC = (X86::CondCode)User->getConstantOperandVal(2);
50139 break;
50140 }
50141
50142 switch (CC) {
50143 default: break;
50144 case X86::COND_A: case X86::COND_AE:
50145 case X86::COND_B: case X86::COND_BE:
50146 case X86::COND_O: case X86::COND_NO:
50147 case X86::COND_G: case X86::COND_GE:
50148 case X86::COND_L: case X86::COND_LE:
50149 return true;
50150 }
50151 }
50152
50153 return false;
50154}
50155
50156static bool onlyZeroFlagUsed(SDValue Flags) {
50157 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!")(static_cast<void> (0));
50158
50159 for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
50160 UI != UE; ++UI) {
50161 SDNode *User = *UI;
50162
50163 unsigned CCOpNo;
50164 switch (User->getOpcode()) {
50165 default:
50166 // Be conservative.
50167 return false;
50168 case X86ISD::SETCC: CCOpNo = 0; break;
50169 case X86ISD::SETCC_CARRY: CCOpNo = 0; break;
50170 case X86ISD::BRCOND: CCOpNo = 2; break;
50171 case X86ISD::CMOV: CCOpNo = 2; break;
50172 }
50173
50174 X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
50175 if (CC != X86::COND_E && CC != X86::COND_NE)
50176 return false;
50177 }
50178
50179 return true;
50180}
50181
50182static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) {
50183 // Only handle test patterns.
50184 if (!isNullConstant(N->getOperand(1)))
50185 return SDValue();
50186
50187 // If we have a CMP of a truncated binop, see if we can make a smaller binop
50188 // and use its flags directly.
50189 // TODO: Maybe we should try promoting compares that only use the zero flag
50190 // first if we can prove the upper bits with computeKnownBits?
50191 SDLoc dl(N);
50192 SDValue Op = N->getOperand(0);
50193 EVT VT = Op.getValueType();
50194
50195 // If we have a constant logical shift that's only used in a comparison
50196 // against zero turn it into an equivalent AND. This allows turning it into
50197 // a TEST instruction later.
50198 if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) &&
50199 Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&
50200 onlyZeroFlagUsed(SDValue(N, 0))) {
50201 unsigned BitWidth = VT.getSizeInBits();
50202 const APInt &ShAmt = Op.getConstantOperandAPInt(1);
50203 if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts.
50204 unsigned MaskBits = BitWidth - ShAmt.getZExtValue();
50205 APInt Mask = Op.getOpcode() == ISD::SRL
50206 ? APInt::getHighBitsSet(BitWidth, MaskBits)
50207 : APInt::getLowBitsSet(BitWidth, MaskBits);
50208 if (Mask.isSignedIntN(32)) {
50209 Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),
50210 DAG.getConstant(Mask, dl, VT));
50211 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
50212 DAG.getConstant(0, dl, VT));
50213 }
50214 }
50215 }
50216
50217 // Look for a truncate.
50218 if (Op.getOpcode() != ISD::TRUNCATE)
50219 return SDValue();
50220
50221 SDValue Trunc = Op;
50222 Op = Op.getOperand(0);
50223
50224 // See if we can compare with zero against the truncation source,
50225 // which should help using the Z flag from many ops. Only do this for
50226 // i32 truncated op to prevent partial-reg compares of promoted ops.
50227 EVT OpVT = Op.getValueType();
50228 APInt UpperBits =
50229 APInt::getBitsSetFrom(OpVT.getSizeInBits(), VT.getSizeInBits());
50230 if (OpVT == MVT::i32 && DAG.MaskedValueIsZero(Op, UpperBits) &&
50231 onlyZeroFlagUsed(SDValue(N, 0))) {
50232 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
50233 DAG.getConstant(0, dl, OpVT));
50234 }
50235
50236 // After this the truncate and arithmetic op must have a single use.
50237 if (!Trunc.hasOneUse() || !Op.hasOneUse())
50238 return SDValue();
50239
50240 unsigned NewOpc;
50241 switch (Op.getOpcode()) {
50242 default: return SDValue();
50243 case ISD::AND:
50244 // Skip and with constant. We have special handling for and with immediate
50245 // during isel to generate test instructions.
50246 if (isa<ConstantSDNode>(Op.getOperand(1)))
50247 return SDValue();
50248 NewOpc = X86ISD::AND;
50249 break;
50250 case ISD::OR: NewOpc = X86ISD::OR; break;
50251 case ISD::XOR: NewOpc = X86ISD::XOR; break;
50252 case ISD::ADD:
50253 // If the carry or overflow flag is used, we can't truncate.
50254 if (needCarryOrOverflowFlag(SDValue(N, 0)))
50255 return SDValue();
50256 NewOpc = X86ISD::ADD;
50257 break;
50258 case ISD::SUB:
50259 // If the carry or overflow flag is used, we can't truncate.
50260 if (needCarryOrOverflowFlag(SDValue(N, 0)))
50261 return SDValue();
50262 NewOpc = X86ISD::SUB;
50263 break;
50264 }
50265
50266 // We found an op we can narrow. Truncate its inputs.
50267 SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0));
50268 SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1));
50269
50270 // Use a X86 specific opcode to avoid DAG combine messing with it.
50271 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
50272 Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1);
50273
50274 // For AND, keep a CMP so that we can match the test pattern.
50275 if (NewOpc == X86ISD::AND)
50276 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
50277 DAG.getConstant(0, dl, VT));
50278
50279 // Return the flags.
50280 return Op.getValue(1);
50281}
50282
50283static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG,
50284 TargetLowering::DAGCombinerInfo &DCI) {
50285 assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) &&(static_cast<void> (0))
50286 "Expected X86ISD::ADD or X86ISD::SUB")(static_cast<void> (0));
50287
50288 SDLoc DL(N);
50289 SDValue LHS = N->getOperand(0);
50290 SDValue RHS = N->getOperand(1);
50291 MVT VT = LHS.getSimpleValueType();
50292 unsigned GenericOpc = X86ISD::ADD == N->getOpcode() ? ISD::ADD : ISD::SUB;
50293
50294 // If we don't use the flag result, simplify back to a generic ADD/SUB.
50295 if (!N->hasAnyUseOfValue(1)) {
50296 SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS);
50297 return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL);
50298 }
50299
50300 // Fold any similar generic ADD/SUB opcodes to reuse this node.
50301 auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {
50302 SDValue Ops[] = {N0, N1};
50303 SDVTList VTs = DAG.getVTList(N->getValueType(0));
50304 if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {
50305 SDValue Op(N, 0);
50306 if (Negate)
50307 Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op);
50308 DCI.CombineTo(GenericAddSub, Op);
50309 }
50310 };
50311 MatchGeneric(LHS, RHS, false);
50312 MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());
50313
50314 return SDValue();
50315}
50316
50317static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
50318 if (SDValue Flags = combineCarryThroughADD(N->getOperand(2), DAG)) {
50319 MVT VT = N->getSimpleValueType(0);
50320 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
50321 return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs,
50322 N->getOperand(0), N->getOperand(1),
50323 Flags);
50324 }
50325
50326 // Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)
50327 // iff the flag result is dead.
50328 SDValue Op0 = N->getOperand(0);
50329 SDValue Op1 = N->getOperand(1);
50330 if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op1) &&
50331 !N->hasAnyUseOfValue(1))
50332 return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), Op0.getOperand(0),
50333 Op0.getOperand(1), N->getOperand(2));
50334
50335 return SDValue();
50336}
50337
50338// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
50339static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
50340 TargetLowering::DAGCombinerInfo &DCI) {
50341 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
50342 // the result is either zero or one (depending on the input carry bit).
50343 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
50344 if (X86::isZeroNode(N->getOperand(0)) &&
50345 X86::isZeroNode(N->getOperand(1)) &&
50346 // We don't have a good way to replace an EFLAGS use, so only do this when
50347 // dead right now.
50348 SDValue(N, 1).use_empty()) {
50349 SDLoc DL(N);
50350 EVT VT = N->getValueType(0);
50351 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
50352 SDValue Res1 =
50353 DAG.getNode(ISD::AND, DL, VT,
50354 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
50355 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
50356 N->getOperand(2)),
50357 DAG.getConstant(1, DL, VT));
50358 return DCI.CombineTo(N, Res1, CarryOut);
50359 }
50360
50361 if (SDValue Flags = combineCarryThroughADD(N->getOperand(2), DAG)) {
50362 MVT VT = N->getSimpleValueType(0);
50363 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
50364 return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs,
50365 N->getOperand(0), N->getOperand(1),
50366 Flags);
50367 }
50368
50369 return SDValue();
50370}
50371
50372/// If this is an add or subtract where one operand is produced by a cmp+setcc,
50373/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
50374/// with CMP+{ADC, SBB}.
50375static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
50376 bool IsSub = N->getOpcode() == ISD::SUB;
50377 SDValue X = N->getOperand(0);
50378 SDValue Y = N->getOperand(1);
50379
50380 // If this is an add, canonicalize a zext operand to the RHS.
50381 // TODO: Incomplete? What if both sides are zexts?
50382 if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND &&
50383 Y.getOpcode() != ISD::ZERO_EXTEND)
50384 std::swap(X, Y);
50385
50386 // Look through a one-use zext.
50387 bool PeekedThroughZext = false;
50388 if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) {
50389 Y = Y.getOperand(0);
50390 PeekedThroughZext = true;
50391 }
50392
50393 // If this is an add, canonicalize a setcc operand to the RHS.
50394 // TODO: Incomplete? What if both sides are setcc?
50395 // TODO: Should we allow peeking through a zext of the other operand?
50396 if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC &&
50397 Y.getOpcode() != X86ISD::SETCC)
50398 std::swap(X, Y);
50399
50400 if (Y.getOpcode() != X86ISD::SETCC || !Y.hasOneUse())
50401 return SDValue();
50402
50403 SDLoc DL(N);
50404 EVT VT = N->getValueType(0);
50405 X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0);
50406
50407 // If X is -1 or 0, then we have an opportunity to avoid constants required in
50408 // the general case below.
50409 auto *ConstantX = dyn_cast<ConstantSDNode>(X);
50410 if (ConstantX) {
50411 if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnesValue()) ||
50412 (IsSub && CC == X86::COND_B && ConstantX->isNullValue())) {
50413 // This is a complicated way to get -1 or 0 from the carry flag:
50414 // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
50415 // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
50416 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
50417 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
50418 Y.getOperand(1));
50419 }
50420
50421 if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnesValue()) ||
50422 (IsSub && CC == X86::COND_A && ConstantX->isNullValue())) {
50423 SDValue EFLAGS = Y->getOperand(1);
50424 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
50425 EFLAGS.getValueType().isInteger() &&
50426 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
50427 // Swap the operands of a SUB, and we have the same pattern as above.
50428 // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
50429 // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
50430 SDValue NewSub = DAG.getNode(
50431 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
50432 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
50433 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
50434 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
50435 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
50436 NewEFLAGS);
50437 }
50438 }
50439 }
50440
50441 if (CC == X86::COND_B) {
50442 // X + SETB Z --> adc X, 0
50443 // X - SETB Z --> sbb X, 0
50444 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
50445 DAG.getVTList(VT, MVT::i32), X,
50446 DAG.getConstant(0, DL, VT), Y.getOperand(1));
50447 }
50448
50449 if (CC == X86::COND_A) {
50450 SDValue EFLAGS = Y.getOperand(1);
50451 // Try to convert COND_A into COND_B in an attempt to facilitate
50452 // materializing "setb reg".
50453 //
50454 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
50455 // cannot take an immediate as its first operand.
50456 //
50457 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
50458 EFLAGS.getValueType().isInteger() &&
50459 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
50460 SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
50461 EFLAGS.getNode()->getVTList(),
50462 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
50463 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
50464 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
50465 DAG.getVTList(VT, MVT::i32), X,
50466 DAG.getConstant(0, DL, VT), NewEFLAGS);
50467 }
50468 }
50469
50470 if (CC == X86::COND_AE) {
50471 // X + SETAE --> sbb X, -1
50472 // X - SETAE --> adc X, -1
50473 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
50474 DAG.getVTList(VT, MVT::i32), X,
50475 DAG.getConstant(-1, DL, VT), Y.getOperand(1));
50476 }
50477
50478 if (CC == X86::COND_BE) {
50479 // X + SETBE --> sbb X, -1
50480 // X - SETBE --> adc X, -1
50481 SDValue EFLAGS = Y.getOperand(1);
50482 // Try to convert COND_BE into COND_AE in an attempt to facilitate
50483 // materializing "setae reg".
50484 //
50485 // Do not flip "e <= c", where "c" is a constant, because Cmp instruction
50486 // cannot take an immediate as its first operand.
50487 //
50488 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
50489 EFLAGS.getValueType().isInteger() &&
50490 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
50491 SDValue NewSub = DAG.getNode(
50492 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
50493 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
50494 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
50495 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
50496 DAG.getVTList(VT, MVT::i32), X,
50497 DAG.getConstant(-1, DL, VT), NewEFLAGS);
50498 }
50499 }
50500
50501 if (CC != X86::COND_E && CC != X86::COND_NE)
50502 return SDValue();
50503
50504 SDValue Cmp = Y.getOperand(1);
50505 if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
50506 !X86::isZeroNode(Cmp.getOperand(1)) ||
50507 !Cmp.getOperand(0).getValueType().isInteger())
50508 return SDValue();
50509
50510 SDValue Z = Cmp.getOperand(0);
50511 EVT ZVT = Z.getValueType();
50512
50513 // If X is -1 or 0, then we have an opportunity to avoid constants required in
50514 // the general case below.
50515 if (ConstantX) {
50516 // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
50517 // fake operands:
50518 // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
50519 // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
50520 if ((IsSub && CC == X86::COND_NE && ConstantX->isNullValue()) ||
50521 (!IsSub && CC == X86::COND_E && ConstantX->isAllOnesValue())) {
50522 SDValue Zero = DAG.getConstant(0, DL, ZVT);
50523 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
50524 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
50525 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
50526 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
50527 SDValue(Neg.getNode(), 1));
50528 }
50529
50530 // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
50531 // with fake operands:
50532 // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
50533 // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
50534 if ((IsSub && CC == X86::COND_E && ConstantX->isNullValue()) ||
50535 (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnesValue())) {
50536 SDValue One = DAG.getConstant(1, DL, ZVT);
50537 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
50538 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
50539 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
50540 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
50541 Cmp1.getValue(1));
50542 }
50543 }
50544
50545 // (cmp Z, 1) sets the carry flag if Z is 0.
50546 SDValue One = DAG.getConstant(1, DL, ZVT);
50547 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
50548 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
50549
50550 // Add the flags type for ADC/SBB nodes.
50551 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
50552
50553 // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
50554 // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
50555 if (CC == X86::COND_NE)
50556 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
50557 DAG.getConstant(-1ULL, DL, VT), Cmp1.getValue(1));
50558
50559 // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
50560 // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
50561 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
50562 DAG.getConstant(0, DL, VT), Cmp1.getValue(1));
50563}
50564
50565static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
50566 const SDLoc &DL, EVT VT,
50567 const X86Subtarget &Subtarget) {
50568 // Example of pattern we try to detect:
50569 // t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
50570 //(add (build_vector (extract_elt t, 0),
50571 // (extract_elt t, 2),
50572 // (extract_elt t, 4),
50573 // (extract_elt t, 6)),
50574 // (build_vector (extract_elt t, 1),
50575 // (extract_elt t, 3),
50576 // (extract_elt t, 5),
50577 // (extract_elt t, 7)))
50578
50579 if (!Subtarget.hasSSE2())
50580 return SDValue();
50581
50582 if (Op0.getOpcode() != ISD::BUILD_VECTOR ||
50583 Op1.getOpcode() != ISD::BUILD_VECTOR)
50584 return SDValue();
50585
50586 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
50587 VT.getVectorNumElements() < 4 ||
50588 !isPowerOf2_32(VT.getVectorNumElements()))
50589 return SDValue();
50590
50591 // Check if one of Op0,Op1 is of the form:
50592 // (build_vector (extract_elt Mul, 0),
50593 // (extract_elt Mul, 2),
50594 // (extract_elt Mul, 4),
50595 // ...
50596 // the other is of the form:
50597 // (build_vector (extract_elt Mul, 1),
50598 // (extract_elt Mul, 3),
50599 // (extract_elt Mul, 5),
50600 // ...
50601 // and identify Mul.
50602 SDValue Mul;
50603 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
50604 SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
50605 Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
50606 // TODO: Be more tolerant to undefs.
50607 if (Op0L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
50608 Op1L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
50609 Op0H.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
50610 Op1H.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
50611 return SDValue();
50612 auto *Const0L = dyn_cast<ConstantSDNode>(Op0L->getOperand(1));
50613 auto *Const1L = dyn_cast<ConstantSDNode>(Op1L->getOperand(1));
50614 auto *Const0H = dyn_cast<ConstantSDNode>(Op0H->getOperand(1));
50615 auto *Const1H = dyn_cast<ConstantSDNode>(Op1H->getOperand(1));
50616 if (!Const0L || !Const1L || !Const0H || !Const1H)
50617 return SDValue();
50618 unsigned Idx0L = Const0L->getZExtValue(), Idx1L = Const1L->getZExtValue(),
50619 Idx0H = Const0H->getZExtValue(), Idx1H = Const1H->getZExtValue();
50620 // Commutativity of mul allows factors of a product to reorder.
50621 if (Idx0L > Idx1L)
50622 std::swap(Idx0L, Idx1L);
50623 if (Idx0H > Idx1H)
50624 std::swap(Idx0H, Idx1H);
50625 // Commutativity of add allows pairs of factors to reorder.
50626 if (Idx0L > Idx0H) {
50627 std::swap(Idx0L, Idx0H);
50628 std::swap(Idx1L, Idx1H);
50629 }
50630 if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||
50631 Idx1H != 2 * i + 3)
50632 return SDValue();
50633 if (!Mul) {
50634 // First time an extract_elt's source vector is visited. Must be a MUL
50635 // with 2X number of vector elements than the BUILD_VECTOR.
50636 // Both extracts must be from same MUL.
50637 Mul = Op0L->getOperand(0);
50638 if (Mul->getOpcode() != ISD::MUL ||
50639 Mul.getValueType().getVectorNumElements() != 2 * e)
50640 return SDValue();
50641 }
50642 // Check that the extract is from the same MUL previously seen.
50643 if (Mul != Op0L->getOperand(0) || Mul != Op1L->getOperand(0) ||
50644 Mul != Op0H->getOperand(0) || Mul != Op1H->getOperand(0))
50645 return SDValue();
50646 }
50647
50648 // Check if the Mul source can be safely shrunk.
50649 ShrinkMode Mode;
50650 if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) ||
50651 Mode == ShrinkMode::MULU16)
50652 return SDValue();
50653
50654 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
50655 VT.getVectorNumElements() * 2);
50656 SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0));
50657 SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1));
50658
50659 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
50660 ArrayRef<SDValue> Ops) {
50661 EVT InVT = Ops[0].getValueType();
50662 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch")(static_cast<void> (0));
50663 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
50664 InVT.getVectorNumElements() / 2);
50665 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
50666 };
50667 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { N0, N1 }, PMADDBuilder);
50668}
50669
50670// Attempt to turn this pattern into PMADDWD.
50671// (add (mul (sext (build_vector)), (sext (build_vector))),
50672// (mul (sext (build_vector)), (sext (build_vector)))
50673static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
50674 const SDLoc &DL, EVT VT,
50675 const X86Subtarget &Subtarget) {
50676 if (!Subtarget.hasSSE2())
50677 return SDValue();
50678
50679 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
50680 return SDValue();
50681
50682 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
50683 VT.getVectorNumElements() < 4 ||
50684 !isPowerOf2_32(VT.getVectorNumElements()))
50685 return SDValue();
50686
50687 SDValue N00 = N0.getOperand(0);
50688 SDValue N01 = N0.getOperand(1);
50689 SDValue N10 = N1.getOperand(0);
50690 SDValue N11 = N1.getOperand(1);
50691
50692 // All inputs need to be sign extends.
50693 // TODO: Support ZERO_EXTEND from known positive?
50694 if (N00.getOpcode() != ISD::SIGN_EXTEND ||
50695 N01.getOpcode() != ISD::SIGN_EXTEND ||
50696 N10.getOpcode() != ISD::SIGN_EXTEND ||
50697 N11.getOpcode() != ISD::SIGN_EXTEND)
50698 return SDValue();
50699
50700 // Peek through the extends.
50701 N00 = N00.getOperand(0);
50702 N01 = N01.getOperand(0);
50703 N10 = N10.getOperand(0);
50704 N11 = N11.getOperand(0);
50705
50706 // Must be extending from vXi16.
50707 EVT InVT = N00.getValueType();
50708 if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT ||
50709 N10.getValueType() != InVT || N11.getValueType() != InVT)
50710 return SDValue();
50711
50712 // All inputs should be build_vectors.
50713 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
50714 N01.getOpcode() != ISD::BUILD_VECTOR ||
50715 N10.getOpcode() != ISD::BUILD_VECTOR ||
50716 N11.getOpcode() != ISD::BUILD_VECTOR)
50717 return SDValue();
50718
50719 // For each element, we need to ensure we have an odd element from one vector
50720 // multiplied by the odd element of another vector and the even element from
50721 // one of the same vectors being multiplied by the even element from the
50722 // other vector. So we need to make sure for each element i, this operator
50723 // is being performed:
50724 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
50725 SDValue In0, In1;
50726 for (unsigned i = 0; i != N00.getNumOperands(); ++i) {
50727 SDValue N00Elt = N00.getOperand(i);
50728 SDValue N01Elt = N01.getOperand(i);
50729 SDValue N10Elt = N10.getOperand(i);
50730 SDValue N11Elt = N11.getOperand(i);
50731 // TODO: Be more tolerant to undefs.
50732 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
50733 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
50734 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
50735 N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
50736 return SDValue();
50737 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
50738 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
50739 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
50740 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
50741 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
50742 return SDValue();
50743 unsigned IdxN00 = ConstN00Elt->getZExtValue();
50744 unsigned IdxN01 = ConstN01Elt->getZExtValue();
50745 unsigned IdxN10 = ConstN10Elt->getZExtValue();
50746 unsigned IdxN11 = ConstN11Elt->getZExtValue();
50747 // Add is commutative so indices can be reordered.
50748 if (IdxN00 > IdxN10) {
50749 std::swap(IdxN00, IdxN10);
50750 std::swap(IdxN01, IdxN11);
50751 }
50752 // N0 indices be the even element. N1 indices must be the next odd element.
50753 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
50754 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
50755 return SDValue();
50756 SDValue N00In = N00Elt.getOperand(0);
50757 SDValue N01In = N01Elt.getOperand(0);
50758 SDValue N10In = N10Elt.getOperand(0);
50759 SDValue N11In = N11Elt.getOperand(0);
50760
50761 // First time we find an input capture it.
50762 if (!In0) {
50763 In0 = N00In;
50764 In1 = N01In;
50765
50766 // The input vectors must be at least as wide as the output.
50767 // If they are larger than the output, we extract subvector below.
50768 if (In0.getValueSizeInBits() < VT.getSizeInBits() ||
50769 In1.getValueSizeInBits() < VT.getSizeInBits())
50770 return SDValue();
50771 }
50772 // Mul is commutative so the input vectors can be in any order.
50773 // Canonicalize to make the compares easier.
50774 if (In0 != N00In)
50775 std::swap(N00In, N01In);
50776 if (In0 != N10In)
50777 std::swap(N10In, N11In);
50778 if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In)
50779 return SDValue();
50780 }
50781
50782 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
50783 ArrayRef<SDValue> Ops) {
50784 EVT OpVT = Ops[0].getValueType();
50785 assert(OpVT.getScalarType() == MVT::i16 &&(static_cast<void> (0))
50786 "Unexpected scalar element type")(static_cast<void> (0));
50787 assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch")(static_cast<void> (0));
50788 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
50789 OpVT.getVectorNumElements() / 2);
50790 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
50791 };
50792
50793 // If the output is narrower than an input, extract the low part of the input
50794 // vector.
50795 EVT OutVT16 = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
50796 VT.getVectorNumElements() * 2);
50797 if (OutVT16.bitsLT(In0.getValueType())) {
50798 In0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In0,
50799 DAG.getIntPtrConstant(0, DL));
50800 }
50801 if (OutVT16.bitsLT(In1.getValueType())) {
50802 In1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In1,
50803 DAG.getIntPtrConstant(0, DL));
50804 }
50805 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
50806 PMADDBuilder);
50807}
50808
50809/// CMOV of constants requires materializing constant operands in registers.
50810/// Try to fold those constants into an 'add' instruction to reduce instruction
50811/// count. We do this with CMOV rather the generic 'select' because there are
50812/// earlier folds that may be used to turn select-of-constants into logic hacks.
50813static SDValue pushAddIntoCmovOfConsts(SDNode *N, SelectionDAG &DAG) {
50814 // If an operand is zero, add-of-0 gets simplified away, so that's clearly
50815 // better because we eliminate 1-2 instructions. This transform is still
50816 // an improvement without zero operands because we trade 2 move constants and
50817 // 1 add for 2 adds (LEA) as long as the constants can be represented as
50818 // immediate asm operands (fit in 32-bits).
50819 auto isSuitableCmov = [](SDValue V) {
50820 if (V.getOpcode() != X86ISD::CMOV || !V.hasOneUse())
50821 return false;
50822 if (!isa<ConstantSDNode>(V.getOperand(0)) ||
50823 !isa<ConstantSDNode>(V.getOperand(1)))
50824 return false;
50825 return isNullConstant(V.getOperand(0)) || isNullConstant(V.getOperand(1)) ||
50826 (V.getConstantOperandAPInt(0).isSignedIntN(32) &&
50827 V.getConstantOperandAPInt(1).isSignedIntN(32));
50828 };
50829
50830 // Match an appropriate CMOV as the first operand of the add.
50831 SDValue Cmov = N->getOperand(0);
50832 SDValue OtherOp = N->getOperand(1);
50833 if (!isSuitableCmov(Cmov))
50834 std::swap(Cmov, OtherOp);
50835 if (!isSuitableCmov(Cmov))
50836 return SDValue();
50837
50838 EVT VT = N->getValueType(0);
50839 SDLoc DL(N);
50840 SDValue FalseOp = Cmov.getOperand(0);
50841 SDValue TrueOp = Cmov.getOperand(1);
50842
50843 // We will push the add through the select, but we can potentially do better
50844 // if we know there is another add in the sequence and this is pointer math.
50845 // In that case, we can absorb an add into the trailing memory op and avoid
50846 // a 3-operand LEA which is likely slower than a 2-operand LEA.
50847 // TODO: If target has "slow3OpsLEA", do this even without the trailing memop?
50848 if (OtherOp.getOpcode() == ISD::ADD && OtherOp.hasOneUse() &&
50849 !isa<ConstantSDNode>(OtherOp.getOperand(0)) &&
50850 all_of(N->uses(), [&](SDNode *Use) {
50851 auto *MemNode = dyn_cast<MemSDNode>(Use);
50852 return MemNode && MemNode->getBasePtr().getNode() == N;
50853 })) {
50854 // add (cmov C1, C2), add (X, Y) --> add (cmov (add X, C1), (add X, C2)), Y
50855 // TODO: We are arbitrarily choosing op0 as the 1st piece of the sum, but
50856 // it is possible that choosing op1 might be better.
50857 SDValue X = OtherOp.getOperand(0), Y = OtherOp.getOperand(1);
50858 FalseOp = DAG.getNode(ISD::ADD, DL, VT, X, FalseOp);
50859 TrueOp = DAG.getNode(ISD::ADD, DL, VT, X, TrueOp);
50860 Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp,
50861 Cmov.getOperand(2), Cmov.getOperand(3));
50862 return DAG.getNode(ISD::ADD, DL, VT, Cmov, Y);
50863 }
50864
50865 // add (cmov C1, C2), OtherOp --> cmov (add OtherOp, C1), (add OtherOp, C2)
50866 FalseOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, FalseOp);
50867 TrueOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, TrueOp);
50868 return DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp, Cmov.getOperand(2),
50869 Cmov.getOperand(3));
50870}
50871
50872static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
50873 TargetLowering::DAGCombinerInfo &DCI,
50874 const X86Subtarget &Subtarget) {
50875 EVT VT = N->getValueType(0);
50876 SDValue Op0 = N->getOperand(0);
50877 SDValue Op1 = N->getOperand(1);
50878
50879 if (SDValue Select = pushAddIntoCmovOfConsts(N, DAG))
50880 return Select;
50881
50882 if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
50883 return MAdd;
50884 if (SDValue MAdd = matchPMADDWD_2(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
50885 return MAdd;
50886
50887 // Try to synthesize horizontal adds from adds of shuffles.
50888 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
50889 return V;
50890
50891 // If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into
50892 // (sub Y, (sext (vXi1 X))).
50893 // FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y) in
50894 // generic DAG combine without a legal type check, but adding this there
50895 // caused regressions.
50896 if (VT.isVector()) {
50897 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50898 if (Op0.getOpcode() == ISD::ZERO_EXTEND &&
50899 Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
50900 TLI.isTypeLegal(Op0.getOperand(0).getValueType())) {
50901 SDLoc DL(N);
50902 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op0.getOperand(0));
50903 return DAG.getNode(ISD::SUB, DL, VT, Op1, SExt);
50904 }
50905
50906 if (Op1.getOpcode() == ISD::ZERO_EXTEND &&
50907 Op1.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
50908 TLI.isTypeLegal(Op1.getOperand(0).getValueType())) {
50909 SDLoc DL(N);
50910 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op1.getOperand(0));
50911 return DAG.getNode(ISD::SUB, DL, VT, Op0, SExt);
50912 }
50913 }
50914
50915 return combineAddOrSubToADCOrSBB(N, DAG);
50916}
50917
50918static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
50919 TargetLowering::DAGCombinerInfo &DCI,
50920 const X86Subtarget &Subtarget) {
50921 SDValue Op0 = N->getOperand(0);
50922 SDValue Op1 = N->getOperand(1);
50923
50924 // TODO: Add NoOpaque handling to isConstantIntBuildVectorOrConstantInt.
50925 auto IsNonOpaqueConstant = [&](SDValue Op) {
50926 if (SDNode *C = DAG.isConstantIntBuildVectorOrConstantInt(Op)) {
50927 if (auto *Cst = dyn_cast<ConstantSDNode>(C))
50928 return !Cst->isOpaque();
50929 return true;
50930 }
50931 return false;
50932 };
50933
50934 // X86 can't encode an immediate LHS of a sub. See if we can push the
50935 // negation into a preceding instruction. If the RHS of the sub is a XOR with
50936 // one use and a constant, invert the immediate, saving one register.
50937 // sub(C1, xor(X, C2)) -> add(xor(X, ~C2), C1+1)
50938 if (Op1.getOpcode() == ISD::XOR && IsNonOpaqueConstant(Op0) &&
50939 IsNonOpaqueConstant(Op1.getOperand(1)) && Op1->hasOneUse()) {
50940 SDLoc DL(N);
50941 EVT VT = Op0.getValueType();
50942 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, Op1.getOperand(0),
50943 DAG.getNOT(SDLoc(Op1), Op1.getOperand(1), VT));
50944 SDValue NewAdd =
50945 DAG.getNode(ISD::ADD, DL, VT, Op0, DAG.getConstant(1, DL, VT));
50946 return DAG.getNode(ISD::ADD, DL, VT, NewXor, NewAdd);
50947 }
50948
50949 // Try to synthesize horizontal subs from subs of shuffles.
50950 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
50951 return V;
50952
50953 return combineAddOrSubToADCOrSBB(N, DAG);
50954}
50955
50956static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
50957 const X86Subtarget &Subtarget) {
50958 MVT VT = N->getSimpleValueType(0);
50959 SDLoc DL(N);
50960
50961 if (N->getOperand(0) == N->getOperand(1)) {
50962 if (N->getOpcode() == X86ISD::PCMPEQ)
50963 return DAG.getConstant(-1, DL, VT);
50964 if (N->getOpcode() == X86ISD::PCMPGT)
50965 return DAG.getConstant(0, DL, VT);
50966 }
50967
50968 return SDValue();
50969}
50970
50971/// Helper that combines an array of subvector ops as if they were the operands
50972/// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g.
50973/// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type.
50974static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
50975 ArrayRef<SDValue> Ops, SelectionDAG &DAG,
50976 TargetLowering::DAGCombinerInfo &DCI,
50977 const X86Subtarget &Subtarget) {
50978 assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors")(static_cast<void> (0));
50979 unsigned EltSizeInBits = VT.getScalarSizeInBits();
50980
50981 if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
50982 return DAG.getUNDEF(VT);
50983
50984 if (llvm::all_of(Ops, [](SDValue Op) {
50985 return ISD::isBuildVectorAllZeros(Op.getNode());
50986 }))
50987 return getZeroVector(VT, Subtarget, DAG, DL);
50988
50989 SDValue Op0 = Ops[0];
50990 bool IsSplat = llvm::all_of(Ops, [&Op0](SDValue Op) { return Op == Op0; });
50991
50992 // Repeated subvectors.
50993 if (IsSplat &&
50994 (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
50995 // If this broadcast is inserted into both halves, use a larger broadcast.
50996 if (Op0.getOpcode() == X86ISD::VBROADCAST)
50997 return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));
50998
50999 // If this scalar/subvector broadcast_load is inserted into both halves, use
51000 // a larger broadcast_load. Update other uses to use an extracted subvector.
51001 if (Op0.getOpcode() == X86ISD::VBROADCAST_LOAD ||
51002 Op0.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
51003 auto *MemIntr = cast<MemIntrinsicSDNode>(Op0);
51004 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
51005 SDValue Ops[] = {MemIntr->getChain(), MemIntr->getBasePtr()};
51006 SDValue BcastLd = DAG.getMemIntrinsicNode(Op0.getOpcode(), DL, Tys, Ops,
51007 MemIntr->getMemoryVT(),
51008 MemIntr->getMemOperand());
51009 DAG.ReplaceAllUsesOfValueWith(
51010 Op0, extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits()));
51011 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
51012 return BcastLd;
51013 }
51014
51015 // If this is a simple subvector load repeated across multiple lanes, then
51016 // broadcast the load. Update other uses to use an extracted subvector.
51017 if (auto *Ld = dyn_cast<LoadSDNode>(Op0)) {
51018 if (Ld->isSimple() && !Ld->isNonTemporal() &&
51019 Ld->getExtensionType() == ISD::NON_EXTLOAD) {
51020 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
51021 SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
51022 SDValue BcastLd =
51023 DAG.getMemIntrinsicNode(X86ISD::SUBV_BROADCAST_LOAD, DL, Tys, Ops,
51024 Ld->getMemoryVT(), Ld->getMemOperand());
51025 DAG.ReplaceAllUsesOfValueWith(
51026 Op0,
51027 extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits()));
51028 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), BcastLd.getValue(1));
51029 return BcastLd;
51030 }
51031 }
51032
51033 // concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
51034 if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&
51035 (Subtarget.hasAVX2() || MayFoldLoadIntoBroadcastFromMem(
51036 Op0.getOperand(0), VT.getScalarType())))
51037 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
51038 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,
51039 Op0.getOperand(0),
51040 DAG.getIntPtrConstant(0, DL)));
51041
51042 // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
51043 if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
51044 (Subtarget.hasAVX2() ||
51045 (EltSizeInBits >= 32 && MayFoldLoad(Op0.getOperand(0)))) &&
51046 Op0.getOperand(0).getValueType() == VT.getScalarType())
51047 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));
51048
51049 // concat_vectors(extract_subvector(broadcast(x)),
51050 // extract_subvector(broadcast(x))) -> broadcast(x)
51051 if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
51052 Op0.getOperand(0).getValueType() == VT) {
51053 if (Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST ||
51054 Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST_LOAD)
51055 return Op0.getOperand(0);
51056 }
51057 }
51058
51059 // concat(extract_subvector(v0,c0), extract_subvector(v1,c1)) -> vperm2x128.
51060 // Only concat of subvector high halves which vperm2x128 is best at.
51061 // TODO: This should go in combineX86ShufflesRecursively eventually.
51062 if (VT.is256BitVector() && Ops.size() == 2) {
51063 SDValue Src0 = peekThroughBitcasts(Ops[0]);
51064 SDValue Src1 = peekThroughBitcasts(Ops[1]);
51065 if (Src0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
51066 Src1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
51067 EVT SrcVT0 = Src0.getOperand(0).getValueType();
51068 EVT SrcVT1 = Src1.getOperand(0).getValueType();
51069 unsigned NumSrcElts0 = SrcVT0.getVectorNumElements();
51070 unsigned NumSrcElts1 = SrcVT1.getVectorNumElements();
51071 if (SrcVT0.is256BitVector() && SrcVT1.is256BitVector() &&
51072 Src0.getConstantOperandAPInt(1) == (NumSrcElts0 / 2) &&
51073 Src1.getConstantOperandAPInt(1) == (NumSrcElts1 / 2)) {
51074 return DAG.getNode(X86ISD::VPERM2X128, DL, VT,
51075 DAG.getBitcast(VT, Src0.getOperand(0)),
51076 DAG.getBitcast(VT, Src1.getOperand(0)),
51077 DAG.getTargetConstant(0x31, DL, MVT::i8));
51078 }
51079 }
51080 }
51081
51082 // Repeated opcode.
51083 // TODO - combineX86ShufflesRecursively should handle shuffle concatenation
51084 // but it currently struggles with different vector widths.
51085 if (llvm::all_of(Ops, [Op0](SDValue Op) {
51086 return Op.getOpcode() == Op0.getOpcode();
51087 })) {
51088 auto ConcatSubOperand = [&](MVT VT, ArrayRef<SDValue> SubOps, unsigned I) {
51089 SmallVector<SDValue> Subs;
51090 for (SDValue SubOp : SubOps)
51091 Subs.push_back(SubOp.getOperand(I));
51092 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
51093 };
51094
51095 unsigned NumOps = Ops.size();
51096 switch (Op0.getOpcode()) {
51097 case X86ISD::SHUFP: {
51098 // Add SHUFPD support if/when necessary.
51099 if (!IsSplat && VT.getScalarType() == MVT::f32 &&
51100 llvm::all_of(Ops, [Op0](SDValue Op) {
51101 return Op.getOperand(2) == Op0.getOperand(2);
51102 })) {
51103 return DAG.getNode(Op0.getOpcode(), DL, VT,
51104 ConcatSubOperand(VT, Ops, 0),
51105 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
51106 }
51107 break;
51108 }
51109 case X86ISD::PSHUFHW:
51110 case X86ISD::PSHUFLW:
51111 case X86ISD::PSHUFD:
51112 if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&
51113 Subtarget.hasInt256() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
51114 return DAG.getNode(Op0.getOpcode(), DL, VT,
51115 ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
51116 }
51117 LLVM_FALLTHROUGH[[gnu::fallthrough]];
51118 case X86ISD::VPERMILPI:
51119 // TODO - add support for vXf64/vXi64 shuffles.
51120 if (!IsSplat && NumOps == 2 && (VT == MVT::v8f32 || VT == MVT::v8i32) &&
51121 Subtarget.hasAVX() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
51122 SDValue Res = DAG.getBitcast(MVT::v8f32, ConcatSubOperand(VT, Ops, 0));
51123 Res = DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, Res,
51124 Op0.getOperand(1));
51125 return DAG.getBitcast(VT, Res);
51126 }
51127 break;
51128 case X86ISD::VPERMV3:
51129 if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
51130 MVT OpVT = Op0.getSimpleValueType();
51131 int NumSrcElts = OpVT.getVectorNumElements();
51132 SmallVector<int, 64> ConcatMask;
51133 for (unsigned i = 0; i != NumOps; ++i) {
51134 SmallVector<int, 64> SubMask;
51135 SmallVector<SDValue, 2> SubOps;
51136 if (!getTargetShuffleMask(Ops[i].getNode(), OpVT, false, SubOps,
51137 SubMask))
51138 break;
51139 for (int M : SubMask) {
51140 if (0 <= M) {
51141 M += M < NumSrcElts ? 0 : NumSrcElts;
51142 M += i * NumSrcElts;
51143 }
51144 ConcatMask.push_back(M);
51145 }
51146 }
51147 if (ConcatMask.size() == (NumOps * NumSrcElts)) {
51148 SDValue Src0 = concatSubVectors(Ops[0].getOperand(0),
51149 Ops[1].getOperand(0), DAG, DL);
51150 SDValue Src1 = concatSubVectors(Ops[0].getOperand(2),
51151 Ops[1].getOperand(2), DAG, DL);
51152 MVT IntMaskSVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
51153 MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts);
51154 SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true);
51155 return DAG.getNode(X86ISD::VPERMV3, DL, VT, Src0, Mask, Src1);
51156 }
51157 }
51158 break;
51159 case X86ISD::VSHLI:
51160 case X86ISD::VSRLI:
51161 // Special case: SHL/SRL AVX1 V4i64 by 32-bits can lower as a shuffle.
51162 // TODO: Move this to LowerScalarImmediateShift?
51163 if (VT == MVT::v4i64 && !Subtarget.hasInt256() &&
51164 llvm::all_of(Ops, [](SDValue Op) {
51165 return Op.getConstantOperandAPInt(1) == 32;
51166 })) {
51167 SDValue Res = DAG.getBitcast(MVT::v8i32, ConcatSubOperand(VT, Ops, 0));
51168 SDValue Zero = getZeroVector(MVT::v8i32, Subtarget, DAG, DL);
51169 if (Op0.getOpcode() == X86ISD::VSHLI) {
51170 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
51171 {8, 0, 8, 2, 8, 4, 8, 6});
51172 } else {
51173 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
51174 {1, 8, 3, 8, 5, 8, 7, 8});
51175 }
51176 return DAG.getBitcast(VT, Res);
51177 }
51178 LLVM_FALLTHROUGH[[gnu::fallthrough]];
51179 case X86ISD::VSRAI:
51180 if (((VT.is256BitVector() && Subtarget.hasInt256()) ||
51181 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
51182 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
51183 llvm::all_of(Ops, [Op0](SDValue Op) {
51184 return Op0.getOperand(1) == Op.getOperand(1);
51185 })) {
51186 return DAG.getNode(Op0.getOpcode(), DL, VT,
51187 ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
51188 }
51189 break;
51190 case X86ISD::VPERMI:
51191 case X86ISD::VROTLI:
51192 case X86ISD::VROTRI:
51193 if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
51194 llvm::all_of(Ops, [Op0](SDValue Op) {
51195 return Op0.getOperand(1) == Op.getOperand(1);
51196 })) {
51197 return DAG.getNode(Op0.getOpcode(), DL, VT,
51198 ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
51199 }
51200 break;
51201 case ISD::AND:
51202 case ISD::OR:
51203 case ISD::XOR:
51204 case X86ISD::ANDNP:
51205 // TODO: Add 256-bit support.
51206 if (!IsSplat && VT.is512BitVector()) {
51207 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
51208 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
51209 NumOps * SrcVT.getVectorNumElements());
51210 return DAG.getNode(Op0.getOpcode(), DL, VT,
51211 ConcatSubOperand(SrcVT, Ops, 0),
51212 ConcatSubOperand(SrcVT, Ops, 1));
51213 }
51214 break;
51215 case X86ISD::HADD:
51216 case X86ISD::HSUB:
51217 case X86ISD::FHADD:
51218 case X86ISD::FHSUB:
51219 case X86ISD::PACKSS:
51220 case X86ISD::PACKUS:
51221 if (!IsSplat && VT.is256BitVector() &&
51222 (VT.isFloatingPoint() || Subtarget.hasInt256())) {
51223 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
51224 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
51225 NumOps * SrcVT.getVectorNumElements());
51226 return DAG.getNode(Op0.getOpcode(), DL, VT,
51227 ConcatSubOperand(SrcVT, Ops, 0),
51228 ConcatSubOperand(SrcVT, Ops, 1));
51229 }
51230 break;
51231 case X86ISD::PALIGNR:
51232 if (!IsSplat &&
51233 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
51234 (VT.is512BitVector() && Subtarget.useBWIRegs())) &&
51235 llvm::all_of(Ops, [Op0](SDValue Op) {
51236 return Op0.getOperand(2) == Op.getOperand(2);
51237 })) {
51238 return DAG.getNode(Op0.getOpcode(), DL, VT,
51239 ConcatSubOperand(VT, Ops, 0),
51240 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
51241 }
51242 break;
51243 }
51244 }
51245
51246 // Fold subvector loads into one.
51247 // If needed, look through bitcasts to get to the load.
51248 if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {
51249 bool Fast;
51250 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
51251 if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
51252 *FirstLd->getMemOperand(), &Fast) &&
51253 Fast) {
51254 if (SDValue Ld =
51255 EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))
51256 return Ld;
51257 }
51258 }
51259
51260 return SDValue();
51261}
51262
51263static SDValue combineConcatVectors(SDNode *N, SelectionDAG &DAG,
51264 TargetLowering::DAGCombinerInfo &DCI,
51265 const X86Subtarget &Subtarget) {
51266 EVT VT = N->getValueType(0);
51267 EVT SrcVT = N->getOperand(0).getValueType();
51268 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51269
51270 // Don't do anything for i1 vectors.
51271 if (VT.getVectorElementType() == MVT::i1)
51272 return SDValue();
51273
51274 if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) {
51275 SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());
51276 if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG,
51277 DCI, Subtarget))
51278 return R;
51279 }
51280
51281 return SDValue();
51282}
51283
51284static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
51285 TargetLowering::DAGCombinerInfo &DCI,
51286 const X86Subtarget &Subtarget) {
51287 if (DCI.isBeforeLegalizeOps())
51288 return SDValue();
51289
51290 MVT OpVT = N->getSimpleValueType(0);
51291
51292 bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;
51293
51294 SDLoc dl(N);
51295 SDValue Vec = N->getOperand(0);
51296 SDValue SubVec = N->getOperand(1);
51297
51298 uint64_t IdxVal = N->getConstantOperandVal(2);
51299 MVT SubVecVT = SubVec.getSimpleValueType();
51300
51301 if (Vec.isUndef() && SubVec.isUndef())
51302 return DAG.getUNDEF(OpVT);
51303
51304 // Inserting undefs/zeros into zeros/undefs is a zero vector.
51305 if ((Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())) &&
51306 (SubVec.isUndef() || ISD::isBuildVectorAllZeros(SubVec.getNode())))
51307 return getZeroVector(OpVT, Subtarget, DAG, dl);
51308
51309 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
51310 // If we're inserting into a zero vector and then into a larger zero vector,
51311 // just insert into the larger zero vector directly.
51312 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
51313 ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) {
51314 uint64_t Idx2Val = SubVec.getConstantOperandVal(2);
51315 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
51316 getZeroVector(OpVT, Subtarget, DAG, dl),
51317 SubVec.getOperand(1),
51318 DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));
51319 }
51320
51321 // If we're inserting into a zero vector and our input was extracted from an
51322 // insert into a zero vector of the same type and the extraction was at
51323 // least as large as the original insertion. Just insert the original
51324 // subvector into a zero vector.
51325 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
51326 isNullConstant(SubVec.getOperand(1)) &&
51327 SubVec.getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR) {
51328 SDValue Ins = SubVec.getOperand(0);
51329 if (isNullConstant(Ins.getOperand(2)) &&
51330 ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
51331 Ins.getOperand(1).getValueSizeInBits().getFixedSize() <=
51332 SubVecVT.getFixedSizeInBits())
51333 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
51334 getZeroVector(OpVT, Subtarget, DAG, dl),
51335 Ins.getOperand(1), N->getOperand(2));
51336 }
51337 }
51338
51339 // Stop here if this is an i1 vector.
51340 if (IsI1Vector)
51341 return SDValue();
51342
51343 // If this is an insert of an extract, combine to a shuffle. Don't do this
51344 // if the insert or extract can be represented with a subregister operation.
51345 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
51346 SubVec.getOperand(0).getSimpleValueType() == OpVT &&
51347 (IdxVal != 0 ||
51348 !(Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())))) {
51349 int ExtIdxVal = SubVec.getConstantOperandVal(1);
51350 if (ExtIdxVal != 0) {
51351 int VecNumElts = OpVT.getVectorNumElements();
51352 int SubVecNumElts = SubVecVT.getVectorNumElements();
51353 SmallVector<int, 64> Mask(VecNumElts);
51354 // First create an identity shuffle mask.
51355 for (int i = 0; i != VecNumElts; ++i)
51356 Mask[i] = i;
51357 // Now insert the extracted portion.
51358 for (int i = 0; i != SubVecNumElts; ++i)
51359 Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
51360
51361 return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
51362 }
51363 }
51364
51365 // Match concat_vector style patterns.
51366 SmallVector<SDValue, 2> SubVectorOps;
51367 if (collectConcatOps(N, SubVectorOps)) {
51368 if (SDValue Fold =
51369 combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, DCI, Subtarget))
51370 return Fold;
51371
51372 // If we're inserting all zeros into the upper half, change this to
51373 // a concat with zero. We will match this to a move
51374 // with implicit upper bit zeroing during isel.
51375 // We do this here because we don't want combineConcatVectorOps to
51376 // create INSERT_SUBVECTOR from CONCAT_VECTORS.
51377 if (SubVectorOps.size() == 2 &&
51378 ISD::isBuildVectorAllZeros(SubVectorOps[1].getNode()))
51379 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
51380 getZeroVector(OpVT, Subtarget, DAG, dl),
51381 SubVectorOps[0], DAG.getIntPtrConstant(0, dl));
51382 }
51383
51384 // If this is a broadcast insert into an upper undef, use a larger broadcast.
51385 if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)
51386 return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));
51387
51388 // If this is a broadcast load inserted into an upper undef, use a larger
51389 // broadcast load.
51390 if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() &&
51391 SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) {
51392 auto *MemIntr = cast<MemIntrinsicSDNode>(SubVec);
51393 SDVTList Tys = DAG.getVTList(OpVT, MVT::Other);
51394 SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() };
51395 SDValue BcastLd =
51396 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
51397 MemIntr->getMemoryVT(),
51398 MemIntr->getMemOperand());
51399 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
51400 return BcastLd;
51401 }
51402
51403 // If we're splatting the lower half subvector of a full vector load into the
51404 // upper half, attempt to create a subvector broadcast.
51405 if (IdxVal == (OpVT.getVectorNumElements() / 2) && SubVec.hasOneUse() &&
51406 Vec.getValueSizeInBits() == (2 * SubVec.getValueSizeInBits())) {
51407 auto *VecLd = dyn_cast<LoadSDNode>(Vec);
51408 auto *SubLd = dyn_cast<LoadSDNode>(SubVec);
51409 if (VecLd && SubLd &&
51410 DAG.areNonVolatileConsecutiveLoads(SubLd, VecLd,
51411 SubVec.getValueSizeInBits() / 8, 0))
51412 return getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, dl, OpVT, SubVecVT,
51413 SubLd, 0, DAG);
51414 }
51415
51416 return SDValue();
51417}
51418
51419/// If we are extracting a subvector of a vector select and the select condition
51420/// is composed of concatenated vectors, try to narrow the select width. This
51421/// is a common pattern for AVX1 integer code because 256-bit selects may be
51422/// legal, but there is almost no integer math/logic available for 256-bit.
51423/// This function should only be called with legal types (otherwise, the calls
51424/// to get simple value types will assert).
51425static SDValue narrowExtractedVectorSelect(SDNode *Ext, SelectionDAG &DAG) {
51426 SDValue Sel = peekThroughBitcasts(Ext->getOperand(0));
51427 SmallVector<SDValue, 4> CatOps;
51428 if (Sel.getOpcode() != ISD::VSELECT ||
51429 !collectConcatOps(Sel.getOperand(0).getNode(), CatOps))
51430 return SDValue();
51431
51432 // Note: We assume simple value types because this should only be called with
51433 // legal operations/types.
51434 // TODO: This can be extended to handle extraction to 256-bits.
51435 MVT VT = Ext->getSimpleValueType(0);
51436 if (!VT.is128BitVector())
51437 return SDValue();
51438
51439 MVT SelCondVT = Sel.getOperand(0).getSimpleValueType();
51440 if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector())
51441 return SDValue();
51442
51443 MVT WideVT = Ext->getOperand(0).getSimpleValueType();
51444 MVT SelVT = Sel.getSimpleValueType();
51445 assert((SelVT.is256BitVector() || SelVT.is512BitVector()) &&(static_cast<void> (0))
51446 "Unexpected vector type with legal operations")(static_cast<void> (0));
51447
51448 unsigned SelElts = SelVT.getVectorNumElements();
51449 unsigned CastedElts = WideVT.getVectorNumElements();
51450 unsigned ExtIdx = Ext->getConstantOperandVal(1);
51451 if (SelElts % CastedElts == 0) {
51452 // The select has the same or more (narrower) elements than the extract
51453 // operand. The extraction index gets scaled by that factor.
51454 ExtIdx *= (SelElts / CastedElts);
51455 } else if (CastedElts % SelElts == 0) {
51456 // The select has less (wider) elements than the extract operand. Make sure
51457 // that the extraction index can be divided evenly.
51458 unsigned IndexDivisor = CastedElts / SelElts;
51459 if (ExtIdx % IndexDivisor != 0)
51460 return SDValue();
51461 ExtIdx /= IndexDivisor;
51462 } else {
51463 llvm_unreachable("Element count of simple vector types are not divisible?")__builtin_unreachable();
51464 }
51465
51466 unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits();
51467 unsigned NarrowElts = SelElts / NarrowingFactor;
51468 MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts);
51469 SDLoc DL(Ext);
51470 SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL);
51471 SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL);
51472 SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL);
51473 SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF);
51474 return DAG.getBitcast(VT, NarrowSel);
51475}
51476
51477static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
51478 TargetLowering::DAGCombinerInfo &DCI,
51479 const X86Subtarget &Subtarget) {
51480 // For AVX1 only, if we are extracting from a 256-bit and+not (which will
51481 // eventually get combined/lowered into ANDNP) with a concatenated operand,
51482 // split the 'and' into 128-bit ops to avoid the concatenate and extract.
51483 // We let generic combining take over from there to simplify the
51484 // insert/extract and 'not'.
51485 // This pattern emerges during AVX1 legalization. We handle it before lowering
51486 // to avoid complications like splitting constant vector loads.
51487
51488 // Capture the original wide type in the likely case that we need to bitcast
51489 // back to this type.
51490 if (!N->getValueType(0).isSimple())
51491 return SDValue();
51492
51493 MVT VT = N->getSimpleValueType(0);
51494 SDValue InVec = N->getOperand(0);
51495 unsigned IdxVal = N->getConstantOperandVal(1);
51496 SDValue InVecBC = peekThroughBitcasts(InVec);
51497 EVT InVecVT = InVec.getValueType();
51498 unsigned SizeInBits = VT.getSizeInBits();
51499 unsigned InSizeInBits = InVecVT.getSizeInBits();
51500 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51501
51502 if (Subtarget.hasAVX() && !Subtarget.hasAVX2() &&
51503 TLI.isTypeLegal(InVecVT) &&
51504 InSizeInBits == 256 && InVecBC.getOpcode() == ISD::AND) {
51505 auto isConcatenatedNot = [](SDValue V) {
51506 V = peekThroughBitcasts(V);
51507 if (!isBitwiseNot(V))
51508 return false;
51509 SDValue NotOp = V->getOperand(0);
51510 return peekThroughBitcasts(NotOp).getOpcode() == ISD::CONCAT_VECTORS;
51511 };
51512 if (isConcatenatedNot(InVecBC.getOperand(0)) ||
51513 isConcatenatedNot(InVecBC.getOperand(1))) {
51514 // extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1
51515 SDValue Concat = splitVectorIntBinary(InVecBC, DAG);
51516 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT,
51517 DAG.getBitcast(InVecVT, Concat), N->getOperand(1));
51518 }
51519 }
51520
51521 if (DCI.isBeforeLegalizeOps())
51522 return SDValue();
51523
51524 if (SDValue V = narrowExtractedVectorSelect(N, DAG))
51525 return V;
51526
51527 if (ISD::isBuildVectorAllZeros(InVec.getNode()))
51528 return getZeroVector(VT, Subtarget, DAG, SDLoc(N));
51529
51530 if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
51531 if (VT.getScalarType() == MVT::i1)
51532 return DAG.getConstant(1, SDLoc(N), VT);
51533 return getOnesVector(VT, DAG, SDLoc(N));
51534 }
51535
51536 if (InVec.getOpcode() == ISD::BUILD_VECTOR)
51537 return DAG.getBuildVector(
51538 VT, SDLoc(N),
51539 InVec.getNode()->ops().slice(IdxVal, VT.getVectorNumElements()));
51540
51541 // If we are extracting from an insert into a zero vector, replace with a
51542 // smaller insert into zero if we don't access less than the original
51543 // subvector. Don't do this for i1 vectors.
51544 if (VT.getVectorElementType() != MVT::i1 &&
51545 InVec.getOpcode() == ISD::INSERT_SUBVECTOR && IdxVal == 0 &&
51546 InVec.hasOneUse() && isNullConstant(InVec.getOperand(2)) &&
51547 ISD::isBuildVectorAllZeros(InVec.getOperand(0).getNode()) &&
51548 InVec.getOperand(1).getValueSizeInBits() <= SizeInBits) {
51549 SDLoc DL(N);
51550 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
51551 getZeroVector(VT, Subtarget, DAG, DL),
51552 InVec.getOperand(1), InVec.getOperand(2));
51553 }
51554
51555 // If we're extracting an upper subvector from a broadcast we should just
51556 // extract the lowest subvector instead which should allow
51557 // SimplifyDemandedVectorElts do more simplifications.
51558 if (IdxVal != 0 && (InVec.getOpcode() == X86ISD::VBROADCAST ||
51559 InVec.getOpcode() == X86ISD::VBROADCAST_LOAD ||
51560 DAG.isSplatValue(InVec, /*AllowUndefs*/ false)))
51561 return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits);
51562
51563 // If we're extracting a broadcasted subvector, just use the lowest subvector.
51564 if (IdxVal != 0 && InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
51565 cast<MemIntrinsicSDNode>(InVec)->getMemoryVT() == VT)
51566 return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits);
51567
51568 // Attempt to extract from the source of a shuffle vector.
51569 if ((InSizeInBits % SizeInBits) == 0 &&
51570 (IdxVal % VT.getVectorNumElements()) == 0) {
51571 SmallVector<int, 32> ShuffleMask;
51572 SmallVector<int, 32> ScaledMask;
51573 SmallVector<SDValue, 2> ShuffleInputs;
51574 unsigned NumSubVecs = InSizeInBits / SizeInBits;
51575 // Decode the shuffle mask and scale it so its shuffling subvectors.
51576 if (getTargetShuffleInputs(InVecBC, ShuffleInputs, ShuffleMask, DAG) &&
51577 scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) {
51578 unsigned SubVecIdx = IdxVal / VT.getVectorNumElements();
51579 if (ScaledMask[SubVecIdx] == SM_SentinelUndef)
51580 return DAG.getUNDEF(VT);
51581 if (ScaledMask[SubVecIdx] == SM_SentinelZero)
51582 return getZeroVector(VT, Subtarget, DAG, SDLoc(N));
51583 SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs];
51584 if (Src.getValueSizeInBits() == InSizeInBits) {
51585 unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs;
51586 unsigned SrcEltIdx = SrcSubVecIdx * VT.getVectorNumElements();
51587 return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG,
51588 SDLoc(N), SizeInBits);
51589 }
51590 }
51591 }
51592
51593 // If we're extracting the lowest subvector and we're the only user,
51594 // we may be able to perform this with a smaller vector width.
51595 unsigned InOpcode = InVec.getOpcode();
51596 if (IdxVal == 0 && InVec.hasOneUse()) {
51597 if (VT == MVT::v2f64 && InVecVT == MVT::v4f64) {
51598 // v2f64 CVTDQ2PD(v4i32).
51599 if (InOpcode == ISD::SINT_TO_FP &&
51600 InVec.getOperand(0).getValueType() == MVT::v4i32) {
51601 return DAG.getNode(X86ISD::CVTSI2P, SDLoc(N), VT, InVec.getOperand(0));
51602 }
51603 // v2f64 CVTUDQ2PD(v4i32).
51604 if (InOpcode == ISD::UINT_TO_FP && Subtarget.hasVLX() &&
51605 InVec.getOperand(0).getValueType() == MVT::v4i32) {
51606 return DAG.getNode(X86ISD::CVTUI2P, SDLoc(N), VT, InVec.getOperand(0));
51607 }
51608 // v2f64 CVTPS2PD(v4f32).
51609 if (InOpcode == ISD::FP_EXTEND &&
51610 InVec.getOperand(0).getValueType() == MVT::v4f32) {
51611 return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), VT, InVec.getOperand(0));
51612 }
51613 }
51614 if ((InOpcode == ISD::ANY_EXTEND ||
51615 InOpcode == ISD::ANY_EXTEND_VECTOR_INREG ||
51616 InOpcode == ISD::ZERO_EXTEND ||
51617 InOpcode == ISD::ZERO_EXTEND_VECTOR_INREG ||
51618 InOpcode == ISD::SIGN_EXTEND ||
51619 InOpcode == ISD::SIGN_EXTEND_VECTOR_INREG) &&
51620 (SizeInBits == 128 || SizeInBits == 256) &&
51621 InVec.getOperand(0).getValueSizeInBits() >= SizeInBits) {
51622 SDLoc DL(N);
51623 SDValue Ext = InVec.getOperand(0);
51624 if (Ext.getValueSizeInBits() > SizeInBits)
51625 Ext = extractSubVector(Ext, 0, DAG, DL, SizeInBits);
51626 unsigned ExtOp = getOpcode_EXTEND_VECTOR_INREG(InOpcode);
51627 return DAG.getNode(ExtOp, DL, VT, Ext);
51628 }
51629 if (InOpcode == ISD::VSELECT &&
51630 InVec.getOperand(0).getValueType().is256BitVector() &&
51631 InVec.getOperand(1).getValueType().is256BitVector() &&
51632 InVec.getOperand(2).getValueType().is256BitVector()) {
51633 SDLoc DL(N);
51634 SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128);
51635 SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128);
51636 SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);
51637 return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);
51638 }
51639 if (InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() &&
51640 (VT.is128BitVector() || VT.is256BitVector())) {
51641 SDLoc DL(N);
51642 SDValue InVecSrc = InVec.getOperand(0);
51643 unsigned Scale = InVecSrc.getValueSizeInBits() / InSizeInBits;
51644 SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits);
51645 return DAG.getNode(InOpcode, DL, VT, Ext);
51646 }
51647 }
51648
51649 // Always split vXi64 logical shifts where we're extracting the upper 32-bits
51650 // as this is very likely to fold into a shuffle/truncation.
51651 if ((InOpcode == X86ISD::VSHLI || InOpcode == X86ISD::VSRLI) &&
51652 InVecVT.getScalarSizeInBits() == 64 &&
51653 InVec.getConstantOperandAPInt(1) == 32) {
51654 SDLoc DL(N);
51655 SDValue Ext =
51656 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
51657 return DAG.getNode(InOpcode, DL, VT, Ext, InVec.getOperand(1));
51658 }
51659
51660 return SDValue();
51661}
51662
51663static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {
51664 EVT VT = N->getValueType(0);
51665 SDValue Src = N->getOperand(0);
51666 SDLoc DL(N);
51667
51668 // If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
51669 // This occurs frequently in our masked scalar intrinsic code and our
51670 // floating point select lowering with AVX512.
51671 // TODO: SimplifyDemandedBits instead?
51672 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse())
51673 if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
51674 if (C->getAPIntValue().isOneValue())
51675 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1,
51676 Src.getOperand(0));
51677
51678 // Combine scalar_to_vector of an extract_vector_elt into an extract_subvec.
51679 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
51680 Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() &&
51681 Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1)
51682 if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
51683 if (C->isNullValue())
51684 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0),
51685 Src.getOperand(1));
51686
51687 // Reduce v2i64 to v4i32 if we don't need the upper bits.
51688 // TODO: Move to DAGCombine/SimplifyDemandedBits?
51689 if (VT == MVT::v2i64 || VT == MVT::v2f64) {
51690 auto IsAnyExt64 = [](SDValue Op) {
51691 if (Op.getValueType() != MVT::i64 || !Op.hasOneUse())
51692 return SDValue();
51693 if (Op.getOpcode() == ISD::ANY_EXTEND &&
51694 Op.getOperand(0).getScalarValueSizeInBits() <= 32)
51695 return Op.getOperand(0);
51696 if (auto *Ld = dyn_cast<LoadSDNode>(Op))
51697 if (Ld->getExtensionType() == ISD::EXTLOAD &&
51698 Ld->getMemoryVT().getScalarSizeInBits() <= 32)
51699 return Op;
51700 return SDValue();
51701 };
51702 if (SDValue ExtSrc = IsAnyExt64(peekThroughOneUseBitcasts(Src)))
51703 return DAG.getBitcast(
51704 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
51705 DAG.getAnyExtOrTrunc(ExtSrc, DL, MVT::i32)));
51706 }
51707
51708 // Combine (v2i64 (scalar_to_vector (i64 (bitconvert (mmx))))) to MOVQ2DQ.
51709 if (VT == MVT::v2i64 && Src.getOpcode() == ISD::BITCAST &&
51710 Src.getOperand(0).getValueType() == MVT::x86mmx)
51711 return DAG.getNode(X86ISD::MOVQ2DQ, DL, VT, Src.getOperand(0));
51712
51713 // See if we're broadcasting the scalar value, in which case just reuse that.
51714 // Ensure the same SDValue from the SDNode use is being used.
51715 if (VT.getScalarType() == Src.getValueType())
51716 for (SDNode *User : Src->uses())
51717 if (User->getOpcode() == X86ISD::VBROADCAST &&
51718 Src == User->getOperand(0)) {
51719 unsigned SizeInBits = VT.getFixedSizeInBits();
51720 unsigned BroadcastSizeInBits =
51721 User->getValueSizeInBits(0).getFixedSize();
51722 if (BroadcastSizeInBits == SizeInBits)
51723 return SDValue(User, 0);
51724 if (BroadcastSizeInBits > SizeInBits)
51725 return extractSubVector(SDValue(User, 0), 0, DAG, DL, SizeInBits);
51726 // TODO: Handle BroadcastSizeInBits < SizeInBits when we have test
51727 // coverage.
51728 }
51729
51730 return SDValue();
51731}
51732
51733// Simplify PMULDQ and PMULUDQ operations.
51734static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
51735 TargetLowering::DAGCombinerInfo &DCI,
51736 const X86Subtarget &Subtarget) {
51737 SDValue LHS = N->getOperand(0);
51738 SDValue RHS = N->getOperand(1);
51739
51740 // Canonicalize constant to RHS.
51741 if (DAG.isConstantIntBuildVectorOrConstantInt(LHS) &&
51742 !DAG.isConstantIntBuildVectorOrConstantInt(RHS))
51743 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);
51744
51745 // Multiply by zero.
51746 // Don't return RHS as it may contain UNDEFs.
51747 if (ISD::isBuildVectorAllZeros(RHS.getNode()))
51748 return DAG.getConstant(0, SDLoc(N), N->getValueType(0));
51749
51750 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
51751 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51752 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnesValue(64), DCI))
51753 return SDValue(N, 0);
51754
51755 // If the input is an extend_invec and the SimplifyDemandedBits call didn't
51756 // convert it to any_extend_invec, due to the LegalOperations check, do the
51757 // conversion directly to a vector shuffle manually. This exposes combine
51758 // opportunities missed by combineEXTEND_VECTOR_INREG not calling
51759 // combineX86ShufflesRecursively on SSE4.1 targets.
51760 // FIXME: This is basically a hack around several other issues related to
51761 // ANY_EXTEND_VECTOR_INREG.
51762 if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() &&
51763 (LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
51764 LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
51765 LHS.getOperand(0).getValueType() == MVT::v4i32) {
51766 SDLoc dl(N);
51767 LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0),
51768 LHS.getOperand(0), { 0, -1, 1, -1 });
51769 LHS = DAG.getBitcast(MVT::v2i64, LHS);
51770 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
51771 }
51772 if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() &&
51773 (RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
51774 RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
51775 RHS.getOperand(0).getValueType() == MVT::v4i32) {
51776 SDLoc dl(N);
51777 RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0),
51778 RHS.getOperand(0), { 0, -1, 1, -1 });
51779 RHS = DAG.getBitcast(MVT::v2i64, RHS);
51780 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
51781 }
51782
51783 return SDValue();
51784}
51785
51786// Simplify VPMADDUBSW/VPMADDWD operations.
51787static SDValue combineVPMADD(SDNode *N, SelectionDAG &DAG,
51788 TargetLowering::DAGCombinerInfo &DCI) {
51789 SDValue LHS = N->getOperand(0);
51790 SDValue RHS = N->getOperand(1);
51791
51792 // Multiply by zero.
51793 // Don't return LHS/RHS as it may contain UNDEFs.
51794 if (ISD::isBuildVectorAllZeros(LHS.getNode()) ||
51795 ISD::isBuildVectorAllZeros(RHS.getNode()))
51796 return DAG.getConstant(0, SDLoc(N), N->getValueType(0));
51797
51798 return SDValue();
51799}
51800
51801static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG,
51802 TargetLowering::DAGCombinerInfo &DCI,
51803 const X86Subtarget &Subtarget) {
51804 EVT VT = N->getValueType(0);
51805 SDValue In = N->getOperand(0);
51806 unsigned Opcode = N->getOpcode();
51807 unsigned InOpcode = In.getOpcode();
51808 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51809
51810 // Try to merge vector loads and extend_inreg to an extload.
51811 if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&
51812 In.hasOneUse()) {
51813 auto *Ld = cast<LoadSDNode>(In);
51814 if (Ld->isSimple()) {
51815 MVT SVT = In.getSimpleValueType().getVectorElementType();
51816 ISD::LoadExtType Ext = Opcode == ISD::SIGN_EXTEND_VECTOR_INREG
51817 ? ISD::SEXTLOAD
51818 : ISD::ZEXTLOAD;
51819 EVT MemVT = VT.changeVectorElementType(SVT);
51820 if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {
51821 SDValue Load =
51822 DAG.getExtLoad(Ext, SDLoc(N), VT, Ld->getChain(), Ld->getBasePtr(),
51823 Ld->getPointerInfo(), MemVT, Ld->getOriginalAlign(),
51824 Ld->getMemOperand()->getFlags());
51825 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
51826 return Load;
51827 }
51828 }
51829 }
51830
51831 // Fold EXTEND_VECTOR_INREG(EXTEND_VECTOR_INREG(X)) -> EXTEND_VECTOR_INREG(X).
51832 if (Opcode == InOpcode)
51833 return DAG.getNode(Opcode, SDLoc(N), VT, In.getOperand(0));
51834
51835 // Fold EXTEND_VECTOR_INREG(EXTRACT_SUBVECTOR(EXTEND(X),0))
51836 // -> EXTEND_VECTOR_INREG(X).
51837 // TODO: Handle non-zero subvector indices.
51838 if (InOpcode == ISD::EXTRACT_SUBVECTOR && In.getConstantOperandVal(1) == 0 &&
51839 In.getOperand(0).getOpcode() == getOpcode_EXTEND(Opcode) &&
51840 In.getOperand(0).getOperand(0).getValueSizeInBits() ==
51841 In.getValueSizeInBits())
51842 return DAG.getNode(Opcode, SDLoc(N), VT, In.getOperand(0).getOperand(0));
51843
51844 // Attempt to combine as a shuffle.
51845 // TODO: General ZERO_EXTEND_VECTOR_INREG support.
51846 if (Opcode == ISD::ANY_EXTEND_VECTOR_INREG ||
51847 (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG && Subtarget.hasSSE41())) {
51848 SDValue Op(N, 0);
51849 if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))
51850 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
51851 return Res;
51852 }
51853
51854 return SDValue();
51855}
51856
51857static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG,
51858 TargetLowering::DAGCombinerInfo &DCI) {
51859 EVT VT = N->getValueType(0);
51860
51861 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
51862 return DAG.getConstant(0, SDLoc(N), VT);
51863
51864 APInt KnownUndef, KnownZero;
51865 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51866 APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
51867 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
51868 KnownZero, DCI))
51869 return SDValue(N, 0);
51870
51871 return SDValue();
51872}
51873
51874// Optimize (fp16_to_fp (fp_to_fp16 X)) to VCVTPS2PH followed by VCVTPH2PS.
51875// Done as a combine because the lowering for fp16_to_fp and fp_to_fp16 produce
51876// extra instructions between the conversion due to going to scalar and back.
51877static SDValue combineFP16_TO_FP(SDNode *N, SelectionDAG &DAG,
51878 const X86Subtarget &Subtarget) {
51879 if (Subtarget.useSoftFloat() || !Subtarget.hasF16C())
51880 return SDValue();
51881
51882 if (N->getOperand(0).getOpcode() != ISD::FP_TO_FP16)
51883 return SDValue();
51884
51885 if (N->getValueType(0) != MVT::f32 ||
51886 N->getOperand(0).getOperand(0).getValueType() != MVT::f32)
51887 return SDValue();
51888
51889 SDLoc dl(N);
51890 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32,
51891 N->getOperand(0).getOperand(0));
51892 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
51893 DAG.getTargetConstant(4, dl, MVT::i32));
51894 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
51895 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
51896 DAG.getIntPtrConstant(0, dl));
51897}
51898
51899static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG,
51900 const X86Subtarget &Subtarget) {
51901 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
51902 return SDValue();
51903
51904 if (Subtarget.hasFP16())
51905 return SDValue();
51906
51907 bool IsStrict = N->isStrictFPOpcode();
51908 EVT VT = N->getValueType(0);
51909 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
51910 EVT SrcVT = Src.getValueType();
51911
51912 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::f16)
51913 return SDValue();
51914
51915 if (VT.getVectorElementType() != MVT::f32 &&
51916 VT.getVectorElementType() != MVT::f64)
51917 return SDValue();
51918
51919 unsigned NumElts = VT.getVectorNumElements();
51920 if (NumElts == 1 || !isPowerOf2_32(NumElts))
51921 return SDValue();
51922
51923 SDLoc dl(N);
51924
51925 // Convert the input to vXi16.
51926 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
51927 Src = DAG.getBitcast(IntVT, Src);
51928
51929 // Widen to at least 8 input elements.
51930 if (NumElts < 8) {
51931 unsigned NumConcats = 8 / NumElts;
51932 SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT)
51933 : DAG.getConstant(0, dl, IntVT);
51934 SmallVector<SDValue, 4> Ops(NumConcats, Fill);
51935 Ops[0] = Src;
51936 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops);
51937 }
51938
51939 // Destination is vXf32 with at least 4 elements.
51940 EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32,
51941 std::max(4U, NumElts));
51942 SDValue Cvt, Chain;
51943 if (IsStrict) {
51944 Cvt = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {CvtVT, MVT::Other},
51945 {N->getOperand(0), Src});
51946 Chain = Cvt.getValue(1);
51947 } else {
51948 Cvt = DAG.getNode(X86ISD::CVTPH2PS, dl, CvtVT, Src);
51949 }
51950
51951 if (NumElts < 4) {
51952 assert(NumElts == 2 && "Unexpected size")(static_cast<void> (0));
51953 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Cvt,
51954 DAG.getIntPtrConstant(0, dl));
51955 }
51956
51957 if (IsStrict) {
51958 // Extend to the original VT if necessary.
51959 if (Cvt.getValueType() != VT) {
51960 Cvt = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {VT, MVT::Other},
51961 {Chain, Cvt});
51962 Chain = Cvt.getValue(1);
51963 }
51964 return DAG.getMergeValues({Cvt, Chain}, dl);
51965 }
51966
51967 // Extend to the original VT if necessary.
51968 return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt);
51969}
51970
51971// Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract
51972// from. Limit this to cases where the loads have the same input chain and the
51973// output chains are unused. This avoids any memory ordering issues.
51974static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG,
51975 TargetLowering::DAGCombinerInfo &DCI) {
51976 assert((N->getOpcode() == X86ISD::VBROADCAST_LOAD ||(static_cast<void> (0))
51977 N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&(static_cast<void> (0))
51978 "Unknown broadcast load type")(static_cast<void> (0));
51979
51980 // Only do this if the chain result is unused.
51981 if (N->hasAnyUseOfValue(1))
51982 return SDValue();
51983
51984 auto *MemIntrin = cast<MemIntrinsicSDNode>(N);
51985
51986 SDValue Ptr = MemIntrin->getBasePtr();
51987 SDValue Chain = MemIntrin->getChain();
51988 EVT VT = N->getSimpleValueType(0);
51989 EVT MemVT = MemIntrin->getMemoryVT();
51990
51991 // Look at other users of our base pointer and try to find a wider broadcast.
51992 // The input chain and the size of the memory VT must match.
51993 for (SDNode *User : Ptr->uses())
51994 if (User != N && User->getOpcode() == N->getOpcode() &&
51995 cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
51996 cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
51997 cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
51998 MemVT.getSizeInBits() &&
51999 !User->hasAnyUseOfValue(1) &&
52000 User->getValueSizeInBits(0).getFixedSize() > VT.getFixedSizeInBits()) {
52001 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
52002 VT.getSizeInBits());
52003 Extract = DAG.getBitcast(VT, Extract);
52004 return DCI.CombineTo(N, Extract, SDValue(User, 1));
52005 }
52006
52007 return SDValue();
52008}
52009
52010static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG,
52011 const X86Subtarget &Subtarget) {
52012 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
52013 return SDValue();
52014
52015 if (Subtarget.hasFP16())
52016 return SDValue();
52017
52018 EVT VT = N->getValueType(0);
52019 SDValue Src = N->getOperand(0);
52020 EVT SrcVT = Src.getValueType();
52021
52022 if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 ||
52023 SrcVT.getVectorElementType() != MVT::f32)
52024 return SDValue();
52025
52026 unsigned NumElts = VT.getVectorNumElements();
52027 if (NumElts == 1 || !isPowerOf2_32(NumElts))
52028 return SDValue();
52029
52030 SDLoc dl(N);
52031
52032 // Widen to at least 4 input elements.
52033 if (NumElts < 4)
52034 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
52035 DAG.getConstantFP(0.0, dl, SrcVT));
52036
52037 // Destination is v8i16 with at least 8 elements.
52038 EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
52039 std::max(8U, NumElts));
52040 SDValue Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src,
52041 DAG.getTargetConstant(4, dl, MVT::i32));
52042
52043 // Extract down to real number of elements.
52044 if (NumElts < 8) {
52045 EVT IntVT = VT.changeVectorElementTypeToInteger();
52046 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, IntVT, Cvt,
52047 DAG.getIntPtrConstant(0, dl));
52048 }
52049
52050 return DAG.getBitcast(VT, Cvt);
52051}
52052
52053static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG) {
52054 SDValue Src = N->getOperand(0);
52055
52056 // Turn MOVDQ2Q+simple_load into an mmx load.
52057 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
52058 LoadSDNode *LN = cast<LoadSDNode>(Src.getNode());
52059
52060 if (LN->isSimple()) {
52061 SDValue NewLd = DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(),
52062 LN->getBasePtr(),
52063 LN->getPointerInfo(),
52064 LN->getOriginalAlign(),
52065 LN->getMemOperand()->getFlags());
52066 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), NewLd.getValue(1));
52067 return NewLd;
52068 }
52069 }
52070
52071 return SDValue();
52072}
52073
52074static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG,
52075 TargetLowering::DAGCombinerInfo &DCI) {
52076 unsigned NumBits = N->getSimpleValueType(0).getSizeInBits();
52077 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52078 if (TLI.SimplifyDemandedBits(SDValue(N, 0),
52079 APInt::getAllOnesValue(NumBits), DCI))
52080 return SDValue(N, 0);
52081
52082 return SDValue();
52083}
52084
52085SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
52086 DAGCombinerInfo &DCI) const {
52087 SelectionDAG &DAG = DCI.DAG;
52088 switch (N->getOpcode()) {
52089 default: break;
52090 case ISD::SCALAR_TO_VECTOR:
52091 return combineScalarToVector(N, DAG);
52092 case ISD::EXTRACT_VECTOR_ELT:
52093 case X86ISD::PEXTRW:
52094 case X86ISD::PEXTRB:
52095 return combineExtractVectorElt(N, DAG, DCI, Subtarget);
52096 case ISD::CONCAT_VECTORS:
52097 return combineConcatVectors(N, DAG, DCI, Subtarget);
52098 case ISD::INSERT_SUBVECTOR:
52099 return combineInsertSubvector(N, DAG, DCI, Subtarget);
52100 case ISD::EXTRACT_SUBVECTOR:
52101 return combineExtractSubvector(N, DAG, DCI, Subtarget);
52102 case ISD::VSELECT:
52103 case ISD::SELECT:
52104 case X86ISD::BLENDV: return combineSelect(N, DAG, DCI, Subtarget);
52105 case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
52106 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
52107 case X86ISD::CMP: return combineCMP(N, DAG);
52108 case ISD::ADD: return combineAdd(N, DAG, DCI, Subtarget);
52109 case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget);
52110 case X86ISD::ADD:
52111 case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI);
52112 case X86ISD::SBB: return combineSBB(N, DAG);
52113 case X86ISD::ADC: return combineADC(N, DAG, DCI);
52114 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
52115 case ISD::SHL: return combineShiftLeft(N, DAG);
52116 case ISD::SRA: return combineShiftRightArithmetic(N, DAG, Subtarget);
52117 case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI, Subtarget);
52118 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
52119 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
52120 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
52121 case X86ISD::BEXTR:
52122 case X86ISD::BEXTRI: return combineBEXTR(N, DAG, DCI, Subtarget);
52123 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
52124 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
52125 case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);
52126 case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget);
52127 case X86ISD::VEXTRACT_STORE:
52128 return combineVEXTRACT_STORE(N, DAG, DCI, Subtarget);
52129 case ISD::SINT_TO_FP:
52130 case ISD::STRICT_SINT_TO_FP:
52131 return combineSIntToFP(N, DAG, DCI, Subtarget);
52132 case ISD::UINT_TO_FP:
52133 case ISD::STRICT_UINT_TO_FP:
52134 return combineUIntToFP(N, DAG, Subtarget);
52135 case ISD::FADD:
52136 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
52137 case X86ISD::VFCMULC:
52138 case X86ISD::VFMULC: return combineFMulcFCMulc(N, DAG, Subtarget);
52139 case ISD::FNEG: return combineFneg(N, DAG, DCI, Subtarget);
52140 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
52141 case X86ISD::VTRUNC: return combineVTRUNC(N, DAG, DCI);
52142 case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
52143 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
52144 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
52145 case X86ISD::FXOR:
52146 case X86ISD::FOR: return combineFOr(N, DAG, DCI, Subtarget);
52147 case X86ISD::FMIN:
52148 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
52149 case ISD::FMINNUM:
52150 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
52151 case X86ISD::CVTSI2P:
52152 case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI);
52153 case X86ISD::CVTP2SI:
52154 case X86ISD::CVTP2UI:
52155 case X86ISD::STRICT_CVTTP2SI:
52156 case X86ISD::CVTTP2SI:
52157 case X86ISD::STRICT_CVTTP2UI:
52158 case X86ISD::CVTTP2UI:
52159 return combineCVTP2I_CVTTP2I(N, DAG, DCI);
52160 case X86ISD::STRICT_CVTPH2PS:
52161 case X86ISD::CVTPH2PS: return combineCVTPH2PS(N, DAG, DCI);
52162 case X86ISD::BT: return combineBT(N, DAG, DCI);
52163 case ISD::ANY_EXTEND:
52164 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
52165 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
52166 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
52167 case ISD::ANY_EXTEND_VECTOR_INREG:
52168 case ISD::SIGN_EXTEND_VECTOR_INREG:
52169 case ISD::ZERO_EXTEND_VECTOR_INREG:
52170 return combineEXTEND_VECTOR_INREG(N, DAG, DCI, Subtarget);
52171 case ISD::SETCC: return combineSetCC(N, DAG, DCI, Subtarget);
52172 case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
52173 case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
52174 case X86ISD::PACKSS:
52175 case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);
52176 case X86ISD::HADD:
52177 case X86ISD::HSUB:
52178 case X86ISD::FHADD:
52179 case X86ISD::FHSUB: return combineVectorHADDSUB(N, DAG, DCI, Subtarget);
52180 case X86ISD::VSHL:
52181 case X86ISD::VSRA:
52182 case X86ISD::VSRL:
52183 return combineVectorShiftVar(N, DAG, DCI, Subtarget);
52184 case X86ISD::VSHLI:
52185 case X86ISD::VSRAI:
52186 case X86ISD::VSRLI:
52187 return combineVectorShiftImm(N, DAG, DCI, Subtarget);
52188 case ISD::INSERT_VECTOR_ELT:
52189 case X86ISD::PINSRB:
52190 case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
52191 case X86ISD::SHUFP: // Handle all target specific shuffles
52192 case X86ISD::INSERTPS:
52193 case X86ISD::EXTRQI:
52194 case X86ISD::INSERTQI:
52195 case X86ISD::VALIGN:
52196 case X86ISD::PALIGNR:
52197 case X86ISD::VSHLDQ:
52198 case X86ISD::VSRLDQ:
52199 case X86ISD::BLENDI:
52200 case X86ISD::UNPCKH:
52201 case X86ISD::UNPCKL:
52202 case X86ISD::MOVHLPS:
52203 case X86ISD::MOVLHPS:
52204 case X86ISD::PSHUFB:
52205 case X86ISD::PSHUFD:
52206 case X86ISD::PSHUFHW:
52207 case X86ISD::PSHUFLW:
52208 case X86ISD::MOVSHDUP:
52209 case X86ISD::MOVSLDUP:
52210 case X86ISD::MOVDDUP:
52211 case X86ISD::MOVSS:
52212 case X86ISD::MOVSD:
52213 case X86ISD::MOVSH:
52214 case X86ISD::VBROADCAST:
52215 case X86ISD::VPPERM:
52216 case X86ISD::VPERMI:
52217 case X86ISD::VPERMV:
52218 case X86ISD::VPERMV3:
52219 case X86ISD::VPERMIL2:
52220 case X86ISD::VPERMILPI:
52221 case X86ISD::VPERMILPV:
52222 case X86ISD::VPERM2X128:
52223 case X86ISD::SHUF128:
52224 case X86ISD::VZEXT_MOVL:
52225 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
52226 case X86ISD::FMADD_RND:
52227 case X86ISD::FMSUB:
52228 case X86ISD::STRICT_FMSUB:
52229 case X86ISD::FMSUB_RND:
52230 case X86ISD::FNMADD:
52231 case X86ISD::STRICT_FNMADD:
52232 case X86ISD::FNMADD_RND:
52233 case X86ISD::FNMSUB:
52234 case X86ISD::STRICT_FNMSUB:
52235 case X86ISD::FNMSUB_RND:
52236 case ISD::FMA:
52237 case ISD::STRICT_FMA: return combineFMA(N, DAG, DCI, Subtarget);
52238 case X86ISD::FMADDSUB_RND:
52239 case X86ISD::FMSUBADD_RND:
52240 case X86ISD::FMADDSUB:
52241 case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, DCI);
52242 case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI, Subtarget);
52243 case X86ISD::MGATHER:
52244 case X86ISD::MSCATTER: return combineX86GatherScatter(N, DAG, DCI);
52245 case ISD::MGATHER:
52246 case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI);
52247 case X86ISD::PCMPEQ:
52248 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
52249 case X86ISD::PMULDQ:
52250 case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget);
52251 case X86ISD::VPMADDUBSW:
52252 case X86ISD::VPMADDWD: return combineVPMADD(N, DAG, DCI);
52253 case X86ISD::KSHIFTL:
52254 case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI);
52255 case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget);
52256 case ISD::STRICT_FP_EXTEND:
52257 case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, Subtarget);
52258 case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget);
52259 case X86ISD::VBROADCAST_LOAD:
52260 case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI);
52261 case X86ISD::MOVDQ2Q: return combineMOVDQ2Q(N, DAG);
52262 case X86ISD::PDEP: return combinePDEP(N, DAG, DCI);
52263 }
52264
52265 return SDValue();
52266}
52267
52268bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
52269 if (!isTypeLegal(VT))
52270 return false;
52271
52272 // There are no vXi8 shifts.
52273 if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
52274 return false;
52275
52276 // TODO: Almost no 8-bit ops are desirable because they have no actual
52277 // size/speed advantages vs. 32-bit ops, but they do have a major
52278 // potential disadvantage by causing partial register stalls.
52279 //
52280 // 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and
52281 // we have specializations to turn 32-bit multiply/shl into LEA or other ops.
52282 // Also, see the comment in "IsDesirableToPromoteOp" - where we additionally
52283 // check for a constant operand to the multiply.
52284 if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8)
52285 return false;
52286
52287 // i16 instruction encodings are longer and some i16 instructions are slow,
52288 // so those are not desirable.
52289 if (VT == MVT::i16) {
52290 switch (Opc) {
52291 default:
52292 break;
52293 case ISD::LOAD:
52294 case ISD::SIGN_EXTEND:
52295 case ISD::ZERO_EXTEND:
52296 case ISD::ANY_EXTEND:
52297 case ISD::SHL:
52298 case ISD::SRA:
52299 case ISD::SRL:
52300 case ISD::SUB:
52301 case ISD::ADD:
52302 case ISD::MUL:
52303 case ISD::AND:
52304 case ISD::OR:
52305 case ISD::XOR:
52306 return false;
52307 }
52308 }
52309
52310 // Any legal type not explicitly accounted for above here is desirable.
52311 return true;
52312}
52313
52314SDValue X86TargetLowering::expandIndirectJTBranch(const SDLoc& dl,
52315 SDValue Value, SDValue Addr,
52316 SelectionDAG &DAG) const {
52317 const Module *M = DAG.getMachineFunction().getMMI().getModule();
52318 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
52319 if (IsCFProtectionSupported) {
52320 // In case control-flow branch protection is enabled, we need to add
52321 // notrack prefix to the indirect branch.
52322 // In order to do that we create NT_BRIND SDNode.
52323 // Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.
52324 return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, Value, Addr);
52325 }
52326
52327 return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, DAG);
52328}
52329
52330bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
52331 EVT VT = Op.getValueType();
52332 bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL &&
52333 isa<ConstantSDNode>(Op.getOperand(1));
52334
52335 // i16 is legal, but undesirable since i16 instruction encodings are longer
52336 // and some i16 instructions are slow.
52337 // 8-bit multiply-by-constant can usually be expanded to something cheaper
52338 // using LEA and/or other ALU ops.
52339 if (VT != MVT::i16 && !Is8BitMulByConstant)
52340 return false;
52341
52342 auto IsFoldableRMW = [](SDValue Load, SDValue Op) {
52343 if (!Op.hasOneUse())
52344 return false;
52345 SDNode *User = *Op->use_begin();
52346 if (!ISD::isNormalStore(User))
52347 return false;
52348 auto *Ld = cast<LoadSDNode>(Load);
52349 auto *St = cast<StoreSDNode>(User);
52350 return Ld->getBasePtr() == St->getBasePtr();
52351 };
52352
52353 auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) {
52354 if (!Load.hasOneUse() || Load.getOpcode() != ISD::ATOMIC_LOAD)
52355 return false;
52356 if (!Op.hasOneUse())
52357 return false;
52358 SDNode *User = *Op->use_begin();
52359 if (User->getOpcode() != ISD::ATOMIC_STORE)
52360 return false;
52361 auto *Ld = cast<AtomicSDNode>(Load);
52362 auto *St = cast<AtomicSDNode>(User);
52363 return Ld->getBasePtr() == St->getBasePtr();
52364 };
52365
52366 bool Commute = false;
52367 switch (Op.getOpcode()) {
52368 default: return false;
52369 case ISD::SIGN_EXTEND:
52370 case ISD::ZERO_EXTEND:
52371 case ISD::ANY_EXTEND:
52372 break;
52373 case ISD::SHL:
52374 case ISD::SRA:
52375 case ISD::SRL: {
52376 SDValue N0 = Op.getOperand(0);
52377 // Look out for (store (shl (load), x)).
52378 if (MayFoldLoad(N0) && IsFoldableRMW(N0, Op))
52379 return false;
52380 break;
52381 }
52382 case ISD::ADD:
52383 case ISD::MUL:
52384 case ISD::AND:
52385 case ISD::OR:
52386 case ISD::XOR:
52387 Commute = true;
52388 LLVM_FALLTHROUGH[[gnu::fallthrough]];
52389 case ISD::SUB: {
52390 SDValue N0 = Op.getOperand(0);
52391 SDValue N1 = Op.getOperand(1);
52392 // Avoid disabling potential load folding opportunities.
52393 if (MayFoldLoad(N1) &&
52394 (!Commute || !isa<ConstantSDNode>(N0) ||
52395 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
52396 return false;
52397 if (MayFoldLoad(N0) &&
52398 ((Commute && !isa<ConstantSDNode>(N1)) ||
52399 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
52400 return false;
52401 if (IsFoldableAtomicRMW(N0, Op) ||
52402 (Commute && IsFoldableAtomicRMW(N1, Op)))
52403 return false;
52404 }
52405 }
52406
52407 PVT = MVT::i32;
52408 return true;
52409}
52410
52411//===----------------------------------------------------------------------===//
52412// X86 Inline Assembly Support
52413//===----------------------------------------------------------------------===//
52414
52415// Helper to match a string separated by whitespace.
52416static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
52417 S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
52418
52419 for (StringRef Piece : Pieces) {
52420 if (!S.startswith(Piece)) // Check if the piece matches.
52421 return false;
52422
52423 S = S.substr(Piece.size());
52424 StringRef::size_type Pos = S.find_first_not_of(" \t");
52425 if (Pos == 0) // We matched a prefix.
52426 return false;
52427
52428 S = S.substr(Pos);
52429 }
52430
52431 return S.empty();
52432}
52433
52434static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
52435
52436 if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
52437 if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
52438 std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
52439 std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
52440
52441 if (AsmPieces.size() == 3)
52442 return true;
52443 else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
52444 return true;
52445 }
52446 }
52447 return false;
52448}
52449
52450bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
52451 InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand());
52452
52453 const std::string &AsmStr = IA->getAsmString();
52454
52455 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
52456 if (!Ty || Ty->getBitWidth() % 16 != 0)
52457 return false;
52458
52459 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
52460 SmallVector<StringRef, 4> AsmPieces;
52461 SplitString(AsmStr, AsmPieces, ";\n");
52462
52463 switch (AsmPieces.size()) {
52464 default: return false;
52465 case 1:
52466 // FIXME: this should verify that we are targeting a 486 or better. If not,
52467 // we will turn this bswap into something that will be lowered to logical
52468 // ops instead of emitting the bswap asm. For now, we don't support 486 or
52469 // lower so don't worry about this.
52470 // bswap $0
52471 if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
52472 matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
52473 matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
52474 matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
52475 matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
52476 matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
52477 // No need to check constraints, nothing other than the equivalent of
52478 // "=r,0" would be valid here.
52479 return IntrinsicLowering::LowerToByteSwap(CI);
52480 }
52481
52482 // rorw $$8, ${0:w} --> llvm.bswap.i16
52483 if (CI->getType()->isIntegerTy(16) &&
52484 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
52485 (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
52486 matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
52487 AsmPieces.clear();
52488 StringRef ConstraintsStr = IA->getConstraintString();
52489 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
52490 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
52491 if (clobbersFlagRegisters(AsmPieces))
52492 return IntrinsicLowering::LowerToByteSwap(CI);
52493 }
52494 break;
52495 case 3:
52496 if (CI->getType()->isIntegerTy(32) &&
52497 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
52498 matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
52499 matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
52500 matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
52501 AsmPieces.clear();
52502 StringRef ConstraintsStr = IA->getConstraintString();
52503 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
52504 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
52505 if (clobbersFlagRegisters(AsmPieces))
52506 return IntrinsicLowering::LowerToByteSwap(CI);
52507 }
52508
52509 if (CI->getType()->isIntegerTy(64)) {
52510 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
52511 if (Constraints.size() >= 2 &&
52512 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
52513 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
52514 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
52515 if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
52516 matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
52517 matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
52518 return IntrinsicLowering::LowerToByteSwap(CI);
52519 }
52520 }
52521 break;
52522 }
52523 return false;
52524}
52525
52526static X86::CondCode parseConstraintCode(llvm::StringRef Constraint) {
52527 X86::CondCode Cond = StringSwitch<X86::CondCode>(Constraint)
52528 .Case("{@cca}", X86::COND_A)
52529 .Case("{@ccae}", X86::COND_AE)
52530 .Case("{@ccb}", X86::COND_B)
52531 .Case("{@ccbe}", X86::COND_BE)
52532 .Case("{@ccc}", X86::COND_B)
52533 .Case("{@cce}", X86::COND_E)
52534 .Case("{@ccz}", X86::COND_E)
52535 .Case("{@ccg}", X86::COND_G)
52536 .Case("{@ccge}", X86::COND_GE)
52537 .Case("{@ccl}", X86::COND_L)
52538 .Case("{@ccle}", X86::COND_LE)
52539 .Case("{@ccna}", X86::COND_BE)
52540 .Case("{@ccnae}", X86::COND_B)
52541 .Case("{@ccnb}", X86::COND_AE)
52542 .Case("{@ccnbe}", X86::COND_A)
52543 .Case("{@ccnc}", X86::COND_AE)
52544 .Case("{@ccne}", X86::COND_NE)
52545 .Case("{@ccnz}", X86::COND_NE)
52546 .Case("{@ccng}", X86::COND_LE)
52547 .Case("{@ccnge}", X86::COND_L)
52548 .Case("{@ccnl}", X86::COND_GE)
52549 .Case("{@ccnle}", X86::COND_G)
52550 .Case("{@ccno}", X86::COND_NO)
52551 .Case("{@ccnp}", X86::COND_NP)
52552 .Case("{@ccns}", X86::COND_NS)
52553 .Case("{@cco}", X86::COND_O)
52554 .Case("{@ccp}", X86::COND_P)
52555 .Case("{@ccs}", X86::COND_S)
52556 .Default(X86::COND_INVALID);
52557 return Cond;
52558}
52559
52560/// Given a constraint letter, return the type of constraint for this target.
52561X86TargetLowering::ConstraintType
52562X86TargetLowering::getConstraintType(StringRef Constraint) const {
52563 if (Constraint.size() == 1) {
52564 switch (Constraint[0]) {
52565 case 'R':
52566 case 'q':
52567 case 'Q':
52568 case 'f':
52569 case 't':
52570 case 'u':
52571 case 'y':
52572 case 'x':
52573 case 'v':
52574 case 'l':
52575 case 'k': // AVX512 masking registers.
52576 return C_RegisterClass;
52577 case 'a':
52578 case 'b':
52579 case 'c':
52580 case 'd':
52581 case 'S':
52582 case 'D':
52583 case 'A':
52584 return C_Register;
52585 case 'I':
52586 case 'J':
52587 case 'K':
52588 case 'N':
52589 case 'G':
52590 case 'L':
52591 case 'M':
52592 return C_Immediate;
52593 case 'C':
52594 case 'e':
52595 case 'Z':
52596 return C_Other;
52597 default:
52598 break;
52599 }
52600 }
52601 else if (Constraint.size() == 2) {
52602 switch (Constraint[0]) {
52603 default:
52604 break;
52605 case 'Y':
52606 switch (Constraint[1]) {
52607 default:
52608 break;
52609 case 'z':
52610 return C_Register;
52611 case 'i':
52612 case 'm':
52613 case 'k':
52614 case 't':
52615 case '2':
52616 return C_RegisterClass;
52617 }
52618 }
52619 } else if (parseConstraintCode(Constraint) != X86::COND_INVALID)
52620 return C_Other;
52621 return TargetLowering::getConstraintType(Constraint);
52622}
52623
52624/// Examine constraint type and operand type and determine a weight value.
52625/// This object must already have been set up with the operand type
52626/// and the current alternative constraint selected.
52627TargetLowering::ConstraintWeight
52628 X86TargetLowering::getSingleConstraintMatchWeight(
52629 AsmOperandInfo &info, const char *constraint) const {
52630 ConstraintWeight weight = CW_Invalid;
52631 Value *CallOperandVal = info.CallOperandVal;
52632 // If we don't have a value, we can't do a match,
52633 // but allow it at the lowest weight.
52634 if (!CallOperandVal)
52635 return CW_Default;
52636 Type *type = CallOperandVal->getType();
52637 // Look at the constraint type.
52638 switch (*constraint) {
52639 default:
52640 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
52641 LLVM_FALLTHROUGH[[gnu::fallthrough]];
52642 case 'R':
52643 case 'q':
52644 case 'Q':
52645 case 'a':
52646 case 'b':
52647 case 'c':
52648 case 'd':
52649 case 'S':
52650 case 'D':
52651 case 'A':
52652 if (CallOperandVal->getType()->isIntegerTy())
52653 weight = CW_SpecificReg;
52654 break;
52655 case 'f':
52656 case 't':
52657 case 'u':
52658 if (type->isFloatingPointTy())
52659 weight = CW_SpecificReg;
52660 break;
52661 case 'y':
52662 if (type->isX86_MMXTy() && Subtarget.hasMMX())
52663 weight = CW_SpecificReg;
52664 break;
52665 case 'Y':
52666 if (StringRef(constraint).size() != 2)
52667 break;
52668 switch (constraint[1]) {
52669 default:
52670 return CW_Invalid;
52671 // XMM0
52672 case 'z':
52673 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
52674 ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) ||
52675 ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()))
52676 return CW_SpecificReg;
52677 return CW_Invalid;
52678 // Conditional OpMask regs (AVX512)
52679 case 'k':
52680 if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
52681 return CW_Register;
52682 return CW_Invalid;
52683 // Any MMX reg
52684 case 'm':
52685 if (type->isX86_MMXTy() && Subtarget.hasMMX())
52686 return weight;
52687 return CW_Invalid;
52688 // Any SSE reg when ISA >= SSE2, same as 'x'
52689 case 'i':
52690 case 't':
52691 case '2':
52692 if (!Subtarget.hasSSE2())
52693 return CW_Invalid;
52694 break;
52695 }
52696 break;
52697 case 'v':
52698 if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
52699 weight = CW_Register;
52700 LLVM_FALLTHROUGH[[gnu::fallthrough]];
52701 case 'x':
52702 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
52703 ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
52704 weight = CW_Register;
52705 break;
52706 case 'k':
52707 // Enable conditional vector operations using %k<#> registers.
52708 if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
52709 weight = CW_Register;
52710 break;
52711 case 'I':
52712 if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
52713 if (C->getZExtValue() <= 31)
52714 weight = CW_Constant;
52715 }
52716 break;
52717 case 'J':
52718 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
52719 if (C->getZExtValue() <= 63)
52720 weight = CW_Constant;
52721 }
52722 break;
52723 case 'K':
52724 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
52725 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
52726 weight = CW_Constant;
52727 }
52728 break;
52729 case 'L':
52730 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
52731 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
52732 weight = CW_Constant;
52733 }
52734 break;
52735 case 'M':
52736 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
52737 if (C->getZExtValue() <= 3)
52738 weight = CW_Constant;
52739 }
52740 break;
52741 case 'N':
52742 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
52743 if (C->getZExtValue() <= 0xff)
52744 weight = CW_Constant;
52745 }
52746 break;
52747 case 'G':
52748 case 'C':
52749 if (isa<ConstantFP>(CallOperandVal)) {
52750 weight = CW_Constant;
52751 }
52752 break;
52753 case 'e':
52754 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
52755 if ((C->getSExtValue() >= -0x80000000LL) &&
52756 (C->getSExtValue() <= 0x7fffffffLL))
52757 weight = CW_Constant;
52758 }
52759 break;
52760 case 'Z':
52761 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
52762 if (C->getZExtValue() <= 0xffffffff)
52763 weight = CW_Constant;
52764 }
52765 break;
52766 }
52767 return weight;
52768}
52769
52770/// Try to replace an X constraint, which matches anything, with another that
52771/// has more specific requirements based on the type of the corresponding
52772/// operand.
52773const char *X86TargetLowering::
52774LowerXConstraint(EVT ConstraintVT) const {
52775 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
52776 // 'f' like normal targets.
52777 if (ConstraintVT.isFloatingPoint()) {
52778 if (Subtarget.hasSSE1())
52779 return "x";
52780 }
52781
52782 return TargetLowering::LowerXConstraint(ConstraintVT);
52783}
52784
52785// Lower @cc targets via setcc.
52786SDValue X86TargetLowering::LowerAsmOutputForConstraint(
52787 SDValue &Chain, SDValue &Flag, const SDLoc &DL,
52788 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
52789 X86::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
52790 if (Cond == X86::COND_INVALID)
52791 return SDValue();
52792 // Check that return type is valid.
52793 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
52794 OpInfo.ConstraintVT.getSizeInBits() < 8)
52795 report_fatal_error("Flag output operand is of invalid type");
52796
52797 // Get EFLAGS register. Only update chain when copyfrom is glued.
52798 if (Flag.getNode()) {
52799 Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Flag);
52800 Chain = Flag.getValue(1);
52801 } else
52802 Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32);
52803 // Extract CC code.
52804 SDValue CC = getSETCC(Cond, Flag, DL, DAG);
52805 // Extend to 32-bits
52806 SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
52807
52808 return Result;
52809}
52810
52811/// Lower the specified operand into the Ops vector.
52812/// If it is invalid, don't add anything to Ops.
52813void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
52814 std::string &Constraint,
52815 std::vector<SDValue>&Ops,
52816 SelectionDAG &DAG) const {
52817 SDValue Result;
52818
52819 // Only support length 1 constraints for now.
52820 if (Constraint.length() > 1) return;
52821
52822 char ConstraintLetter = Constraint[0];
52823 switch (ConstraintLetter) {
52824 default: break;
52825 case 'I':
52826 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
52827 if (C->getZExtValue() <= 31) {
52828 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
52829 Op.getValueType());
52830 break;
52831 }
52832 }
52833 return;
52834 case 'J':
52835 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
52836 if (C->getZExtValue() <= 63) {
52837 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
52838 Op.getValueType());
52839 break;
52840 }
52841 }
52842 return;
52843 case 'K':
52844 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
52845 if (isInt<8>(C->getSExtValue())) {
52846 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
52847 Op.getValueType());
52848 break;
52849 }
52850 }
52851 return;
52852 case 'L':
52853 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
52854 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
52855 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
52856 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
52857 Op.getValueType());
52858 break;
52859 }
52860 }
52861 return;
52862 case 'M':
52863 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
52864 if (C->getZExtValue() <= 3) {
52865 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
52866 Op.getValueType());
52867 break;
52868 }
52869 }
52870 return;
52871 case 'N':
52872 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
52873 if (C->getZExtValue() <= 255) {
52874 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
52875 Op.getValueType());
52876 break;
52877 }
52878 }
52879 return;
52880 case 'O':
52881 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
52882 if (C->getZExtValue() <= 127) {
52883 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
52884 Op.getValueType());
52885 break;
52886 }
52887 }
52888 return;
52889 case 'e': {
52890 // 32-bit signed value
52891 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
52892 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
52893 C->getSExtValue())) {
52894 // Widen to 64 bits here to get it sign extended.
52895 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
52896 break;
52897 }
52898 // FIXME gcc accepts some relocatable values here too, but only in certain
52899 // memory models; it's complicated.
52900 }
52901 return;
52902 }
52903 case 'Z': {
52904 // 32-bit unsigned value
52905 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
52906 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
52907 C->getZExtValue())) {
52908 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
52909 Op.getValueType());
52910 break;
52911 }
52912 }
52913 // FIXME gcc accepts some relocatable values here too, but only in certain
52914 // memory models; it's complicated.
52915 return;
52916 }
52917 case 'i': {
52918 // Literal immediates are always ok.
52919 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
52920 bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;
52921 BooleanContent BCont = getBooleanContents(MVT::i64);
52922 ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)
52923 : ISD::SIGN_EXTEND;
52924 int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()
52925 : CST->getSExtValue();
52926 Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);
52927 break;
52928 }
52929
52930 // In any sort of PIC mode addresses need to be computed at runtime by
52931 // adding in a register or some sort of table lookup. These can't
52932 // be used as immediates.
52933 if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC())
52934 return;
52935
52936 // If we are in non-pic codegen mode, we allow the address of a global (with
52937 // an optional displacement) to be used with 'i'.
52938 if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
52939 // If we require an extra load to get this address, as in PIC mode, we
52940 // can't accept it.
52941 if (isGlobalStubReference(
52942 Subtarget.classifyGlobalReference(GA->getGlobal())))
52943 return;
52944 break;
52945 }
52946 }
52947
52948 if (Result.getNode()) {
52949 Ops.push_back(Result);
52950 return;
52951 }
52952 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
52953}
52954
52955/// Check if \p RC is a general purpose register class.
52956/// I.e., GR* or one of their variant.
52957static bool isGRClass(const TargetRegisterClass &RC) {
52958 return RC.hasSuperClassEq(&X86::GR8RegClass) ||
52959 RC.hasSuperClassEq(&X86::GR16RegClass) ||
52960 RC.hasSuperClassEq(&X86::GR32RegClass) ||
52961 RC.hasSuperClassEq(&X86::GR64RegClass) ||
52962 RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
52963}
52964
52965/// Check if \p RC is a vector register class.
52966/// I.e., FR* / VR* or one of their variant.
52967static bool isFRClass(const TargetRegisterClass &RC) {
52968 return RC.hasSuperClassEq(&X86::FR16XRegClass) ||
52969 RC.hasSuperClassEq(&X86::FR32XRegClass) ||
52970 RC.hasSuperClassEq(&X86::FR64XRegClass) ||
52971 RC.hasSuperClassEq(&X86::VR128XRegClass) ||
52972 RC.hasSuperClassEq(&X86::VR256XRegClass) ||
52973 RC.hasSuperClassEq(&X86::VR512RegClass);
52974}
52975
52976/// Check if \p RC is a mask register class.
52977/// I.e., VK* or one of their variant.
52978static bool isVKClass(const TargetRegisterClass &RC) {
52979 return RC.hasSuperClassEq(&X86::VK1RegClass) ||
52980 RC.hasSuperClassEq(&X86::VK2RegClass) ||
52981 RC.hasSuperClassEq(&X86::VK4RegClass) ||
52982 RC.hasSuperClassEq(&X86::VK8RegClass) ||
52983 RC.hasSuperClassEq(&X86::VK16RegClass) ||
52984 RC.hasSuperClassEq(&X86::VK32RegClass) ||
52985 RC.hasSuperClassEq(&X86::VK64RegClass);
52986}
52987
52988std::pair<unsigned, const TargetRegisterClass *>
52989X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
52990 StringRef Constraint,
52991 MVT VT) const {
52992 // First, see if this is a constraint that directly corresponds to an LLVM
52993 // register class.
52994 if (Constraint.size() == 1) {
52995 // GCC Constraint Letters
52996 switch (Constraint[0]) {
52997 default: break;
52998 // 'A' means [ER]AX + [ER]DX.
52999 case 'A':
53000 if (Subtarget.is64Bit())
53001 return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);
53002 assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&(static_cast<void> (0))
53003 "Expecting 64, 32 or 16 bit subtarget")(static_cast<void> (0));
53004 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
53005
53006 // TODO: Slight differences here in allocation order and leaving
53007 // RIP in the class. Do they matter any more here than they do
53008 // in the normal allocation?
53009 case 'k':
53010 if (Subtarget.hasAVX512()) {
53011 if (VT == MVT::i1)
53012 return std::make_pair(0U, &X86::VK1RegClass);
53013 if (VT == MVT::i8)
53014 return std::make_pair(0U, &X86::VK8RegClass);
53015 if (VT == MVT::i16)
53016 return std::make_pair(0U, &X86::VK16RegClass);
53017 }
53018 if (Subtarget.hasBWI()) {
53019 if (VT == MVT::i32)
53020 return std::make_pair(0U, &X86::VK32RegClass);
53021 if (VT == MVT::i64)
53022 return std::make_pair(0U, &X86::VK64RegClass);
53023 }
53024 break;
53025 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
53026 if (Subtarget.is64Bit()) {
53027 if (VT == MVT::i8 || VT == MVT::i1)
53028 return std::make_pair(0U, &X86::GR8RegClass);
53029 if (VT == MVT::i16)
53030 return std::make_pair(0U, &X86::GR16RegClass);
53031 if (VT == MVT::i32 || VT == MVT::f32)
53032 return std::make_pair(0U, &X86::GR32RegClass);
53033 if (VT != MVT::f80 && !VT.isVector())
53034 return std::make_pair(0U, &X86::GR64RegClass);
53035 break;
53036 }
53037 LLVM_FALLTHROUGH[[gnu::fallthrough]];
53038 // 32-bit fallthrough
53039 case 'Q': // Q_REGS
53040 if (VT == MVT::i8 || VT == MVT::i1)
53041 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
53042 if (VT == MVT::i16)
53043 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
53044 if (VT == MVT::i32 || VT == MVT::f32 ||
53045 (!VT.isVector() && !Subtarget.is64Bit()))
53046 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
53047 if (VT != MVT::f80 && !VT.isVector())
53048 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
53049 break;
53050 case 'r': // GENERAL_REGS
53051 case 'l': // INDEX_REGS
53052 if (VT == MVT::i8 || VT == MVT::i1)
53053 return std::make_pair(0U, &X86::GR8RegClass);
53054 if (VT == MVT::i16)
53055 return std::make_pair(0U, &X86::GR16RegClass);
53056 if (VT == MVT::i32 || VT == MVT::f32 ||
53057 (!VT.isVector() && !Subtarget.is64Bit()))
53058 return std::make_pair(0U, &X86::GR32RegClass);
53059 if (VT != MVT::f80 && !VT.isVector())
53060 return std::make_pair(0U, &X86::GR64RegClass);
53061 break;
53062 case 'R': // LEGACY_REGS
53063 if (VT == MVT::i8 || VT == MVT::i1)
53064 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
53065 if (VT == MVT::i16)
53066 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
53067 if (VT == MVT::i32 || VT == MVT::f32 ||
53068 (!VT.isVector() && !Subtarget.is64Bit()))
53069 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
53070 if (VT != MVT::f80 && !VT.isVector())
53071 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
53072 break;
53073 case 'f': // FP Stack registers.
53074 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
53075 // value to the correct fpstack register class.
53076 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
53077 return std::make_pair(0U, &X86::RFP32RegClass);
53078 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
53079 return std::make_pair(0U, &X86::RFP64RegClass);
53080 if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80)
53081 return std::make_pair(0U, &X86::RFP80RegClass);
53082 break;
53083 case 'y': // MMX_REGS if MMX allowed.
53084 if (!Subtarget.hasMMX()) break;
53085 return std::make_pair(0U, &X86::VR64RegClass);
53086 case 'v':
53087 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
53088 if (!Subtarget.hasSSE1()) break;
53089 bool VConstraint = (Constraint[0] == 'v');
53090
53091 switch (VT.SimpleTy) {
53092 default: break;
53093 // Scalar SSE types.
53094 case MVT::f16:
53095 if (VConstraint && Subtarget.hasFP16())
53096 return std::make_pair(0U, &X86::FR16XRegClass);
53097 break;
53098 case MVT::f32:
53099 case MVT::i32:
53100 if (VConstraint && Subtarget.hasVLX())
53101 return std::make_pair(0U, &X86::FR32XRegClass);
53102 return std::make_pair(0U, &X86::FR32RegClass);
53103 case MVT::f64:
53104 case MVT::i64:
53105 if (VConstraint && Subtarget.hasVLX())
53106 return std::make_pair(0U, &X86::FR64XRegClass);
53107 return std::make_pair(0U, &X86::FR64RegClass);
53108 case MVT::i128:
53109 if (Subtarget.is64Bit()) {
53110 if (VConstraint && Subtarget.hasVLX())
53111 return std::make_pair(0U, &X86::VR128XRegClass);
53112 return std::make_pair(0U, &X86::VR128RegClass);
53113 }
53114 break;
53115 // Vector types and fp128.
53116 case MVT::v8f16:
53117 if (!Subtarget.hasFP16())
53118 break;
53119 LLVM_FALLTHROUGH[[gnu::fallthrough]];
53120 case MVT::f128:
53121 case MVT::v16i8:
53122 case MVT::v8i16:
53123 case MVT::v4i32:
53124 case MVT::v2i64:
53125 case MVT::v4f32:
53126 case MVT::v2f64:
53127 if (VConstraint && Subtarget.hasVLX())
53128 return std::make_pair(0U, &X86::VR128XRegClass);
53129 return std::make_pair(0U, &X86::VR128RegClass);
53130 // AVX types.
53131 case MVT::v16f16:
53132 if (!Subtarget.hasFP16())
53133 break;
53134 LLVM_FALLTHROUGH[[gnu::fallthrough]];
53135 case MVT::v32i8:
53136 case MVT::v16i16:
53137 case MVT::v8i32:
53138 case MVT::v4i64:
53139 case MVT::v8f32:
53140 case MVT::v4f64:
53141 if (VConstraint && Subtarget.hasVLX())
53142 return std::make_pair(0U, &X86::VR256XRegClass);
53143 if (Subtarget.hasAVX())
53144 return std::make_pair(0U, &X86::VR256RegClass);
53145 break;
53146 case MVT::v32f16:
53147 if (!Subtarget.hasFP16())
53148 break;
53149 LLVM_FALLTHROUGH[[gnu::fallthrough]];
53150 case MVT::v64i8:
53151 case MVT::v32i16:
53152 case MVT::v8f64:
53153 case MVT::v16f32:
53154 case MVT::v16i32:
53155 case MVT::v8i64:
53156 if (!Subtarget.hasAVX512()) break;
53157 if (VConstraint)
53158 return std::make_pair(0U, &X86::VR512RegClass);
53159 return std::make_pair(0U, &X86::VR512_0_15RegClass);
53160 }
53161 break;
53162 }
53163 } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
53164 switch (Constraint[1]) {
53165 default:
53166 break;
53167 case 'i':
53168 case 't':
53169 case '2':
53170 return getRegForInlineAsmConstraint(TRI, "x", VT);
53171 case 'm':
53172 if (!Subtarget.hasMMX()) break;
53173 return std::make_pair(0U, &X86::VR64RegClass);
53174 case 'z':
53175 if (!Subtarget.hasSSE1()) break;
53176 switch (VT.SimpleTy) {
53177 default: break;
53178 // Scalar SSE types.
53179 case MVT::f16:
53180 if (!Subtarget.hasFP16())
53181 break;
53182 return std::make_pair(X86::XMM0, &X86::FR16XRegClass);
53183 case MVT::f32:
53184 case MVT::i32:
53185 return std::make_pair(X86::XMM0, &X86::FR32RegClass);
53186 case MVT::f64:
53187 case MVT::i64:
53188 return std::make_pair(X86::XMM0, &X86::FR64RegClass);
53189 case MVT::v8f16:
53190 if (!Subtarget.hasFP16())
53191 break;
53192 LLVM_FALLTHROUGH[[gnu::fallthrough]];
53193 case MVT::f128:
53194 case MVT::v16i8:
53195 case MVT::v8i16:
53196 case MVT::v4i32:
53197 case MVT::v2i64:
53198 case MVT::v4f32:
53199 case MVT::v2f64:
53200 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
53201 // AVX types.
53202 case MVT::v16f16:
53203 if (!Subtarget.hasFP16())
53204 break;
53205 LLVM_FALLTHROUGH[[gnu::fallthrough]];
53206 case MVT::v32i8:
53207 case MVT::v16i16:
53208 case MVT::v8i32:
53209 case MVT::v4i64:
53210 case MVT::v8f32:
53211 case MVT::v4f64:
53212 if (Subtarget.hasAVX())
53213 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
53214 break;
53215 case MVT::v32f16:
53216 if (!Subtarget.hasFP16())
53217 break;
53218 LLVM_FALLTHROUGH[[gnu::fallthrough]];
53219 case MVT::v64i8:
53220 case MVT::v32i16:
53221 case MVT::v8f64:
53222 case MVT::v16f32:
53223 case MVT::v16i32:
53224 case MVT::v8i64:
53225 if (Subtarget.hasAVX512())
53226 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
53227 break;
53228 }
53229 break;
53230 case 'k':
53231 // This register class doesn't allocate k0 for masked vector operation.
53232 if (Subtarget.hasAVX512()) {
53233 if (VT == MVT::i1)
53234 return std::make_pair(0U, &X86::VK1WMRegClass);
53235 if (VT == MVT::i8)
53236 return std::make_pair(0U, &X86::VK8WMRegClass);
53237 if (VT == MVT::i16)
53238 return std::make_pair(0U, &X86::VK16WMRegClass);
53239 }
53240 if (Subtarget.hasBWI()) {
53241 if (VT == MVT::i32)
53242 return std::make_pair(0U, &X86::VK32WMRegClass);
53243 if (VT == MVT::i64)
53244 return std::make_pair(0U, &X86::VK64WMRegClass);
53245 }
53246 break;
53247 }
53248 }
53249
53250 if (parseConstraintCode(Constraint) != X86::COND_INVALID)
53251 return std::make_pair(0U, &X86::GR32RegClass);
53252
53253 // Use the default implementation in TargetLowering to convert the register
53254 // constraint into a member of a register class.
53255 std::pair<Register, const TargetRegisterClass*> Res;
53256 Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
53257
53258 // Not found as a standard register?
53259 if (!Res.second) {
53260 // Only match x87 registers if the VT is one SelectionDAGBuilder can convert
53261 // to/from f80.
53262 if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) {
53263 // Map st(0) -> st(7) -> ST0
53264 if (Constraint.size() == 7 && Constraint[0] == '{' &&
53265 tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&
53266 Constraint[3] == '(' &&
53267 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
53268 Constraint[5] == ')' && Constraint[6] == '}') {
53269 // st(7) is not allocatable and thus not a member of RFP80. Return
53270 // singleton class in cases where we have a reference to it.
53271 if (Constraint[4] == '7')
53272 return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);
53273 return std::make_pair(X86::FP0 + Constraint[4] - '0',
53274 &X86::RFP80RegClass);
53275 }
53276
53277 // GCC allows "st(0)" to be called just plain "st".
53278 if (StringRef("{st}").equals_insensitive(Constraint))
53279 return std::make_pair(X86::FP0, &X86::RFP80RegClass);
53280 }
53281
53282 // flags -> EFLAGS
53283 if (StringRef("{flags}").equals_insensitive(Constraint))
53284 return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);
53285
53286 // dirflag -> DF
53287 // Only allow for clobber.
53288 if (StringRef("{dirflag}").equals_insensitive(Constraint) &&
53289 VT == MVT::Other)
53290 return std::make_pair(X86::DF, &X86::DFCCRRegClass);
53291
53292 // fpsr -> FPSW
53293 if (StringRef("{fpsr}").equals_insensitive(Constraint))
53294 return std::make_pair(X86::FPSW, &X86::FPCCRRegClass);
53295
53296 return Res;
53297 }
53298
53299 // Make sure it isn't a register that requires 64-bit mode.
53300 if (!Subtarget.is64Bit() &&
53301 (isFRClass(*Res.second) || isGRClass(*Res.second)) &&
53302 TRI->getEncodingValue(Res.first) >= 8) {
53303 // Register requires REX prefix, but we're in 32-bit mode.
53304 return std::make_pair(0, nullptr);
53305 }
53306
53307 // Make sure it isn't a register that requires AVX512.
53308 if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&
53309 TRI->getEncodingValue(Res.first) & 0x10) {
53310 // Register requires EVEX prefix.
53311 return std::make_pair(0, nullptr);
53312 }
53313
53314 // Otherwise, check to see if this is a register class of the wrong value
53315 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
53316 // turn into {ax},{dx}.
53317 // MVT::Other is used to specify clobber names.
53318 if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
53319 return Res; // Correct type already, nothing to do.
53320
53321 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
53322 // return "eax". This should even work for things like getting 64bit integer
53323 // registers when given an f64 type.
53324 const TargetRegisterClass *Class = Res.second;
53325 // The generic code will match the first register class that contains the
53326 // given register. Thus, based on the ordering of the tablegened file,
53327 // the "plain" GR classes might not come first.
53328 // Therefore, use a helper method.
53329 if (isGRClass(*Class)) {
53330 unsigned Size = VT.getSizeInBits();
53331 if (Size == 1) Size = 8;
53332 Register DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
53333 if (DestReg > 0) {
53334 bool is64Bit = Subtarget.is64Bit();
53335 const TargetRegisterClass *RC =
53336 Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
53337 : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
53338 : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
53339 : Size == 64 ? (is64Bit ? &X86::GR64RegClass : nullptr)
53340 : nullptr;
53341 if (Size == 64 && !is64Bit) {
53342 // Model GCC's behavior here and select a fixed pair of 32-bit
53343 // registers.
53344 switch (DestReg) {
53345 case X86::RAX:
53346 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
53347 case X86::RDX:
53348 return std::make_pair(X86::EDX, &X86::GR32_DCRegClass);
53349 case X86::RCX:
53350 return std::make_pair(X86::ECX, &X86::GR32_CBRegClass);
53351 case X86::RBX:
53352 return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass);
53353 case X86::RSI:
53354 return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass);
53355 case X86::RDI:
53356 return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass);
53357 case X86::RBP:
53358 return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass);
53359 default:
53360 return std::make_pair(0, nullptr);
53361 }
53362 }
53363 if (RC && RC->contains(DestReg))
53364 return std::make_pair(DestReg, RC);
53365 return Res;
53366 }
53367 // No register found/type mismatch.
53368 return std::make_pair(0, nullptr);
53369 } else if (isFRClass(*Class)) {
53370 // Handle references to XMM physical registers that got mapped into the
53371 // wrong class. This can happen with constraints like {xmm0} where the
53372 // target independent register mapper will just pick the first match it can
53373 // find, ignoring the required type.
53374
53375 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
53376 if (VT == MVT::f16)
53377 Res.second = &X86::FR16XRegClass;
53378 else if (VT == MVT::f32 || VT == MVT::i32)
53379 Res.second = &X86::FR32XRegClass;
53380 else if (VT == MVT::f64 || VT == MVT::i64)
53381 Res.second = &X86::FR64XRegClass;
53382 else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))
53383 Res.second = &X86::VR128XRegClass;
53384 else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))
53385 Res.second = &X86::VR256XRegClass;
53386 else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
53387 Res.second = &X86::VR512RegClass;
53388 else {
53389 // Type mismatch and not a clobber: Return an error;
53390 Res.first = 0;
53391 Res.second = nullptr;
53392 }
53393 } else if (isVKClass(*Class)) {
53394 if (VT == MVT::i1)
53395 Res.second = &X86::VK1RegClass;
53396 else if (VT == MVT::i8)
53397 Res.second = &X86::VK8RegClass;
53398 else if (VT == MVT::i16)
53399 Res.second = &X86::VK16RegClass;
53400 else if (VT == MVT::i32)
53401 Res.second = &X86::VK32RegClass;
53402 else if (VT == MVT::i64)
53403 Res.second = &X86::VK64RegClass;
53404 else {
53405 // Type mismatch and not a clobber: Return an error;
53406 Res.first = 0;
53407 Res.second = nullptr;
53408 }
53409 }
53410
53411 return Res;
53412}
53413
53414InstructionCost X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
53415 const AddrMode &AM,
53416 Type *Ty,
53417 unsigned AS) const {
53418 // Scaling factors are not free at all.
53419 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
53420 // will take 2 allocations in the out of order engine instead of 1
53421 // for plain addressing mode, i.e. inst (reg1).
53422 // E.g.,
53423 // vaddps (%rsi,%rdx), %ymm0, %ymm1
53424 // Requires two allocations (one for the load, one for the computation)
53425 // whereas:
53426 // vaddps (%rsi), %ymm0, %ymm1
53427 // Requires just 1 allocation, i.e., freeing allocations for other operations
53428 // and having less micro operations to execute.
53429 //
53430 // For some X86 architectures, this is even worse because for instance for
53431 // stores, the complex addressing mode forces the instruction to use the
53432 // "load" ports instead of the dedicated "store" port.
53433 // E.g., on Haswell:
53434 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
53435 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
53436 if (isLegalAddressingMode(DL, AM, Ty, AS))
53437 // Scale represents reg2 * scale, thus account for 1
53438 // as soon as we use a second register.
53439 return AM.Scale != 0;
53440 return -1;
53441}
53442
53443bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
53444 // Integer division on x86 is expensive. However, when aggressively optimizing
53445 // for code size, we prefer to use a div instruction, as it is usually smaller
53446 // than the alternative sequence.
53447 // The exception to this is vector division. Since x86 doesn't have vector
53448 // integer division, leaving the division as-is is a loss even in terms of
53449 // size, because it will have to be scalarized, while the alternative code
53450 // sequence can be performed in vector form.
53451 bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
53452 return OptSize && !VT.isVector();
53453}
53454
53455void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
53456 if (!Subtarget.is64Bit())
53457 return;
53458
53459 // Update IsSplitCSR in X86MachineFunctionInfo.
53460 X86MachineFunctionInfo *AFI =
53461 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
53462 AFI->setIsSplitCSR(true);
53463}
53464
53465void X86TargetLowering::insertCopiesSplitCSR(
53466 MachineBasicBlock *Entry,
53467 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
53468 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
53469 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
53470 if (!IStart)
53471 return;
53472
53473 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
53474 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
53475 MachineBasicBlock::iterator MBBI = Entry->begin();
53476 for (const MCPhysReg *I = IStart; *I; ++I) {
53477 const TargetRegisterClass *RC = nullptr;
53478 if (X86::GR64RegClass.contains(*I))
53479 RC = &X86::GR64RegClass;
53480 else
53481 llvm_unreachable("Unexpected register class in CSRsViaCopy!")__builtin_unreachable();
53482
53483 Register NewVR = MRI->createVirtualRegister(RC);
53484 // Create copy from CSR to a virtual register.
53485 // FIXME: this currently does not emit CFI pseudo-instructions, it works
53486 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
53487 // nounwind. If we want to generalize this later, we may need to emit
53488 // CFI pseudo-instructions.
53489 assert((static_cast<void> (0))
53490 Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&(static_cast<void> (0))
53491 "Function should be nounwind in insertCopiesSplitCSR!")(static_cast<void> (0));
53492 Entry->addLiveIn(*I);
53493 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
53494 .addReg(*I);
53495
53496 // Insert the copy-back instructions right before the terminator.
53497 for (auto *Exit : Exits)
53498 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
53499 TII->get(TargetOpcode::COPY), *I)
53500 .addReg(NewVR);
53501 }
53502}
53503
53504bool X86TargetLowering::supportSwiftError() const {
53505 return Subtarget.is64Bit();
53506}
53507
53508/// Returns true if stack probing through a function call is requested.
53509bool X86TargetLowering::hasStackProbeSymbol(MachineFunction &MF) const {
53510 return !getStackProbeSymbolName(MF).empty();
53511}
53512
53513/// Returns true if stack probing through inline assembly is requested.
53514bool X86TargetLowering::hasInlineStackProbe(MachineFunction &MF) const {
53515
53516 // No inline stack probe for Windows, they have their own mechanism.
53517 if (Subtarget.isOSWindows() ||
53518 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
53519 return false;
53520
53521 // If the function specifically requests inline stack probes, emit them.
53522 if (MF.getFunction().hasFnAttribute("probe-stack"))
53523 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
53524 "inline-asm";
53525
53526 return false;
53527}
53528
53529/// Returns the name of the symbol used to emit stack probes or the empty
53530/// string if not applicable.
53531StringRef
53532X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
53533 // Inline Stack probes disable stack probe call
53534 if (hasInlineStackProbe(MF))
53535 return "";
53536
53537 // If the function specifically requests stack probes, emit them.
53538 if (MF.getFunction().hasFnAttribute("probe-stack"))
53539 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
53540
53541 // Generally, if we aren't on Windows, the platform ABI does not include
53542 // support for stack probes, so don't emit them.
53543 if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO() ||
53544 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
53545 return "";
53546
53547 // We need a stack probe to conform to the Windows ABI. Choose the right
53548 // symbol.
53549 if (Subtarget.is64Bit())
53550 return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
53551 return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
53552}
53553
53554unsigned
53555X86TargetLowering::getStackProbeSize(MachineFunction &MF) const {
53556 // The default stack probe size is 4096 if the function has no stackprobesize
53557 // attribute.
53558 unsigned StackProbeSize = 4096;
53559 const Function &Fn = MF.getFunction();
53560 if (Fn.hasFnAttribute("stack-probe-size"))
53561 Fn.getFnAttribute("stack-probe-size")
53562 .getValueAsString()
53563 .getAsInteger(0, StackProbeSize);
53564 return StackProbeSize;
53565}
53566
53567Align X86TargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
53568 if (ML->isInnermost() &&
53569 ExperimentalPrefInnermostLoopAlignment.getNumOccurrences())
53570 return Align(1ULL << ExperimentalPrefInnermostLoopAlignment);
53571 return TargetLowering::getPrefLoopAlignment();
53572}

/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/include/llvm/ADT/SmallBitVector.h

1//===- llvm/ADT/SmallBitVector.h - 'Normally small' bit vectors -*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the SmallBitVector class.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef LLVM_ADT_SMALLBITVECTOR_H
14#define LLVM_ADT_SMALLBITVECTOR_H
15
16#include "llvm/ADT/BitVector.h"
17#include "llvm/ADT/iterator_range.h"
18#include "llvm/Support/MathExtras.h"
19#include <algorithm>
20#include <cassert>
21#include <climits>
22#include <cstddef>
23#include <cstdint>
24#include <limits>
25#include <utility>
26
27namespace llvm {
28
29/// This is a 'bitvector' (really, a variable-sized bit array), optimized for
30/// the case when the array is small. It contains one pointer-sized field, which
31/// is directly used as a plain collection of bits when possible, or as a
32/// pointer to a larger heap-allocated array when necessary. This allows normal
33/// "small" cases to be fast without losing generality for large inputs.
34class SmallBitVector {
35 // TODO: In "large" mode, a pointer to a BitVector is used, leading to an
36 // unnecessary level of indirection. It would be more efficient to use a
37 // pointer to memory containing size, allocation size, and the array of bits.
38 uintptr_t X = 1;
39
40 enum {
41 // The number of bits in this class.
42 NumBaseBits = sizeof(uintptr_t) * CHAR_BIT8,
43
44 // One bit is used to discriminate between small and large mode. The
45 // remaining bits are used for the small-mode representation.
46 SmallNumRawBits = NumBaseBits - 1,
47
48 // A few more bits are used to store the size of the bit set in small mode.
49 // Theoretically this is a ceil-log2. These bits are encoded in the most
50 // significant bits of the raw bits.
51 SmallNumSizeBits = (NumBaseBits == 32 ? 5 :
52 NumBaseBits == 64 ? 6 :
53 SmallNumRawBits),
54
55 // The remaining bits are used to store the actual set in small mode.
56 SmallNumDataBits = SmallNumRawBits - SmallNumSizeBits
57 };
58
59 static_assert(NumBaseBits == 64 || NumBaseBits == 32,
60 "Unsupported word size");
61
62public:
63 using size_type = uintptr_t;
64
65 // Encapsulation of a single bit.
66 class reference {
67 SmallBitVector &TheVector;
68 unsigned BitPos;
69
70 public:
71 reference(SmallBitVector &b, unsigned Idx) : TheVector(b), BitPos(Idx) {}
72
73 reference(const reference&) = default;
74
75 reference& operator=(reference t) {
76 *this = bool(t);
77 return *this;
78 }
79
80 reference& operator=(bool t) {
81 if (t
25.1
't' is true
25.1
't' is true
)
26
Taking true branch
82 TheVector.set(BitPos);
27
Calling 'SmallBitVector::set'
83 else
84 TheVector.reset(BitPos);
85 return *this;
86 }
87
88 operator bool() const {
89 return const_cast<const SmallBitVector &>(TheVector).operator[](BitPos);
90 }
91 };
92
93private:
94 BitVector *getPointer() const {
95 assert(!isSmall())(static_cast<void> (0));
96 return reinterpret_cast<BitVector *>(X);
97 }
98
99 void switchToSmall(uintptr_t NewSmallBits, size_type NewSize) {
100 X = 1;
101 setSmallSize(NewSize);
102 setSmallBits(NewSmallBits);
103 }
104
105 void switchToLarge(BitVector *BV) {
106 X = reinterpret_cast<uintptr_t>(BV);
107 assert(!isSmall() && "Tried to use an unaligned pointer")(static_cast<void> (0));
108 }
109
110 // Return all the bits used for the "small" representation; this includes
111 // bits for the size as well as the element bits.
112 uintptr_t getSmallRawBits() const {
113 assert(isSmall())(static_cast<void> (0));
114 return X >> 1;
115 }
116
117 void setSmallRawBits(uintptr_t NewRawBits) {
118 assert(isSmall())(static_cast<void> (0));
119 X = (NewRawBits << 1) | uintptr_t(1);
120 }
32
Potential memory leak
121
122 // Return the size.
123 size_type getSmallSize() const {
124 return getSmallRawBits() >> SmallNumDataBits;
125 }
126
127 void setSmallSize(size_type Size) {
128 setSmallRawBits(getSmallBits() | (Size << SmallNumDataBits));
129 }
130
131 // Return the element bits.
132 uintptr_t getSmallBits() const {
133 return getSmallRawBits() & ~(~uintptr_t(0) << getSmallSize());
134 }
135
136 void setSmallBits(uintptr_t NewBits) {
137 setSmallRawBits((NewBits & ~(~uintptr_t(0) << getSmallSize())) |
31
Calling 'SmallBitVector::setSmallRawBits'
138 (getSmallSize() << SmallNumDataBits));
139 }
140
141public:
142 /// Creates an empty bitvector.
143 SmallBitVector() = default;
144
145 /// Creates a bitvector of specified number of bits. All bits are initialized
146 /// to the specified value.
147 explicit SmallBitVector(unsigned s, bool t = false) {
148 if (s <= SmallNumDataBits)
149 switchToSmall(t ? ~uintptr_t(0) : 0, s);
150 else
151 switchToLarge(new BitVector(s, t));
152 }
153
154 /// SmallBitVector copy ctor.
155 SmallBitVector(const SmallBitVector &RHS) {
156 if (RHS.isSmall())
157 X = RHS.X;
158 else
159 switchToLarge(new BitVector(*RHS.getPointer()));
160 }
161
162 SmallBitVector(SmallBitVector &&RHS) : X(RHS.X) {
163 RHS.X = 1;
164 }
165
166 ~SmallBitVector() {
167 if (!isSmall())
168 delete getPointer();
169 }
170
171 using const_set_bits_iterator = const_set_bits_iterator_impl<SmallBitVector>;
172 using set_iterator = const_set_bits_iterator;
173
174 const_set_bits_iterator set_bits_begin() const {
175 return const_set_bits_iterator(*this);
176 }
177
178 const_set_bits_iterator set_bits_end() const {
179 return const_set_bits_iterator(*this, -1);
180 }
181
182 iterator_range<const_set_bits_iterator> set_bits() const {
183 return make_range(set_bits_begin(), set_bits_end());
184 }
185
186 bool isSmall() const { return X & uintptr_t(1); }
187
188 /// Tests whether there are no bits in this bitvector.
189 bool empty() const {
190 return isSmall() ? getSmallSize() == 0 : getPointer()->empty();
191 }
192
193 /// Returns the number of bits in this bitvector.
194 size_type size() const {
195 return isSmall() ? getSmallSize() : getPointer()->size();
196 }
197
198 /// Returns the number of bits which are set.
199 size_type count() const {
200 if (isSmall()) {
201 uintptr_t Bits = getSmallBits();
202 return countPopulation(Bits);
203 }
204 return getPointer()->count();
205 }
206
207 /// Returns true if any bit is set.
208 bool any() const {
209 if (isSmall())
210 return getSmallBits() != 0;
211 return getPointer()->any();
212 }
213
214 /// Returns true if all bits are set.
215 bool all() const {
216 if (isSmall())
217 return getSmallBits() == (uintptr_t(1) << getSmallSize()) - 1;
218 return getPointer()->all();
219 }
220
221 /// Returns true if none of the bits are set.
222 bool none() const {
223 if (isSmall())
224 return getSmallBits() == 0;
225 return getPointer()->none();
226 }
227
228 /// Returns the index of the first set bit, -1 if none of the bits are set.
229 int find_first() const {
230 if (isSmall()) {
231 uintptr_t Bits = getSmallBits();
232 if (Bits == 0)
233 return -1;
234 return countTrailingZeros(Bits);
235 }
236 return getPointer()->find_first();
237 }
238
239 int find_last() const {
240 if (isSmall()) {
241 uintptr_t Bits = getSmallBits();
242 if (Bits == 0)
243 return -1;
244 return NumBaseBits - countLeadingZeros(Bits) - 1;
245 }
246 return getPointer()->find_last();
247 }
248
249 /// Returns the index of the first unset bit, -1 if all of the bits are set.
250 int find_first_unset() const {
251 if (isSmall()) {
252 if (count() == getSmallSize())
253 return -1;
254
255 uintptr_t Bits = getSmallBits();
256 return countTrailingOnes(Bits);
257 }
258 return getPointer()->find_first_unset();
259 }
260
261 int find_last_unset() const {
262 if (isSmall()) {
263 if (count() == getSmallSize())
264 return -1;
265
266 uintptr_t Bits = getSmallBits();
267 // Set unused bits.
268 Bits |= ~uintptr_t(0) << getSmallSize();
269 return NumBaseBits - countLeadingOnes(Bits) - 1;
270 }
271 return getPointer()->find_last_unset();
272 }
273
274 /// Returns the index of the next set bit following the "Prev" bit.
275 /// Returns -1 if the next set bit is not found.
276 int find_next(unsigned Prev) const {
277 if (isSmall()) {
278 uintptr_t Bits = getSmallBits();
279 // Mask off previous bits.
280 Bits &= ~uintptr_t(0) << (Prev + 1);
281 if (Bits == 0 || Prev + 1 >= getSmallSize())
282 return -1;
283 return countTrailingZeros(Bits);
284 }
285 return getPointer()->find_next(Prev);
286 }
287
288 /// Returns the index of the next unset bit following the "Prev" bit.
289 /// Returns -1 if the next unset bit is not found.
290 int find_next_unset(unsigned Prev) const {
291 if (isSmall()) {
292 uintptr_t Bits = getSmallBits();
293 // Mask in previous bits.
294 Bits |= (uintptr_t(1) << (Prev + 1)) - 1;
295 // Mask in unused bits.
296 Bits |= ~uintptr_t(0) << getSmallSize();
297
298 if (Bits == ~uintptr_t(0) || Prev + 1 >= getSmallSize())
299 return -1;
300 return countTrailingOnes(Bits);
301 }
302 return getPointer()->find_next_unset(Prev);
303 }
304
305 /// find_prev - Returns the index of the first set bit that precedes the
306 /// the bit at \p PriorTo. Returns -1 if all previous bits are unset.
307 int find_prev(unsigned PriorTo) const {
308 if (isSmall()) {
309 if (PriorTo == 0)
310 return -1;
311
312 --PriorTo;
313 uintptr_t Bits = getSmallBits();
314 Bits &= maskTrailingOnes<uintptr_t>(PriorTo + 1);
315 if (Bits == 0)
316 return -1;
317
318 return NumBaseBits - countLeadingZeros(Bits) - 1;
319 }
320 return getPointer()->find_prev(PriorTo);
321 }
322
323 /// Clear all bits.
324 void clear() {
325 if (!isSmall())
326 delete getPointer();
327 switchToSmall(0, 0);
328 }
329
330 /// Grow or shrink the bitvector.
331 void resize(unsigned N, bool t = false) {
332 if (!isSmall()) {
14
Assuming the condition is false
15
Taking false branch
333 getPointer()->resize(N, t);
334 } else if (SmallNumDataBits >= N
15.1
'N' is > SmallNumDataBits
15.1
'N' is > SmallNumDataBits
) {
16
Taking false branch
335 uintptr_t NewBits = t ? ~uintptr_t(0) << getSmallSize() : 0;
336 setSmallSize(N);
337 setSmallBits(NewBits | getSmallBits());
338 } else {
339 BitVector *BV = new BitVector(N, t);
17
Memory is allocated
340 uintptr_t OldBits = getSmallBits();
341 for (size_type I = 0, E = getSmallSize(); I != E; ++I)
18
Assuming 'I' is equal to 'E'
19
Loop condition is false. Execution continues on line 343
342 (*BV)[I] = (OldBits >> I) & 1;
343 switchToLarge(BV);
344 }
345 }
346
347 void reserve(unsigned N) {
348 if (isSmall()) {
349 if (N > SmallNumDataBits) {
350 uintptr_t OldBits = getSmallRawBits();
351 size_type SmallSize = getSmallSize();
352 BitVector *BV = new BitVector(SmallSize);
353 for (size_type I = 0; I < SmallSize; ++I)
354 if ((OldBits >> I) & 1)
355 BV->set(I);
356 BV->reserve(N);
357 switchToLarge(BV);
358 }
359 } else {
360 getPointer()->reserve(N);
361 }
362 }
363
364 // Set, reset, flip
365 SmallBitVector &set() {
366 if (isSmall())
367 setSmallBits(~uintptr_t(0));
368 else
369 getPointer()->set();
370 return *this;
371 }
372
373 SmallBitVector &set(unsigned Idx) {
374 if (isSmall()) {
28
Assuming the condition is true
29
Taking true branch
375 assert(Idx <= static_cast<unsigned>((static_cast<void> (0))
376 std::numeric_limits<uintptr_t>::digits) &&(static_cast<void> (0))
377 "undefined behavior")(static_cast<void> (0));
378 setSmallBits(getSmallBits() | (uintptr_t(1) << Idx));
30
Calling 'SmallBitVector::setSmallBits'
379 }
380 else
381 getPointer()->set(Idx);
382 return *this;
383 }
384
385 /// Efficiently set a range of bits in [I, E)
386 SmallBitVector &set(unsigned I, unsigned E) {
387 assert(I <= E && "Attempted to set backwards range!")(static_cast<void> (0));
388 assert(E <= size() && "Attempted to set out-of-bounds range!")(static_cast<void> (0));
389 if (I == E) return *this;
390 if (isSmall()) {
391 uintptr_t EMask = ((uintptr_t)1) << E;
392 uintptr_t IMask = ((uintptr_t)1) << I;
393 uintptr_t Mask = EMask - IMask;
394 setSmallBits(getSmallBits() | Mask);
395 } else
396 getPointer()->set(I, E);
397 return *this;
398 }
399
400 SmallBitVector &reset() {
401 if (isSmall())
402 setSmallBits(0);
403 else
404 getPointer()->reset();
405 return *this;
406 }
407
408 SmallBitVector &reset(unsigned Idx) {
409 if (isSmall())
410 setSmallBits(getSmallBits() & ~(uintptr_t(1) << Idx));
411 else
412 getPointer()->reset(Idx);
413 return *this;
414 }
415
416 /// Efficiently reset a range of bits in [I, E)
417 SmallBitVector &reset(unsigned I, unsigned E) {
418 assert(I <= E && "Attempted to reset backwards range!")(static_cast<void> (0));
419 assert(E <= size() && "Attempted to reset out-of-bounds range!")(static_cast<void> (0));
420 if (I == E) return *this;
421 if (isSmall()) {
422 uintptr_t EMask = ((uintptr_t)1) << E;
423 uintptr_t IMask = ((uintptr_t)1) << I;
424 uintptr_t Mask = EMask - IMask;
425 setSmallBits(getSmallBits() & ~Mask);
426 } else
427 getPointer()->reset(I, E);
428 return *this;
429 }
430
431 SmallBitVector &flip() {
432 if (isSmall())
433 setSmallBits(~getSmallBits());
434 else
435 getPointer()->flip();
436 return *this;
437 }
438
439 SmallBitVector &flip(unsigned Idx) {
440 if (isSmall())
441 setSmallBits(getSmallBits() ^ (uintptr_t(1) << Idx));
442 else
443 getPointer()->flip(Idx);
444 return *this;
445 }
446
447 // No argument flip.
448 SmallBitVector operator~() const {
449 return SmallBitVector(*this).flip();
450 }
451
452 // Indexing.
453 reference operator[](unsigned Idx) {
454 assert(Idx < size() && "Out-of-bounds Bit access.")(static_cast<void> (0));
455 return reference(*this, Idx);
456 }
457
458 bool operator[](unsigned Idx) const {
459 assert(Idx < size() && "Out-of-bounds Bit access.")(static_cast<void> (0));
460 if (isSmall())
461 return ((getSmallBits() >> Idx) & 1) != 0;
462 return getPointer()->operator[](Idx);
463 }
464
465 bool test(unsigned Idx) const {
466 return (*this)[Idx];
467 }
468
469 // Push single bit to end of vector.
470 void push_back(bool Val) {
471 resize(size() + 1, Val);
472 }
473
474 /// Test if any common bits are set.
475 bool anyCommon(const SmallBitVector &RHS) const {
476 if (isSmall() && RHS.isSmall())
477 return (getSmallBits() & RHS.getSmallBits()) != 0;
478 if (!isSmall() && !RHS.isSmall())
479 return getPointer()->anyCommon(*RHS.getPointer());
480
481 for (unsigned i = 0, e = std::min(size(), RHS.size()); i != e; ++i)
482 if (test(i) && RHS.test(i))
483 return true;
484 return false;
485 }
486
487 // Comparison operators.
488 bool operator==(const SmallBitVector &RHS) const {
489 if (size() != RHS.size())
490 return false;
491 if (isSmall() && RHS.isSmall())
492 return getSmallBits() == RHS.getSmallBits();
493 else if (!isSmall() && !RHS.isSmall())
494 return *getPointer() == *RHS.getPointer();
495 else {
496 for (size_type I = 0, E = size(); I != E; ++I) {
497 if ((*this)[I] != RHS[I])
498 return false;
499 }
500 return true;
501 }
502 }
503
504 bool operator!=(const SmallBitVector &RHS) const {
505 return !(*this == RHS);
506 }
507
508 // Intersection, union, disjoint union.
509 // FIXME BitVector::operator&= does not resize the LHS but this does
510 SmallBitVector &operator&=(const SmallBitVector &RHS) {
511 resize(std::max(size(), RHS.size()));
512 if (isSmall() && RHS.isSmall())
513 setSmallBits(getSmallBits() & RHS.getSmallBits());
514 else if (!isSmall() && !RHS.isSmall())
515 getPointer()->operator&=(*RHS.getPointer());
516 else {
517 size_type I, E;
518 for (I = 0, E = std::min(size(), RHS.size()); I != E; ++I)
519 (*this)[I] = test(I) && RHS.test(I);
520 for (E = size(); I != E; ++I)
521 reset(I);
522 }
523 return *this;
524 }
525
526 /// Reset bits that are set in RHS. Same as *this &= ~RHS.
527 SmallBitVector &reset(const SmallBitVector &RHS) {
528 if (isSmall() && RHS.isSmall())
529 setSmallBits(getSmallBits() & ~RHS.getSmallBits());
530 else if (!isSmall() && !RHS.isSmall())
531 getPointer()->reset(*RHS.getPointer());
532 else
533 for (unsigned i = 0, e = std::min(size(), RHS.size()); i != e; ++i)
534 if (RHS.test(i))
535 reset(i);
536
537 return *this;
538 }
539
540 /// Check if (This - RHS) is zero. This is the same as reset(RHS) and any().
541 bool test(const SmallBitVector &RHS) const {
542 if (isSmall() && RHS.isSmall())
543 return (getSmallBits() & ~RHS.getSmallBits()) != 0;
544 if (!isSmall() && !RHS.isSmall())
545 return getPointer()->test(*RHS.getPointer());
546
547 unsigned i, e;
548 for (i = 0, e = std::min(size(), RHS.size()); i != e; ++i)
549 if (test(i) && !RHS.test(i))
550 return true;
551
552 for (e = size(); i != e; ++i)
553 if (test(i))
554 return true;
555
556 return false;
557 }
558
559 SmallBitVector &operator|=(const SmallBitVector &RHS) {
560 resize(std::max(size(), RHS.size()));
561 if (isSmall() && RHS.isSmall())
562 setSmallBits(getSmallBits() | RHS.getSmallBits());
563 else if (!isSmall() && !RHS.isSmall())
564 getPointer()->operator|=(*RHS.getPointer());
565 else {
566 for (size_type I = 0, E = RHS.size(); I != E; ++I)
567 (*this)[I] = test(I) || RHS.test(I);
568 }
569 return *this;
570 }
571
572 SmallBitVector &operator^=(const SmallBitVector &RHS) {
573 resize(std::max(size(), RHS.size()));
574 if (isSmall() && RHS.isSmall())
575 setSmallBits(getSmallBits() ^ RHS.getSmallBits());
576 else if (!isSmall() && !RHS.isSmall())
577 getPointer()->operator^=(*RHS.getPointer());
578 else {
579 for (size_type I = 0, E = RHS.size(); I != E; ++I)
580 (*this)[I] = test(I) != RHS.test(I);
581 }
582 return *this;
583 }
584
585 SmallBitVector &operator<<=(unsigned N) {
586 if (isSmall())
587 setSmallBits(getSmallBits() << N);
588 else
589 getPointer()->operator<<=(N);
590 return *this;
591 }
592
593 SmallBitVector &operator>>=(unsigned N) {
594 if (isSmall())
595 setSmallBits(getSmallBits() >> N);
596 else
597 getPointer()->operator>>=(N);
598 return *this;
599 }
600
601 // Assignment operator.
602 const SmallBitVector &operator=(const SmallBitVector &RHS) {
603 if (isSmall()) {
604 if (RHS.isSmall())
605 X = RHS.X;
606 else
607 switchToLarge(new BitVector(*RHS.getPointer()));
608 } else {
609 if (!RHS.isSmall())
610 *getPointer() = *RHS.getPointer();
611 else {
612 delete getPointer();
613 X = RHS.X;
614 }
615 }
616 return *this;
617 }
618
619 const SmallBitVector &operator=(SmallBitVector &&RHS) {
620 if (this != &RHS) {
621 clear();
622 swap(RHS);
623 }
624 return *this;
625 }
626
627 void swap(SmallBitVector &RHS) {
628 std::swap(X, RHS.X);
629 }
630
631 /// Add '1' bits from Mask to this vector. Don't resize.
632 /// This computes "*this |= Mask".
633 void setBitsInMask(const uint32_t *Mask, unsigned MaskWords = ~0u) {
634 if (isSmall())
635 applyMask<true, false>(Mask, MaskWords);
636 else
637 getPointer()->setBitsInMask(Mask, MaskWords);
638 }
639
640 /// Clear any bits in this vector that are set in Mask. Don't resize.
641 /// This computes "*this &= ~Mask".
642 void clearBitsInMask(const uint32_t *Mask, unsigned MaskWords = ~0u) {
643 if (isSmall())
644 applyMask<false, false>(Mask, MaskWords);
645 else
646 getPointer()->clearBitsInMask(Mask, MaskWords);
647 }
648
649 /// Add a bit to this vector for every '0' bit in Mask. Don't resize.
650 /// This computes "*this |= ~Mask".
651 void setBitsNotInMask(const uint32_t *Mask, unsigned MaskWords = ~0u) {
652 if (isSmall())
653 applyMask<true, true>(Mask, MaskWords);
654 else
655 getPointer()->setBitsNotInMask(Mask, MaskWords);
656 }
657
658 /// Clear a bit in this vector for every '0' bit in Mask. Don't resize.
659 /// This computes "*this &= Mask".
660 void clearBitsNotInMask(const uint32_t *Mask, unsigned MaskWords = ~0u) {
661 if (isSmall())
662 applyMask<false, true>(Mask, MaskWords);
663 else
664 getPointer()->clearBitsNotInMask(Mask, MaskWords);
665 }
666
667 void invalid() {
668 assert(empty())(static_cast<void> (0));
669 X = (uintptr_t)-1;
670 }
671 bool isInvalid() const { return X == (uintptr_t)-1; }
672
673 ArrayRef<uintptr_t> getData(uintptr_t &Store) const {
674 if (!isSmall())
675 return getPointer()->getData();
676 Store = getSmallBits();
677 return makeArrayRef(Store);
678 }
679
680private:
681 template <bool AddBits, bool InvertMask>
682 void applyMask(const uint32_t *Mask, unsigned MaskWords) {
683 assert(MaskWords <= sizeof(uintptr_t) && "Mask is larger than base!")(static_cast<void> (0));
684 uintptr_t M = Mask[0];
685 if (NumBaseBits == 64)
686 M |= uint64_t(Mask[1]) << 32;
687 if (InvertMask)
688 M = ~M;
689 if (AddBits)
690 setSmallBits(getSmallBits() | M);
691 else
692 setSmallBits(getSmallBits() & ~M);
693 }
694};
695
696inline SmallBitVector
697operator&(const SmallBitVector &LHS, const SmallBitVector &RHS) {
698 SmallBitVector Result(LHS);
699 Result &= RHS;
700 return Result;
701}
702
703inline SmallBitVector
704operator|(const SmallBitVector &LHS, const SmallBitVector &RHS) {
705 SmallBitVector Result(LHS);
706 Result |= RHS;
707 return Result;
708}
709
710inline SmallBitVector
711operator^(const SmallBitVector &LHS, const SmallBitVector &RHS) {
712 SmallBitVector Result(LHS);
713 Result ^= RHS;
714 return Result;
715}
716
717template <> struct DenseMapInfo<SmallBitVector> {
718 static inline SmallBitVector getEmptyKey() { return SmallBitVector(); }
719 static inline SmallBitVector getTombstoneKey() {
720 SmallBitVector V;
721 V.invalid();
722 return V;
723 }
724 static unsigned getHashValue(const SmallBitVector &V) {
725 uintptr_t Store;
726 return DenseMapInfo<
727 std::pair<SmallBitVector::size_type, ArrayRef<uintptr_t>>>::
728 getHashValue(std::make_pair(V.size(), V.getData(Store)));
729 }
730 static bool isEqual(const SmallBitVector &LHS, const SmallBitVector &RHS) {
731 if (LHS.isInvalid() || RHS.isInvalid())
732 return LHS.isInvalid() == RHS.isInvalid();
733 return LHS == RHS;
734 }
735};
736} // end namespace llvm
737
738namespace std {
739
740/// Implement std::swap in terms of BitVector swap.
741inline void
742swap(llvm::SmallBitVector &LHS, llvm::SmallBitVector &RHS) {
743 LHS.swap(RHS);
744}
745
746} // end namespace std
747
748#endif // LLVM_ADT_SMALLBITVECTOR_H