Bug Summary

File:build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/llvm/lib/Target/X86/X86ISelLowering.cpp
Warning:line 16986, column 31
Division by zero

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name X86ISelLowering.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/build-llvm -resource-dir /usr/lib/llvm-15/lib/clang/15.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I lib/Target/X86 -I /build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/llvm/lib/Target/X86 -I include -I /build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/llvm/include -D _FORTIFY_SOURCE=2 -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-15/lib/clang/15.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -fmacro-prefix-map=/build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/build-llvm=build-llvm -fmacro-prefix-map=/build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/= -fcoverage-prefix-map=/build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/build-llvm=build-llvm -fcoverage-prefix-map=/build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/= -O3 -Wno-unused-command-line-argument -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir=/build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/build-llvm -fdebug-prefix-map=/build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/build-llvm=build-llvm -fdebug-prefix-map=/build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/= -ferror-limit 19 -fvisibility hidden -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2022-04-20-140412-16051-1 -x c++ /build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/llvm/lib/Target/X86/X86ISelLowering.cpp
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that X86 uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "X86ISelLowering.h"
15#include "MCTargetDesc/X86ShuffleDecode.h"
16#include "X86.h"
17#include "X86CallingConv.h"
18#include "X86FrameLowering.h"
19#include "X86InstrBuilder.h"
20#include "X86IntrinsicsInfo.h"
21#include "X86MachineFunctionInfo.h"
22#include "X86TargetMachine.h"
23#include "X86TargetObjectFile.h"
24#include "llvm/ADT/SmallBitVector.h"
25#include "llvm/ADT/SmallSet.h"
26#include "llvm/ADT/Statistic.h"
27#include "llvm/ADT/StringExtras.h"
28#include "llvm/ADT/StringSwitch.h"
29#include "llvm/Analysis/BlockFrequencyInfo.h"
30#include "llvm/Analysis/EHPersonalities.h"
31#include "llvm/Analysis/ObjCARCUtil.h"
32#include "llvm/Analysis/ProfileSummaryInfo.h"
33#include "llvm/Analysis/VectorUtils.h"
34#include "llvm/CodeGen/IntrinsicLowering.h"
35#include "llvm/CodeGen/MachineFrameInfo.h"
36#include "llvm/CodeGen/MachineFunction.h"
37#include "llvm/CodeGen/MachineInstrBuilder.h"
38#include "llvm/CodeGen/MachineJumpTableInfo.h"
39#include "llvm/CodeGen/MachineLoopInfo.h"
40#include "llvm/CodeGen/MachineModuleInfo.h"
41#include "llvm/CodeGen/MachineRegisterInfo.h"
42#include "llvm/CodeGen/TargetLowering.h"
43#include "llvm/CodeGen/WinEHFuncInfo.h"
44#include "llvm/IR/CallingConv.h"
45#include "llvm/IR/Constants.h"
46#include "llvm/IR/DerivedTypes.h"
47#include "llvm/IR/DiagnosticInfo.h"
48#include "llvm/IR/Function.h"
49#include "llvm/IR/GlobalAlias.h"
50#include "llvm/IR/GlobalVariable.h"
51#include "llvm/IR/IRBuilder.h"
52#include "llvm/IR/Instructions.h"
53#include "llvm/IR/Intrinsics.h"
54#include "llvm/IR/PatternMatch.h"
55#include "llvm/MC/MCAsmInfo.h"
56#include "llvm/MC/MCContext.h"
57#include "llvm/MC/MCExpr.h"
58#include "llvm/MC/MCSymbol.h"
59#include "llvm/Support/CommandLine.h"
60#include "llvm/Support/Debug.h"
61#include "llvm/Support/ErrorHandling.h"
62#include "llvm/Support/KnownBits.h"
63#include "llvm/Support/MathExtras.h"
64#include "llvm/Target/TargetOptions.h"
65#include <algorithm>
66#include <bitset>
67#include <cctype>
68#include <numeric>
69using namespace llvm;
70
71#define DEBUG_TYPE"x86-isel" "x86-isel"
72
73STATISTIC(NumTailCalls, "Number of tail calls")static llvm::Statistic NumTailCalls = {"x86-isel", "NumTailCalls"
, "Number of tail calls"}
;
74
75static cl::opt<int> ExperimentalPrefInnermostLoopAlignment(
76 "x86-experimental-pref-innermost-loop-alignment", cl::init(4),
77 cl::desc(
78 "Sets the preferable loop alignment for experiments (as log2 bytes) "
79 "for innermost loops only. If specified, this option overrides "
80 "alignment set by x86-experimental-pref-loop-alignment."),
81 cl::Hidden);
82
83static cl::opt<bool> MulConstantOptimization(
84 "mul-constant-optimization", cl::init(true),
85 cl::desc("Replace 'mul x, Const' with more effective instructions like "
86 "SHIFT, LEA, etc."),
87 cl::Hidden);
88
89static cl::opt<bool> ExperimentalUnorderedISEL(
90 "x86-experimental-unordered-atomic-isel", cl::init(false),
91 cl::desc("Use LoadSDNode and StoreSDNode instead of "
92 "AtomicSDNode for unordered atomic loads and "
93 "stores respectively."),
94 cl::Hidden);
95
96/// Call this when the user attempts to do something unsupported, like
97/// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
98/// report_fatal_error, so calling code should attempt to recover without
99/// crashing.
100static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
101 const char *Msg) {
102 MachineFunction &MF = DAG.getMachineFunction();
103 DAG.getContext()->diagnose(
104 DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc()));
105}
106
107X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
108 const X86Subtarget &STI)
109 : TargetLowering(TM), Subtarget(STI) {
110 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
111 MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
112
113 // Set up the TargetLowering object.
114
115 // X86 is weird. It always uses i8 for shift amounts and setcc results.
116 setBooleanContents(ZeroOrOneBooleanContent);
117 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
118 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
119
120 // For 64-bit, since we have so many registers, use the ILP scheduler.
121 // For 32-bit, use the register pressure specific scheduling.
122 // For Atom, always use ILP scheduling.
123 if (Subtarget.isAtom())
124 setSchedulingPreference(Sched::ILP);
125 else if (Subtarget.is64Bit())
126 setSchedulingPreference(Sched::ILP);
127 else
128 setSchedulingPreference(Sched::RegPressure);
129 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
130 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
131
132 // Bypass expensive divides and use cheaper ones.
133 if (TM.getOptLevel() >= CodeGenOpt::Default) {
134 if (Subtarget.hasSlowDivide32())
135 addBypassSlowDiv(32, 8);
136 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
137 addBypassSlowDiv(64, 32);
138 }
139
140 // Setup Windows compiler runtime calls.
141 if (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()) {
142 static const struct {
143 const RTLIB::Libcall Op;
144 const char * const Name;
145 const CallingConv::ID CC;
146 } LibraryCalls[] = {
147 { RTLIB::SDIV_I64, "_alldiv", CallingConv::X86_StdCall },
148 { RTLIB::UDIV_I64, "_aulldiv", CallingConv::X86_StdCall },
149 { RTLIB::SREM_I64, "_allrem", CallingConv::X86_StdCall },
150 { RTLIB::UREM_I64, "_aullrem", CallingConv::X86_StdCall },
151 { RTLIB::MUL_I64, "_allmul", CallingConv::X86_StdCall },
152 };
153
154 for (const auto &LC : LibraryCalls) {
155 setLibcallName(LC.Op, LC.Name);
156 setLibcallCallingConv(LC.Op, LC.CC);
157 }
158 }
159
160 if (Subtarget.getTargetTriple().isOSMSVCRT()) {
161 // MSVCRT doesn't have powi; fall back to pow
162 setLibcallName(RTLIB::POWI_F32, nullptr);
163 setLibcallName(RTLIB::POWI_F64, nullptr);
164 }
165
166 // If we don't have cmpxchg8b(meaing this is a 386/486), limit atomic size to
167 // 32 bits so the AtomicExpandPass will expand it so we don't need cmpxchg8b.
168 // FIXME: Should we be limiting the atomic size on other configs? Default is
169 // 1024.
170 if (!Subtarget.canUseCMPXCHG8B())
171 setMaxAtomicSizeInBitsSupported(32);
172
173 // Set up the register classes.
174 addRegisterClass(MVT::i8, &X86::GR8RegClass);
175 addRegisterClass(MVT::i16, &X86::GR16RegClass);
176 addRegisterClass(MVT::i32, &X86::GR32RegClass);
177 if (Subtarget.is64Bit())
178 addRegisterClass(MVT::i64, &X86::GR64RegClass);
179
180 for (MVT VT : MVT::integer_valuetypes())
181 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
182
183 // We don't accept any truncstore of integer registers.
184 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
185 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
186 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
187 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
188 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
189 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
190
191 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
192
193 // SETOEQ and SETUNE require checking two conditions.
194 for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
195 setCondCodeAction(ISD::SETOEQ, VT, Expand);
196 setCondCodeAction(ISD::SETUNE, VT, Expand);
197 }
198
199 // Integer absolute.
200 if (Subtarget.canUseCMOV()) {
201 setOperationAction(ISD::ABS , MVT::i16 , Custom);
202 setOperationAction(ISD::ABS , MVT::i32 , Custom);
203 if (Subtarget.is64Bit())
204 setOperationAction(ISD::ABS , MVT::i64 , Custom);
205 }
206
207 // Signed saturation subtraction.
208 setOperationAction(ISD::SSUBSAT , MVT::i8 , Custom);
209 setOperationAction(ISD::SSUBSAT , MVT::i16 , Custom);
210 setOperationAction(ISD::SSUBSAT , MVT::i32 , Custom);
211 if (Subtarget.is64Bit())
212 setOperationAction(ISD::SSUBSAT , MVT::i64 , Custom);
213
214 // Funnel shifts.
215 for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
216 // For slow shld targets we only lower for code size.
217 LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;
218
219 setOperationAction(ShiftOp , MVT::i8 , Custom);
220 setOperationAction(ShiftOp , MVT::i16 , Custom);
221 setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction);
222 if (Subtarget.is64Bit())
223 setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction);
224 }
225
226 if (!Subtarget.useSoftFloat()) {
227 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
228 // operation.
229 setOperationAction(ISD::UINT_TO_FP, MVT::i8, Promote);
230 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i8, Promote);
231 setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
232 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i16, Promote);
233 // We have an algorithm for SSE2, and we turn this into a 64-bit
234 // FILD or VCVTUSI2SS/SD for other targets.
235 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
236 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);
237 // We have an algorithm for SSE2->double, and we turn this into a
238 // 64-bit FILD followed by conditional FADD for other targets.
239 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
240 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);
241
242 // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
243 // this operation.
244 setOperationAction(ISD::SINT_TO_FP, MVT::i8, Promote);
245 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i8, Promote);
246 // SSE has no i16 to fp conversion, only i32. We promote in the handler
247 // to allow f80 to use i16 and f64 to use i16 with sse1 only
248 setOperationAction(ISD::SINT_TO_FP, MVT::i16, Custom);
249 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i16, Custom);
250 // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
251 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
252 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);
253 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
254 // are Legal, f80 is custom lowered.
255 setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
256 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);
257
258 // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
259 // this operation.
260 setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote);
261 // FIXME: This doesn't generate invalid exception when it should. PR44019.
262 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i8, Promote);
263 setOperationAction(ISD::FP_TO_SINT, MVT::i16, Custom);
264 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i16, Custom);
265 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
266 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
267 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
268 // are Legal, f80 is custom lowered.
269 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
270 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);
271
272 // Handle FP_TO_UINT by promoting the destination to a larger signed
273 // conversion.
274 setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote);
275 // FIXME: This doesn't generate invalid exception when it should. PR44019.
276 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i8, Promote);
277 setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
278 // FIXME: This doesn't generate invalid exception when it should. PR44019.
279 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i16, Promote);
280 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
281 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
282 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
283 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);
284
285 setOperationAction(ISD::LRINT, MVT::f32, Custom);
286 setOperationAction(ISD::LRINT, MVT::f64, Custom);
287 setOperationAction(ISD::LLRINT, MVT::f32, Custom);
288 setOperationAction(ISD::LLRINT, MVT::f64, Custom);
289
290 if (!Subtarget.is64Bit()) {
291 setOperationAction(ISD::LRINT, MVT::i64, Custom);
292 setOperationAction(ISD::LLRINT, MVT::i64, Custom);
293 }
294 }
295
296 if (Subtarget.hasSSE2()) {
297 // Custom lowering for saturating float to int conversions.
298 // We handle promotion to larger result types manually.
299 for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
300 setOperationAction(ISD::FP_TO_UINT_SAT, VT, Custom);
301 setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom);
302 }
303 if (Subtarget.is64Bit()) {
304 setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom);
305 setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom);
306 }
307 }
308
309 // Handle address space casts between mixed sized pointers.
310 setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
311 setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
312
313 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
314 if (!Subtarget.hasSSE2()) {
315 setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
316 setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
317 if (Subtarget.is64Bit()) {
318 setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
319 // Without SSE, i64->f64 goes through memory.
320 setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
321 }
322 } else if (!Subtarget.is64Bit())
323 setOperationAction(ISD::BITCAST , MVT::i64 , Custom);
324
325 // Scalar integer divide and remainder are lowered to use operations that
326 // produce two results, to match the available instructions. This exposes
327 // the two-result form to trivial CSE, which is able to combine x/y and x%y
328 // into a single instruction.
329 //
330 // Scalar integer multiply-high is also lowered to use two-result
331 // operations, to match the available instructions. However, plain multiply
332 // (low) operations are left as Legal, as there are single-result
333 // instructions for this in x86. Using the two-result multiply instructions
334 // when both high and low results are needed must be arranged by dagcombine.
335 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
336 setOperationAction(ISD::MULHS, VT, Expand);
337 setOperationAction(ISD::MULHU, VT, Expand);
338 setOperationAction(ISD::SDIV, VT, Expand);
339 setOperationAction(ISD::UDIV, VT, Expand);
340 setOperationAction(ISD::SREM, VT, Expand);
341 setOperationAction(ISD::UREM, VT, Expand);
342 }
343
344 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
345 setOperationAction(ISD::BRCOND , MVT::Other, Custom);
346 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
347 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
348 setOperationAction(ISD::BR_CC, VT, Expand);
349 setOperationAction(ISD::SELECT_CC, VT, Expand);
350 }
351 if (Subtarget.is64Bit())
352 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
353 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
354 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
355 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
356
357 setOperationAction(ISD::FREM , MVT::f32 , Expand);
358 setOperationAction(ISD::FREM , MVT::f64 , Expand);
359 setOperationAction(ISD::FREM , MVT::f80 , Expand);
360 setOperationAction(ISD::FREM , MVT::f128 , Expand);
361
362 if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {
363 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);
364 setOperationAction(ISD::SET_ROUNDING , MVT::Other, Custom);
365 }
366
367 // Promote the i8 variants and force them on up to i32 which has a shorter
368 // encoding.
369 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
370 setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
371
372 if (Subtarget.hasBMI()) {
373 // Promote the i16 zero undef variant and force it on up to i32 when tzcnt
374 // is enabled.
375 setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i16, MVT::i32);
376 } else {
377 setOperationAction(ISD::CTTZ, MVT::i16, Custom);
378 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
379 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Legal);
380 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
381 if (Subtarget.is64Bit()) {
382 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
383 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
384 }
385 }
386
387 if (Subtarget.hasLZCNT()) {
388 // When promoting the i8 variants, force them to i32 for a shorter
389 // encoding.
390 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
391 setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
392 } else {
393 for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
394 if (VT == MVT::i64 && !Subtarget.is64Bit())
395 continue;
396 setOperationAction(ISD::CTLZ , VT, Custom);
397 setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Custom);
398 }
399 }
400
401 for (auto Op : {ISD::FP16_TO_FP, ISD::STRICT_FP16_TO_FP, ISD::FP_TO_FP16,
402 ISD::STRICT_FP_TO_FP16}) {
403 // Special handling for half-precision floating point conversions.
404 // If we don't have F16C support, then lower half float conversions
405 // into library calls.
406 setOperationAction(
407 Op, MVT::f32,
408 (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);
409 // There's never any support for operations beyond MVT::f32.
410 setOperationAction(Op, MVT::f64, Expand);
411 setOperationAction(Op, MVT::f80, Expand);
412 setOperationAction(Op, MVT::f128, Expand);
413 }
414
415 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
416 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
417 setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
418 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f16, Expand);
419 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
420 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
421 setTruncStoreAction(MVT::f80, MVT::f16, Expand);
422 setTruncStoreAction(MVT::f128, MVT::f16, Expand);
423
424 setOperationAction(ISD::PARITY, MVT::i8, Custom);
425 setOperationAction(ISD::PARITY, MVT::i16, Custom);
426 setOperationAction(ISD::PARITY, MVT::i32, Custom);
427 if (Subtarget.is64Bit())
428 setOperationAction(ISD::PARITY, MVT::i64, Custom);
429 if (Subtarget.hasPOPCNT()) {
430 setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
431 // popcntw is longer to encode than popcntl and also has a false dependency
432 // on the dest that popcntl hasn't had since Cannon Lake.
433 setOperationPromotedToType(ISD::CTPOP, MVT::i16, MVT::i32);
434 } else {
435 setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
436 setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
437 setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
438 if (Subtarget.is64Bit())
439 setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
440 else
441 setOperationAction(ISD::CTPOP , MVT::i64 , Custom);
442 }
443
444 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
445
446 if (!Subtarget.hasMOVBE())
447 setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
448
449 // X86 wants to expand cmov itself.
450 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
451 setOperationAction(ISD::SELECT, VT, Custom);
452 setOperationAction(ISD::SETCC, VT, Custom);
453 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
454 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
455 }
456 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
457 if (VT == MVT::i64 && !Subtarget.is64Bit())
458 continue;
459 setOperationAction(ISD::SELECT, VT, Custom);
460 setOperationAction(ISD::SETCC, VT, Custom);
461 }
462
463 // Custom action for SELECT MMX and expand action for SELECT_CC MMX
464 setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
465 setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);
466
467 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
468 // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
469 // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
470 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
471 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
472 setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
473 if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
474 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
475
476 // Darwin ABI issue.
477 for (auto VT : { MVT::i32, MVT::i64 }) {
478 if (VT == MVT::i64 && !Subtarget.is64Bit())
479 continue;
480 setOperationAction(ISD::ConstantPool , VT, Custom);
481 setOperationAction(ISD::JumpTable , VT, Custom);
482 setOperationAction(ISD::GlobalAddress , VT, Custom);
483 setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
484 setOperationAction(ISD::ExternalSymbol , VT, Custom);
485 setOperationAction(ISD::BlockAddress , VT, Custom);
486 }
487
488 // 64-bit shl, sra, srl (iff 32-bit x86)
489 for (auto VT : { MVT::i32, MVT::i64 }) {
490 if (VT == MVT::i64 && !Subtarget.is64Bit())
491 continue;
492 setOperationAction(ISD::SHL_PARTS, VT, Custom);
493 setOperationAction(ISD::SRA_PARTS, VT, Custom);
494 setOperationAction(ISD::SRL_PARTS, VT, Custom);
495 }
496
497 if (Subtarget.hasSSEPrefetch() || Subtarget.hasThreeDNow())
498 setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
499
500 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
501
502 // Expand certain atomics
503 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
504 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
505 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
506 setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
507 setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
508 setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
509 setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
510 setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
511 }
512
513 if (!Subtarget.is64Bit())
514 setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
515
516 if (Subtarget.canUseCMPXCHG16B())
517 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
518
519 // FIXME - use subtarget debug flags
520 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
521 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
522 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
523 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
524 }
525
526 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
527 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
528
529 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
530 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
531
532 setOperationAction(ISD::TRAP, MVT::Other, Legal);
533 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
534 if (Subtarget.getTargetTriple().isPS4())
535 setOperationAction(ISD::UBSANTRAP, MVT::Other, Expand);
536 else
537 setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);
538
539 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
540 setOperationAction(ISD::VASTART , MVT::Other, Custom);
541 setOperationAction(ISD::VAEND , MVT::Other, Expand);
542 bool Is64Bit = Subtarget.is64Bit();
543 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
544 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
545
546 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
547 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
548
549 setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
550
551 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
552 setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
553 setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
554
555 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
556 // f32 and f64 use SSE.
557 // Set up the FP register classes.
558 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
559 : &X86::FR32RegClass);
560 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
561 : &X86::FR64RegClass);
562
563 // Disable f32->f64 extload as we can only generate this in one instruction
564 // under optsize. So its easier to pattern match (fpext (load)) for that
565 // case instead of needing to emit 2 instructions for extload in the
566 // non-optsize case.
567 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
568
569 for (auto VT : { MVT::f32, MVT::f64 }) {
570 // Use ANDPD to simulate FABS.
571 setOperationAction(ISD::FABS, VT, Custom);
572
573 // Use XORP to simulate FNEG.
574 setOperationAction(ISD::FNEG, VT, Custom);
575
576 // Use ANDPD and ORPD to simulate FCOPYSIGN.
577 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
578
579 // These might be better off as horizontal vector ops.
580 setOperationAction(ISD::FADD, VT, Custom);
581 setOperationAction(ISD::FSUB, VT, Custom);
582
583 // We don't support sin/cos/fmod
584 setOperationAction(ISD::FSIN , VT, Expand);
585 setOperationAction(ISD::FCOS , VT, Expand);
586 setOperationAction(ISD::FSINCOS, VT, Expand);
587 }
588
589 // Lower this to MOVMSK plus an AND.
590 setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
591 setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
592
593 } else if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1() &&
594 (UseX87 || Is64Bit)) {
595 // Use SSE for f32, x87 for f64.
596 // Set up the FP register classes.
597 addRegisterClass(MVT::f32, &X86::FR32RegClass);
598 if (UseX87)
599 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
600
601 // Use ANDPS to simulate FABS.
602 setOperationAction(ISD::FABS , MVT::f32, Custom);
603
604 // Use XORP to simulate FNEG.
605 setOperationAction(ISD::FNEG , MVT::f32, Custom);
606
607 if (UseX87)
608 setOperationAction(ISD::UNDEF, MVT::f64, Expand);
609
610 // Use ANDPS and ORPS to simulate FCOPYSIGN.
611 if (UseX87)
612 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
613 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
614
615 // We don't support sin/cos/fmod
616 setOperationAction(ISD::FSIN , MVT::f32, Expand);
617 setOperationAction(ISD::FCOS , MVT::f32, Expand);
618 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
619
620 if (UseX87) {
621 // Always expand sin/cos functions even though x87 has an instruction.
622 setOperationAction(ISD::FSIN, MVT::f64, Expand);
623 setOperationAction(ISD::FCOS, MVT::f64, Expand);
624 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
625 }
626 } else if (UseX87) {
627 // f32 and f64 in x87.
628 // Set up the FP register classes.
629 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
630 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
631
632 for (auto VT : { MVT::f32, MVT::f64 }) {
633 setOperationAction(ISD::UNDEF, VT, Expand);
634 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
635
636 // Always expand sin/cos functions even though x87 has an instruction.
637 setOperationAction(ISD::FSIN , VT, Expand);
638 setOperationAction(ISD::FCOS , VT, Expand);
639 setOperationAction(ISD::FSINCOS, VT, Expand);
640 }
641 }
642
643 // Expand FP32 immediates into loads from the stack, save special cases.
644 if (isTypeLegal(MVT::f32)) {
645 if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
646 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
647 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
648 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
649 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
650 } else // SSE immediates.
651 addLegalFPImmediate(APFloat(+0.0f)); // xorps
652 }
653 // Expand FP64 immediates into loads from the stack, save special cases.
654 if (isTypeLegal(MVT::f64)) {
655 if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
656 addLegalFPImmediate(APFloat(+0.0)); // FLD0
657 addLegalFPImmediate(APFloat(+1.0)); // FLD1
658 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
659 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
660 } else // SSE immediates.
661 addLegalFPImmediate(APFloat(+0.0)); // xorpd
662 }
663 // Handle constrained floating-point operations of scalar.
664 setOperationAction(ISD::STRICT_FADD, MVT::f32, Legal);
665 setOperationAction(ISD::STRICT_FADD, MVT::f64, Legal);
666 setOperationAction(ISD::STRICT_FSUB, MVT::f32, Legal);
667 setOperationAction(ISD::STRICT_FSUB, MVT::f64, Legal);
668 setOperationAction(ISD::STRICT_FMUL, MVT::f32, Legal);
669 setOperationAction(ISD::STRICT_FMUL, MVT::f64, Legal);
670 setOperationAction(ISD::STRICT_FDIV, MVT::f32, Legal);
671 setOperationAction(ISD::STRICT_FDIV, MVT::f64, Legal);
672 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);
673 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal);
674 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Legal);
675 setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Legal);
676 setOperationAction(ISD::STRICT_FSQRT, MVT::f64, Legal);
677
678 // We don't support FMA.
679 setOperationAction(ISD::FMA, MVT::f64, Expand);
680 setOperationAction(ISD::FMA, MVT::f32, Expand);
681
682 // f80 always uses X87.
683 if (UseX87) {
684 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
685 setOperationAction(ISD::UNDEF, MVT::f80, Expand);
686 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
687 {
688 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
689 addLegalFPImmediate(TmpFlt); // FLD0
690 TmpFlt.changeSign();
691 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
692
693 bool ignored;
694 APFloat TmpFlt2(+1.0);
695 TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
696 &ignored);
697 addLegalFPImmediate(TmpFlt2); // FLD1
698 TmpFlt2.changeSign();
699 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
700 }
701
702 // Always expand sin/cos functions even though x87 has an instruction.
703 setOperationAction(ISD::FSIN , MVT::f80, Expand);
704 setOperationAction(ISD::FCOS , MVT::f80, Expand);
705 setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
706
707 setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
708 setOperationAction(ISD::FCEIL, MVT::f80, Expand);
709 setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
710 setOperationAction(ISD::FRINT, MVT::f80, Expand);
711 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
712 setOperationAction(ISD::FMA, MVT::f80, Expand);
713 setOperationAction(ISD::LROUND, MVT::f80, Expand);
714 setOperationAction(ISD::LLROUND, MVT::f80, Expand);
715 setOperationAction(ISD::LRINT, MVT::f80, Custom);
716 setOperationAction(ISD::LLRINT, MVT::f80, Custom);
717
718 // Handle constrained floating-point operations of scalar.
719 setOperationAction(ISD::STRICT_FADD , MVT::f80, Legal);
720 setOperationAction(ISD::STRICT_FSUB , MVT::f80, Legal);
721 setOperationAction(ISD::STRICT_FMUL , MVT::f80, Legal);
722 setOperationAction(ISD::STRICT_FDIV , MVT::f80, Legal);
723 setOperationAction(ISD::STRICT_FSQRT , MVT::f80, Legal);
724 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Legal);
725 // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
726 // as Custom.
727 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Legal);
728 }
729
730 // f128 uses xmm registers, but most operations require libcalls.
731 if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
732 addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
733 : &X86::VR128RegClass);
734
735 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
736
737 setOperationAction(ISD::FADD, MVT::f128, LibCall);
738 setOperationAction(ISD::STRICT_FADD, MVT::f128, LibCall);
739 setOperationAction(ISD::FSUB, MVT::f128, LibCall);
740 setOperationAction(ISD::STRICT_FSUB, MVT::f128, LibCall);
741 setOperationAction(ISD::FDIV, MVT::f128, LibCall);
742 setOperationAction(ISD::STRICT_FDIV, MVT::f128, LibCall);
743 setOperationAction(ISD::FMUL, MVT::f128, LibCall);
744 setOperationAction(ISD::STRICT_FMUL, MVT::f128, LibCall);
745 setOperationAction(ISD::FMA, MVT::f128, LibCall);
746 setOperationAction(ISD::STRICT_FMA, MVT::f128, LibCall);
747
748 setOperationAction(ISD::FABS, MVT::f128, Custom);
749 setOperationAction(ISD::FNEG, MVT::f128, Custom);
750 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
751
752 setOperationAction(ISD::FSIN, MVT::f128, LibCall);
753 setOperationAction(ISD::STRICT_FSIN, MVT::f128, LibCall);
754 setOperationAction(ISD::FCOS, MVT::f128, LibCall);
755 setOperationAction(ISD::STRICT_FCOS, MVT::f128, LibCall);
756 setOperationAction(ISD::FSINCOS, MVT::f128, LibCall);
757 // No STRICT_FSINCOS
758 setOperationAction(ISD::FSQRT, MVT::f128, LibCall);
759 setOperationAction(ISD::STRICT_FSQRT, MVT::f128, LibCall);
760
761 setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
762 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Custom);
763 // We need to custom handle any FP_ROUND with an f128 input, but
764 // LegalizeDAG uses the result type to know when to run a custom handler.
765 // So we have to list all legal floating point result types here.
766 if (isTypeLegal(MVT::f32)) {
767 setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
768 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);
769 }
770 if (isTypeLegal(MVT::f64)) {
771 setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
772 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);
773 }
774 if (isTypeLegal(MVT::f80)) {
775 setOperationAction(ISD::FP_ROUND, MVT::f80, Custom);
776 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Custom);
777 }
778
779 setOperationAction(ISD::SETCC, MVT::f128, Custom);
780
781 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
782 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
783 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);
784 setTruncStoreAction(MVT::f128, MVT::f32, Expand);
785 setTruncStoreAction(MVT::f128, MVT::f64, Expand);
786 setTruncStoreAction(MVT::f128, MVT::f80, Expand);
787 }
788
789 // Always use a library call for pow.
790 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
791 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
792 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
793 setOperationAction(ISD::FPOW , MVT::f128 , Expand);
794
795 setOperationAction(ISD::FLOG, MVT::f80, Expand);
796 setOperationAction(ISD::FLOG2, MVT::f80, Expand);
797 setOperationAction(ISD::FLOG10, MVT::f80, Expand);
798 setOperationAction(ISD::FEXP, MVT::f80, Expand);
799 setOperationAction(ISD::FEXP2, MVT::f80, Expand);
800 setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
801 setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
802
803 // Some FP actions are always expanded for vector types.
804 for (auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16,
805 MVT::v4f32, MVT::v8f32, MVT::v16f32,
806 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
807 setOperationAction(ISD::FSIN, VT, Expand);
808 setOperationAction(ISD::FSINCOS, VT, Expand);
809 setOperationAction(ISD::FCOS, VT, Expand);
810 setOperationAction(ISD::FREM, VT, Expand);
811 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
812 setOperationAction(ISD::FPOW, VT, Expand);
813 setOperationAction(ISD::FLOG, VT, Expand);
814 setOperationAction(ISD::FLOG2, VT, Expand);
815 setOperationAction(ISD::FLOG10, VT, Expand);
816 setOperationAction(ISD::FEXP, VT, Expand);
817 setOperationAction(ISD::FEXP2, VT, Expand);
818 }
819
820 // First set operation action for all vector types to either promote
821 // (for widening) or expand (for scalarization). Then we will selectively
822 // turn on ones that can be effectively codegen'd.
823 for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
824 setOperationAction(ISD::SDIV, VT, Expand);
825 setOperationAction(ISD::UDIV, VT, Expand);
826 setOperationAction(ISD::SREM, VT, Expand);
827 setOperationAction(ISD::UREM, VT, Expand);
828 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
829 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
830 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
831 setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
832 setOperationAction(ISD::FMA, VT, Expand);
833 setOperationAction(ISD::FFLOOR, VT, Expand);
834 setOperationAction(ISD::FCEIL, VT, Expand);
835 setOperationAction(ISD::FTRUNC, VT, Expand);
836 setOperationAction(ISD::FRINT, VT, Expand);
837 setOperationAction(ISD::FNEARBYINT, VT, Expand);
838 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
839 setOperationAction(ISD::MULHS, VT, Expand);
840 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
841 setOperationAction(ISD::MULHU, VT, Expand);
842 setOperationAction(ISD::SDIVREM, VT, Expand);
843 setOperationAction(ISD::UDIVREM, VT, Expand);
844 setOperationAction(ISD::CTPOP, VT, Expand);
845 setOperationAction(ISD::CTTZ, VT, Expand);
846 setOperationAction(ISD::CTLZ, VT, Expand);
847 setOperationAction(ISD::ROTL, VT, Expand);
848 setOperationAction(ISD::ROTR, VT, Expand);
849 setOperationAction(ISD::BSWAP, VT, Expand);
850 setOperationAction(ISD::SETCC, VT, Expand);
851 setOperationAction(ISD::FP_TO_UINT, VT, Expand);
852 setOperationAction(ISD::FP_TO_SINT, VT, Expand);
853 setOperationAction(ISD::UINT_TO_FP, VT, Expand);
854 setOperationAction(ISD::SINT_TO_FP, VT, Expand);
855 setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
856 setOperationAction(ISD::TRUNCATE, VT, Expand);
857 setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
858 setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
859 setOperationAction(ISD::ANY_EXTEND, VT, Expand);
860 setOperationAction(ISD::SELECT_CC, VT, Expand);
861 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
862 setTruncStoreAction(InnerVT, VT, Expand);
863
864 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
865 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
866
867 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
868 // types, we have to deal with them whether we ask for Expansion or not.
869 // Setting Expand causes its own optimisation problems though, so leave
870 // them legal.
871 if (VT.getVectorElementType() == MVT::i1)
872 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
873
874 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
875 // split/scalarized right now.
876 if (VT.getVectorElementType() == MVT::f16)
877 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
878 }
879 }
880
881 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
882 // with -msoft-float, disable use of MMX as well.
883 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
884 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
885 // No operations on x86mmx supported, everything uses intrinsics.
886 }
887
888 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
889 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
890 : &X86::VR128RegClass);
891
892 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
893 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
894 setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);
895 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
896 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
897 setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
898 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
899 setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
900
901 setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
902 setOperationAction(ISD::STORE, MVT::v2f32, Custom);
903
904 setOperationAction(ISD::STRICT_FADD, MVT::v4f32, Legal);
905 setOperationAction(ISD::STRICT_FSUB, MVT::v4f32, Legal);
906 setOperationAction(ISD::STRICT_FMUL, MVT::v4f32, Legal);
907 setOperationAction(ISD::STRICT_FDIV, MVT::v4f32, Legal);
908 setOperationAction(ISD::STRICT_FSQRT, MVT::v4f32, Legal);
909 }
910
911 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
912 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
913 : &X86::VR128RegClass);
914
915 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
916 // registers cannot be used even for integer operations.
917 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
918 : &X86::VR128RegClass);
919 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
920 : &X86::VR128RegClass);
921 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
922 : &X86::VR128RegClass);
923 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
924 : &X86::VR128RegClass);
925
926 for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
927 MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
928 setOperationAction(ISD::SDIV, VT, Custom);
929 setOperationAction(ISD::SREM, VT, Custom);
930 setOperationAction(ISD::UDIV, VT, Custom);
931 setOperationAction(ISD::UREM, VT, Custom);
932 }
933
934 setOperationAction(ISD::MUL, MVT::v2i8, Custom);
935 setOperationAction(ISD::MUL, MVT::v4i8, Custom);
936 setOperationAction(ISD::MUL, MVT::v8i8, Custom);
937
938 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
939 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
940 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
941 setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
942 setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
943 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
944 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
945 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
946 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
947 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
948 setOperationAction(ISD::AVGCEILU, MVT::v16i8, Legal);
949 setOperationAction(ISD::AVGCEILU, MVT::v8i16, Legal);
950
951 setOperationAction(ISD::SMULO, MVT::v16i8, Custom);
952 setOperationAction(ISD::UMULO, MVT::v16i8, Custom);
953
954 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
955 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
956 setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);
957
958 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
959 setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
960 setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
961 setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
962 setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
963 }
964
965 setOperationAction(ISD::UADDSAT, MVT::v16i8, Legal);
966 setOperationAction(ISD::SADDSAT, MVT::v16i8, Legal);
967 setOperationAction(ISD::USUBSAT, MVT::v16i8, Legal);
968 setOperationAction(ISD::SSUBSAT, MVT::v16i8, Legal);
969 setOperationAction(ISD::UADDSAT, MVT::v8i16, Legal);
970 setOperationAction(ISD::SADDSAT, MVT::v8i16, Legal);
971 setOperationAction(ISD::USUBSAT, MVT::v8i16, Legal);
972 setOperationAction(ISD::SSUBSAT, MVT::v8i16, Legal);
973 setOperationAction(ISD::USUBSAT, MVT::v4i32, Custom);
974 setOperationAction(ISD::USUBSAT, MVT::v2i64, Custom);
975
976 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
977 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
978 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
979 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
980
981 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
982 setOperationAction(ISD::SETCC, VT, Custom);
983 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
984 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
985 setOperationAction(ISD::CTPOP, VT, Custom);
986 setOperationAction(ISD::ABS, VT, Custom);
987
988 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
989 // setcc all the way to isel and prefer SETGT in some isel patterns.
990 setCondCodeAction(ISD::SETLT, VT, Custom);
991 setCondCodeAction(ISD::SETLE, VT, Custom);
992 }
993
994 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
995 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
996 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
997 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
998 setOperationAction(ISD::VSELECT, VT, Custom);
999 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1000 }
1001
1002 for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
1003 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1004 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1005 setOperationAction(ISD::VSELECT, VT, Custom);
1006
1007 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
1008 continue;
1009
1010 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1011 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1012 }
1013
1014 // Custom lower v2i64 and v2f64 selects.
1015 setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
1016 setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
1017 setOperationAction(ISD::SELECT, MVT::v4i32, Custom);
1018 setOperationAction(ISD::SELECT, MVT::v8i16, Custom);
1019 setOperationAction(ISD::SELECT, MVT::v16i8, Custom);
1020
1021 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
1022 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Custom);
1023 setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
1024 setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
1025 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Legal);
1026 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i32, Custom);
1027
1028 // Custom legalize these to avoid over promotion or custom promotion.
1029 for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
1030 setOperationAction(ISD::FP_TO_SINT, VT, Custom);
1031 setOperationAction(ISD::FP_TO_UINT, VT, Custom);
1032 setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom);
1033 setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom);
1034 }
1035
1036 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
1037 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i32, Legal);
1038 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
1039 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i32, Custom);
1040
1041 setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
1042 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i32, Custom);
1043
1044 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
1045 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32, Custom);
1046
1047 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
1048 setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
1049 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f32, Custom);
1050 setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
1051 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f32, Custom);
1052
1053 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
1054 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f32, Custom);
1055 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
1056 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f32, Custom);
1057
1058 // We want to legalize this to an f64 load rather than an i64 load on
1059 // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
1060 // store.
1061 setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
1062 setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
1063 setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
1064 setOperationAction(ISD::STORE, MVT::v2i32, Custom);
1065 setOperationAction(ISD::STORE, MVT::v4i16, Custom);
1066 setOperationAction(ISD::STORE, MVT::v8i8, Custom);
1067
1068 setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
1069 setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
1070 setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
1071 if (!Subtarget.hasAVX512())
1072 setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);
1073
1074 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
1075 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
1076 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
1077
1078 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
1079
1080 setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom);
1081 setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom);
1082 setOperationAction(ISD::TRUNCATE, MVT::v2i32, Custom);
1083 setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom);
1084 setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom);
1085 setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom);
1086
1087 // In the customized shift lowering, the legal v4i32/v2i64 cases
1088 // in AVX2 will be recognized.
1089 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1090 setOperationAction(ISD::SRL, VT, Custom);
1091 setOperationAction(ISD::SHL, VT, Custom);
1092 setOperationAction(ISD::SRA, VT, Custom);
1093 if (VT == MVT::v2i64) continue;
1094 setOperationAction(ISD::ROTL, VT, Custom);
1095 setOperationAction(ISD::ROTR, VT, Custom);
1096 setOperationAction(ISD::FSHL, VT, Custom);
1097 setOperationAction(ISD::FSHR, VT, Custom);
1098 }
1099
1100 setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal);
1101 setOperationAction(ISD::STRICT_FADD, MVT::v2f64, Legal);
1102 setOperationAction(ISD::STRICT_FSUB, MVT::v2f64, Legal);
1103 setOperationAction(ISD::STRICT_FMUL, MVT::v2f64, Legal);
1104 setOperationAction(ISD::STRICT_FDIV, MVT::v2f64, Legal);
1105 }
1106
1107 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
1108 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
1109 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
1110 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
1111 setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);
1112 setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
1113 setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
1114 setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
1115 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1116
1117 // These might be better off as horizontal vector ops.
1118 setOperationAction(ISD::ADD, MVT::i16, Custom);
1119 setOperationAction(ISD::ADD, MVT::i32, Custom);
1120 setOperationAction(ISD::SUB, MVT::i16, Custom);
1121 setOperationAction(ISD::SUB, MVT::i32, Custom);
1122 }
1123
1124 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
1125 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
1126 setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
1127 setOperationAction(ISD::STRICT_FFLOOR, RoundedTy, Legal);
1128 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
1129 setOperationAction(ISD::STRICT_FCEIL, RoundedTy, Legal);
1130 setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
1131 setOperationAction(ISD::STRICT_FTRUNC, RoundedTy, Legal);
1132 setOperationAction(ISD::FRINT, RoundedTy, Legal);
1133 setOperationAction(ISD::STRICT_FRINT, RoundedTy, Legal);
1134 setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
1135 setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy, Legal);
1136 setOperationAction(ISD::FROUNDEVEN, RoundedTy, Legal);
1137 setOperationAction(ISD::STRICT_FROUNDEVEN, RoundedTy, Legal);
1138
1139 setOperationAction(ISD::FROUND, RoundedTy, Custom);
1140 }
1141
1142 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
1143 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
1144 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
1145 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
1146 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
1147 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
1148 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
1149 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
1150
1151 setOperationAction(ISD::UADDSAT, MVT::v4i32, Custom);
1152 setOperationAction(ISD::SADDSAT, MVT::v2i64, Custom);
1153 setOperationAction(ISD::SSUBSAT, MVT::v2i64, Custom);
1154
1155 // FIXME: Do we need to handle scalar-to-vector here?
1156 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
1157
1158 // We directly match byte blends in the backend as they match the VSELECT
1159 // condition form.
1160 setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
1161
1162 // SSE41 brings specific instructions for doing vector sign extend even in
1163 // cases where we don't have SRA.
1164 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1165 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
1166 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
1167 }
1168
1169 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1170 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1171 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
1172 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
1173 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
1174 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
1175 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
1176 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
1177 }
1178
1179 if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
1180 // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
1181 // do the pre and post work in the vector domain.
1182 setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Custom);
1183 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i64, Custom);
1184 // We need to mark SINT_TO_FP as Custom even though we want to expand it
1185 // so that DAG combine doesn't try to turn it into uint_to_fp.
1186 setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Custom);
1187 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i64, Custom);
1188 }
1189 }
1190
1191 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
1192 setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom);
1193 }
1194
1195 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
1196 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1197 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1198 setOperationAction(ISD::ROTL, VT, Custom);
1199 setOperationAction(ISD::ROTR, VT, Custom);
1200 }
1201
1202 // XOP can efficiently perform BITREVERSE with VPPERM.
1203 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
1204 setOperationAction(ISD::BITREVERSE, VT, Custom);
1205
1206 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1207 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1208 setOperationAction(ISD::BITREVERSE, VT, Custom);
1209 }
1210
1211 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
1212 bool HasInt256 = Subtarget.hasInt256();
1213
1214 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
1215 : &X86::VR256RegClass);
1216 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1217 : &X86::VR256RegClass);
1218 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1219 : &X86::VR256RegClass);
1220 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1221 : &X86::VR256RegClass);
1222 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1223 : &X86::VR256RegClass);
1224 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1225 : &X86::VR256RegClass);
1226
1227 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
1228 setOperationAction(ISD::FFLOOR, VT, Legal);
1229 setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
1230 setOperationAction(ISD::FCEIL, VT, Legal);
1231 setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
1232 setOperationAction(ISD::FTRUNC, VT, Legal);
1233 setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
1234 setOperationAction(ISD::FRINT, VT, Legal);
1235 setOperationAction(ISD::STRICT_FRINT, VT, Legal);
1236 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1237 setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
1238 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
1239 setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);
1240
1241 setOperationAction(ISD::FROUND, VT, Custom);
1242
1243 setOperationAction(ISD::FNEG, VT, Custom);
1244 setOperationAction(ISD::FABS, VT, Custom);
1245 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1246 }
1247
1248 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1249 // even though v8i16 is a legal type.
1250 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1251 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1252 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1253 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1254 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
1255 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Custom);
1256 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i32, Legal);
1257
1258 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
1259 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i32, Legal);
1260
1261 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f32, Legal);
1262 setOperationAction(ISD::STRICT_FADD, MVT::v8f32, Legal);
1263 setOperationAction(ISD::STRICT_FADD, MVT::v4f64, Legal);
1264 setOperationAction(ISD::STRICT_FSUB, MVT::v8f32, Legal);
1265 setOperationAction(ISD::STRICT_FSUB, MVT::v4f64, Legal);
1266 setOperationAction(ISD::STRICT_FMUL, MVT::v8f32, Legal);
1267 setOperationAction(ISD::STRICT_FMUL, MVT::v4f64, Legal);
1268 setOperationAction(ISD::STRICT_FDIV, MVT::v8f32, Legal);
1269 setOperationAction(ISD::STRICT_FDIV, MVT::v4f64, Legal);
1270 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Legal);
1271 setOperationAction(ISD::STRICT_FSQRT, MVT::v8f32, Legal);
1272 setOperationAction(ISD::STRICT_FSQRT, MVT::v4f64, Legal);
1273
1274 if (!Subtarget.hasAVX512())
1275 setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
1276
1277 // In the customized shift lowering, the legal v8i32/v4i64 cases
1278 // in AVX2 will be recognized.
1279 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1280 setOperationAction(ISD::SRL, VT, Custom);
1281 setOperationAction(ISD::SHL, VT, Custom);
1282 setOperationAction(ISD::SRA, VT, Custom);
1283 if (VT == MVT::v4i64) continue;
1284 setOperationAction(ISD::ROTL, VT, Custom);
1285 setOperationAction(ISD::ROTR, VT, Custom);
1286 setOperationAction(ISD::FSHL, VT, Custom);
1287 setOperationAction(ISD::FSHR, VT, Custom);
1288 }
1289
1290 // These types need custom splitting if their input is a 128-bit vector.
1291 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1292 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1293 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1294 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1295
1296 setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
1297 setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
1298 setOperationAction(ISD::SELECT, MVT::v8i32, Custom);
1299 setOperationAction(ISD::SELECT, MVT::v16i16, Custom);
1300 setOperationAction(ISD::SELECT, MVT::v32i8, Custom);
1301 setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
1302
1303 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1304 setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1305 setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1306 setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1307 }
1308
1309 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1310 setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
1311 setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
1312 setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);
1313
1314 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1315 setOperationAction(ISD::SETCC, VT, Custom);
1316 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
1317 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
1318 setOperationAction(ISD::CTPOP, VT, Custom);
1319 setOperationAction(ISD::CTLZ, VT, Custom);
1320
1321 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1322 // setcc all the way to isel and prefer SETGT in some isel patterns.
1323 setCondCodeAction(ISD::SETLT, VT, Custom);
1324 setCondCodeAction(ISD::SETLE, VT, Custom);
1325 }
1326
1327 if (Subtarget.hasAnyFMA()) {
1328 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1329 MVT::v2f64, MVT::v4f64 }) {
1330 setOperationAction(ISD::FMA, VT, Legal);
1331 setOperationAction(ISD::STRICT_FMA, VT, Legal);
1332 }
1333 }
1334
1335 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1336 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1337 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1338 }
1339
1340 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1341 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1342 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1343 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1344
1345 setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
1346 setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
1347 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1348 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1349 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1350 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1351 setOperationAction(ISD::AVGCEILU, MVT::v16i16, HasInt256 ? Legal : Custom);
1352 setOperationAction(ISD::AVGCEILU, MVT::v32i8, HasInt256 ? Legal : Custom);
1353
1354 setOperationAction(ISD::SMULO, MVT::v32i8, Custom);
1355 setOperationAction(ISD::UMULO, MVT::v32i8, Custom);
1356
1357 setOperationAction(ISD::ABS, MVT::v4i64, Custom);
1358 setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
1359 setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
1360 setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
1361 setOperationAction(ISD::UMIN, MVT::v4i64, Custom);
1362
1363 setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1364 setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1365 setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1366 setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1367 setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1368 setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1369 setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1370 setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1371 setOperationAction(ISD::UADDSAT, MVT::v8i32, Custom);
1372 setOperationAction(ISD::USUBSAT, MVT::v8i32, Custom);
1373 setOperationAction(ISD::UADDSAT, MVT::v4i64, Custom);
1374 setOperationAction(ISD::USUBSAT, MVT::v4i64, Custom);
1375
1376 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1377 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1378 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1379 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1380 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1381 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1382 }
1383
1384 for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
1385 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
1386 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
1387 }
1388
1389 if (HasInt256) {
1390 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1391 // when we have a 256bit-wide blend with immediate.
1392 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1393 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32, Custom);
1394
1395 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1396 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1397 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1398 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1399 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1400 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1401 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1402 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1403 }
1404 }
1405
1406 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1407 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1408 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1409 setOperationAction(ISD::MSTORE, VT, Legal);
1410 }
1411
1412 // Extract subvector is special because the value type
1413 // (result) is 128-bit but the source is 256-bit wide.
1414 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1415 MVT::v4f32, MVT::v2f64 }) {
1416 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1417 }
1418
1419 // Custom lower several nodes for 256-bit types.
1420 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1421 MVT::v8f32, MVT::v4f64 }) {
1422 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1423 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1424 setOperationAction(ISD::VSELECT, VT, Custom);
1425 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1426 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1427 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1428 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1429 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1430 setOperationAction(ISD::STORE, VT, Custom);
1431 }
1432
1433 if (HasInt256) {
1434 setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
1435
1436 // Custom legalize 2x32 to get a little better code.
1437 setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
1438 setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);
1439
1440 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1441 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1442 setOperationAction(ISD::MGATHER, VT, Custom);
1443 }
1444 }
1445
1446 // This block controls legalization of the mask vector sizes that are
1447 // available with AVX512. 512-bit vectors are in a separate block controlled
1448 // by useAVX512Regs.
1449 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1450 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1451 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1452 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1453 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1454 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1455
1456 setOperationAction(ISD::SELECT, MVT::v1i1, Custom);
1457 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
1458 setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);
1459
1460 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1461 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1462 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1463 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1464 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1465 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1466 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1467 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1468 setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom);
1469 setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom);
1470 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i1, Custom);
1471 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i1, Custom);
1472
1473 // There is no byte sized k-register load or store without AVX512DQ.
1474 if (!Subtarget.hasDQI()) {
1475 setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
1476 setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
1477 setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
1478 setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
1479
1480 setOperationAction(ISD::STORE, MVT::v1i1, Custom);
1481 setOperationAction(ISD::STORE, MVT::v2i1, Custom);
1482 setOperationAction(ISD::STORE, MVT::v4i1, Custom);
1483 setOperationAction(ISD::STORE, MVT::v8i1, Custom);
1484 }
1485
1486 // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1487 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1488 setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1489 setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1490 setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1491 }
1492
1493 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })
1494 setOperationAction(ISD::VSELECT, VT, Expand);
1495
1496 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1497 setOperationAction(ISD::SETCC, VT, Custom);
1498 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
1499 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
1500 setOperationAction(ISD::SELECT, VT, Custom);
1501 setOperationAction(ISD::TRUNCATE, VT, Custom);
1502
1503 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1504 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1505 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1506 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1507 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1508 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1509 }
1510
1511 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1512 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1513 }
1514
1515 // This block controls legalization for 512-bit operations with 32/64 bit
1516 // elements. 512-bits can be disabled based on prefer-vector-width and
1517 // required-vector-width function attributes.
1518 if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1519 bool HasBWI = Subtarget.hasBWI();
1520
1521 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1522 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1523 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1524 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1525 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1526 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1527
1528 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1529 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1530 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1531 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1532 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1533 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1534 if (HasBWI)
1535 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1536 }
1537
1538 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1539 setOperationAction(ISD::FNEG, VT, Custom);
1540 setOperationAction(ISD::FABS, VT, Custom);
1541 setOperationAction(ISD::FMA, VT, Legal);
1542 setOperationAction(ISD::STRICT_FMA, VT, Legal);
1543 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1544 }
1545
1546 for (MVT VT : { MVT::v16i1, MVT::v16i8, MVT::v16i16 }) {
1547 setOperationPromotedToType(ISD::FP_TO_SINT , VT, MVT::v16i32);
1548 setOperationPromotedToType(ISD::FP_TO_UINT , VT, MVT::v16i32);
1549 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, VT, MVT::v16i32);
1550 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, VT, MVT::v16i32);
1551 }
1552 setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
1553 setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
1554 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v16i32, Legal);
1555 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v16i32, Legal);
1556 setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
1557 setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
1558 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i32, Legal);
1559 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i32, Legal);
1560
1561 setOperationAction(ISD::STRICT_FADD, MVT::v16f32, Legal);
1562 setOperationAction(ISD::STRICT_FADD, MVT::v8f64, Legal);
1563 setOperationAction(ISD::STRICT_FSUB, MVT::v16f32, Legal);
1564 setOperationAction(ISD::STRICT_FSUB, MVT::v8f64, Legal);
1565 setOperationAction(ISD::STRICT_FMUL, MVT::v16f32, Legal);
1566 setOperationAction(ISD::STRICT_FMUL, MVT::v8f64, Legal);
1567 setOperationAction(ISD::STRICT_FDIV, MVT::v16f32, Legal);
1568 setOperationAction(ISD::STRICT_FDIV, MVT::v8f64, Legal);
1569 setOperationAction(ISD::STRICT_FSQRT, MVT::v16f32, Legal);
1570 setOperationAction(ISD::STRICT_FSQRT, MVT::v8f64, Legal);
1571 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Legal);
1572 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f32, Legal);
1573
1574 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1575 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1576 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1577 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1578 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1579 if (HasBWI)
1580 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1581
1582 // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1583 // to 512-bit rather than use the AVX2 instructions so that we can use
1584 // k-masks.
1585 if (!Subtarget.hasVLX()) {
1586 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1587 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1588 setOperationAction(ISD::MLOAD, VT, Custom);
1589 setOperationAction(ISD::MSTORE, VT, Custom);
1590 }
1591 }
1592
1593 setOperationAction(ISD::TRUNCATE, MVT::v8i32, Legal);
1594 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal);
1595 setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom);
1596 setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
1597 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
1598 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1599 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1600 setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
1601 setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
1602 setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
1603 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
1604 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1605 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1606
1607 if (HasBWI) {
1608 // Extends from v64i1 masks to 512-bit vectors.
1609 setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
1610 setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
1611 setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom);
1612 }
1613
1614 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1615 setOperationAction(ISD::FFLOOR, VT, Legal);
1616 setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
1617 setOperationAction(ISD::FCEIL, VT, Legal);
1618 setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
1619 setOperationAction(ISD::FTRUNC, VT, Legal);
1620 setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
1621 setOperationAction(ISD::FRINT, VT, Legal);
1622 setOperationAction(ISD::STRICT_FRINT, VT, Legal);
1623 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1624 setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
1625 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
1626 setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);
1627
1628 setOperationAction(ISD::FROUND, VT, Custom);
1629 }
1630
1631 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
1632 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
1633 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
1634 }
1635
1636 setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);
1637 setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);
1638 setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom);
1639 setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom);
1640
1641 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1642 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1643 setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);
1644 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1645
1646 setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
1647 setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
1648 setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);
1649 setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
1650 setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
1651 setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
1652 setOperationAction(ISD::AVGCEILU, MVT::v32i16, HasBWI ? Legal : Custom);
1653 setOperationAction(ISD::AVGCEILU, MVT::v64i8, HasBWI ? Legal : Custom);
1654
1655 setOperationAction(ISD::SMULO, MVT::v64i8, Custom);
1656 setOperationAction(ISD::UMULO, MVT::v64i8, Custom);
1657
1658 setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);
1659
1660 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1661 setOperationAction(ISD::SRL, VT, Custom);
1662 setOperationAction(ISD::SHL, VT, Custom);
1663 setOperationAction(ISD::SRA, VT, Custom);
1664 setOperationAction(ISD::ROTL, VT, Custom);
1665 setOperationAction(ISD::ROTR, VT, Custom);
1666 setOperationAction(ISD::SETCC, VT, Custom);
1667
1668 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1669 // setcc all the way to isel and prefer SETGT in some isel patterns.
1670 setCondCodeAction(ISD::SETLT, VT, Custom);
1671 setCondCodeAction(ISD::SETLE, VT, Custom);
1672 }
1673 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1674 setOperationAction(ISD::SMAX, VT, Legal);
1675 setOperationAction(ISD::UMAX, VT, Legal);
1676 setOperationAction(ISD::SMIN, VT, Legal);
1677 setOperationAction(ISD::UMIN, VT, Legal);
1678 setOperationAction(ISD::ABS, VT, Legal);
1679 setOperationAction(ISD::CTPOP, VT, Custom);
1680 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
1681 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
1682 }
1683
1684 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1685 setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom);
1686 setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom);
1687 setOperationAction(ISD::CTLZ, VT, Custom);
1688 setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom);
1689 setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom);
1690 setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom);
1691 setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom);
1692 setOperationAction(ISD::UADDSAT, VT, HasBWI ? Legal : Custom);
1693 setOperationAction(ISD::SADDSAT, VT, HasBWI ? Legal : Custom);
1694 setOperationAction(ISD::USUBSAT, VT, HasBWI ? Legal : Custom);
1695 setOperationAction(ISD::SSUBSAT, VT, HasBWI ? Legal : Custom);
1696 }
1697
1698 setOperationAction(ISD::FSHL, MVT::v64i8, Custom);
1699 setOperationAction(ISD::FSHR, MVT::v64i8, Custom);
1700 setOperationAction(ISD::FSHL, MVT::v32i16, Custom);
1701 setOperationAction(ISD::FSHR, MVT::v32i16, Custom);
1702 setOperationAction(ISD::FSHL, MVT::v16i32, Custom);
1703 setOperationAction(ISD::FSHR, MVT::v16i32, Custom);
1704
1705 if (Subtarget.hasDQI()) {
1706 setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);
1707 setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
1708 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i64, Legal);
1709 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i64, Legal);
1710 setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);
1711 setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);
1712 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i64, Legal);
1713 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i64, Legal);
1714
1715 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
1716 }
1717
1718 if (Subtarget.hasCDI()) {
1719 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1720 for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
1721 setOperationAction(ISD::CTLZ, VT, Legal);
1722 }
1723 } // Subtarget.hasCDI()
1724
1725 if (Subtarget.hasVPOPCNTDQ()) {
1726 for (auto VT : { MVT::v16i32, MVT::v8i64 })
1727 setOperationAction(ISD::CTPOP, VT, Legal);
1728 }
1729
1730 // Extract subvector is special because the value type
1731 // (result) is 256-bit but the source is 512-bit wide.
1732 // 128-bit was made Legal under AVX1.
1733 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1734 MVT::v8f32, MVT::v4f64 })
1735 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1736
1737 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
1738 MVT::v16f32, MVT::v8f64 }) {
1739 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1740 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1741 setOperationAction(ISD::SELECT, VT, Custom);
1742 setOperationAction(ISD::VSELECT, VT, Custom);
1743 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1744 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1745 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1746 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1747 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1748 }
1749
1750 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
1751 setOperationAction(ISD::MLOAD, VT, Legal);
1752 setOperationAction(ISD::MSTORE, VT, Legal);
1753 setOperationAction(ISD::MGATHER, VT, Custom);
1754 setOperationAction(ISD::MSCATTER, VT, Custom);
1755 }
1756 if (HasBWI) {
1757 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1758 setOperationAction(ISD::MLOAD, VT, Legal);
1759 setOperationAction(ISD::MSTORE, VT, Legal);
1760 }
1761 } else {
1762 setOperationAction(ISD::STORE, MVT::v32i16, Custom);
1763 setOperationAction(ISD::STORE, MVT::v64i8, Custom);
1764 }
1765
1766 if (Subtarget.hasVBMI2()) {
1767 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64,
1768 MVT::v16i16, MVT::v8i32, MVT::v4i64,
1769 MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1770 setOperationAction(ISD::FSHL, VT, Custom);
1771 setOperationAction(ISD::FSHR, VT, Custom);
1772 }
1773
1774 setOperationAction(ISD::ROTL, MVT::v32i16, Custom);
1775 setOperationAction(ISD::ROTR, MVT::v8i16, Custom);
1776 setOperationAction(ISD::ROTR, MVT::v16i16, Custom);
1777 setOperationAction(ISD::ROTR, MVT::v32i16, Custom);
1778 }
1779 }// useAVX512Regs
1780
1781 // This block controls legalization for operations that don't have
1782 // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
1783 // narrower widths.
1784 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1785 // These operations are handled on non-VLX by artificially widening in
1786 // isel patterns.
1787
1788 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32,
1789 Subtarget.hasVLX() ? Legal : Custom);
1790 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32,
1791 Subtarget.hasVLX() ? Legal : Custom);
1792 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32,
1793 Subtarget.hasVLX() ? Legal : Custom);
1794 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32,
1795 Subtarget.hasVLX() ? Legal : Custom);
1796 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i32, Custom);
1797 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32,
1798 Subtarget.hasVLX() ? Legal : Custom);
1799 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32,
1800 Subtarget.hasVLX() ? Legal : Custom);
1801 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32,
1802 Subtarget.hasVLX() ? Legal : Custom);
1803 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32,
1804 Subtarget.hasVLX() ? Legal : Custom);
1805
1806 if (Subtarget.hasDQI()) {
1807 // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
1808 // v2f32 UINT_TO_FP is already custom under SSE2.
1809 assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&(static_cast <bool> (isOperationCustom(ISD::UINT_TO_FP,
MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP
, MVT::v2f32) && "Unexpected operation action!") ? void
(0) : __assert_fail ("isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && \"Unexpected operation action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 1811, __extension__
__PRETTY_FUNCTION__))
1810 isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) &&(static_cast <bool> (isOperationCustom(ISD::UINT_TO_FP,
MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP
, MVT::v2f32) && "Unexpected operation action!") ? void
(0) : __assert_fail ("isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && \"Unexpected operation action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 1811, __extension__
__PRETTY_FUNCTION__))
1811 "Unexpected operation action!")(static_cast <bool> (isOperationCustom(ISD::UINT_TO_FP,
MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP
, MVT::v2f32) && "Unexpected operation action!") ? void
(0) : __assert_fail ("isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && \"Unexpected operation action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 1811, __extension__
__PRETTY_FUNCTION__))
;
1812 // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
1813 setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
1814 setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
1815 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f32, Custom);
1816 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f32, Custom);
1817 }
1818
1819 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1820 setOperationAction(ISD::SMAX, VT, Legal);
1821 setOperationAction(ISD::UMAX, VT, Legal);
1822 setOperationAction(ISD::SMIN, VT, Legal);
1823 setOperationAction(ISD::UMIN, VT, Legal);
1824 setOperationAction(ISD::ABS, VT, Legal);
1825 }
1826
1827 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
1828 setOperationAction(ISD::ROTL, VT, Custom);
1829 setOperationAction(ISD::ROTR, VT, Custom);
1830 }
1831
1832 // Custom legalize 2x32 to get a little better code.
1833 setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);
1834 setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);
1835
1836 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1837 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1838 setOperationAction(ISD::MSCATTER, VT, Custom);
1839
1840 if (Subtarget.hasDQI()) {
1841 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1842 setOperationAction(ISD::SINT_TO_FP, VT,
1843 Subtarget.hasVLX() ? Legal : Custom);
1844 setOperationAction(ISD::UINT_TO_FP, VT,
1845 Subtarget.hasVLX() ? Legal : Custom);
1846 setOperationAction(ISD::STRICT_SINT_TO_FP, VT,
1847 Subtarget.hasVLX() ? Legal : Custom);
1848 setOperationAction(ISD::STRICT_UINT_TO_FP, VT,
1849 Subtarget.hasVLX() ? Legal : Custom);
1850 setOperationAction(ISD::FP_TO_SINT, VT,
1851 Subtarget.hasVLX() ? Legal : Custom);
1852 setOperationAction(ISD::FP_TO_UINT, VT,
1853 Subtarget.hasVLX() ? Legal : Custom);
1854 setOperationAction(ISD::STRICT_FP_TO_SINT, VT,
1855 Subtarget.hasVLX() ? Legal : Custom);
1856 setOperationAction(ISD::STRICT_FP_TO_UINT, VT,
1857 Subtarget.hasVLX() ? Legal : Custom);
1858 setOperationAction(ISD::MUL, VT, Legal);
1859 }
1860 }
1861
1862 if (Subtarget.hasCDI()) {
1863 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
1864 setOperationAction(ISD::CTLZ, VT, Legal);
1865 }
1866 } // Subtarget.hasCDI()
1867
1868 if (Subtarget.hasVPOPCNTDQ()) {
1869 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
1870 setOperationAction(ISD::CTPOP, VT, Legal);
1871 }
1872 }
1873
1874 // This block control legalization of v32i1/v64i1 which are available with
1875 // AVX512BW. 512-bit v32i16 and v64i8 vector legalization is controlled with
1876 // useBWIRegs.
1877 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1878 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
1879 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
1880
1881 for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
1882 setOperationAction(ISD::VSELECT, VT, Expand);
1883 setOperationAction(ISD::TRUNCATE, VT, Custom);
1884 setOperationAction(ISD::SETCC, VT, Custom);
1885 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1886 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1887 setOperationAction(ISD::SELECT, VT, Custom);
1888 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1889 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1890 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1891 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1892 }
1893
1894 for (auto VT : { MVT::v16i1, MVT::v32i1 })
1895 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1896
1897 // Extends from v32i1 masks to 256-bit vectors.
1898 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
1899 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
1900 setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom);
1901
1902 for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
1903 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1904 setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
1905 }
1906
1907 // These operations are handled on non-VLX by artificially widening in
1908 // isel patterns.
1909 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
1910
1911 if (Subtarget.hasBITALG()) {
1912 for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
1913 setOperationAction(ISD::CTPOP, VT, Legal);
1914 }
1915 }
1916
1917 if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) {
1918 auto setGroup = [&] (MVT VT) {
1919 setOperationAction(ISD::FADD, VT, Legal);
1920 setOperationAction(ISD::STRICT_FADD, VT, Legal);
1921 setOperationAction(ISD::FSUB, VT, Legal);
1922 setOperationAction(ISD::STRICT_FSUB, VT, Legal);
1923 setOperationAction(ISD::FMUL, VT, Legal);
1924 setOperationAction(ISD::STRICT_FMUL, VT, Legal);
1925 setOperationAction(ISD::FDIV, VT, Legal);
1926 setOperationAction(ISD::STRICT_FDIV, VT, Legal);
1927 setOperationAction(ISD::FSQRT, VT, Legal);
1928 setOperationAction(ISD::STRICT_FSQRT, VT, Legal);
1929
1930 setOperationAction(ISD::FFLOOR, VT, Legal);
1931 setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
1932 setOperationAction(ISD::FCEIL, VT, Legal);
1933 setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
1934 setOperationAction(ISD::FTRUNC, VT, Legal);
1935 setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
1936 setOperationAction(ISD::FRINT, VT, Legal);
1937 setOperationAction(ISD::STRICT_FRINT, VT, Legal);
1938 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1939 setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
1940
1941 setOperationAction(ISD::LOAD, VT, Legal);
1942 setOperationAction(ISD::STORE, VT, Legal);
1943
1944 setOperationAction(ISD::FMA, VT, Legal);
1945 setOperationAction(ISD::STRICT_FMA, VT, Legal);
1946 setOperationAction(ISD::VSELECT, VT, Legal);
1947 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1948 setOperationAction(ISD::SELECT, VT, Custom);
1949
1950 setOperationAction(ISD::FNEG, VT, Custom);
1951 setOperationAction(ISD::FABS, VT, Custom);
1952 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1953 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1954 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1955 };
1956
1957 // AVX512_FP16 scalar operations
1958 setGroup(MVT::f16);
1959 addRegisterClass(MVT::f16, &X86::FR16XRegClass);
1960 setOperationAction(ISD::FREM, MVT::f16, Promote);
1961 setOperationAction(ISD::STRICT_FREM, MVT::f16, Promote);
1962 setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
1963 setOperationAction(ISD::BR_CC, MVT::f16, Expand);
1964 setOperationAction(ISD::SETCC, MVT::f16, Custom);
1965 setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Custom);
1966 setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Custom);
1967 setOperationAction(ISD::FROUND, MVT::f16, Custom);
1968 setOperationAction(ISD::STRICT_FROUND, MVT::f16, Promote);
1969 setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal);
1970 setOperationAction(ISD::STRICT_FROUNDEVEN, MVT::f16, Legal);
1971 setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
1972 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);
1973 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Legal);
1974 if (isTypeLegal(MVT::f80)) {
1975 setOperationAction(ISD::FP_EXTEND, MVT::f80, Custom);
1976 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Custom);
1977 }
1978
1979 setCondCodeAction(ISD::SETOEQ, MVT::f16, Expand);
1980 setCondCodeAction(ISD::SETUNE, MVT::f16, Expand);
1981
1982 if (Subtarget.useAVX512Regs()) {
1983 setGroup(MVT::v32f16);
1984 addRegisterClass(MVT::v32f16, &X86::VR512RegClass);
1985 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32f16, Custom);
1986 setOperationAction(ISD::SINT_TO_FP, MVT::v32i16, Legal);
1987 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v32i16, Legal);
1988 setOperationAction(ISD::UINT_TO_FP, MVT::v32i16, Legal);
1989 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v32i16, Legal);
1990 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f16, Legal);
1991 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Legal);
1992 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32f16, Custom);
1993
1994 setOperationAction(ISD::FP_TO_SINT, MVT::v32i16, Custom);
1995 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v32i16, Custom);
1996 setOperationAction(ISD::FP_TO_UINT, MVT::v32i16, Custom);
1997 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v32i16, Custom);
1998 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i8, MVT::v32i16);
1999 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v32i8,
2000 MVT::v32i16);
2001 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i8, MVT::v32i16);
2002 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v32i8,
2003 MVT::v32i16);
2004 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i1, MVT::v32i16);
2005 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v32i1,
2006 MVT::v32i16);
2007 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i1, MVT::v32i16);
2008 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v32i1,
2009 MVT::v32i16);
2010
2011 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f16, Legal);
2012 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32f16, Legal);
2013 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32f16, Custom);
2014
2015 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Legal);
2016 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal);
2017
2018 setOperationAction(ISD::STRICT_FSETCC, MVT::v32i1, Custom);
2019 setOperationAction(ISD::STRICT_FSETCCS, MVT::v32i1, Custom);
2020 }
2021
2022 if (Subtarget.hasVLX()) {
2023 addRegisterClass(MVT::v8f16, &X86::VR128XRegClass);
2024 addRegisterClass(MVT::v16f16, &X86::VR256XRegClass);
2025 setGroup(MVT::v8f16);
2026 setGroup(MVT::v16f16);
2027
2028 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8f16, Legal);
2029 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16f16, Custom);
2030 setOperationAction(ISD::SINT_TO_FP, MVT::v16i16, Legal);
2031 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i16, Legal);
2032 setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Legal);
2033 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i16, Legal);
2034 setOperationAction(ISD::UINT_TO_FP, MVT::v16i16, Legal);
2035 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i16, Legal);
2036 setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Legal);
2037 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i16, Legal);
2038
2039 setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom);
2040 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i16, Custom);
2041 setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Custom);
2042 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i16, Custom);
2043 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f16, Legal);
2044 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f32, Legal);
2045
2046 // INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE
2047 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8f16, Custom);
2048 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16f16, Custom);
2049
2050 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f16, Legal);
2051 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16f16, Legal);
2052 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f16, Custom);
2053
2054 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Legal);
2055 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Legal);
2056 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Legal);
2057 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Legal);
2058
2059 // Need to custom widen these to prevent scalarization.
2060 setOperationAction(ISD::LOAD, MVT::v4f16, Custom);
2061 setOperationAction(ISD::STORE, MVT::v4f16, Custom);
2062 }
2063
2064 // Support fp16 0 immediate
2065 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf()));
2066 }
2067
2068 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
2069 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
2070 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
2071 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
2072 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
2073 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
2074
2075 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
2076 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
2077 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
2078 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
2079 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
2080
2081 if (Subtarget.hasBWI()) {
2082 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
2083 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
2084 }
2085
2086 if (Subtarget.hasFP16()) {
2087 // vcvttph2[u]dq v4f16 -> v4i32/64, v2f16 -> v2i32/64
2088 setOperationAction(ISD::FP_TO_SINT, MVT::v2f16, Custom);
2089 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f16, Custom);
2090 setOperationAction(ISD::FP_TO_UINT, MVT::v2f16, Custom);
2091 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f16, Custom);
2092 setOperationAction(ISD::FP_TO_SINT, MVT::v4f16, Custom);
2093 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4f16, Custom);
2094 setOperationAction(ISD::FP_TO_UINT, MVT::v4f16, Custom);
2095 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4f16, Custom);
2096 // vcvt[u]dq2ph v4i32/64 -> v4f16, v2i32/64 -> v2f16
2097 setOperationAction(ISD::SINT_TO_FP, MVT::v2f16, Custom);
2098 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f16, Custom);
2099 setOperationAction(ISD::UINT_TO_FP, MVT::v2f16, Custom);
2100 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f16, Custom);
2101 setOperationAction(ISD::SINT_TO_FP, MVT::v4f16, Custom);
2102 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4f16, Custom);
2103 setOperationAction(ISD::UINT_TO_FP, MVT::v4f16, Custom);
2104 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4f16, Custom);
2105 // vcvtps2phx v4f32 -> v4f16, v2f32 -> v2f16
2106 setOperationAction(ISD::FP_ROUND, MVT::v2f16, Custom);
2107 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f16, Custom);
2108 setOperationAction(ISD::FP_ROUND, MVT::v4f16, Custom);
2109 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f16, Custom);
2110 // vcvtph2psx v4f16 -> v4f32, v2f16 -> v2f32
2111 setOperationAction(ISD::FP_EXTEND, MVT::v2f16, Custom);
2112 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f16, Custom);
2113 setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Custom);
2114 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f16, Custom);
2115 }
2116
2117 setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom);
2118 setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom);
2119 setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
2120 }
2121
2122 if (Subtarget.hasAMXTILE()) {
2123 addRegisterClass(MVT::x86amx, &X86::TILERegClass);
2124 }
2125
2126 // We want to custom lower some of our intrinsics.
2127 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
2128 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
2129 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
2130 if (!Subtarget.is64Bit()) {
2131 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
2132 }
2133
2134 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
2135 // handle type legalization for these operations here.
2136 //
2137 // FIXME: We really should do custom legalization for addition and
2138 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
2139 // than generic legalization for 64-bit multiplication-with-overflow, though.
2140 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
2141 if (VT == MVT::i64 && !Subtarget.is64Bit())
2142 continue;
2143 // Add/Sub/Mul with overflow operations are custom lowered.
2144 setOperationAction(ISD::SADDO, VT, Custom);
2145 setOperationAction(ISD::UADDO, VT, Custom);
2146 setOperationAction(ISD::SSUBO, VT, Custom);
2147 setOperationAction(ISD::USUBO, VT, Custom);
2148 setOperationAction(ISD::SMULO, VT, Custom);
2149 setOperationAction(ISD::UMULO, VT, Custom);
2150
2151 // Support carry in as value rather than glue.
2152 setOperationAction(ISD::ADDCARRY, VT, Custom);
2153 setOperationAction(ISD::SUBCARRY, VT, Custom);
2154 setOperationAction(ISD::SETCCCARRY, VT, Custom);
2155 setOperationAction(ISD::SADDO_CARRY, VT, Custom);
2156 setOperationAction(ISD::SSUBO_CARRY, VT, Custom);
2157 }
2158
2159 if (!Subtarget.is64Bit()) {
2160 // These libcalls are not available in 32-bit.
2161 setLibcallName(RTLIB::SHL_I128, nullptr);
2162 setLibcallName(RTLIB::SRL_I128, nullptr);
2163 setLibcallName(RTLIB::SRA_I128, nullptr);
2164 setLibcallName(RTLIB::MUL_I128, nullptr);
2165 // The MULO libcall is not part of libgcc, only compiler-rt.
2166 setLibcallName(RTLIB::MULO_I64, nullptr);
2167 }
2168 // The MULO libcall is not part of libgcc, only compiler-rt.
2169 setLibcallName(RTLIB::MULO_I128, nullptr);
2170
2171 // Combine sin / cos into _sincos_stret if it is available.
2172 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
2173 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
2174 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
2175 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
2176 }
2177
2178 if (Subtarget.isTargetWin64()) {
2179 setOperationAction(ISD::SDIV, MVT::i128, Custom);
2180 setOperationAction(ISD::UDIV, MVT::i128, Custom);
2181 setOperationAction(ISD::SREM, MVT::i128, Custom);
2182 setOperationAction(ISD::UREM, MVT::i128, Custom);
2183 setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);
2184 setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);
2185 setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);
2186 setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);
2187 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i128, Custom);
2188 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i128, Custom);
2189 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i128, Custom);
2190 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom);
2191 }
2192
2193 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
2194 // is. We should promote the value to 64-bits to solve this.
2195 // This is what the CRT headers do - `fmodf` is an inline header
2196 // function casting to f64 and calling `fmod`.
2197 if (Subtarget.is32Bit() &&
2198 (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
2199 for (ISD::NodeType Op :
2200 {ISD::FCEIL, ISD::STRICT_FCEIL,
2201 ISD::FCOS, ISD::STRICT_FCOS,
2202 ISD::FEXP, ISD::STRICT_FEXP,
2203 ISD::FFLOOR, ISD::STRICT_FFLOOR,
2204 ISD::FREM, ISD::STRICT_FREM,
2205 ISD::FLOG, ISD::STRICT_FLOG,
2206 ISD::FLOG10, ISD::STRICT_FLOG10,
2207 ISD::FPOW, ISD::STRICT_FPOW,
2208 ISD::FSIN, ISD::STRICT_FSIN})
2209 if (isOperationExpand(Op, MVT::f32))
2210 setOperationAction(Op, MVT::f32, Promote);
2211
2212 // We have target-specific dag combine patterns for the following nodes:
2213 setTargetDAGCombine({ISD::VECTOR_SHUFFLE,
2214 ISD::SCALAR_TO_VECTOR,
2215 ISD::INSERT_VECTOR_ELT,
2216 ISD::EXTRACT_VECTOR_ELT,
2217 ISD::CONCAT_VECTORS,
2218 ISD::INSERT_SUBVECTOR,
2219 ISD::EXTRACT_SUBVECTOR,
2220 ISD::BITCAST,
2221 ISD::VSELECT,
2222 ISD::SELECT,
2223 ISD::SHL,
2224 ISD::SRA,
2225 ISD::SRL,
2226 ISD::OR,
2227 ISD::AND,
2228 ISD::ADD,
2229 ISD::FADD,
2230 ISD::FSUB,
2231 ISD::FNEG,
2232 ISD::FMA,
2233 ISD::STRICT_FMA,
2234 ISD::FMINNUM,
2235 ISD::FMAXNUM,
2236 ISD::SUB,
2237 ISD::LOAD,
2238 ISD::MLOAD,
2239 ISD::STORE,
2240 ISD::MSTORE,
2241 ISD::TRUNCATE,
2242 ISD::ZERO_EXTEND,
2243 ISD::ANY_EXTEND,
2244 ISD::SIGN_EXTEND,
2245 ISD::SIGN_EXTEND_INREG,
2246 ISD::ANY_EXTEND_VECTOR_INREG,
2247 ISD::SIGN_EXTEND_VECTOR_INREG,
2248 ISD::ZERO_EXTEND_VECTOR_INREG,
2249 ISD::SINT_TO_FP,
2250 ISD::UINT_TO_FP,
2251 ISD::STRICT_SINT_TO_FP,
2252 ISD::STRICT_UINT_TO_FP,
2253 ISD::SETCC,
2254 ISD::MUL,
2255 ISD::XOR,
2256 ISD::MSCATTER,
2257 ISD::MGATHER,
2258 ISD::FP16_TO_FP,
2259 ISD::FP_EXTEND,
2260 ISD::STRICT_FP_EXTEND,
2261 ISD::FP_ROUND});
2262
2263 computeRegisterProperties(Subtarget.getRegisterInfo());
2264
2265 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
2266 MaxStoresPerMemsetOptSize = 8;
2267 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
2268 MaxStoresPerMemcpyOptSize = 4;
2269 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
2270 MaxStoresPerMemmoveOptSize = 4;
2271
2272 // TODO: These control memcmp expansion in CGP and could be raised higher, but
2273 // that needs to benchmarked and balanced with the potential use of vector
2274 // load/store types (PR33329, PR33914).
2275 MaxLoadsPerMemcmp = 2;
2276 MaxLoadsPerMemcmpOptSize = 2;
2277
2278 // Default loop alignment, which can be overridden by -align-loops.
2279 setPrefLoopAlignment(Align(16));
2280
2281 // An out-of-order CPU can speculatively execute past a predictable branch,
2282 // but a conditional move could be stalled by an expensive earlier operation.
2283 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
2284 EnableExtLdPromotion = true;
2285 setPrefFunctionAlignment(Align(16));
2286
2287 verifyIntrinsicTables();
2288
2289 // Default to having -disable-strictnode-mutation on
2290 IsStrictFPEnabled = true;
2291}
2292
2293// This has so far only been implemented for 64-bit MachO.
2294bool X86TargetLowering::useLoadStackGuardNode() const {
2295 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
2296}
2297
2298bool X86TargetLowering::useStackGuardXorFP() const {
2299 // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
2300 return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
2301}
2302
2303SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
2304 const SDLoc &DL) const {
2305 EVT PtrTy = getPointerTy(DAG.getDataLayout());
2306 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
2307 MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
2308 return SDValue(Node, 0);
2309}
2310
2311TargetLoweringBase::LegalizeTypeAction
2312X86TargetLowering::getPreferredVectorAction(MVT VT) const {
2313 if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&
2314 !Subtarget.hasBWI())
2315 return TypeSplitVector;
2316
2317 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2318 VT.getVectorElementType() != MVT::i1)
2319 return TypeWidenVector;
2320
2321 return TargetLoweringBase::getPreferredVectorAction(VT);
2322}
2323
2324static std::pair<MVT, unsigned>
2325handleMaskRegisterForCallingConv(unsigned NumElts, CallingConv::ID CC,
2326 const X86Subtarget &Subtarget) {
2327 // v2i1/v4i1/v8i1/v16i1 all pass in xmm registers unless the calling
2328 // convention is one that uses k registers.
2329 if (NumElts == 2)
2330 return {MVT::v2i64, 1};
2331 if (NumElts == 4)
2332 return {MVT::v4i32, 1};
2333 if (NumElts == 8 && CC != CallingConv::X86_RegCall &&
2334 CC != CallingConv::Intel_OCL_BI)
2335 return {MVT::v8i16, 1};
2336 if (NumElts == 16 && CC != CallingConv::X86_RegCall &&
2337 CC != CallingConv::Intel_OCL_BI)
2338 return {MVT::v16i8, 1};
2339 // v32i1 passes in ymm unless we have BWI and the calling convention is
2340 // regcall.
2341 if (NumElts == 32 && (!Subtarget.hasBWI() || CC != CallingConv::X86_RegCall))
2342 return {MVT::v32i8, 1};
2343 // Split v64i1 vectors if we don't have v64i8 available.
2344 if (NumElts == 64 && Subtarget.hasBWI() && CC != CallingConv::X86_RegCall) {
2345 if (Subtarget.useAVX512Regs())
2346 return {MVT::v64i8, 1};
2347 return {MVT::v32i8, 2};
2348 }
2349
2350 // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
2351 if (!isPowerOf2_32(NumElts) || (NumElts == 64 && !Subtarget.hasBWI()) ||
2352 NumElts > 64)
2353 return {MVT::i8, NumElts};
2354
2355 return {MVT::INVALID_SIMPLE_VALUE_TYPE, 0};
2356}
2357
2358MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
2359 CallingConv::ID CC,
2360 EVT VT) const {
2361 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
2362 Subtarget.hasAVX512()) {
2363 unsigned NumElts = VT.getVectorNumElements();
2364
2365 MVT RegisterVT;
2366 unsigned NumRegisters;
2367 std::tie(RegisterVT, NumRegisters) =
2368 handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
2369 if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
2370 return RegisterVT;
2371 }
2372
2373 // v3f16 will be widen to v4f16. But we don't assign register class for v4f16.
2374 // So its default register type is f16. We override the type to v8f16 here.
2375 if (VT == MVT::v3f16 && Subtarget.hasFP16())
2376 return MVT::v8f16;
2377
2378 // We will use more GPRs for f64 and f80 on 32 bits when x87 is disabled.
2379 if ((VT == MVT::f64 || VT == MVT::f80) && !Subtarget.is64Bit() &&
2380 !Subtarget.hasX87())
2381 return MVT::i32;
2382
2383 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
2384}
2385
2386unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
2387 CallingConv::ID CC,
2388 EVT VT) const {
2389 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
2390 Subtarget.hasAVX512()) {
2391 unsigned NumElts = VT.getVectorNumElements();
2392
2393 MVT RegisterVT;
2394 unsigned NumRegisters;
2395 std::tie(RegisterVT, NumRegisters) =
2396 handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
2397 if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
2398 return NumRegisters;
2399 }
2400
2401 // v3f16 will be widen to v4f16. But we don't assign register class for v4f16.
2402 // So its default register number is 3. We override the number to 1 here.
2403 if (VT == MVT::v3f16 && Subtarget.hasFP16())
2404 return 1;
2405
2406 // We have to split f64 to 2 registers and f80 to 3 registers on 32 bits if
2407 // x87 is disabled.
2408 if (!Subtarget.is64Bit() && !Subtarget.hasX87()) {
2409 if (VT == MVT::f64)
2410 return 2;
2411 if (VT == MVT::f80)
2412 return 3;
2413 }
2414
2415 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
2416}
2417
2418unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv(
2419 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
2420 unsigned &NumIntermediates, MVT &RegisterVT) const {
2421 // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
2422 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
2423 Subtarget.hasAVX512() &&
2424 (!isPowerOf2_32(VT.getVectorNumElements()) ||
2425 (VT.getVectorNumElements() == 64 && !Subtarget.hasBWI()) ||
2426 VT.getVectorNumElements() > 64)) {
2427 RegisterVT = MVT::i8;
2428 IntermediateVT = MVT::i1;
2429 NumIntermediates = VT.getVectorNumElements();
2430 return NumIntermediates;
2431 }
2432
2433 // Split v64i1 vectors if we don't have v64i8 available.
2434 if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
2435 CC != CallingConv::X86_RegCall) {
2436 RegisterVT = MVT::v32i8;
2437 IntermediateVT = MVT::v32i1;
2438 NumIntermediates = 2;
2439 return 2;
2440 }
2441
2442 return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT,
2443 NumIntermediates, RegisterVT);
2444}
2445
2446EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
2447 LLVMContext& Context,
2448 EVT VT) const {
2449 if (!VT.isVector())
2450 return MVT::i8;
2451
2452 if (Subtarget.hasAVX512()) {
2453 // Figure out what this type will be legalized to.
2454 EVT LegalVT = VT;
2455 while (getTypeAction(Context, LegalVT) != TypeLegal)
2456 LegalVT = getTypeToTransformTo(Context, LegalVT);
2457
2458 // If we got a 512-bit vector then we'll definitely have a vXi1 compare.
2459 if (LegalVT.getSimpleVT().is512BitVector())
2460 return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount());
2461
2462 if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {
2463 // If we legalized to less than a 512-bit vector, then we will use a vXi1
2464 // compare for vXi32/vXi64 for sure. If we have BWI we will also support
2465 // vXi16/vXi8.
2466 MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();
2467 if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32)
2468 return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount());
2469 }
2470 }
2471
2472 return VT.changeVectorElementTypeToInteger();
2473}
2474
2475/// Helper for getByValTypeAlignment to determine
2476/// the desired ByVal argument alignment.
2477static void getMaxByValAlign(Type *Ty, Align &MaxAlign) {
2478 if (MaxAlign == 16)
2479 return;
2480 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
2481 if (VTy->getPrimitiveSizeInBits().getFixedSize() == 128)
2482 MaxAlign = Align(16);
2483 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
2484 Align EltAlign;
2485 getMaxByValAlign(ATy->getElementType(), EltAlign);
2486 if (EltAlign > MaxAlign)
2487 MaxAlign = EltAlign;
2488 } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
2489 for (auto *EltTy : STy->elements()) {
2490 Align EltAlign;
2491 getMaxByValAlign(EltTy, EltAlign);
2492 if (EltAlign > MaxAlign)
2493 MaxAlign = EltAlign;
2494 if (MaxAlign == 16)
2495 break;
2496 }
2497 }
2498}
2499
2500/// Return the desired alignment for ByVal aggregate
2501/// function arguments in the caller parameter area. For X86, aggregates
2502/// that contain SSE vectors are placed at 16-byte boundaries while the rest
2503/// are at 4-byte boundaries.
2504uint64_t X86TargetLowering::getByValTypeAlignment(Type *Ty,
2505 const DataLayout &DL) const {
2506 if (Subtarget.is64Bit()) {
2507 // Max of 8 and alignment of type.
2508 Align TyAlign = DL.getABITypeAlign(Ty);
2509 if (TyAlign > 8)
2510 return TyAlign.value();
2511 return 8;
2512 }
2513
2514 Align Alignment(4);
2515 if (Subtarget.hasSSE1())
2516 getMaxByValAlign(Ty, Alignment);
2517 return Alignment.value();
2518}
2519
2520/// It returns EVT::Other if the type should be determined using generic
2521/// target-independent logic.
2522/// For vector ops we check that the overall size isn't larger than our
2523/// preferred vector width.
2524EVT X86TargetLowering::getOptimalMemOpType(
2525 const MemOp &Op, const AttributeList &FuncAttributes) const {
2526 if (!FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat)) {
2527 if (Op.size() >= 16 &&
2528 (!Subtarget.isUnalignedMem16Slow() || Op.isAligned(Align(16)))) {
2529 // FIXME: Check if unaligned 64-byte accesses are slow.
2530 if (Op.size() >= 64 && Subtarget.hasAVX512() &&
2531 (Subtarget.getPreferVectorWidth() >= 512)) {
2532 return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32;
2533 }
2534 // FIXME: Check if unaligned 32-byte accesses are slow.
2535 if (Op.size() >= 32 && Subtarget.hasAVX() &&
2536 (Subtarget.getPreferVectorWidth() >= 256)) {
2537 // Although this isn't a well-supported type for AVX1, we'll let
2538 // legalization and shuffle lowering produce the optimal codegen. If we
2539 // choose an optimal type with a vector element larger than a byte,
2540 // getMemsetStores() may create an intermediate splat (using an integer
2541 // multiply) before we splat as a vector.
2542 return MVT::v32i8;
2543 }
2544 if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128))
2545 return MVT::v16i8;
2546 // TODO: Can SSE1 handle a byte vector?
2547 // If we have SSE1 registers we should be able to use them.
2548 if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) &&
2549 (Subtarget.getPreferVectorWidth() >= 128))
2550 return MVT::v4f32;
2551 } else if (((Op.isMemcpy() && !Op.isMemcpyStrSrc()) || Op.isZeroMemset()) &&
2552 Op.size() >= 8 && !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
2553 // Do not use f64 to lower memcpy if source is string constant. It's
2554 // better to use i32 to avoid the loads.
2555 // Also, do not use f64 to lower memset unless this is a memset of zeros.
2556 // The gymnastics of splatting a byte value into an XMM register and then
2557 // only using 8-byte stores (because this is a CPU with slow unaligned
2558 // 16-byte accesses) makes that a loser.
2559 return MVT::f64;
2560 }
2561 }
2562 // This is a compromise. If we reach here, unaligned accesses may be slow on
2563 // this target. However, creating smaller, aligned accesses could be even
2564 // slower and would certainly be a lot more code.
2565 if (Subtarget.is64Bit() && Op.size() >= 8)
2566 return MVT::i64;
2567 return MVT::i32;
2568}
2569
2570bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
2571 if (VT == MVT::f32)
2572 return Subtarget.hasSSE1();
2573 if (VT == MVT::f64)
2574 return Subtarget.hasSSE2();
2575 return true;
2576}
2577
2578bool X86TargetLowering::allowsMisalignedMemoryAccesses(
2579 EVT VT, unsigned, Align Alignment, MachineMemOperand::Flags Flags,
2580 bool *Fast) const {
2581 if (Fast) {
2582 switch (VT.getSizeInBits()) {
2583 default:
2584 // 8-byte and under are always assumed to be fast.
2585 *Fast = true;
2586 break;
2587 case 128:
2588 *Fast = !Subtarget.isUnalignedMem16Slow();
2589 break;
2590 case 256:
2591 *Fast = !Subtarget.isUnalignedMem32Slow();
2592 break;
2593 // TODO: What about AVX-512 (512-bit) accesses?
2594 }
2595 }
2596 // NonTemporal vector memory ops must be aligned.
2597 if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {
2598 // NT loads can only be vector aligned, so if its less aligned than the
2599 // minimum vector size (which we can split the vector down to), we might as
2600 // well use a regular unaligned vector load.
2601 // We don't have any NT loads pre-SSE41.
2602 if (!!(Flags & MachineMemOperand::MOLoad))
2603 return (Alignment < 16 || !Subtarget.hasSSE41());
2604 return false;
2605 }
2606 // Misaligned accesses of any size are always allowed.
2607 return true;
2608}
2609
2610/// Return the entry encoding for a jump table in the
2611/// current function. The returned value is a member of the
2612/// MachineJumpTableInfo::JTEntryKind enum.
2613unsigned X86TargetLowering::getJumpTableEncoding() const {
2614 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
2615 // symbol.
2616 if (isPositionIndependent() && Subtarget.isPICStyleGOT())
2617 return MachineJumpTableInfo::EK_Custom32;
2618
2619 // Otherwise, use the normal jump table encoding heuristics.
2620 return TargetLowering::getJumpTableEncoding();
2621}
2622
2623bool X86TargetLowering::useSoftFloat() const {
2624 return Subtarget.useSoftFloat();
2625}
2626
2627void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
2628 ArgListTy &Args) const {
2629
2630 // Only relabel X86-32 for C / Stdcall CCs.
2631 if (Subtarget.is64Bit())
2632 return;
2633 if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
2634 return;
2635 unsigned ParamRegs = 0;
2636 if (auto *M = MF->getFunction().getParent())
2637 ParamRegs = M->getNumberRegisterParameters();
2638
2639 // Mark the first N int arguments as having reg
2640 for (auto &Arg : Args) {
2641 Type *T = Arg.Ty;
2642 if (T->isIntOrPtrTy())
2643 if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
2644 unsigned numRegs = 1;
2645 if (MF->getDataLayout().getTypeAllocSize(T) > 4)
2646 numRegs = 2;
2647 if (ParamRegs < numRegs)
2648 return;
2649 ParamRegs -= numRegs;
2650 Arg.IsInReg = true;
2651 }
2652 }
2653}
2654
2655const MCExpr *
2656X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
2657 const MachineBasicBlock *MBB,
2658 unsigned uid,MCContext &Ctx) const{
2659 assert(isPositionIndependent() && Subtarget.isPICStyleGOT())(static_cast <bool> (isPositionIndependent() &&
Subtarget.isPICStyleGOT()) ? void (0) : __assert_fail ("isPositionIndependent() && Subtarget.isPICStyleGOT()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 2659, __extension__
__PRETTY_FUNCTION__))
;
2660 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
2661 // entries.
2662 return MCSymbolRefExpr::create(MBB->getSymbol(),
2663 MCSymbolRefExpr::VK_GOTOFF, Ctx);
2664}
2665
2666/// Returns relocation base for the given PIC jumptable.
2667SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
2668 SelectionDAG &DAG) const {
2669 if (!Subtarget.is64Bit())
2670 // This doesn't have SDLoc associated with it, but is not really the
2671 // same as a Register.
2672 return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
2673 getPointerTy(DAG.getDataLayout()));
2674 return Table;
2675}
2676
2677/// This returns the relocation base for the given PIC jumptable,
2678/// the same as getPICJumpTableRelocBase, but as an MCExpr.
2679const MCExpr *X86TargetLowering::
2680getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
2681 MCContext &Ctx) const {
2682 // X86-64 uses RIP relative addressing based on the jump table label.
2683 if (Subtarget.isPICStyleRIPRel())
2684 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
2685
2686 // Otherwise, the reference is relative to the PIC base.
2687 return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
2688}
2689
2690std::pair<const TargetRegisterClass *, uint8_t>
2691X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
2692 MVT VT) const {
2693 const TargetRegisterClass *RRC = nullptr;
2694 uint8_t Cost = 1;
2695 switch (VT.SimpleTy) {
2696 default:
2697 return TargetLowering::findRepresentativeClass(TRI, VT);
2698 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
2699 RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
2700 break;
2701 case MVT::x86mmx:
2702 RRC = &X86::VR64RegClass;
2703 break;
2704 case MVT::f32: case MVT::f64:
2705 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
2706 case MVT::v4f32: case MVT::v2f64:
2707 case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
2708 case MVT::v8f32: case MVT::v4f64:
2709 case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
2710 case MVT::v16f32: case MVT::v8f64:
2711 RRC = &X86::VR128XRegClass;
2712 break;
2713 }
2714 return std::make_pair(RRC, Cost);
2715}
2716
2717unsigned X86TargetLowering::getAddressSpace() const {
2718 if (Subtarget.is64Bit())
2719 return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
2720 return 256;
2721}
2722
2723static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
2724 return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
2725 (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
2726}
2727
2728static Constant* SegmentOffset(IRBuilderBase &IRB,
2729 int Offset, unsigned AddressSpace) {
2730 return ConstantExpr::getIntToPtr(
2731 ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
2732 Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
2733}
2734
2735Value *X86TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const {
2736 // glibc, bionic, and Fuchsia have a special slot for the stack guard in
2737 // tcbhead_t; use it instead of the usual global variable (see
2738 // sysdeps/{i386,x86_64}/nptl/tls.h)
2739 if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
2740 if (Subtarget.isTargetFuchsia()) {
2741 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
2742 return SegmentOffset(IRB, 0x10, getAddressSpace());
2743 } else {
2744 unsigned AddressSpace = getAddressSpace();
2745 Module *M = IRB.GetInsertBlock()->getParent()->getParent();
2746 // Specially, some users may customize the base reg and offset.
2747 int Offset = M->getStackProtectorGuardOffset();
2748 // If we don't set -stack-protector-guard-offset value:
2749 // %fs:0x28, unless we're using a Kernel code model, in which case
2750 // it's %gs:0x28. gs:0x14 on i386.
2751 if (Offset == INT_MAX2147483647)
2752 Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
2753
2754 StringRef GuardReg = M->getStackProtectorGuardReg();
2755 if (GuardReg == "fs")
2756 AddressSpace = X86AS::FS;
2757 else if (GuardReg == "gs")
2758 AddressSpace = X86AS::GS;
2759 return SegmentOffset(IRB, Offset, AddressSpace);
2760 }
2761 }
2762 return TargetLowering::getIRStackGuard(IRB);
2763}
2764
2765void X86TargetLowering::insertSSPDeclarations(Module &M) const {
2766 // MSVC CRT provides functionalities for stack protection.
2767 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2768 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
2769 // MSVC CRT has a global variable holding security cookie.
2770 M.getOrInsertGlobal("__security_cookie",
2771 Type::getInt8PtrTy(M.getContext()));
2772
2773 // MSVC CRT has a function to validate security cookie.
2774 FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
2775 "__security_check_cookie", Type::getVoidTy(M.getContext()),
2776 Type::getInt8PtrTy(M.getContext()));
2777 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
2778 F->setCallingConv(CallingConv::X86_FastCall);
2779 F->addParamAttr(0, Attribute::AttrKind::InReg);
2780 }
2781 return;
2782 }
2783
2784 StringRef GuardMode = M.getStackProtectorGuard();
2785
2786 // glibc, bionic, and Fuchsia have a special slot for the stack guard.
2787 if ((GuardMode == "tls" || GuardMode.empty()) &&
2788 hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
2789 return;
2790 TargetLowering::insertSSPDeclarations(M);
2791}
2792
2793Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
2794 // MSVC CRT has a global variable holding security cookie.
2795 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2796 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
2797 return M.getGlobalVariable("__security_cookie");
2798 }
2799 return TargetLowering::getSDagStackGuard(M);
2800}
2801
2802Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
2803 // MSVC CRT has a function to validate security cookie.
2804 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2805 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
2806 return M.getFunction("__security_check_cookie");
2807 }
2808 return TargetLowering::getSSPStackGuardCheck(M);
2809}
2810
2811Value *
2812X86TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {
2813 if (Subtarget.getTargetTriple().isOSContiki())
2814 return getDefaultSafeStackPointerLocation(IRB, false);
2815
2816 // Android provides a fixed TLS slot for the SafeStack pointer. See the
2817 // definition of TLS_SLOT_SAFESTACK in
2818 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
2819 if (Subtarget.isTargetAndroid()) {
2820 // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
2821 // %gs:0x24 on i386
2822 int Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
2823 return SegmentOffset(IRB, Offset, getAddressSpace());
2824 }
2825
2826 // Fuchsia is similar.
2827 if (Subtarget.isTargetFuchsia()) {
2828 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
2829 return SegmentOffset(IRB, 0x18, getAddressSpace());
2830 }
2831
2832 return TargetLowering::getSafeStackPointerLocation(IRB);
2833}
2834
2835//===----------------------------------------------------------------------===//
2836// Return Value Calling Convention Implementation
2837//===----------------------------------------------------------------------===//
2838
2839bool X86TargetLowering::CanLowerReturn(
2840 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
2841 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
2842 SmallVector<CCValAssign, 16> RVLocs;
2843 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2844 return CCInfo.CheckReturn(Outs, RetCC_X86);
2845}
2846
2847const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2848 static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2849 return ScratchRegs;
2850}
2851
2852/// Lowers masks values (v*i1) to the local register values
2853/// \returns DAG node after lowering to register type
2854static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
2855 const SDLoc &Dl, SelectionDAG &DAG) {
2856 EVT ValVT = ValArg.getValueType();
2857
2858 if (ValVT == MVT::v1i1)
2859 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ValLoc, ValArg,
2860 DAG.getIntPtrConstant(0, Dl));
2861
2862 if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
2863 (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
2864 // Two stage lowering might be required
2865 // bitcast: v8i1 -> i8 / v16i1 -> i16
2866 // anyextend: i8 -> i32 / i16 -> i32
2867 EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
2868 SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
2869 if (ValLoc == MVT::i32)
2870 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
2871 return ValToCopy;
2872 }
2873
2874 if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
2875 (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
2876 // One stage lowering is required
2877 // bitcast: v32i1 -> i32 / v64i1 -> i64
2878 return DAG.getBitcast(ValLoc, ValArg);
2879 }
2880
2881 return DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValArg);
2882}
2883
2884/// Breaks v64i1 value into two registers and adds the new node to the DAG
2885static void Passv64i1ArgInRegs(
2886 const SDLoc &Dl, SelectionDAG &DAG, SDValue &Arg,
2887 SmallVectorImpl<std::pair<Register, SDValue>> &RegsToPass, CCValAssign &VA,
2888 CCValAssign &NextVA, const X86Subtarget &Subtarget) {
2889 assert(Subtarget.hasBWI() && "Expected AVX512BW target!")(static_cast <bool> (Subtarget.hasBWI() && "Expected AVX512BW target!"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected AVX512BW target!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 2889, __extension__
__PRETTY_FUNCTION__))
;
2890 assert(Subtarget.is32Bit() && "Expecting 32 bit target")(static_cast <bool> (Subtarget.is32Bit() && "Expecting 32 bit target"
) ? void (0) : __assert_fail ("Subtarget.is32Bit() && \"Expecting 32 bit target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 2890, __extension__
__PRETTY_FUNCTION__))
;
2891 assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value")(static_cast <bool> (Arg.getValueType() == MVT::i64 &&
"Expecting 64 bit value") ? void (0) : __assert_fail ("Arg.getValueType() == MVT::i64 && \"Expecting 64 bit value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 2891, __extension__
__PRETTY_FUNCTION__))
;
2892 assert(VA.isRegLoc() && NextVA.isRegLoc() &&(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc
() && "The value should reside in two registers") ? void
(0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The value should reside in two registers\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 2893, __extension__
__PRETTY_FUNCTION__))
2893 "The value should reside in two registers")(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc
() && "The value should reside in two registers") ? void
(0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The value should reside in two registers\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 2893, __extension__
__PRETTY_FUNCTION__))
;
2894
2895 // Before splitting the value we cast it to i64
2896 Arg = DAG.getBitcast(MVT::i64, Arg);
2897
2898 // Splitting the value into two i32 types
2899 SDValue Lo, Hi;
2900 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2901 DAG.getConstant(0, Dl, MVT::i32));
2902 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2903 DAG.getConstant(1, Dl, MVT::i32));
2904
2905 // Attach the two i32 types into corresponding registers
2906 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
2907 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
2908}
2909
2910SDValue
2911X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2912 bool isVarArg,
2913 const SmallVectorImpl<ISD::OutputArg> &Outs,
2914 const SmallVectorImpl<SDValue> &OutVals,
2915 const SDLoc &dl, SelectionDAG &DAG) const {
2916 MachineFunction &MF = DAG.getMachineFunction();
2917 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2918
2919 // In some cases we need to disable registers from the default CSR list.
2920 // For example, when they are used for argument passing.
2921 bool ShouldDisableCalleeSavedRegister =
2922 CallConv == CallingConv::X86_RegCall ||
2923 MF.getFunction().hasFnAttribute("no_caller_saved_registers");
2924
2925 if (CallConv == CallingConv::X86_INTR && !Outs.empty())
2926 report_fatal_error("X86 interrupts may not return any value");
2927
2928 SmallVector<CCValAssign, 16> RVLocs;
2929 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2930 CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2931
2932 SmallVector<std::pair<Register, SDValue>, 4> RetVals;
2933 for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
2934 ++I, ++OutsIndex) {
2935 CCValAssign &VA = RVLocs[I];
2936 assert(VA.isRegLoc() && "Can only return in registers!")(static_cast <bool> (VA.isRegLoc() && "Can only return in registers!"
) ? void (0) : __assert_fail ("VA.isRegLoc() && \"Can only return in registers!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 2936, __extension__
__PRETTY_FUNCTION__))
;
2937
2938 // Add the register to the CalleeSaveDisableRegs list.
2939 if (ShouldDisableCalleeSavedRegister)
2940 MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());
2941
2942 SDValue ValToCopy = OutVals[OutsIndex];
2943 EVT ValVT = ValToCopy.getValueType();
2944
2945 // Promote values to the appropriate types.
2946 if (VA.getLocInfo() == CCValAssign::SExt)
2947 ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2948 else if (VA.getLocInfo() == CCValAssign::ZExt)
2949 ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2950 else if (VA.getLocInfo() == CCValAssign::AExt) {
2951 if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
2952 ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
2953 else
2954 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2955 }
2956 else if (VA.getLocInfo() == CCValAssign::BCvt)
2957 ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
2958
2959 assert(VA.getLocInfo() != CCValAssign::FPExt &&(static_cast <bool> (VA.getLocInfo() != CCValAssign::FPExt
&& "Unexpected FP-extend for return value.") ? void (
0) : __assert_fail ("VA.getLocInfo() != CCValAssign::FPExt && \"Unexpected FP-extend for return value.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 2960, __extension__
__PRETTY_FUNCTION__))
2960 "Unexpected FP-extend for return value.")(static_cast <bool> (VA.getLocInfo() != CCValAssign::FPExt
&& "Unexpected FP-extend for return value.") ? void (
0) : __assert_fail ("VA.getLocInfo() != CCValAssign::FPExt && \"Unexpected FP-extend for return value.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 2960, __extension__
__PRETTY_FUNCTION__))
;
2961
2962 // Report an error if we have attempted to return a value via an XMM
2963 // register and SSE was disabled.
2964 if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
2965 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2966 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2967 } else if (!Subtarget.hasSSE2() &&
2968 X86::FR64XRegClass.contains(VA.getLocReg()) &&
2969 ValVT == MVT::f64) {
2970 // When returning a double via an XMM register, report an error if SSE2 is
2971 // not enabled.
2972 errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
2973 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2974 }
2975
2976 // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2977 // the RET instruction and handled by the FP Stackifier.
2978 if (VA.getLocReg() == X86::FP0 ||
2979 VA.getLocReg() == X86::FP1) {
2980 // If this is a copy from an xmm register to ST(0), use an FPExtend to
2981 // change the value to the FP stack register class.
2982 if (isScalarFPTypeInSSEReg(VA.getValVT()))
2983 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2984 RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
2985 // Don't emit a copytoreg.
2986 continue;
2987 }
2988
2989 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2990 // which is returned in RAX / RDX.
2991 if (Subtarget.is64Bit()) {
2992 if (ValVT == MVT::x86mmx) {
2993 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2994 ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
2995 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2996 ValToCopy);
2997 // If we don't have SSE2 available, convert to v4f32 so the generated
2998 // register is legal.
2999 if (!Subtarget.hasSSE2())
3000 ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
3001 }
3002 }
3003 }
3004
3005 if (VA.needsCustom()) {
3006 assert(VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3007, __extension__
__PRETTY_FUNCTION__))
3007 "Currently the only custom case is when we split v64i1 to 2 regs")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3007, __extension__
__PRETTY_FUNCTION__))
;
3008
3009 Passv64i1ArgInRegs(dl, DAG, ValToCopy, RetVals, VA, RVLocs[++I],
3010 Subtarget);
3011
3012 // Add the second register to the CalleeSaveDisableRegs list.
3013 if (ShouldDisableCalleeSavedRegister)
3014 MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
3015 } else {
3016 RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
3017 }
3018 }
3019
3020 SDValue Flag;
3021 SmallVector<SDValue, 6> RetOps;
3022 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3023 // Operand #1 = Bytes To Pop
3024 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
3025 MVT::i32));
3026
3027 // Copy the result values into the output registers.
3028 for (auto &RetVal : RetVals) {
3029 if (RetVal.first == X86::FP0 || RetVal.first == X86::FP1) {
3030 RetOps.push_back(RetVal.second);
3031 continue; // Don't emit a copytoreg.
3032 }
3033
3034 Chain = DAG.getCopyToReg(Chain, dl, RetVal.first, RetVal.second, Flag);
3035 Flag = Chain.getValue(1);
3036 RetOps.push_back(
3037 DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
3038 }
3039
3040 // Swift calling convention does not require we copy the sret argument
3041 // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
3042
3043 // All x86 ABIs require that for returning structs by value we copy
3044 // the sret argument into %rax/%eax (depending on ABI) for the return.
3045 // We saved the argument into a virtual register in the entry block,
3046 // so now we copy the value out and into %rax/%eax.
3047 //
3048 // Checking Function.hasStructRetAttr() here is insufficient because the IR
3049 // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
3050 // false, then an sret argument may be implicitly inserted in the SelDAG. In
3051 // either case FuncInfo->setSRetReturnReg() will have been called.
3052 if (Register SRetReg = FuncInfo->getSRetReturnReg()) {
3053 // When we have both sret and another return value, we should use the
3054 // original Chain stored in RetOps[0], instead of the current Chain updated
3055 // in the above loop. If we only have sret, RetOps[0] equals to Chain.
3056
3057 // For the case of sret and another return value, we have
3058 // Chain_0 at the function entry
3059 // Chain_1 = getCopyToReg(Chain_0) in the above loop
3060 // If we use Chain_1 in getCopyFromReg, we will have
3061 // Val = getCopyFromReg(Chain_1)
3062 // Chain_2 = getCopyToReg(Chain_1, Val) from below
3063
3064 // getCopyToReg(Chain_0) will be glued together with
3065 // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
3066 // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
3067 // Data dependency from Unit B to Unit A due to usage of Val in
3068 // getCopyToReg(Chain_1, Val)
3069 // Chain dependency from Unit A to Unit B
3070
3071 // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
3072 SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
3073 getPointerTy(MF.getDataLayout()));
3074
3075 Register RetValReg
3076 = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
3077 X86::RAX : X86::EAX;
3078 Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
3079 Flag = Chain.getValue(1);
3080
3081 // RAX/EAX now acts like a return value.
3082 RetOps.push_back(
3083 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
3084
3085 // Add the returned register to the CalleeSaveDisableRegs list.
3086 if (ShouldDisableCalleeSavedRegister)
3087 MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
3088 }
3089
3090 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
3091 const MCPhysReg *I =
3092 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3093 if (I) {
3094 for (; *I; ++I) {
3095 if (X86::GR64RegClass.contains(*I))
3096 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3097 else
3098 llvm_unreachable("Unexpected register class in CSRsViaCopy!")::llvm::llvm_unreachable_internal("Unexpected register class in CSRsViaCopy!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3098)
;
3099 }
3100 }
3101
3102 RetOps[0] = Chain; // Update chain.
3103
3104 // Add the flag if we have it.
3105 if (Flag.getNode())
3106 RetOps.push_back(Flag);
3107
3108 X86ISD::NodeType opcode = X86ISD::RET_FLAG;
3109 if (CallConv == CallingConv::X86_INTR)
3110 opcode = X86ISD::IRET;
3111 return DAG.getNode(opcode, dl, MVT::Other, RetOps);
3112}
3113
3114bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
3115 if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
3116 return false;
3117
3118 SDValue TCChain = Chain;
3119 SDNode *Copy = *N->use_begin();
3120 if (Copy->getOpcode() == ISD::CopyToReg) {
3121 // If the copy has a glue operand, we conservatively assume it isn't safe to
3122 // perform a tail call.
3123 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
3124 return false;
3125 TCChain = Copy->getOperand(0);
3126 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
3127 return false;
3128
3129 bool HasRet = false;
3130 for (const SDNode *U : Copy->uses()) {
3131 if (U->getOpcode() != X86ISD::RET_FLAG)
3132 return false;
3133 // If we are returning more than one value, we can definitely
3134 // not make a tail call see PR19530
3135 if (U->getNumOperands() > 4)
3136 return false;
3137 if (U->getNumOperands() == 4 &&
3138 U->getOperand(U->getNumOperands() - 1).getValueType() != MVT::Glue)
3139 return false;
3140 HasRet = true;
3141 }
3142
3143 if (!HasRet)
3144 return false;
3145
3146 Chain = TCChain;
3147 return true;
3148}
3149
3150EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
3151 ISD::NodeType ExtendKind) const {
3152 MVT ReturnMVT = MVT::i32;
3153
3154 bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
3155 if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
3156 // The ABI does not require i1, i8 or i16 to be extended.
3157 //
3158 // On Darwin, there is code in the wild relying on Clang's old behaviour of
3159 // always extending i8/i16 return values, so keep doing that for now.
3160 // (PR26665).
3161 ReturnMVT = MVT::i8;
3162 }
3163
3164 EVT MinVT = getRegisterType(Context, ReturnMVT);
3165 return VT.bitsLT(MinVT) ? MinVT : VT;
3166}
3167
3168/// Reads two 32 bit registers and creates a 64 bit mask value.
3169/// \param VA The current 32 bit value that need to be assigned.
3170/// \param NextVA The next 32 bit value that need to be assigned.
3171/// \param Root The parent DAG node.
3172/// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
3173/// glue purposes. In the case the DAG is already using
3174/// physical register instead of virtual, we should glue
3175/// our new SDValue to InFlag SDvalue.
3176/// \return a new SDvalue of size 64bit.
3177static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
3178 SDValue &Root, SelectionDAG &DAG,
3179 const SDLoc &Dl, const X86Subtarget &Subtarget,
3180 SDValue *InFlag = nullptr) {
3181 assert((Subtarget.hasBWI()) && "Expected AVX512BW target!")(static_cast <bool> ((Subtarget.hasBWI()) && "Expected AVX512BW target!"
) ? void (0) : __assert_fail ("(Subtarget.hasBWI()) && \"Expected AVX512BW target!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3181, __extension__
__PRETTY_FUNCTION__))
;
3182 assert(Subtarget.is32Bit() && "Expecting 32 bit target")(static_cast <bool> (Subtarget.is32Bit() && "Expecting 32 bit target"
) ? void (0) : __assert_fail ("Subtarget.is32Bit() && \"Expecting 32 bit target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3182, __extension__
__PRETTY_FUNCTION__))
;
3183 assert(VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Expecting first location of 64 bit width type") ? void (0) :
__assert_fail ("VA.getValVT() == MVT::v64i1 && \"Expecting first location of 64 bit width type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3184, __extension__
__PRETTY_FUNCTION__))
3184 "Expecting first location of 64 bit width type")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Expecting first location of 64 bit width type") ? void (0) :
__assert_fail ("VA.getValVT() == MVT::v64i1 && \"Expecting first location of 64 bit width type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3184, __extension__
__PRETTY_FUNCTION__))
;
3185 assert(NextVA.getValVT() == VA.getValVT() &&(static_cast <bool> (NextVA.getValVT() == VA.getValVT()
&& "The locations should have the same type") ? void
(0) : __assert_fail ("NextVA.getValVT() == VA.getValVT() && \"The locations should have the same type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3186, __extension__
__PRETTY_FUNCTION__))
3186 "The locations should have the same type")(static_cast <bool> (NextVA.getValVT() == VA.getValVT()
&& "The locations should have the same type") ? void
(0) : __assert_fail ("NextVA.getValVT() == VA.getValVT() && \"The locations should have the same type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3186, __extension__
__PRETTY_FUNCTION__))
;
3187 assert(VA.isRegLoc() && NextVA.isRegLoc() &&(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc
() && "The values should reside in two registers") ? void
(0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The values should reside in two registers\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3188, __extension__
__PRETTY_FUNCTION__))
3188 "The values should reside in two registers")(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc
() && "The values should reside in two registers") ? void
(0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The values should reside in two registers\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3188, __extension__
__PRETTY_FUNCTION__))
;
3189
3190 SDValue Lo, Hi;
3191 SDValue ArgValueLo, ArgValueHi;
3192
3193 MachineFunction &MF = DAG.getMachineFunction();
3194 const TargetRegisterClass *RC = &X86::GR32RegClass;
3195
3196 // Read a 32 bit value from the registers.
3197 if (nullptr == InFlag) {
3198 // When no physical register is present,
3199 // create an intermediate virtual register.
3200 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
3201 ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
3202 Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
3203 ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
3204 } else {
3205 // When a physical register is available read the value from it and glue
3206 // the reads together.
3207 ArgValueLo =
3208 DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
3209 *InFlag = ArgValueLo.getValue(2);
3210 ArgValueHi =
3211 DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
3212 *InFlag = ArgValueHi.getValue(2);
3213 }
3214
3215 // Convert the i32 type into v32i1 type.
3216 Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
3217
3218 // Convert the i32 type into v32i1 type.
3219 Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
3220
3221 // Concatenate the two values together.
3222 return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
3223}
3224
3225/// The function will lower a register of various sizes (8/16/32/64)
3226/// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
3227/// \returns a DAG node contains the operand after lowering to mask type.
3228static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
3229 const EVT &ValLoc, const SDLoc &Dl,
3230 SelectionDAG &DAG) {
3231 SDValue ValReturned = ValArg;
3232
3233 if (ValVT == MVT::v1i1)
3234 return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);
3235
3236 if (ValVT == MVT::v64i1) {
3237 // In 32 bit machine, this case is handled by getv64i1Argument
3238 assert(ValLoc == MVT::i64 && "Expecting only i64 locations")(static_cast <bool> (ValLoc == MVT::i64 && "Expecting only i64 locations"
) ? void (0) : __assert_fail ("ValLoc == MVT::i64 && \"Expecting only i64 locations\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3238, __extension__
__PRETTY_FUNCTION__))
;
3239 // In 64 bit machine, There is no need to truncate the value only bitcast
3240 } else {
3241 MVT maskLen;
3242 switch (ValVT.getSimpleVT().SimpleTy) {
3243 case MVT::v8i1:
3244 maskLen = MVT::i8;
3245 break;
3246 case MVT::v16i1:
3247 maskLen = MVT::i16;
3248 break;
3249 case MVT::v32i1:
3250 maskLen = MVT::i32;
3251 break;
3252 default:
3253 llvm_unreachable("Expecting a vector of i1 types")::llvm::llvm_unreachable_internal("Expecting a vector of i1 types"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3253)
;
3254 }
3255
3256 ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
3257 }
3258 return DAG.getBitcast(ValVT, ValReturned);
3259}
3260
3261/// Lower the result values of a call into the
3262/// appropriate copies out of appropriate physical registers.
3263///
3264SDValue X86TargetLowering::LowerCallResult(
3265 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
3266 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3267 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
3268 uint32_t *RegMask) const {
3269
3270 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
3271 // Assign locations to each value returned by this call.
3272 SmallVector<CCValAssign, 16> RVLocs;
3273 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3274 *DAG.getContext());
3275 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
3276
3277 // Copy all of the result registers out of their specified physreg.
3278 for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
3279 ++I, ++InsIndex) {
3280 CCValAssign &VA = RVLocs[I];
3281 EVT CopyVT = VA.getLocVT();
3282
3283 // In some calling conventions we need to remove the used registers
3284 // from the register mask.
3285 if (RegMask) {
3286 for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);
3287 SubRegs.isValid(); ++SubRegs)
3288 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
3289 }
3290
3291 // Report an error if there was an attempt to return FP values via XMM
3292 // registers.
3293 if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
3294 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
3295 if (VA.getLocReg() == X86::XMM1)
3296 VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
3297 else
3298 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
3299 } else if (!Subtarget.hasSSE2() &&
3300 X86::FR64XRegClass.contains(VA.getLocReg()) &&
3301 CopyVT == MVT::f64) {
3302 errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
3303 if (VA.getLocReg() == X86::XMM1)
3304 VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
3305 else
3306 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
3307 }
3308
3309 // If we prefer to use the value in xmm registers, copy it out as f80 and
3310 // use a truncate to move it from fp stack reg to xmm reg.
3311 bool RoundAfterCopy = false;
3312 if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
3313 isScalarFPTypeInSSEReg(VA.getValVT())) {
3314 if (!Subtarget.hasX87())
3315 report_fatal_error("X87 register return with X87 disabled");
3316 CopyVT = MVT::f80;
3317 RoundAfterCopy = (CopyVT != VA.getLocVT());
3318 }
3319
3320 SDValue Val;
3321 if (VA.needsCustom()) {
3322 assert(VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3323, __extension__
__PRETTY_FUNCTION__))
3323 "Currently the only custom case is when we split v64i1 to 2 regs")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3323, __extension__
__PRETTY_FUNCTION__))
;
3324 Val =
3325 getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
3326 } else {
3327 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
3328 .getValue(1);
3329 Val = Chain.getValue(0);
3330 InFlag = Chain.getValue(2);
3331 }
3332
3333 if (RoundAfterCopy)
3334 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
3335 // This truncation won't change the value.
3336 DAG.getIntPtrConstant(1, dl));
3337
3338 if (VA.isExtInLoc()) {
3339 if (VA.getValVT().isVector() &&
3340 VA.getValVT().getScalarType() == MVT::i1 &&
3341 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
3342 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
3343 // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
3344 Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
3345 } else
3346 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
3347 }
3348
3349 if (VA.getLocInfo() == CCValAssign::BCvt)
3350 Val = DAG.getBitcast(VA.getValVT(), Val);
3351
3352 InVals.push_back(Val);
3353 }
3354
3355 return Chain;
3356}
3357
3358//===----------------------------------------------------------------------===//
3359// C & StdCall & Fast Calling Convention implementation
3360//===----------------------------------------------------------------------===//
3361// StdCall calling convention seems to be standard for many Windows' API
3362// routines and around. It differs from C calling convention just a little:
3363// callee should clean up the stack, not caller. Symbols should be also
3364// decorated in some fancy way :) It doesn't support any vector arguments.
3365// For info on fast calling convention see Fast Calling Convention (tail call)
3366// implementation LowerX86_32FastCCCallTo.
3367
3368/// Determines whether Args, either a set of outgoing arguments to a call, or a
3369/// set of incoming args of a call, contains an sret pointer that the callee
3370/// pops
3371template <typename T>
3372static bool hasCalleePopSRet(const SmallVectorImpl<T> &Args,
3373 const X86Subtarget &Subtarget) {
3374 // Not C++20 (yet), so no concepts available.
3375 static_assert(std::is_same<T, ISD::OutputArg>::value ||
3376 std::is_same<T, ISD::InputArg>::value,
3377 "requires ISD::OutputArg or ISD::InputArg");
3378
3379 // Only 32-bit pops the sret. It's a 64-bit world these days, so early-out
3380 // for most compilations.
3381 if (!Subtarget.is32Bit())
3382 return false;
3383
3384 if (Args.empty())
3385 return false;
3386
3387 // Most calls do not have an sret argument, check the arg next.
3388 const ISD::ArgFlagsTy &Flags = Args[0].Flags;
3389 if (!Flags.isSRet() || Flags.isInReg())
3390 return false;
3391
3392 // The MSVCabi does not pop the sret.
3393 if (Subtarget.getTargetTriple().isOSMSVCRT())
3394 return false;
3395
3396 // MCUs don't pop the sret
3397 if (Subtarget.isTargetMCU())
3398 return false;
3399
3400 // Callee pops argument
3401 return true;
3402}
3403
3404/// Make a copy of an aggregate at address specified by "Src" to address
3405/// "Dst" with size and alignment information specified by the specific
3406/// parameter attribute. The copy will be passed as a byval function parameter.
3407static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
3408 SDValue Chain, ISD::ArgFlagsTy Flags,
3409 SelectionDAG &DAG, const SDLoc &dl) {
3410 SDValue SizeNode = DAG.getIntPtrConstant(Flags.getByValSize(), dl);
3411
3412 return DAG.getMemcpy(
3413 Chain, dl, Dst, Src, SizeNode, Flags.getNonZeroByValAlign(),
3414 /*isVolatile*/ false, /*AlwaysInline=*/true,
3415 /*isTailCall*/ false, MachinePointerInfo(), MachinePointerInfo());
3416}
3417
3418/// Return true if the calling convention is one that we can guarantee TCO for.
3419static bool canGuaranteeTCO(CallingConv::ID CC) {
3420 return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
3421 CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
3422 CC == CallingConv::HHVM || CC == CallingConv::Tail ||
3423 CC == CallingConv::SwiftTail);
3424}
3425
3426/// Return true if we might ever do TCO for calls with this calling convention.
3427static bool mayTailCallThisCC(CallingConv::ID CC) {
3428 switch (CC) {
3429 // C calling conventions:
3430 case CallingConv::C:
3431 case CallingConv::Win64:
3432 case CallingConv::X86_64_SysV:
3433 // Callee pop conventions:
3434 case CallingConv::X86_ThisCall:
3435 case CallingConv::X86_StdCall:
3436 case CallingConv::X86_VectorCall:
3437 case CallingConv::X86_FastCall:
3438 // Swift:
3439 case CallingConv::Swift:
3440 return true;
3441 default:
3442 return canGuaranteeTCO(CC);
3443 }
3444}
3445
3446/// Return true if the function is being made into a tailcall target by
3447/// changing its ABI.
3448static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
3449 return (GuaranteedTailCallOpt && canGuaranteeTCO(CC)) ||
3450 CC == CallingConv::Tail || CC == CallingConv::SwiftTail;
3451}
3452
3453bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
3454 if (!CI->isTailCall())
3455 return false;
3456
3457 CallingConv::ID CalleeCC = CI->getCallingConv();
3458 if (!mayTailCallThisCC(CalleeCC))
3459 return false;
3460
3461 return true;
3462}
3463
3464SDValue
3465X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
3466 const SmallVectorImpl<ISD::InputArg> &Ins,
3467 const SDLoc &dl, SelectionDAG &DAG,
3468 const CCValAssign &VA,
3469 MachineFrameInfo &MFI, unsigned i) const {
3470 // Create the nodes corresponding to a load from this parameter slot.
3471 ISD::ArgFlagsTy Flags = Ins[i].Flags;
3472 bool AlwaysUseMutable = shouldGuaranteeTCO(
3473 CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
3474 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
3475 EVT ValVT;
3476 MVT PtrVT = getPointerTy(DAG.getDataLayout());
3477
3478 // If value is passed by pointer we have address passed instead of the value
3479 // itself. No need to extend if the mask value and location share the same
3480 // absolute size.
3481 bool ExtendedInMem =
3482 VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
3483 VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
3484
3485 if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
3486 ValVT = VA.getLocVT();
3487 else
3488 ValVT = VA.getValVT();
3489
3490 // FIXME: For now, all byval parameter objects are marked mutable. This can be
3491 // changed with more analysis.
3492 // In case of tail call optimization mark all arguments mutable. Since they
3493 // could be overwritten by lowering of arguments in case of a tail call.
3494 if (Flags.isByVal()) {
3495 unsigned Bytes = Flags.getByValSize();
3496 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
3497
3498 // FIXME: For now, all byval parameter objects are marked as aliasing. This
3499 // can be improved with deeper analysis.
3500 int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable,
3501 /*isAliased=*/true);
3502 return DAG.getFrameIndex(FI, PtrVT);
3503 }
3504
3505 EVT ArgVT = Ins[i].ArgVT;
3506
3507 // If this is a vector that has been split into multiple parts, and the
3508 // scalar size of the parts don't match the vector element size, then we can't
3509 // elide the copy. The parts will have padding between them instead of being
3510 // packed like a vector.
3511 bool ScalarizedAndExtendedVector =
3512 ArgVT.isVector() && !VA.getLocVT().isVector() &&
3513 VA.getLocVT().getSizeInBits() != ArgVT.getScalarSizeInBits();
3514
3515 // This is an argument in memory. We might be able to perform copy elision.
3516 // If the argument is passed directly in memory without any extension, then we
3517 // can perform copy elision. Large vector types, for example, may be passed
3518 // indirectly by pointer.
3519 if (Flags.isCopyElisionCandidate() &&
3520 VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem &&
3521 !ScalarizedAndExtendedVector) {
3522 SDValue PartAddr;
3523 if (Ins[i].PartOffset == 0) {
3524 // If this is a one-part value or the first part of a multi-part value,
3525 // create a stack object for the entire argument value type and return a
3526 // load from our portion of it. This assumes that if the first part of an
3527 // argument is in memory, the rest will also be in memory.
3528 int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
3529 /*IsImmutable=*/false);
3530 PartAddr = DAG.getFrameIndex(FI, PtrVT);
3531 return DAG.getLoad(
3532 ValVT, dl, Chain, PartAddr,
3533 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3534 } else {
3535 // This is not the first piece of an argument in memory. See if there is
3536 // already a fixed stack object including this offset. If so, assume it
3537 // was created by the PartOffset == 0 branch above and create a load from
3538 // the appropriate offset into it.
3539 int64_t PartBegin = VA.getLocMemOffset();
3540 int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
3541 int FI = MFI.getObjectIndexBegin();
3542 for (; MFI.isFixedObjectIndex(FI); ++FI) {
3543 int64_t ObjBegin = MFI.getObjectOffset(FI);
3544 int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
3545 if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
3546 break;
3547 }
3548 if (MFI.isFixedObjectIndex(FI)) {
3549 SDValue Addr =
3550 DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
3551 DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
3552 return DAG.getLoad(
3553 ValVT, dl, Chain, Addr,
3554 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
3555 Ins[i].PartOffset));
3556 }
3557 }
3558 }
3559
3560 int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
3561 VA.getLocMemOffset(), isImmutable);
3562
3563 // Set SExt or ZExt flag.
3564 if (VA.getLocInfo() == CCValAssign::ZExt) {
3565 MFI.setObjectZExt(FI, true);
3566 } else if (VA.getLocInfo() == CCValAssign::SExt) {
3567 MFI.setObjectSExt(FI, true);
3568 }
3569
3570 MaybeAlign Alignment;
3571 if (Subtarget.isTargetWindowsMSVC() && !Subtarget.is64Bit() &&
3572 ValVT != MVT::f80)
3573 Alignment = MaybeAlign(4);
3574 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
3575 SDValue Val = DAG.getLoad(
3576 ValVT, dl, Chain, FIN,
3577 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
3578 Alignment);
3579 return ExtendedInMem
3580 ? (VA.getValVT().isVector()
3581 ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
3582 : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
3583 : Val;
3584}
3585
3586// FIXME: Get this from tablegen.
3587static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
3588 const X86Subtarget &Subtarget) {
3589 assert(Subtarget.is64Bit())(static_cast <bool> (Subtarget.is64Bit()) ? void (0) : __assert_fail
("Subtarget.is64Bit()", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3589, __extension__ __PRETTY_FUNCTION__))
;
3590
3591 if (Subtarget.isCallingConvWin64(CallConv)) {
3592 static const MCPhysReg GPR64ArgRegsWin64[] = {
3593 X86::RCX, X86::RDX, X86::R8, X86::R9
3594 };
3595 return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
3596 }
3597
3598 static const MCPhysReg GPR64ArgRegs64Bit[] = {
3599 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
3600 };
3601 return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
3602}
3603
3604// FIXME: Get this from tablegen.
3605static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
3606 CallingConv::ID CallConv,
3607 const X86Subtarget &Subtarget) {
3608 assert(Subtarget.is64Bit())(static_cast <bool> (Subtarget.is64Bit()) ? void (0) : __assert_fail
("Subtarget.is64Bit()", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3608, __extension__ __PRETTY_FUNCTION__))
;
3609 if (Subtarget.isCallingConvWin64(CallConv)) {
3610 // The XMM registers which might contain var arg parameters are shadowed
3611 // in their paired GPR. So we only need to save the GPR to their home
3612 // slots.
3613 // TODO: __vectorcall will change this.
3614 return None;
3615 }
3616
3617 bool isSoftFloat = Subtarget.useSoftFloat();
3618 if (isSoftFloat || !Subtarget.hasSSE1())
3619 // Kernel mode asks for SSE to be disabled, so there are no XMM argument
3620 // registers.
3621 return None;
3622
3623 static const MCPhysReg XMMArgRegs64Bit[] = {
3624 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3625 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3626 };
3627 return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
3628}
3629
3630#ifndef NDEBUG
3631static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) {
3632 return llvm::is_sorted(
3633 ArgLocs, [](const CCValAssign &A, const CCValAssign &B) -> bool {
3634 return A.getValNo() < B.getValNo();
3635 });
3636}
3637#endif
3638
3639namespace {
3640/// This is a helper class for lowering variable arguments parameters.
3641class VarArgsLoweringHelper {
3642public:
3643 VarArgsLoweringHelper(X86MachineFunctionInfo *FuncInfo, const SDLoc &Loc,
3644 SelectionDAG &DAG, const X86Subtarget &Subtarget,
3645 CallingConv::ID CallConv, CCState &CCInfo)
3646 : FuncInfo(FuncInfo), DL(Loc), DAG(DAG), Subtarget(Subtarget),
3647 TheMachineFunction(DAG.getMachineFunction()),
3648 TheFunction(TheMachineFunction.getFunction()),
3649 FrameInfo(TheMachineFunction.getFrameInfo()),
3650 FrameLowering(*Subtarget.getFrameLowering()),
3651 TargLowering(DAG.getTargetLoweringInfo()), CallConv(CallConv),
3652 CCInfo(CCInfo) {}
3653
3654 // Lower variable arguments parameters.
3655 void lowerVarArgsParameters(SDValue &Chain, unsigned StackSize);
3656
3657private:
3658 void createVarArgAreaAndStoreRegisters(SDValue &Chain, unsigned StackSize);
3659
3660 void forwardMustTailParameters(SDValue &Chain);
3661
3662 bool is64Bit() const { return Subtarget.is64Bit(); }
3663 bool isWin64() const { return Subtarget.isCallingConvWin64(CallConv); }
3664
3665 X86MachineFunctionInfo *FuncInfo;
3666 const SDLoc &DL;
3667 SelectionDAG &DAG;
3668 const X86Subtarget &Subtarget;
3669 MachineFunction &TheMachineFunction;
3670 const Function &TheFunction;
3671 MachineFrameInfo &FrameInfo;
3672 const TargetFrameLowering &FrameLowering;
3673 const TargetLowering &TargLowering;
3674 CallingConv::ID CallConv;
3675 CCState &CCInfo;
3676};
3677} // namespace
3678
3679void VarArgsLoweringHelper::createVarArgAreaAndStoreRegisters(
3680 SDValue &Chain, unsigned StackSize) {
3681 // If the function takes variable number of arguments, make a frame index for
3682 // the start of the first vararg value... for expansion of llvm.va_start. We
3683 // can skip this if there are no va_start calls.
3684 if (is64Bit() || (CallConv != CallingConv::X86_FastCall &&
3685 CallConv != CallingConv::X86_ThisCall)) {
3686 FuncInfo->setVarArgsFrameIndex(
3687 FrameInfo.CreateFixedObject(1, StackSize, true));
3688 }
3689
3690 // 64-bit calling conventions support varargs and register parameters, so we
3691 // have to do extra work to spill them in the prologue.
3692 if (is64Bit()) {
3693 // Find the first unallocated argument registers.
3694 ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
3695 ArrayRef<MCPhysReg> ArgXMMs =
3696 get64BitArgumentXMMs(TheMachineFunction, CallConv, Subtarget);
3697 unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
3698 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
3699
3700 assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&(static_cast <bool> (!(NumXMMRegs && !Subtarget
.hasSSE1()) && "SSE register cannot be used when SSE is disabled!"
) ? void (0) : __assert_fail ("!(NumXMMRegs && !Subtarget.hasSSE1()) && \"SSE register cannot be used when SSE is disabled!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3701, __extension__
__PRETTY_FUNCTION__))
3701 "SSE register cannot be used when SSE is disabled!")(static_cast <bool> (!(NumXMMRegs && !Subtarget
.hasSSE1()) && "SSE register cannot be used when SSE is disabled!"
) ? void (0) : __assert_fail ("!(NumXMMRegs && !Subtarget.hasSSE1()) && \"SSE register cannot be used when SSE is disabled!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3701, __extension__
__PRETTY_FUNCTION__))
;
3702
3703 if (isWin64()) {
3704 // Get to the caller-allocated home save location. Add 8 to account
3705 // for the return address.
3706 int HomeOffset = FrameLowering.getOffsetOfLocalArea() + 8;
3707 FuncInfo->setRegSaveFrameIndex(
3708 FrameInfo.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
3709 // Fixup to set vararg frame on shadow area (4 x i64).
3710 if (NumIntRegs < 4)
3711 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
3712 } else {
3713 // For X86-64, if there are vararg parameters that are passed via
3714 // registers, then we must store them to their spots on the stack so
3715 // they may be loaded by dereferencing the result of va_next.
3716 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
3717 FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
3718 FuncInfo->setRegSaveFrameIndex(FrameInfo.CreateStackObject(
3719 ArgGPRs.size() * 8 + ArgXMMs.size() * 16, Align(16), false));
3720 }
3721
3722 SmallVector<SDValue, 6>
3723 LiveGPRs; // list of SDValue for GPR registers keeping live input value
3724 SmallVector<SDValue, 8> LiveXMMRegs; // list of SDValue for XMM registers
3725 // keeping live input value
3726 SDValue ALVal; // if applicable keeps SDValue for %al register
3727
3728 // Gather all the live in physical registers.
3729 for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
3730 Register GPR = TheMachineFunction.addLiveIn(Reg, &X86::GR64RegClass);
3731 LiveGPRs.push_back(DAG.getCopyFromReg(Chain, DL, GPR, MVT::i64));
3732 }
3733 const auto &AvailableXmms = ArgXMMs.slice(NumXMMRegs);
3734 if (!AvailableXmms.empty()) {
3735 Register AL = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);
3736 ALVal = DAG.getCopyFromReg(Chain, DL, AL, MVT::i8);
3737 for (MCPhysReg Reg : AvailableXmms) {
3738 // FastRegisterAllocator spills virtual registers at basic
3739 // block boundary. That leads to usages of xmm registers
3740 // outside of check for %al. Pass physical registers to
3741 // VASTART_SAVE_XMM_REGS to avoid unneccessary spilling.
3742 TheMachineFunction.getRegInfo().addLiveIn(Reg);
3743 LiveXMMRegs.push_back(DAG.getRegister(Reg, MVT::v4f32));
3744 }
3745 }
3746
3747 // Store the integer parameter registers.
3748 SmallVector<SDValue, 8> MemOps;
3749 SDValue RSFIN =
3750 DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
3751 TargLowering.getPointerTy(DAG.getDataLayout()));
3752 unsigned Offset = FuncInfo->getVarArgsGPOffset();
3753 for (SDValue Val : LiveGPRs) {
3754 SDValue FIN = DAG.getNode(ISD::ADD, DL,
3755 TargLowering.getPointerTy(DAG.getDataLayout()),
3756 RSFIN, DAG.getIntPtrConstant(Offset, DL));
3757 SDValue Store =
3758 DAG.getStore(Val.getValue(1), DL, Val, FIN,
3759 MachinePointerInfo::getFixedStack(
3760 DAG.getMachineFunction(),
3761 FuncInfo->getRegSaveFrameIndex(), Offset));
3762 MemOps.push_back(Store);
3763 Offset += 8;
3764 }
3765
3766 // Now store the XMM (fp + vector) parameter registers.
3767 if (!LiveXMMRegs.empty()) {
3768 SmallVector<SDValue, 12> SaveXMMOps;
3769 SaveXMMOps.push_back(Chain);
3770 SaveXMMOps.push_back(ALVal);
3771 SaveXMMOps.push_back(RSFIN);
3772 SaveXMMOps.push_back(
3773 DAG.getTargetConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32));
3774 llvm::append_range(SaveXMMOps, LiveXMMRegs);
3775 MachineMemOperand *StoreMMO =
3776 DAG.getMachineFunction().getMachineMemOperand(
3777 MachinePointerInfo::getFixedStack(
3778 DAG.getMachineFunction(), FuncInfo->getRegSaveFrameIndex(),
3779 Offset),
3780 MachineMemOperand::MOStore, 128, Align(16));
3781 MemOps.push_back(DAG.getMemIntrinsicNode(X86ISD::VASTART_SAVE_XMM_REGS,
3782 DL, DAG.getVTList(MVT::Other),
3783 SaveXMMOps, MVT::i8, StoreMMO));
3784 }
3785
3786 if (!MemOps.empty())
3787 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
3788 }
3789}
3790
3791void VarArgsLoweringHelper::forwardMustTailParameters(SDValue &Chain) {
3792 // Find the largest legal vector type.
3793 MVT VecVT = MVT::Other;
3794 // FIXME: Only some x86_32 calling conventions support AVX512.
3795 if (Subtarget.useAVX512Regs() &&
3796 (is64Bit() || (CallConv == CallingConv::X86_VectorCall ||
3797 CallConv == CallingConv::Intel_OCL_BI)))
3798 VecVT = MVT::v16f32;
3799 else if (Subtarget.hasAVX())
3800 VecVT = MVT::v8f32;
3801 else if (Subtarget.hasSSE2())
3802 VecVT = MVT::v4f32;
3803
3804 // We forward some GPRs and some vector types.
3805 SmallVector<MVT, 2> RegParmTypes;
3806 MVT IntVT = is64Bit() ? MVT::i64 : MVT::i32;
3807 RegParmTypes.push_back(IntVT);
3808 if (VecVT != MVT::Other)
3809 RegParmTypes.push_back(VecVT);
3810
3811 // Compute the set of forwarded registers. The rest are scratch.
3812 SmallVectorImpl<ForwardedRegister> &Forwards =
3813 FuncInfo->getForwardedMustTailRegParms();
3814 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
3815
3816 // Forward AL for SysV x86_64 targets, since it is used for varargs.
3817 if (is64Bit() && !isWin64() && !CCInfo.isAllocated(X86::AL)) {
3818 Register ALVReg = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);
3819 Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
3820 }
3821
3822 // Copy all forwards from physical to virtual registers.
3823 for (ForwardedRegister &FR : Forwards) {
3824 // FIXME: Can we use a less constrained schedule?
3825 SDValue RegVal = DAG.getCopyFromReg(Chain, DL, FR.VReg, FR.VT);
3826 FR.VReg = TheMachineFunction.getRegInfo().createVirtualRegister(
3827 TargLowering.getRegClassFor(FR.VT));
3828 Chain = DAG.getCopyToReg(Chain, DL, FR.VReg, RegVal);
3829 }
3830}
3831
3832void VarArgsLoweringHelper::lowerVarArgsParameters(SDValue &Chain,
3833 unsigned StackSize) {
3834 // Set FrameIndex to the 0xAAAAAAA value to mark unset state.
3835 // If necessary, it would be set into the correct value later.
3836 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
3837 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
3838
3839 if (FrameInfo.hasVAStart())
3840 createVarArgAreaAndStoreRegisters(Chain, StackSize);
3841
3842 if (FrameInfo.hasMustTailInVarArgFunc())
3843 forwardMustTailParameters(Chain);
3844}
3845
3846SDValue X86TargetLowering::LowerFormalArguments(
3847 SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
3848 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3849 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3850 MachineFunction &MF = DAG.getMachineFunction();
3851 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
3852
3853 const Function &F = MF.getFunction();
3854 if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&
3855 F.getName() == "main")
3856 FuncInfo->setForceFramePointer(true);
3857
3858 MachineFrameInfo &MFI = MF.getFrameInfo();
3859 bool Is64Bit = Subtarget.is64Bit();
3860 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
3861
3862 assert((static_cast <bool> (!(IsVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling conv' regcall, fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(IsVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3864, __extension__
__PRETTY_FUNCTION__))
3863 !(IsVarArg && canGuaranteeTCO(CallConv)) &&(static_cast <bool> (!(IsVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling conv' regcall, fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(IsVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3864, __extension__
__PRETTY_FUNCTION__))
3864 "Var args not supported with calling conv' regcall, fastcc, ghc or hipe")(static_cast <bool> (!(IsVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling conv' regcall, fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(IsVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3864, __extension__
__PRETTY_FUNCTION__))
;
3865
3866 // Assign locations to all of the incoming arguments.
3867 SmallVector<CCValAssign, 16> ArgLocs;
3868 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
3869
3870 // Allocate shadow area for Win64.
3871 if (IsWin64)
3872 CCInfo.AllocateStack(32, Align(8));
3873
3874 CCInfo.AnalyzeArguments(Ins, CC_X86);
3875
3876 // In vectorcall calling convention a second pass is required for the HVA
3877 // types.
3878 if (CallingConv::X86_VectorCall == CallConv) {
3879 CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
3880 }
3881
3882 // The next loop assumes that the locations are in the same order of the
3883 // input arguments.
3884 assert(isSortedByValueNo(ArgLocs) &&(static_cast <bool> (isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering") ? void
(0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3885, __extension__
__PRETTY_FUNCTION__))
3885 "Argument Location list must be sorted before lowering")(static_cast <bool> (isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering") ? void
(0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3885, __extension__
__PRETTY_FUNCTION__))
;
3886
3887 SDValue ArgValue;
3888 for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
3889 ++I, ++InsIndex) {
3890 assert(InsIndex < Ins.size() && "Invalid Ins index")(static_cast <bool> (InsIndex < Ins.size() &&
"Invalid Ins index") ? void (0) : __assert_fail ("InsIndex < Ins.size() && \"Invalid Ins index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3890, __extension__
__PRETTY_FUNCTION__))
;
3891 CCValAssign &VA = ArgLocs[I];
3892
3893 if (VA.isRegLoc()) {
3894 EVT RegVT = VA.getLocVT();
3895 if (VA.needsCustom()) {
3896 assert((static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3898, __extension__
__PRETTY_FUNCTION__))
3897 VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3898, __extension__
__PRETTY_FUNCTION__))
3898 "Currently the only custom case is when we split v64i1 to 2 regs")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3898, __extension__
__PRETTY_FUNCTION__))
;
3899
3900 // v64i1 values, in regcall calling convention, that are
3901 // compiled to 32 bit arch, are split up into two registers.
3902 ArgValue =
3903 getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
3904 } else {
3905 const TargetRegisterClass *RC;
3906 if (RegVT == MVT::i8)
3907 RC = &X86::GR8RegClass;
3908 else if (RegVT == MVT::i16)
3909 RC = &X86::GR16RegClass;
3910 else if (RegVT == MVT::i32)
3911 RC = &X86::GR32RegClass;
3912 else if (Is64Bit && RegVT == MVT::i64)
3913 RC = &X86::GR64RegClass;
3914 else if (RegVT == MVT::f16)
3915 RC = &X86::FR16XRegClass;
3916 else if (RegVT == MVT::f32)
3917 RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
3918 else if (RegVT == MVT::f64)
3919 RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
3920 else if (RegVT == MVT::f80)
3921 RC = &X86::RFP80RegClass;
3922 else if (RegVT == MVT::f128)
3923 RC = &X86::VR128RegClass;
3924 else if (RegVT.is512BitVector())
3925 RC = &X86::VR512RegClass;
3926 else if (RegVT.is256BitVector())
3927 RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
3928 else if (RegVT.is128BitVector())
3929 RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
3930 else if (RegVT == MVT::x86mmx)
3931 RC = &X86::VR64RegClass;
3932 else if (RegVT == MVT::v1i1)
3933 RC = &X86::VK1RegClass;
3934 else if (RegVT == MVT::v8i1)
3935 RC = &X86::VK8RegClass;
3936 else if (RegVT == MVT::v16i1)
3937 RC = &X86::VK16RegClass;
3938 else if (RegVT == MVT::v32i1)
3939 RC = &X86::VK32RegClass;
3940 else if (RegVT == MVT::v64i1)
3941 RC = &X86::VK64RegClass;
3942 else
3943 llvm_unreachable("Unknown argument type!")::llvm::llvm_unreachable_internal("Unknown argument type!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3943)
;
3944
3945 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
3946 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
3947 }
3948
3949 // If this is an 8 or 16-bit value, it is really passed promoted to 32
3950 // bits. Insert an assert[sz]ext to capture this, then truncate to the
3951 // right size.
3952 if (VA.getLocInfo() == CCValAssign::SExt)
3953 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
3954 DAG.getValueType(VA.getValVT()));
3955 else if (VA.getLocInfo() == CCValAssign::ZExt)
3956 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
3957 DAG.getValueType(VA.getValVT()));
3958 else if (VA.getLocInfo() == CCValAssign::BCvt)
3959 ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
3960
3961 if (VA.isExtInLoc()) {
3962 // Handle MMX values passed in XMM regs.
3963 if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
3964 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
3965 else if (VA.getValVT().isVector() &&
3966 VA.getValVT().getScalarType() == MVT::i1 &&
3967 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
3968 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
3969 // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
3970 ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
3971 } else
3972 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
3973 }
3974 } else {
3975 assert(VA.isMemLoc())(static_cast <bool> (VA.isMemLoc()) ? void (0) : __assert_fail
("VA.isMemLoc()", "llvm/lib/Target/X86/X86ISelLowering.cpp",
3975, __extension__ __PRETTY_FUNCTION__))
;
3976 ArgValue =
3977 LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
3978 }
3979
3980 // If value is passed via pointer - do a load.
3981 if (VA.getLocInfo() == CCValAssign::Indirect && !Ins[I].Flags.isByVal())
3982 ArgValue =
3983 DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
3984
3985 InVals.push_back(ArgValue);
3986 }
3987
3988 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
3989 if (Ins[I].Flags.isSwiftAsync()) {
3990 auto X86FI = MF.getInfo<X86MachineFunctionInfo>();
3991 if (Subtarget.is64Bit())
3992 X86FI->setHasSwiftAsyncContext(true);
3993 else {
3994 int FI = MF.getFrameInfo().CreateStackObject(4, Align(4), false);
3995 X86FI->setSwiftAsyncContextFrameIdx(FI);
3996 SDValue St = DAG.getStore(DAG.getEntryNode(), dl, InVals[I],
3997 DAG.getFrameIndex(FI, MVT::i32),
3998 MachinePointerInfo::getFixedStack(MF, FI));
3999 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, St, Chain);
4000 }
4001 }
4002
4003 // Swift calling convention does not require we copy the sret argument
4004 // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
4005 if (CallConv == CallingConv::Swift || CallConv == CallingConv::SwiftTail)
4006 continue;
4007
4008 // All x86 ABIs require that for returning structs by value we copy the
4009 // sret argument into %rax/%eax (depending on ABI) for the return. Save
4010 // the argument into a virtual register so that we can access it from the
4011 // return points.
4012 if (Ins[I].Flags.isSRet()) {
4013 assert(!FuncInfo->getSRetReturnReg() &&(static_cast <bool> (!FuncInfo->getSRetReturnReg() &&
"SRet return has already been set") ? void (0) : __assert_fail
("!FuncInfo->getSRetReturnReg() && \"SRet return has already been set\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4014, __extension__
__PRETTY_FUNCTION__))
4014 "SRet return has already been set")(static_cast <bool> (!FuncInfo->getSRetReturnReg() &&
"SRet return has already been set") ? void (0) : __assert_fail
("!FuncInfo->getSRetReturnReg() && \"SRet return has already been set\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4014, __extension__
__PRETTY_FUNCTION__))
;
4015 MVT PtrTy = getPointerTy(DAG.getDataLayout());
4016 Register Reg =
4017 MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
4018 FuncInfo->setSRetReturnReg(Reg);
4019 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
4020 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
4021 break;
4022 }
4023 }
4024
4025 unsigned StackSize = CCInfo.getNextStackOffset();
4026 // Align stack specially for tail calls.
4027 if (shouldGuaranteeTCO(CallConv,
4028 MF.getTarget().Options.GuaranteedTailCallOpt))
4029 StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
4030
4031 if (IsVarArg)
4032 VarArgsLoweringHelper(FuncInfo, dl, DAG, Subtarget, CallConv, CCInfo)
4033 .lowerVarArgsParameters(Chain, StackSize);
4034
4035 // Some CCs need callee pop.
4036 if (X86::isCalleePop(CallConv, Is64Bit, IsVarArg,
4037 MF.getTarget().Options.GuaranteedTailCallOpt)) {
4038 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
4039 } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
4040 // X86 interrupts must pop the error code (and the alignment padding) if
4041 // present.
4042 FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
4043 } else {
4044 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
4045 // If this is an sret function, the return should pop the hidden pointer.
4046 if (!canGuaranteeTCO(CallConv) && hasCalleePopSRet(Ins, Subtarget))
4047 FuncInfo->setBytesToPopOnReturn(4);
4048 }
4049
4050 if (!Is64Bit) {
4051 // RegSaveFrameIndex is X86-64 only.
4052 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
4053 }
4054
4055 FuncInfo->setArgumentStackSize(StackSize);
4056
4057 if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
4058 EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());
4059 if (Personality == EHPersonality::CoreCLR) {
4060 assert(Is64Bit)(static_cast <bool> (Is64Bit) ? void (0) : __assert_fail
("Is64Bit", "llvm/lib/Target/X86/X86ISelLowering.cpp", 4060,
__extension__ __PRETTY_FUNCTION__))
;
4061 // TODO: Add a mechanism to frame lowering that will allow us to indicate
4062 // that we'd prefer this slot be allocated towards the bottom of the frame
4063 // (i.e. near the stack pointer after allocating the frame). Every
4064 // funclet needs a copy of this slot in its (mostly empty) frame, and the
4065 // offset from the bottom of this and each funclet's frame must be the
4066 // same, so the size of funclets' (mostly empty) frames is dictated by
4067 // how far this slot is from the bottom (since they allocate just enough
4068 // space to accommodate holding this slot at the correct offset).
4069 int PSPSymFI = MFI.CreateStackObject(8, Align(8), /*isSpillSlot=*/false);
4070 EHInfo->PSPSymFrameIdx = PSPSymFI;
4071 }
4072 }
4073
4074 if (CallConv == CallingConv::X86_RegCall ||
4075 F.hasFnAttribute("no_caller_saved_registers")) {
4076 MachineRegisterInfo &MRI = MF.getRegInfo();
4077 for (std::pair<Register, Register> Pair : MRI.liveins())
4078 MRI.disableCalleeSavedRegister(Pair.first);
4079 }
4080
4081 return Chain;
4082}
4083
4084SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
4085 SDValue Arg, const SDLoc &dl,
4086 SelectionDAG &DAG,
4087 const CCValAssign &VA,
4088 ISD::ArgFlagsTy Flags,
4089 bool isByVal) const {
4090 unsigned LocMemOffset = VA.getLocMemOffset();
4091 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
4092 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
4093 StackPtr, PtrOff);
4094 if (isByVal)
4095 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
4096
4097 MaybeAlign Alignment;
4098 if (Subtarget.isTargetWindowsMSVC() && !Subtarget.is64Bit() &&
4099 Arg.getSimpleValueType() != MVT::f80)
4100 Alignment = MaybeAlign(4);
4101 return DAG.getStore(
4102 Chain, dl, Arg, PtrOff,
4103 MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset),
4104 Alignment);
4105}
4106
4107/// Emit a load of return address if tail call
4108/// optimization is performed and it is required.
4109SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
4110 SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
4111 bool Is64Bit, int FPDiff, const SDLoc &dl) const {
4112 // Adjust the Return address stack slot.
4113 EVT VT = getPointerTy(DAG.getDataLayout());
4114 OutRetAddr = getReturnAddressFrameIndex(DAG);
4115
4116 // Load the "old" Return address.
4117 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
4118 return SDValue(OutRetAddr.getNode(), 1);
4119}
4120
4121/// Emit a store of the return address if tail call
4122/// optimization is performed and it is required (FPDiff!=0).
4123static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
4124 SDValue Chain, SDValue RetAddrFrIdx,
4125 EVT PtrVT, unsigned SlotSize,
4126 int FPDiff, const SDLoc &dl) {
4127 // Store the return address to the appropriate stack slot.
4128 if (!FPDiff) return Chain;
4129 // Calculate the new stack slot for the return address.
4130 int NewReturnAddrFI =
4131 MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
4132 false);
4133 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
4134 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
4135 MachinePointerInfo::getFixedStack(
4136 DAG.getMachineFunction(), NewReturnAddrFI));
4137 return Chain;
4138}
4139
4140/// Returns a vector_shuffle mask for an movs{s|d}, movd
4141/// operation of specified width.
4142static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
4143 SDValue V2) {
4144 unsigned NumElems = VT.getVectorNumElements();
4145 SmallVector<int, 8> Mask;
4146 Mask.push_back(NumElems);
4147 for (unsigned i = 1; i != NumElems; ++i)
4148 Mask.push_back(i);
4149 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
4150}
4151
4152SDValue
4153X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
4154 SmallVectorImpl<SDValue> &InVals) const {
4155 SelectionDAG &DAG = CLI.DAG;
4156 SDLoc &dl = CLI.DL;
4157 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
4158 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
4159 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
4160 SDValue Chain = CLI.Chain;
4161 SDValue Callee = CLI.Callee;
4162 CallingConv::ID CallConv = CLI.CallConv;
4163 bool &isTailCall = CLI.IsTailCall;
4164 bool isVarArg = CLI.IsVarArg;
4165 const auto *CB = CLI.CB;
4166
4167 MachineFunction &MF = DAG.getMachineFunction();
4168 bool Is64Bit = Subtarget.is64Bit();
4169 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
4170 bool IsSibcall = false;
4171 bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt ||
4172 CallConv == CallingConv::Tail || CallConv == CallingConv::SwiftTail;
4173 bool IsCalleePopSRet = !IsGuaranteeTCO && hasCalleePopSRet(Outs, Subtarget);
4174 X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
4175 bool HasNCSR = (CB && isa<CallInst>(CB) &&
4176 CB->hasFnAttr("no_caller_saved_registers"));
4177 bool HasNoCfCheck = (CB && CB->doesNoCfCheck());
4178 bool IsIndirectCall = (CB && isa<CallInst>(CB) && CB->isIndirectCall());
4179 const Module *M = MF.getMMI().getModule();
4180 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
4181
4182 MachineFunction::CallSiteInfo CSInfo;
4183 if (CallConv == CallingConv::X86_INTR)
4184 report_fatal_error("X86 interrupts may not be called directly");
4185
4186 bool IsMustTail = CLI.CB && CLI.CB->isMustTailCall();
4187 if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO && !IsMustTail) {
4188 // If we are using a GOT, disable tail calls to external symbols with
4189 // default visibility. Tail calling such a symbol requires using a GOT
4190 // relocation, which forces early binding of the symbol. This breaks code
4191 // that require lazy function symbol resolution. Using musttail or
4192 // GuaranteedTailCallOpt will override this.
4193 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
4194 if (!G || (!G->getGlobal()->hasLocalLinkage() &&
4195 G->getGlobal()->hasDefaultVisibility()))
4196 isTailCall = false;
4197 }
4198
4199 if (isTailCall && !IsMustTail) {
4200 // Check if it's really possible to do a tail call.
4201 isTailCall = IsEligibleForTailCallOptimization(
4202 Callee, CallConv, IsCalleePopSRet, isVarArg, CLI.RetTy, Outs, OutVals,
4203 Ins, DAG);
4204
4205 // Sibcalls are automatically detected tailcalls which do not require
4206 // ABI changes.
4207 if (!IsGuaranteeTCO && isTailCall)
4208 IsSibcall = true;
4209
4210 if (isTailCall)
4211 ++NumTailCalls;
4212 }
4213
4214 if (IsMustTail && !isTailCall)
4215 report_fatal_error("failed to perform tail call elimination on a call "
4216 "site marked musttail");
4217
4218 assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&(static_cast <bool> (!(isVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling convention fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(isVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling convention fastcc, ghc or hipe\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4219, __extension__
__PRETTY_FUNCTION__))
4219 "Var args not supported with calling convention fastcc, ghc or hipe")(static_cast <bool> (!(isVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling convention fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(isVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling convention fastcc, ghc or hipe\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4219, __extension__
__PRETTY_FUNCTION__))
;
4220
4221 // Analyze operands of the call, assigning locations to each operand.
4222 SmallVector<CCValAssign, 16> ArgLocs;
4223 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
4224
4225 // Allocate shadow area for Win64.
4226 if (IsWin64)
4227 CCInfo.AllocateStack(32, Align(8));
4228
4229 CCInfo.AnalyzeArguments(Outs, CC_X86);
4230
4231 // In vectorcall calling convention a second pass is required for the HVA
4232 // types.
4233 if (CallingConv::X86_VectorCall == CallConv) {
4234 CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
4235 }
4236
4237 // Get a count of how many bytes are to be pushed on the stack.
4238 unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
4239 if (IsSibcall)
4240 // This is a sibcall. The memory operands are available in caller's
4241 // own caller's stack.
4242 NumBytes = 0;
4243 else if (IsGuaranteeTCO && canGuaranteeTCO(CallConv))
4244 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
4245
4246 int FPDiff = 0;
4247 if (isTailCall &&
4248 shouldGuaranteeTCO(CallConv,
4249 MF.getTarget().Options.GuaranteedTailCallOpt)) {
4250 // Lower arguments at fp - stackoffset + fpdiff.
4251 unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
4252
4253 FPDiff = NumBytesCallerPushed - NumBytes;
4254
4255 // Set the delta of movement of the returnaddr stackslot.
4256 // But only set if delta is greater than previous delta.
4257 if (FPDiff < X86Info->getTCReturnAddrDelta())
4258 X86Info->setTCReturnAddrDelta(FPDiff);
4259 }
4260
4261 unsigned NumBytesToPush = NumBytes;
4262 unsigned NumBytesToPop = NumBytes;
4263
4264 // If we have an inalloca argument, all stack space has already been allocated
4265 // for us and be right at the top of the stack. We don't support multiple
4266 // arguments passed in memory when using inalloca.
4267 if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
4268 NumBytesToPush = 0;
4269 if (!ArgLocs.back().isMemLoc())
4270 report_fatal_error("cannot use inalloca attribute on a register "
4271 "parameter");
4272 if (ArgLocs.back().getLocMemOffset() != 0)
4273 report_fatal_error("any parameter with the inalloca attribute must be "
4274 "the only memory argument");
4275 } else if (CLI.IsPreallocated) {
4276 assert(ArgLocs.back().isMemLoc() &&(static_cast <bool> (ArgLocs.back().isMemLoc() &&
"cannot use preallocated attribute on a register " "parameter"
) ? void (0) : __assert_fail ("ArgLocs.back().isMemLoc() && \"cannot use preallocated attribute on a register \" \"parameter\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4278, __extension__
__PRETTY_FUNCTION__))
4277 "cannot use preallocated attribute on a register "(static_cast <bool> (ArgLocs.back().isMemLoc() &&
"cannot use preallocated attribute on a register " "parameter"
) ? void (0) : __assert_fail ("ArgLocs.back().isMemLoc() && \"cannot use preallocated attribute on a register \" \"parameter\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4278, __extension__
__PRETTY_FUNCTION__))
4278 "parameter")(static_cast <bool> (ArgLocs.back().isMemLoc() &&
"cannot use preallocated attribute on a register " "parameter"
) ? void (0) : __assert_fail ("ArgLocs.back().isMemLoc() && \"cannot use preallocated attribute on a register \" \"parameter\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4278, __extension__
__PRETTY_FUNCTION__))
;
4279 SmallVector<size_t, 4> PreallocatedOffsets;
4280 for (size_t i = 0; i < CLI.OutVals.size(); ++i) {
4281 if (CLI.CB->paramHasAttr(i, Attribute::Preallocated)) {
4282 PreallocatedOffsets.push_back(ArgLocs[i].getLocMemOffset());
4283 }
4284 }
4285 auto *MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();
4286 size_t PreallocatedId = MFI->getPreallocatedIdForCallSite(CLI.CB);
4287 MFI->setPreallocatedStackSize(PreallocatedId, NumBytes);
4288 MFI->setPreallocatedArgOffsets(PreallocatedId, PreallocatedOffsets);
4289 NumBytesToPush = 0;
4290 }
4291
4292 if (!IsSibcall && !IsMustTail)
4293 Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
4294 NumBytes - NumBytesToPush, dl);
4295
4296 SDValue RetAddrFrIdx;
4297 // Load return address for tail calls.
4298 if (isTailCall && FPDiff)
4299 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
4300 Is64Bit, FPDiff, dl);
4301
4302 SmallVector<std::pair<Register, SDValue>, 8> RegsToPass;
4303 SmallVector<SDValue, 8> MemOpChains;
4304 SDValue StackPtr;
4305
4306 // The next loop assumes that the locations are in the same order of the
4307 // input arguments.
4308 assert(isSortedByValueNo(ArgLocs) &&(static_cast <bool> (isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering") ? void
(0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4309, __extension__
__PRETTY_FUNCTION__))
4309 "Argument Location list must be sorted before lowering")(static_cast <bool> (isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering") ? void
(0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4309, __extension__
__PRETTY_FUNCTION__))
;
4310
4311 // Walk the register/memloc assignments, inserting copies/loads. In the case
4312 // of tail call optimization arguments are handle later.
4313 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4314 for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
4315 ++I, ++OutIndex) {
4316 assert(OutIndex < Outs.size() && "Invalid Out index")(static_cast <bool> (OutIndex < Outs.size() &&
"Invalid Out index") ? void (0) : __assert_fail ("OutIndex < Outs.size() && \"Invalid Out index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4316, __extension__
__PRETTY_FUNCTION__))
;
4317 // Skip inalloca/preallocated arguments, they have already been written.
4318 ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
4319 if (Flags.isInAlloca() || Flags.isPreallocated())
4320 continue;
4321
4322 CCValAssign &VA = ArgLocs[I];
4323 EVT RegVT = VA.getLocVT();
4324 SDValue Arg = OutVals[OutIndex];
4325 bool isByVal = Flags.isByVal();
4326
4327 // Promote the value if needed.
4328 switch (VA.getLocInfo()) {
4329 default: llvm_unreachable("Unknown loc info!")::llvm::llvm_unreachable_internal("Unknown loc info!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4329)
;
4330 case CCValAssign::Full: break;
4331 case CCValAssign::SExt:
4332 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
4333 break;
4334 case CCValAssign::ZExt:
4335 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
4336 break;
4337 case CCValAssign::AExt:
4338 if (Arg.getValueType().isVector() &&
4339 Arg.getValueType().getVectorElementType() == MVT::i1)
4340 Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
4341 else if (RegVT.is128BitVector()) {
4342 // Special case: passing MMX values in XMM registers.
4343 Arg = DAG.getBitcast(MVT::i64, Arg);
4344 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
4345 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
4346 } else
4347 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
4348 break;
4349 case CCValAssign::BCvt:
4350 Arg = DAG.getBitcast(RegVT, Arg);
4351 break;
4352 case CCValAssign::Indirect: {
4353 if (isByVal) {
4354 // Memcpy the argument to a temporary stack slot to prevent
4355 // the caller from seeing any modifications the callee may make
4356 // as guaranteed by the `byval` attribute.
4357 int FrameIdx = MF.getFrameInfo().CreateStackObject(
4358 Flags.getByValSize(),
4359 std::max(Align(16), Flags.getNonZeroByValAlign()), false);
4360 SDValue StackSlot =
4361 DAG.getFrameIndex(FrameIdx, getPointerTy(DAG.getDataLayout()));
4362 Chain =
4363 CreateCopyOfByValArgument(Arg, StackSlot, Chain, Flags, DAG, dl);
4364 // From now on treat this as a regular pointer
4365 Arg = StackSlot;
4366 isByVal = false;
4367 } else {
4368 // Store the argument.
4369 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
4370 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
4371 Chain = DAG.getStore(
4372 Chain, dl, Arg, SpillSlot,
4373 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
4374 Arg = SpillSlot;
4375 }
4376 break;
4377 }
4378 }
4379
4380 if (VA.needsCustom()) {
4381 assert(VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4382, __extension__
__PRETTY_FUNCTION__))
4382 "Currently the only custom case is when we split v64i1 to 2 regs")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4382, __extension__
__PRETTY_FUNCTION__))
;
4383 // Split v64i1 value into two registers
4384 Passv64i1ArgInRegs(dl, DAG, Arg, RegsToPass, VA, ArgLocs[++I], Subtarget);
4385 } else if (VA.isRegLoc()) {
4386 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
4387 const TargetOptions &Options = DAG.getTarget().Options;
4388 if (Options.EmitCallSiteInfo)
4389 CSInfo.emplace_back(VA.getLocReg(), I);
4390 if (isVarArg && IsWin64) {
4391 // Win64 ABI requires argument XMM reg to be copied to the corresponding
4392 // shadow reg if callee is a varargs function.
4393 Register ShadowReg;
4394 switch (VA.getLocReg()) {
4395 case X86::XMM0: ShadowReg = X86::RCX; break;
4396 case X86::XMM1: ShadowReg = X86::RDX; break;
4397 case X86::XMM2: ShadowReg = X86::R8; break;
4398 case X86::XMM3: ShadowReg = X86::R9; break;
4399 }
4400 if (ShadowReg)
4401 RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
4402 }
4403 } else if (!IsSibcall && (!isTailCall || isByVal)) {
4404 assert(VA.isMemLoc())(static_cast <bool> (VA.isMemLoc()) ? void (0) : __assert_fail
("VA.isMemLoc()", "llvm/lib/Target/X86/X86ISelLowering.cpp",
4404, __extension__ __PRETTY_FUNCTION__))
;
4405 if (!StackPtr.getNode())
4406 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
4407 getPointerTy(DAG.getDataLayout()));
4408 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
4409 dl, DAG, VA, Flags, isByVal));
4410 }
4411 }
4412
4413 if (!MemOpChains.empty())
4414 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
4415
4416 if (Subtarget.isPICStyleGOT()) {
4417 // ELF / PIC requires GOT in the EBX register before function calls via PLT
4418 // GOT pointer (except regcall).
4419 if (!isTailCall) {
4420 // Indirect call with RegCall calling convertion may use up all the
4421 // general registers, so it is not suitable to bind EBX reister for
4422 // GOT address, just let register allocator handle it.
4423 if (CallConv != CallingConv::X86_RegCall)
4424 RegsToPass.push_back(std::make_pair(
4425 Register(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
4426 getPointerTy(DAG.getDataLayout()))));
4427 } else {
4428 // If we are tail calling and generating PIC/GOT style code load the
4429 // address of the callee into ECX. The value in ecx is used as target of
4430 // the tail jump. This is done to circumvent the ebx/callee-saved problem
4431 // for tail calls on PIC/GOT architectures. Normally we would just put the
4432 // address of GOT into ebx and then call target@PLT. But for tail calls
4433 // ebx would be restored (since ebx is callee saved) before jumping to the
4434 // target@PLT.
4435
4436 // Note: The actual moving to ECX is done further down.
4437 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
4438 if (G && !G->getGlobal()->hasLocalLinkage() &&
4439 G->getGlobal()->hasDefaultVisibility())
4440 Callee = LowerGlobalAddress(Callee, DAG);
4441 else if (isa<ExternalSymbolSDNode>(Callee))
4442 Callee = LowerExternalSymbol(Callee, DAG);
4443 }
4444 }
4445
4446 if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail &&
4447 (Subtarget.hasSSE1() || !M->getModuleFlag("SkipRaxSetup"))) {
4448 // From AMD64 ABI document:
4449 // For calls that may call functions that use varargs or stdargs
4450 // (prototype-less calls or calls to functions containing ellipsis (...) in
4451 // the declaration) %al is used as hidden argument to specify the number
4452 // of SSE registers used. The contents of %al do not need to match exactly
4453 // the number of registers, but must be an ubound on the number of SSE
4454 // registers used and is in the range 0 - 8 inclusive.
4455
4456 // Count the number of XMM registers allocated.
4457 static const MCPhysReg XMMArgRegs[] = {
4458 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
4459 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
4460 };
4461 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
4462 assert((Subtarget.hasSSE1() || !NumXMMRegs)(static_cast <bool> ((Subtarget.hasSSE1() || !NumXMMRegs
) && "SSE registers cannot be used when SSE is disabled"
) ? void (0) : __assert_fail ("(Subtarget.hasSSE1() || !NumXMMRegs) && \"SSE registers cannot be used when SSE is disabled\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4463, __extension__
__PRETTY_FUNCTION__))
4463 && "SSE registers cannot be used when SSE is disabled")(static_cast <bool> ((Subtarget.hasSSE1() || !NumXMMRegs
) && "SSE registers cannot be used when SSE is disabled"
) ? void (0) : __assert_fail ("(Subtarget.hasSSE1() || !NumXMMRegs) && \"SSE registers cannot be used when SSE is disabled\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4463, __extension__
__PRETTY_FUNCTION__))
;
4464 RegsToPass.push_back(std::make_pair(Register(X86::AL),
4465 DAG.getConstant(NumXMMRegs, dl,
4466 MVT::i8)));
4467 }
4468
4469 if (isVarArg && IsMustTail) {
4470 const auto &Forwards = X86Info->getForwardedMustTailRegParms();
4471 for (const auto &F : Forwards) {
4472 SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
4473 RegsToPass.push_back(std::make_pair(F.PReg, Val));
4474 }
4475 }
4476
4477 // For tail calls lower the arguments to the 'real' stack slots. Sibcalls
4478 // don't need this because the eligibility check rejects calls that require
4479 // shuffling arguments passed in memory.
4480 if (!IsSibcall && isTailCall) {
4481 // Force all the incoming stack arguments to be loaded from the stack
4482 // before any new outgoing arguments are stored to the stack, because the
4483 // outgoing stack slots may alias the incoming argument stack slots, and
4484 // the alias isn't otherwise explicit. This is slightly more conservative
4485 // than necessary, because it means that each store effectively depends
4486 // on every argument instead of just those arguments it would clobber.
4487 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
4488
4489 SmallVector<SDValue, 8> MemOpChains2;
4490 SDValue FIN;
4491 int FI = 0;
4492 for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
4493 ++I, ++OutsIndex) {
4494 CCValAssign &VA = ArgLocs[I];
4495
4496 if (VA.isRegLoc()) {
4497 if (VA.needsCustom()) {
4498 assert((CallConv == CallingConv::X86_RegCall) &&(static_cast <bool> ((CallConv == CallingConv::X86_RegCall
) && "Expecting custom case only in regcall calling convention"
) ? void (0) : __assert_fail ("(CallConv == CallingConv::X86_RegCall) && \"Expecting custom case only in regcall calling convention\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4499, __extension__
__PRETTY_FUNCTION__))
4499 "Expecting custom case only in regcall calling convention")(static_cast <bool> ((CallConv == CallingConv::X86_RegCall
) && "Expecting custom case only in regcall calling convention"
) ? void (0) : __assert_fail ("(CallConv == CallingConv::X86_RegCall) && \"Expecting custom case only in regcall calling convention\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4499, __extension__
__PRETTY_FUNCTION__))
;
4500 // This means that we are in special case where one argument was
4501 // passed through two register locations - Skip the next location
4502 ++I;
4503 }
4504
4505 continue;
4506 }
4507
4508 assert(VA.isMemLoc())(static_cast <bool> (VA.isMemLoc()) ? void (0) : __assert_fail
("VA.isMemLoc()", "llvm/lib/Target/X86/X86ISelLowering.cpp",
4508, __extension__ __PRETTY_FUNCTION__))
;
4509 SDValue Arg = OutVals[OutsIndex];
4510 ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
4511 // Skip inalloca/preallocated arguments. They don't require any work.
4512 if (Flags.isInAlloca() || Flags.isPreallocated())
4513 continue;
4514 // Create frame index.
4515 int32_t Offset = VA.getLocMemOffset()+FPDiff;
4516 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
4517 FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
4518 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
4519
4520 if (Flags.isByVal()) {
4521 // Copy relative to framepointer.
4522 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
4523 if (!StackPtr.getNode())
4524 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
4525 getPointerTy(DAG.getDataLayout()));
4526 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
4527 StackPtr, Source);
4528
4529 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
4530 ArgChain,
4531 Flags, DAG, dl));
4532 } else {
4533 // Store relative to framepointer.
4534 MemOpChains2.push_back(DAG.getStore(
4535 ArgChain, dl, Arg, FIN,
4536 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
4537 }
4538 }
4539
4540 if (!MemOpChains2.empty())
4541 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
4542
4543 // Store the return address to the appropriate stack slot.
4544 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
4545 getPointerTy(DAG.getDataLayout()),
4546 RegInfo->getSlotSize(), FPDiff, dl);
4547 }
4548
4549 // Build a sequence of copy-to-reg nodes chained together with token chain
4550 // and flag operands which copy the outgoing args into registers.
4551 SDValue InFlag;
4552 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
4553 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
4554 RegsToPass[i].second, InFlag);
4555 InFlag = Chain.getValue(1);
4556 }
4557
4558 if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
4559 assert(Is64Bit && "Large code model is only legal in 64-bit mode.")(static_cast <bool> (Is64Bit && "Large code model is only legal in 64-bit mode."
) ? void (0) : __assert_fail ("Is64Bit && \"Large code model is only legal in 64-bit mode.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4559, __extension__
__PRETTY_FUNCTION__))
;
4560 // In the 64-bit large code model, we have to make all calls
4561 // through a register, since the call instruction's 32-bit
4562 // pc-relative offset may not be large enough to hold the whole
4563 // address.
4564 } else if (Callee->getOpcode() == ISD::GlobalAddress ||
4565 Callee->getOpcode() == ISD::ExternalSymbol) {
4566 // Lower direct calls to global addresses and external symbols. Setting
4567 // ForCall to true here has the effect of removing WrapperRIP when possible
4568 // to allow direct calls to be selected without first materializing the
4569 // address into a register.
4570 Callee = LowerGlobalOrExternal(Callee, DAG, /*ForCall=*/true);
4571 } else if (Subtarget.isTarget64BitILP32() &&
4572 Callee.getValueType() == MVT::i32) {
4573 // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
4574 Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
4575 }
4576
4577 // Returns a chain & a flag for retval copy to use.
4578 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
4579 SmallVector<SDValue, 8> Ops;
4580
4581 if (!IsSibcall && isTailCall && !IsMustTail) {
4582 Chain = DAG.getCALLSEQ_END(Chain,
4583 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
4584 DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
4585 InFlag = Chain.getValue(1);
4586 }
4587
4588 Ops.push_back(Chain);
4589 Ops.push_back(Callee);
4590
4591 if (isTailCall)
4592 Ops.push_back(DAG.getTargetConstant(FPDiff, dl, MVT::i32));
4593
4594 // Add argument registers to the end of the list so that they are known live
4595 // into the call.
4596 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
4597 Ops.push_back(DAG.getRegister(RegsToPass[i].first,
4598 RegsToPass[i].second.getValueType()));
4599
4600 // Add a register mask operand representing the call-preserved registers.
4601 const uint32_t *Mask = [&]() {
4602 auto AdaptedCC = CallConv;
4603 // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists),
4604 // use X86_INTR calling convention because it has the same CSR mask
4605 // (same preserved registers).
4606 if (HasNCSR)
4607 AdaptedCC = (CallingConv::ID)CallingConv::X86_INTR;
4608 // If NoCalleeSavedRegisters is requested, than use GHC since it happens
4609 // to use the CSR_NoRegs_RegMask.
4610 if (CB && CB->hasFnAttr("no_callee_saved_registers"))
4611 AdaptedCC = (CallingConv::ID)CallingConv::GHC;
4612 return RegInfo->getCallPreservedMask(MF, AdaptedCC);
4613 }();
4614 assert(Mask && "Missing call preserved mask for calling convention")(static_cast <bool> (Mask && "Missing call preserved mask for calling convention"
) ? void (0) : __assert_fail ("Mask && \"Missing call preserved mask for calling convention\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4614, __extension__
__PRETTY_FUNCTION__))
;
4615
4616 // If this is an invoke in a 32-bit function using a funclet-based
4617 // personality, assume the function clobbers all registers. If an exception
4618 // is thrown, the runtime will not restore CSRs.
4619 // FIXME: Model this more precisely so that we can register allocate across
4620 // the normal edge and spill and fill across the exceptional edge.
4621 if (!Is64Bit && CLI.CB && isa<InvokeInst>(CLI.CB)) {
4622 const Function &CallerFn = MF.getFunction();
4623 EHPersonality Pers =
4624 CallerFn.hasPersonalityFn()
4625 ? classifyEHPersonality(CallerFn.getPersonalityFn())
4626 : EHPersonality::Unknown;
4627 if (isFuncletEHPersonality(Pers))
4628 Mask = RegInfo->getNoPreservedMask();
4629 }
4630
4631 // Define a new register mask from the existing mask.
4632 uint32_t *RegMask = nullptr;
4633
4634 // In some calling conventions we need to remove the used physical registers
4635 // from the reg mask.
4636 if (CallConv == CallingConv::X86_RegCall || HasNCSR) {
4637 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
4638
4639 // Allocate a new Reg Mask and copy Mask.
4640 RegMask = MF.allocateRegMask();
4641 unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs());
4642 memcpy(RegMask, Mask, sizeof(RegMask[0]) * RegMaskSize);
4643
4644 // Make sure all sub registers of the argument registers are reset
4645 // in the RegMask.
4646 for (auto const &RegPair : RegsToPass)
4647 for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true);
4648 SubRegs.isValid(); ++SubRegs)
4649 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
4650
4651 // Create the RegMask Operand according to our updated mask.
4652 Ops.push_back(DAG.getRegisterMask(RegMask));
4653 } else {
4654 // Create the RegMask Operand according to the static mask.
4655 Ops.push_back(DAG.getRegisterMask(Mask));
4656 }
4657
4658 if (InFlag.getNode())
4659 Ops.push_back(InFlag);
4660
4661 if (isTailCall) {
4662 // We used to do:
4663 //// If this is the first return lowered for this function, add the regs
4664 //// to the liveout set for the function.
4665 // This isn't right, although it's probably harmless on x86; liveouts
4666 // should be computed from returns not tail calls. Consider a void
4667 // function making a tail call to a function returning int.
4668 MF.getFrameInfo().setHasTailCall();
4669 SDValue Ret = DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
4670 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
4671 return Ret;
4672 }
4673
4674 if (HasNoCfCheck && IsCFProtectionSupported && IsIndirectCall) {
4675 Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops);
4676 } else if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
4677 // Calls with a "clang.arc.attachedcall" bundle are special. They should be
4678 // expanded to the call, directly followed by a special marker sequence and
4679 // a call to a ObjC library function. Use the CALL_RVMARKER to do that.
4680 assert(!isTailCall &&(static_cast <bool> (!isTailCall && "tail calls cannot be marked with clang.arc.attachedcall"
) ? void (0) : __assert_fail ("!isTailCall && \"tail calls cannot be marked with clang.arc.attachedcall\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4681, __extension__
__PRETTY_FUNCTION__))
4681 "tail calls cannot be marked with clang.arc.attachedcall")(static_cast <bool> (!isTailCall && "tail calls cannot be marked with clang.arc.attachedcall"
) ? void (0) : __assert_fail ("!isTailCall && \"tail calls cannot be marked with clang.arc.attachedcall\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4681, __extension__
__PRETTY_FUNCTION__))
;
4682 assert(Is64Bit && "clang.arc.attachedcall is only supported in 64bit mode")(static_cast <bool> (Is64Bit && "clang.arc.attachedcall is only supported in 64bit mode"
) ? void (0) : __assert_fail ("Is64Bit && \"clang.arc.attachedcall is only supported in 64bit mode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4682, __extension__
__PRETTY_FUNCTION__))
;
4683
4684 // Add a target global address for the retainRV/claimRV runtime function
4685 // just before the call target.
4686 Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB);
4687 auto PtrVT = getPointerTy(DAG.getDataLayout());
4688 auto GA = DAG.getTargetGlobalAddress(ARCFn, dl, PtrVT);
4689 Ops.insert(Ops.begin() + 1, GA);
4690 Chain = DAG.getNode(X86ISD::CALL_RVMARKER, dl, NodeTys, Ops);
4691 } else {
4692 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
4693 }
4694
4695 InFlag = Chain.getValue(1);
4696 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
4697 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
4698
4699 // Save heapallocsite metadata.
4700 if (CLI.CB)
4701 if (MDNode *HeapAlloc = CLI.CB->getMetadata("heapallocsite"))
4702 DAG.addHeapAllocSite(Chain.getNode(), HeapAlloc);
4703
4704 // Create the CALLSEQ_END node.
4705 unsigned NumBytesForCalleeToPop = 0; // Callee pops nothing.
4706 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
4707 DAG.getTarget().Options.GuaranteedTailCallOpt))
4708 NumBytesForCalleeToPop = NumBytes; // Callee pops everything
4709 else if (!canGuaranteeTCO(CallConv) && IsCalleePopSRet)
4710 // If this call passes a struct-return pointer, the callee
4711 // pops that struct pointer.
4712 NumBytesForCalleeToPop = 4;
4713
4714 // Returns a flag for retval copy to use.
4715 if (!IsSibcall) {
4716 Chain = DAG.getCALLSEQ_END(Chain,
4717 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
4718 DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
4719 true),
4720 InFlag, dl);
4721 InFlag = Chain.getValue(1);
4722 }
4723
4724 // Handle result values, copying them out of physregs into vregs that we
4725 // return.
4726 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
4727 InVals, RegMask);
4728}
4729
4730//===----------------------------------------------------------------------===//
4731// Fast Calling Convention (tail call) implementation
4732//===----------------------------------------------------------------------===//
4733
4734// Like std call, callee cleans arguments, convention except that ECX is
4735// reserved for storing the tail called function address. Only 2 registers are
4736// free for argument passing (inreg). Tail call optimization is performed
4737// provided:
4738// * tailcallopt is enabled
4739// * caller/callee are fastcc
4740// On X86_64 architecture with GOT-style position independent code only local
4741// (within module) calls are supported at the moment.
4742// To keep the stack aligned according to platform abi the function
4743// GetAlignedArgumentStackSize ensures that argument delta is always multiples
4744// of stack alignment. (Dynamic linkers need this - Darwin's dyld for example)
4745// If a tail called function callee has more arguments than the caller the
4746// caller needs to make sure that there is room to move the RETADDR to. This is
4747// achieved by reserving an area the size of the argument delta right after the
4748// original RETADDR, but before the saved framepointer or the spilled registers
4749// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
4750// stack layout:
4751// arg1
4752// arg2
4753// RETADDR
4754// [ new RETADDR
4755// move area ]
4756// (possible EBP)
4757// ESI
4758// EDI
4759// local1 ..
4760
4761/// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
4762/// requirement.
4763unsigned
4764X86TargetLowering::GetAlignedArgumentStackSize(const unsigned StackSize,
4765 SelectionDAG &DAG) const {
4766 const Align StackAlignment = Subtarget.getFrameLowering()->getStackAlign();
4767 const uint64_t SlotSize = Subtarget.getRegisterInfo()->getSlotSize();
4768 assert(StackSize % SlotSize == 0 &&(static_cast <bool> (StackSize % SlotSize == 0 &&
"StackSize must be a multiple of SlotSize") ? void (0) : __assert_fail
("StackSize % SlotSize == 0 && \"StackSize must be a multiple of SlotSize\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4769, __extension__
__PRETTY_FUNCTION__))
4769 "StackSize must be a multiple of SlotSize")(static_cast <bool> (StackSize % SlotSize == 0 &&
"StackSize must be a multiple of SlotSize") ? void (0) : __assert_fail
("StackSize % SlotSize == 0 && \"StackSize must be a multiple of SlotSize\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4769, __extension__
__PRETTY_FUNCTION__))
;
4770 return alignTo(StackSize + SlotSize, StackAlignment) - SlotSize;
4771}
4772
4773/// Return true if the given stack call argument is already available in the
4774/// same position (relatively) of the caller's incoming argument stack.
4775static
4776bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
4777 MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
4778 const X86InstrInfo *TII, const CCValAssign &VA) {
4779 unsigned Bytes = Arg.getValueSizeInBits() / 8;
4780
4781 for (;;) {
4782 // Look through nodes that don't alter the bits of the incoming value.
4783 unsigned Op = Arg.getOpcode();
4784 if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
4785 Arg = Arg.getOperand(0);
4786 continue;
4787 }
4788 if (Op == ISD::TRUNCATE) {
4789 const SDValue &TruncInput = Arg.getOperand(0);
4790 if (TruncInput.getOpcode() == ISD::AssertZext &&
4791 cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
4792 Arg.getValueType()) {
4793 Arg = TruncInput.getOperand(0);
4794 continue;
4795 }
4796 }
4797 break;
4798 }
4799
4800 int FI = INT_MAX2147483647;
4801 if (Arg.getOpcode() == ISD::CopyFromReg) {
4802 Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
4803 if (!VR.isVirtual())
4804 return false;
4805 MachineInstr *Def = MRI->getVRegDef(VR);
4806 if (!Def)
4807 return false;
4808 if (!Flags.isByVal()) {
4809 if (!TII->isLoadFromStackSlot(*Def, FI))
4810 return false;
4811 } else {
4812 unsigned Opcode = Def->getOpcode();
4813 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
4814 Opcode == X86::LEA64_32r) &&
4815 Def->getOperand(1).isFI()) {
4816 FI = Def->getOperand(1).getIndex();
4817 Bytes = Flags.getByValSize();
4818 } else
4819 return false;
4820 }
4821 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
4822 if (Flags.isByVal())
4823 // ByVal argument is passed in as a pointer but it's now being
4824 // dereferenced. e.g.
4825 // define @foo(%struct.X* %A) {
4826 // tail call @bar(%struct.X* byval %A)
4827 // }
4828 return false;
4829 SDValue Ptr = Ld->getBasePtr();
4830 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
4831 if (!FINode)
4832 return false;
4833 FI = FINode->getIndex();
4834 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
4835 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
4836 FI = FINode->getIndex();
4837 Bytes = Flags.getByValSize();
4838 } else
4839 return false;
4840
4841 assert(FI != INT_MAX)(static_cast <bool> (FI != 2147483647) ? void (0) : __assert_fail
("FI != INT_MAX", "llvm/lib/Target/X86/X86ISelLowering.cpp",
4841, __extension__ __PRETTY_FUNCTION__))
;
4842 if (!MFI.isFixedObjectIndex(FI))
4843 return false;
4844
4845 if (Offset != MFI.getObjectOffset(FI))
4846 return false;
4847
4848 // If this is not byval, check that the argument stack object is immutable.
4849 // inalloca and argument copy elision can create mutable argument stack
4850 // objects. Byval objects can be mutated, but a byval call intends to pass the
4851 // mutated memory.
4852 if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
4853 return false;
4854
4855 if (VA.getLocVT().getFixedSizeInBits() >
4856 Arg.getValueSizeInBits().getFixedSize()) {
4857 // If the argument location is wider than the argument type, check that any
4858 // extension flags match.
4859 if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
4860 Flags.isSExt() != MFI.isObjectSExt(FI)) {
4861 return false;
4862 }
4863 }
4864
4865 return Bytes == MFI.getObjectSize(FI);
4866}
4867
4868/// Check whether the call is eligible for tail call optimization. Targets
4869/// that want to do tail call optimization should implement this function.
4870bool X86TargetLowering::IsEligibleForTailCallOptimization(
4871 SDValue Callee, CallingConv::ID CalleeCC, bool IsCalleePopSRet,
4872 bool isVarArg, Type *RetTy, const SmallVectorImpl<ISD::OutputArg> &Outs,
4873 const SmallVectorImpl<SDValue> &OutVals,
4874 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
4875 if (!mayTailCallThisCC(CalleeCC))
4876 return false;
4877
4878 // If -tailcallopt is specified, make fastcc functions tail-callable.
4879 MachineFunction &MF = DAG.getMachineFunction();
4880 const Function &CallerF = MF.getFunction();
4881
4882 // If the function return type is x86_fp80 and the callee return type is not,
4883 // then the FP_EXTEND of the call result is not a nop. It's not safe to
4884 // perform a tailcall optimization here.
4885 if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
4886 return false;
4887
4888 CallingConv::ID CallerCC = CallerF.getCallingConv();
4889 bool CCMatch = CallerCC == CalleeCC;
4890 bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
4891 bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
4892 bool IsGuaranteeTCO = DAG.getTarget().Options.GuaranteedTailCallOpt ||
4893 CalleeCC == CallingConv::Tail || CalleeCC == CallingConv::SwiftTail;
4894
4895 // Win64 functions have extra shadow space for argument homing. Don't do the
4896 // sibcall if the caller and callee have mismatched expectations for this
4897 // space.
4898 if (IsCalleeWin64 != IsCallerWin64)
4899 return false;
4900
4901 if (IsGuaranteeTCO) {
4902 if (canGuaranteeTCO(CalleeCC) && CCMatch)
4903 return true;
4904 return false;
4905 }
4906
4907 // Look for obvious safe cases to perform tail call optimization that do not
4908 // require ABI changes. This is what gcc calls sibcall.
4909
4910 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
4911 // emit a special epilogue.
4912 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4913 if (RegInfo->hasStackRealignment(MF))
4914 return false;
4915
4916 // Also avoid sibcall optimization if we're an sret return fn and the callee
4917 // is incompatible. See comment in LowerReturn about why hasStructRetAttr is
4918 // insufficient.
4919 if (MF.getInfo<X86MachineFunctionInfo>()->getSRetReturnReg()) {
4920 // For a compatible tail call the callee must return our sret pointer. So it
4921 // needs to be (a) an sret function itself and (b) we pass our sret as its
4922 // sret. Condition #b is harder to determine.
4923 return false;
4924 } else if (IsCalleePopSRet)
4925 // The callee pops an sret, so we cannot tail-call, as our caller doesn't
4926 // expect that.
4927 return false;
4928
4929 // Do not sibcall optimize vararg calls unless all arguments are passed via
4930 // registers.
4931 LLVMContext &C = *DAG.getContext();
4932 if (isVarArg && !Outs.empty()) {
4933 // Optimizing for varargs on Win64 is unlikely to be safe without
4934 // additional testing.
4935 if (IsCalleeWin64 || IsCallerWin64)
4936 return false;
4937
4938 SmallVector<CCValAssign, 16> ArgLocs;
4939 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4940
4941 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4942 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
4943 if (!ArgLocs[i].isRegLoc())
4944 return false;
4945 }
4946
4947 // If the call result is in ST0 / ST1, it needs to be popped off the x87
4948 // stack. Therefore, if it's not used by the call it is not safe to optimize
4949 // this into a sibcall.
4950 bool Unused = false;
4951 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
4952 if (!Ins[i].Used) {
4953 Unused = true;
4954 break;
4955 }
4956 }
4957 if (Unused) {
4958 SmallVector<CCValAssign, 16> RVLocs;
4959 CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
4960 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
4961 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
4962 CCValAssign &VA = RVLocs[i];
4963 if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
4964 return false;
4965 }
4966 }
4967
4968 // Check that the call results are passed in the same way.
4969 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
4970 RetCC_X86, RetCC_X86))
4971 return false;
4972 // The callee has to preserve all registers the caller needs to preserve.
4973 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
4974 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
4975 if (!CCMatch) {
4976 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4977 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4978 return false;
4979 }
4980
4981 unsigned StackArgsSize = 0;
4982
4983 // If the callee takes no arguments then go on to check the results of the
4984 // call.
4985 if (!Outs.empty()) {
4986 // Check if stack adjustment is needed. For now, do not do this if any
4987 // argument is passed on the stack.
4988 SmallVector<CCValAssign, 16> ArgLocs;
4989 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4990
4991 // Allocate shadow area for Win64
4992 if (IsCalleeWin64)
4993 CCInfo.AllocateStack(32, Align(8));
4994
4995 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4996 StackArgsSize = CCInfo.getNextStackOffset();
4997
4998 if (CCInfo.getNextStackOffset()) {
4999 // Check if the arguments are already laid out in the right way as
5000 // the caller's fixed stack objects.
5001 MachineFrameInfo &MFI = MF.getFrameInfo();
5002 const MachineRegisterInfo *MRI = &MF.getRegInfo();
5003 const X86InstrInfo *TII = Subtarget.getInstrInfo();
5004 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
5005 CCValAssign &VA = ArgLocs[i];
5006 SDValue Arg = OutVals[i];
5007 ISD::ArgFlagsTy Flags = Outs[i].Flags;
5008 if (VA.getLocInfo() == CCValAssign::Indirect)
5009 return false;
5010 if (!VA.isRegLoc()) {
5011 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
5012 MFI, MRI, TII, VA))
5013 return false;
5014 }
5015 }
5016 }
5017
5018 bool PositionIndependent = isPositionIndependent();
5019 // If the tailcall address may be in a register, then make sure it's
5020 // possible to register allocate for it. In 32-bit, the call address can
5021 // only target EAX, EDX, or ECX since the tail call must be scheduled after
5022 // callee-saved registers are restored. These happen to be the same
5023 // registers used to pass 'inreg' arguments so watch out for those.
5024 if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
5025 !isa<ExternalSymbolSDNode>(Callee)) ||
5026 PositionIndependent)) {
5027 unsigned NumInRegs = 0;
5028 // In PIC we need an extra register to formulate the address computation
5029 // for the callee.
5030 unsigned MaxInRegs = PositionIndependent ? 2 : 3;
5031
5032 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
5033 CCValAssign &VA = ArgLocs[i];
5034 if (!VA.isRegLoc())
5035 continue;
5036 Register Reg = VA.getLocReg();
5037 switch (Reg) {
5038 default: break;
5039 case X86::EAX: case X86::EDX: case X86::ECX:
5040 if (++NumInRegs == MaxInRegs)
5041 return false;
5042 break;
5043 }
5044 }
5045 }
5046
5047 const MachineRegisterInfo &MRI = MF.getRegInfo();
5048 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
5049 return false;
5050 }
5051
5052 bool CalleeWillPop =
5053 X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
5054 MF.getTarget().Options.GuaranteedTailCallOpt);
5055
5056 if (unsigned BytesToPop =
5057 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
5058 // If we have bytes to pop, the callee must pop them.
5059 bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
5060 if (!CalleePopMatches)
5061 return false;
5062 } else if (CalleeWillPop && StackArgsSize > 0) {
5063 // If we don't have bytes to pop, make sure the callee doesn't pop any.
5064 return false;
5065 }
5066
5067 return true;
5068}
5069
5070FastISel *
5071X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
5072 const TargetLibraryInfo *libInfo) const {
5073 return X86::createFastISel(funcInfo, libInfo);
5074}
5075
5076//===----------------------------------------------------------------------===//
5077// Other Lowering Hooks
5078//===----------------------------------------------------------------------===//
5079
5080bool X86::mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget,
5081 bool AssumeSingleUse) {
5082 if (!AssumeSingleUse && !Op.hasOneUse())
5083 return false;
5084 if (!ISD::isNormalLoad(Op.getNode()))
5085 return false;
5086
5087 // If this is an unaligned vector, make sure the target supports folding it.
5088 auto *Ld = cast<LoadSDNode>(Op.getNode());
5089 if (!Subtarget.hasAVX() && !Subtarget.hasSSEUnalignedMem() &&
5090 Ld->getValueSizeInBits(0) == 128 && Ld->getAlignment() < 16)
5091 return false;
5092
5093 // TODO: If this is a non-temporal load and the target has an instruction
5094 // for it, it should not be folded. See "useNonTemporalLoad()".
5095
5096 return true;
5097}
5098
5099bool X86::mayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT,
5100 const X86Subtarget &Subtarget,
5101 bool AssumeSingleUse) {
5102 assert(Subtarget.hasAVX() && "Expected AVX for broadcast from memory")(static_cast <bool> (Subtarget.hasAVX() && "Expected AVX for broadcast from memory"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"Expected AVX for broadcast from memory\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5102, __extension__
__PRETTY_FUNCTION__))
;
5103 if (!X86::mayFoldLoad(Op, Subtarget, AssumeSingleUse))
5104 return false;
5105
5106 // We can not replace a wide volatile load with a broadcast-from-memory,
5107 // because that would narrow the load, which isn't legal for volatiles.
5108 auto *Ld = cast<LoadSDNode>(Op.getNode());
5109 return !Ld->isVolatile() ||
5110 Ld->getValueSizeInBits(0) == EltVT.getScalarSizeInBits();
5111}
5112
5113bool X86::mayFoldIntoStore(SDValue Op) {
5114 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
5115}
5116
5117bool X86::mayFoldIntoZeroExtend(SDValue Op) {
5118 if (Op.hasOneUse()) {
5119 unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
5120 return (ISD::ZERO_EXTEND == Opcode);
5121 }
5122 return false;
5123}
5124
5125static bool isTargetShuffle(unsigned Opcode) {
5126 switch(Opcode) {
5127 default: return false;
5128 case X86ISD::BLENDI:
5129 case X86ISD::PSHUFB:
5130 case X86ISD::PSHUFD:
5131 case X86ISD::PSHUFHW:
5132 case X86ISD::PSHUFLW:
5133 case X86ISD::SHUFP:
5134 case X86ISD::INSERTPS:
5135 case X86ISD::EXTRQI:
5136 case X86ISD::INSERTQI:
5137 case X86ISD::VALIGN:
5138 case X86ISD::PALIGNR:
5139 case X86ISD::VSHLDQ:
5140 case X86ISD::VSRLDQ:
5141 case X86ISD::MOVLHPS:
5142 case X86ISD::MOVHLPS:
5143 case X86ISD::MOVSHDUP:
5144 case X86ISD::MOVSLDUP:
5145 case X86ISD::MOVDDUP:
5146 case X86ISD::MOVSS:
5147 case X86ISD::MOVSD:
5148 case X86ISD::MOVSH:
5149 case X86ISD::UNPCKL:
5150 case X86ISD::UNPCKH:
5151 case X86ISD::VBROADCAST:
5152 case X86ISD::VPERMILPI:
5153 case X86ISD::VPERMILPV:
5154 case X86ISD::VPERM2X128:
5155 case X86ISD::SHUF128:
5156 case X86ISD::VPERMIL2:
5157 case X86ISD::VPERMI:
5158 case X86ISD::VPPERM:
5159 case X86ISD::VPERMV:
5160 case X86ISD::VPERMV3:
5161 case X86ISD::VZEXT_MOVL:
5162 return true;
5163 }
5164}
5165
5166static bool isTargetShuffleVariableMask(unsigned Opcode) {
5167 switch (Opcode) {
5168 default: return false;
5169 // Target Shuffles.
5170 case X86ISD::PSHUFB:
5171 case X86ISD::VPERMILPV:
5172 case X86ISD::VPERMIL2:
5173 case X86ISD::VPPERM:
5174 case X86ISD::VPERMV:
5175 case X86ISD::VPERMV3:
5176 return true;
5177 // 'Faux' Target Shuffles.
5178 case ISD::OR:
5179 case ISD::AND:
5180 case X86ISD::ANDNP:
5181 return true;
5182 }
5183}
5184
5185static bool isTargetShuffleSplat(SDValue Op) {
5186 unsigned Opcode = Op.getOpcode();
5187 if (Opcode == ISD::EXTRACT_SUBVECTOR)
5188 return isTargetShuffleSplat(Op.getOperand(0));
5189 return Opcode == X86ISD::VBROADCAST || Opcode == X86ISD::VBROADCAST_LOAD;
5190}
5191
5192SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
5193 MachineFunction &MF = DAG.getMachineFunction();
5194 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
5195 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
5196 int ReturnAddrIndex = FuncInfo->getRAIndex();
5197
5198 if (ReturnAddrIndex == 0) {
5199 // Set up a frame object for the return address.
5200 unsigned SlotSize = RegInfo->getSlotSize();
5201 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
5202 -(int64_t)SlotSize,
5203 false);
5204 FuncInfo->setRAIndex(ReturnAddrIndex);
5205 }
5206
5207 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
5208}
5209
5210bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
5211 bool hasSymbolicDisplacement) {
5212 // Offset should fit into 32 bit immediate field.
5213 if (!isInt<32>(Offset))
5214 return false;
5215
5216 // If we don't have a symbolic displacement - we don't have any extra
5217 // restrictions.
5218 if (!hasSymbolicDisplacement)
5219 return true;
5220
5221 // FIXME: Some tweaks might be needed for medium code model.
5222 if (M != CodeModel::Small && M != CodeModel::Kernel)
5223 return false;
5224
5225 // For small code model we assume that latest object is 16MB before end of 31
5226 // bits boundary. We may also accept pretty large negative constants knowing
5227 // that all objects are in the positive half of address space.
5228 if (M == CodeModel::Small && Offset < 16*1024*1024)
5229 return true;
5230
5231 // For kernel code model we know that all object resist in the negative half
5232 // of 32bits address space. We may not accept negative offsets, since they may
5233 // be just off and we may accept pretty large positive ones.
5234 if (M == CodeModel::Kernel && Offset >= 0)
5235 return true;
5236
5237 return false;
5238}
5239
5240/// Determines whether the callee is required to pop its own arguments.
5241/// Callee pop is necessary to support tail calls.
5242bool X86::isCalleePop(CallingConv::ID CallingConv,
5243 bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
5244 // If GuaranteeTCO is true, we force some calls to be callee pop so that we
5245 // can guarantee TCO.
5246 if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
5247 return true;
5248
5249 switch (CallingConv) {
5250 default:
5251 return false;
5252 case CallingConv::X86_StdCall:
5253 case CallingConv::X86_FastCall:
5254 case CallingConv::X86_ThisCall:
5255 case CallingConv::X86_VectorCall:
5256 return !is64Bit;
5257 }
5258}
5259
5260/// Return true if the condition is an signed comparison operation.
5261static bool isX86CCSigned(unsigned X86CC) {
5262 switch (X86CC) {
5263 default:
5264 llvm_unreachable("Invalid integer condition!")::llvm::llvm_unreachable_internal("Invalid integer condition!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5264)
;
5265 case X86::COND_E:
5266 case X86::COND_NE:
5267 case X86::COND_B:
5268 case X86::COND_A:
5269 case X86::COND_BE:
5270 case X86::COND_AE:
5271 return false;
5272 case X86::COND_G:
5273 case X86::COND_GE:
5274 case X86::COND_L:
5275 case X86::COND_LE:
5276 return true;
5277 }
5278}
5279
5280static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
5281 switch (SetCCOpcode) {
5282 default: llvm_unreachable("Invalid integer condition!")::llvm::llvm_unreachable_internal("Invalid integer condition!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5282)
;
5283 case ISD::SETEQ: return X86::COND_E;
5284 case ISD::SETGT: return X86::COND_G;
5285 case ISD::SETGE: return X86::COND_GE;
5286 case ISD::SETLT: return X86::COND_L;
5287 case ISD::SETLE: return X86::COND_LE;
5288 case ISD::SETNE: return X86::COND_NE;
5289 case ISD::SETULT: return X86::COND_B;
5290 case ISD::SETUGT: return X86::COND_A;
5291 case ISD::SETULE: return X86::COND_BE;
5292 case ISD::SETUGE: return X86::COND_AE;
5293 }
5294}
5295
5296/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
5297/// condition code, returning the condition code and the LHS/RHS of the
5298/// comparison to make.
5299static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
5300 bool isFP, SDValue &LHS, SDValue &RHS,
5301 SelectionDAG &DAG) {
5302 if (!isFP) {
5303 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
5304 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnes()) {
5305 // X > -1 -> X == 0, jump !sign.
5306 RHS = DAG.getConstant(0, DL, RHS.getValueType());
5307 return X86::COND_NS;
5308 }
5309 if (SetCCOpcode == ISD::SETLT && RHSC->isZero()) {
5310 // X < 0 -> X == 0, jump on sign.
5311 return X86::COND_S;
5312 }
5313 if (SetCCOpcode == ISD::SETGE && RHSC->isZero()) {
5314 // X >= 0 -> X == 0, jump on !sign.
5315 return X86::COND_NS;
5316 }
5317 if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {
5318 // X < 1 -> X <= 0
5319 RHS = DAG.getConstant(0, DL, RHS.getValueType());
5320 return X86::COND_LE;
5321 }
5322 }
5323
5324 return TranslateIntegerX86CC(SetCCOpcode);
5325 }
5326
5327 // First determine if it is required or is profitable to flip the operands.
5328
5329 // If LHS is a foldable load, but RHS is not, flip the condition.
5330 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
5331 !ISD::isNON_EXTLoad(RHS.getNode())) {
5332 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
5333 std::swap(LHS, RHS);
5334 }
5335
5336 switch (SetCCOpcode) {
5337 default: break;
5338 case ISD::SETOLT:
5339 case ISD::SETOLE:
5340 case ISD::SETUGT:
5341 case ISD::SETUGE:
5342 std::swap(LHS, RHS);
5343 break;
5344 }
5345
5346 // On a floating point condition, the flags are set as follows:
5347 // ZF PF CF op
5348 // 0 | 0 | 0 | X > Y
5349 // 0 | 0 | 1 | X < Y
5350 // 1 | 0 | 0 | X == Y
5351 // 1 | 1 | 1 | unordered
5352 switch (SetCCOpcode) {
5353 default: llvm_unreachable("Condcode should be pre-legalized away")::llvm::llvm_unreachable_internal("Condcode should be pre-legalized away"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5353)
;
5354 case ISD::SETUEQ:
5355 case ISD::SETEQ: return X86::COND_E;
5356 case ISD::SETOLT: // flipped
5357 case ISD::SETOGT:
5358 case ISD::SETGT: return X86::COND_A;
5359 case ISD::SETOLE: // flipped
5360 case ISD::SETOGE:
5361 case ISD::SETGE: return X86::COND_AE;
5362 case ISD::SETUGT: // flipped
5363 case ISD::SETULT:
5364 case ISD::SETLT: return X86::COND_B;
5365 case ISD::SETUGE: // flipped
5366 case ISD::SETULE:
5367 case ISD::SETLE: return X86::COND_BE;
5368 case ISD::SETONE:
5369 case ISD::SETNE: return X86::COND_NE;
5370 case ISD::SETUO: return X86::COND_P;
5371 case ISD::SETO: return X86::COND_NP;
5372 case ISD::SETOEQ:
5373 case ISD::SETUNE: return X86::COND_INVALID;
5374 }
5375}
5376
5377/// Is there a floating point cmov for the specific X86 condition code?
5378/// Current x86 isa includes the following FP cmov instructions:
5379/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
5380static bool hasFPCMov(unsigned X86CC) {
5381 switch (X86CC) {
5382 default:
5383 return false;
5384 case X86::COND_B:
5385 case X86::COND_BE:
5386 case X86::COND_E:
5387 case X86::COND_P:
5388 case X86::COND_A:
5389 case X86::COND_AE:
5390 case X86::COND_NE:
5391 case X86::COND_NP:
5392 return true;
5393 }
5394}
5395
5396static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT) {
5397 return Subtarget.hasVLX() || Subtarget.canExtendTo512DQ() ||
5398 VT.is512BitVector();
5399}
5400
5401bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
5402 const CallInst &I,
5403 MachineFunction &MF,
5404 unsigned Intrinsic) const {
5405 Info.flags = MachineMemOperand::MONone;
5406 Info.offset = 0;
5407
5408 const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
5409 if (!IntrData) {
5410 switch (Intrinsic) {
5411 case Intrinsic::x86_aesenc128kl:
5412 case Intrinsic::x86_aesdec128kl:
5413 Info.opc = ISD::INTRINSIC_W_CHAIN;
5414 Info.ptrVal = I.getArgOperand(1);
5415 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
5416 Info.align = Align(1);
5417 Info.flags |= MachineMemOperand::MOLoad;
5418 return true;
5419 case Intrinsic::x86_aesenc256kl:
5420 case Intrinsic::x86_aesdec256kl:
5421 Info.opc = ISD::INTRINSIC_W_CHAIN;
5422 Info.ptrVal = I.getArgOperand(1);
5423 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
5424 Info.align = Align(1);
5425 Info.flags |= MachineMemOperand::MOLoad;
5426 return true;
5427 case Intrinsic::x86_aesencwide128kl:
5428 case Intrinsic::x86_aesdecwide128kl:
5429 Info.opc = ISD::INTRINSIC_W_CHAIN;
5430 Info.ptrVal = I.getArgOperand(0);
5431 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
5432 Info.align = Align(1);
5433 Info.flags |= MachineMemOperand::MOLoad;
5434 return true;
5435 case Intrinsic::x86_aesencwide256kl:
5436 case Intrinsic::x86_aesdecwide256kl:
5437 Info.opc = ISD::INTRINSIC_W_CHAIN;
5438 Info.ptrVal = I.getArgOperand(0);
5439 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
5440 Info.align = Align(1);
5441 Info.flags |= MachineMemOperand::MOLoad;
5442 return true;
5443 case Intrinsic::x86_atomic_bts:
5444 case Intrinsic::x86_atomic_btc:
5445 case Intrinsic::x86_atomic_btr: {
5446 Info.opc = ISD::INTRINSIC_W_CHAIN;
5447 Info.ptrVal = I.getArgOperand(0);
5448 unsigned Size = I.getType()->getScalarSizeInBits();
5449 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
5450 Info.align = Align(Size);
5451 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
5452 MachineMemOperand::MOVolatile;
5453 return true;
5454 }
5455 }
5456 return false;
5457 }
5458
5459 switch (IntrData->Type) {
5460 case TRUNCATE_TO_MEM_VI8:
5461 case TRUNCATE_TO_MEM_VI16:
5462 case TRUNCATE_TO_MEM_VI32: {
5463 Info.opc = ISD::INTRINSIC_VOID;
5464 Info.ptrVal = I.getArgOperand(0);
5465 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
5466 MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
5467 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
5468 ScalarVT = MVT::i8;
5469 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
5470 ScalarVT = MVT::i16;
5471 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
5472 ScalarVT = MVT::i32;
5473
5474 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
5475 Info.align = Align(1);
5476 Info.flags |= MachineMemOperand::MOStore;
5477 break;
5478 }
5479 case GATHER:
5480 case GATHER_AVX2: {
5481 Info.opc = ISD::INTRINSIC_W_CHAIN;
5482 Info.ptrVal = nullptr;
5483 MVT DataVT = MVT::getVT(I.getType());
5484 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
5485 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
5486 IndexVT.getVectorNumElements());
5487 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
5488 Info.align = Align(1);
5489 Info.flags |= MachineMemOperand::MOLoad;
5490 break;
5491 }
5492 case SCATTER: {
5493 Info.opc = ISD::INTRINSIC_VOID;
5494 Info.ptrVal = nullptr;
5495 MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
5496 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
5497 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
5498 IndexVT.getVectorNumElements());
5499 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
5500 Info.align = Align(1);
5501 Info.flags |= MachineMemOperand::MOStore;
5502 break;
5503 }
5504 default:
5505 return false;
5506 }
5507
5508 return true;
5509}
5510
5511/// Returns true if the target can instruction select the
5512/// specified FP immediate natively. If false, the legalizer will
5513/// materialize the FP immediate as a load from a constant pool.
5514bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
5515 bool ForCodeSize) const {
5516 for (const APFloat &FPImm : LegalFPImmediates)
5517 if (Imm.bitwiseIsEqual(FPImm))
5518 return true;
5519 return false;
5520}
5521
5522bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
5523 ISD::LoadExtType ExtTy,
5524 EVT NewVT) const {
5525 assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow")(static_cast <bool> (cast<LoadSDNode>(Load)->isSimple
() && "illegal to narrow") ? void (0) : __assert_fail
("cast<LoadSDNode>(Load)->isSimple() && \"illegal to narrow\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5525, __extension__
__PRETTY_FUNCTION__))
;
5526
5527 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
5528 // relocation target a movq or addq instruction: don't let the load shrink.
5529 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
5530 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
5531 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
5532 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
5533
5534 // If this is an (1) AVX vector load with (2) multiple uses and (3) all of
5535 // those uses are extracted directly into a store, then the extract + store
5536 // can be store-folded. Therefore, it's probably not worth splitting the load.
5537 EVT VT = Load->getValueType(0);
5538 if ((VT.is256BitVector() || VT.is512BitVector()) && !Load->hasOneUse()) {
5539 for (auto UI = Load->use_begin(), UE = Load->use_end(); UI != UE; ++UI) {
5540 // Skip uses of the chain value. Result 0 of the node is the load value.
5541 if (UI.getUse().getResNo() != 0)
5542 continue;
5543
5544 // If this use is not an extract + store, it's probably worth splitting.
5545 if (UI->getOpcode() != ISD::EXTRACT_SUBVECTOR || !UI->hasOneUse() ||
5546 UI->use_begin()->getOpcode() != ISD::STORE)
5547 return true;
5548 }
5549 // All non-chain uses are extract + store.
5550 return false;
5551 }
5552
5553 return true;
5554}
5555
5556/// Returns true if it is beneficial to convert a load of a constant
5557/// to just the constant itself.
5558bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
5559 Type *Ty) const {
5560 assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) :
__assert_fail ("Ty->isIntegerTy()", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5560, __extension__ __PRETTY_FUNCTION__))
;
5561
5562 unsigned BitSize = Ty->getPrimitiveSizeInBits();
5563 if (BitSize == 0 || BitSize > 64)
5564 return false;
5565 return true;
5566}
5567
5568bool X86TargetLowering::reduceSelectOfFPConstantLoads(EVT CmpOpVT) const {
5569 // If we are using XMM registers in the ABI and the condition of the select is
5570 // a floating-point compare and we have blendv or conditional move, then it is
5571 // cheaper to select instead of doing a cross-register move and creating a
5572 // load that depends on the compare result.
5573 bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;
5574 return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();
5575}
5576
5577bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {
5578 // TODO: It might be a win to ease or lift this restriction, but the generic
5579 // folds in DAGCombiner conflict with vector folds for an AVX512 target.
5580 if (VT.isVector() && Subtarget.hasAVX512())
5581 return false;
5582
5583 return true;
5584}
5585
5586bool X86TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
5587 SDValue C) const {
5588 // TODO: We handle scalars using custom code, but generic combining could make
5589 // that unnecessary.
5590 APInt MulC;
5591 if (!ISD::isConstantSplatVector(C.getNode(), MulC))
5592 return false;
5593
5594 // Find the type this will be legalized too. Otherwise we might prematurely
5595 // convert this to shl+add/sub and then still have to type legalize those ops.
5596 // Another choice would be to defer the decision for illegal types until
5597 // after type legalization. But constant splat vectors of i64 can't make it
5598 // through type legalization on 32-bit targets so we would need to special
5599 // case vXi64.
5600 while (getTypeAction(Context, VT) != TypeLegal)
5601 VT = getTypeToTransformTo(Context, VT);
5602
5603 // If vector multiply is legal, assume that's faster than shl + add/sub.
5604 // Multiply is a complex op with higher latency and lower throughput in
5605 // most implementations, sub-vXi32 vector multiplies are always fast,
5606 // vXi32 mustn't have a SlowMULLD implementation, and anything larger (vXi64)
5607 // is always going to be slow.
5608 unsigned EltSizeInBits = VT.getScalarSizeInBits();
5609 if (isOperationLegal(ISD::MUL, VT) && EltSizeInBits <= 32 &&
5610 (EltSizeInBits != 32 || !Subtarget.isPMULLDSlow()))
5611 return false;
5612
5613 // shl+add, shl+sub, shl+add+neg
5614 return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
5615 (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
5616}
5617
5618bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
5619 unsigned Index) const {
5620 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
5621 return false;
5622
5623 // Mask vectors support all subregister combinations and operations that
5624 // extract half of vector.
5625 if (ResVT.getVectorElementType() == MVT::i1)
5626 return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
5627 (Index == ResVT.getVectorNumElements()));
5628
5629 return (Index % ResVT.getVectorNumElements()) == 0;
5630}
5631
5632bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const {
5633 unsigned Opc = VecOp.getOpcode();
5634
5635 // Assume target opcodes can't be scalarized.
5636 // TODO - do we have any exceptions?
5637 if (Opc >= ISD::BUILTIN_OP_END)
5638 return false;
5639
5640 // If the vector op is not supported, try to convert to scalar.
5641 EVT VecVT = VecOp.getValueType();
5642 if (!isOperationLegalOrCustomOrPromote(Opc, VecVT))
5643 return true;
5644
5645 // If the vector op is supported, but the scalar op is not, the transform may
5646 // not be worthwhile.
5647 EVT ScalarVT = VecVT.getScalarType();
5648 return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
5649}
5650
5651bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT,
5652 bool) const {
5653 // TODO: Allow vectors?
5654 if (VT.isVector())
5655 return false;
5656 return VT.isSimple() || !isOperationExpand(Opcode, VT);
5657}
5658
5659bool X86TargetLowering::isCheapToSpeculateCttz() const {
5660 // Speculate cttz only if we can directly use TZCNT.
5661 return Subtarget.hasBMI();
5662}
5663
5664bool X86TargetLowering::isCheapToSpeculateCtlz() const {
5665 // Speculate ctlz only if we can directly use LZCNT.
5666 return Subtarget.hasLZCNT();
5667}
5668
5669bool X86TargetLowering::hasBitPreservingFPLogic(EVT VT) const {
5670 return VT == MVT::f32 || VT == MVT::f64 || VT.isVector() ||
5671 (VT == MVT::f16 && Subtarget.hasFP16());
5672}
5673
5674bool X86TargetLowering::ShouldShrinkFPConstant(EVT VT) const {
5675 // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
5676 // expensive than a straight movsd. On the other hand, it's important to
5677 // shrink long double fp constant since fldt is very slow.
5678 return !Subtarget.hasSSE2() || VT == MVT::f80;
5679}
5680
5681bool X86TargetLowering::isScalarFPTypeInSSEReg(EVT VT) const {
5682 return (VT == MVT::f64 && Subtarget.hasSSE2()) ||
5683 (VT == MVT::f32 && Subtarget.hasSSE1()) ||
5684 (VT == MVT::f16 && Subtarget.hasFP16());
5685}
5686
5687bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
5688 const SelectionDAG &DAG,
5689 const MachineMemOperand &MMO) const {
5690 if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
5691 BitcastVT.getVectorElementType() == MVT::i1)
5692 return false;
5693
5694 if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
5695 return false;
5696
5697 // If both types are legal vectors, it's always ok to convert them.
5698 if (LoadVT.isVector() && BitcastVT.isVector() &&
5699 isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
5700 return true;
5701
5702 return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
5703}
5704
5705bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
5706 const MachineFunction &MF) const {
5707 // Do not merge to float value size (128 bytes) if no implicit
5708 // float attribute is set.
5709 bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);
5710
5711 if (NoFloat) {
5712 unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
5713 return (MemVT.getSizeInBits() <= MaxIntSize);
5714 }
5715 // Make sure we don't merge greater than our preferred vector
5716 // width.
5717 if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
5718 return false;
5719
5720 return true;
5721}
5722
5723bool X86TargetLowering::isCtlzFast() const {
5724 return Subtarget.hasFastLZCNT();
5725}
5726
5727bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
5728 const Instruction &AndI) const {
5729 return true;
5730}
5731
5732bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
5733 EVT VT = Y.getValueType();
5734
5735 if (VT.isVector())
5736 return false;
5737
5738 if (!Subtarget.hasBMI())
5739 return false;
5740
5741 // There are only 32-bit and 64-bit forms for 'andn'.
5742 if (VT != MVT::i32 && VT != MVT::i64)
5743 return false;
5744
5745 return !isa<ConstantSDNode>(Y);
5746}
5747
5748bool X86TargetLowering::hasAndNot(SDValue Y) const {
5749 EVT VT = Y.getValueType();
5750
5751 if (!VT.isVector())
5752 return hasAndNotCompare(Y);
5753
5754 // Vector.
5755
5756 if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
5757 return false;
5758
5759 if (VT == MVT::v4i32)
5760 return true;
5761
5762 return Subtarget.hasSSE2();
5763}
5764
5765bool X86TargetLowering::hasBitTest(SDValue X, SDValue Y) const {
5766 return X.getValueType().isScalarInteger(); // 'bt'
5767}
5768
5769bool X86TargetLowering::
5770 shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
5771 SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
5772 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
5773 SelectionDAG &DAG) const {
5774 // Does baseline recommend not to perform the fold by default?
5775 if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
5776 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
5777 return false;
5778 // For scalars this transform is always beneficial.
5779 if (X.getValueType().isScalarInteger())
5780 return true;
5781 // If all the shift amounts are identical, then transform is beneficial even
5782 // with rudimentary SSE2 shifts.
5783 if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))
5784 return true;
5785 // If we have AVX2 with it's powerful shift operations, then it's also good.
5786 if (Subtarget.hasAVX2())
5787 return true;
5788 // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.
5789 return NewShiftOpcode == ISD::SHL;
5790}
5791
5792bool X86TargetLowering::shouldFoldConstantShiftPairToMask(
5793 const SDNode *N, CombineLevel Level) const {
5794 assert(((N->getOpcode() == ISD::SHL &&(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5798, __extension__
__PRETTY_FUNCTION__))
5795 N->getOperand(0).getOpcode() == ISD::SRL) ||(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5798, __extension__
__PRETTY_FUNCTION__))
5796 (N->getOpcode() == ISD::SRL &&(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5798, __extension__
__PRETTY_FUNCTION__))
5797 N->getOperand(0).getOpcode() == ISD::SHL)) &&(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5798, __extension__
__PRETTY_FUNCTION__))
5798 "Expected shift-shift mask")(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5798, __extension__
__PRETTY_FUNCTION__))
;
5799 EVT VT = N->getValueType(0);
5800 if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||
5801 (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
5802 // Only fold if the shift values are equal - so it folds to AND.
5803 // TODO - we should fold if either is a non-uniform vector but we don't do
5804 // the fold for non-splats yet.
5805 return N->getOperand(1) == N->getOperand(0).getOperand(1);
5806 }
5807 return TargetLoweringBase::shouldFoldConstantShiftPairToMask(N, Level);
5808}
5809
5810bool X86TargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const {
5811 EVT VT = Y.getValueType();
5812
5813 // For vectors, we don't have a preference, but we probably want a mask.
5814 if (VT.isVector())
5815 return false;
5816
5817 // 64-bit shifts on 32-bit targets produce really bad bloated code.
5818 if (VT == MVT::i64 && !Subtarget.is64Bit())
5819 return false;
5820
5821 return true;
5822}
5823
5824bool X86TargetLowering::shouldExpandShift(SelectionDAG &DAG,
5825 SDNode *N) const {
5826 if (DAG.getMachineFunction().getFunction().hasMinSize() &&
5827 !Subtarget.isOSWindows())
5828 return false;
5829 return true;
5830}
5831
5832bool X86TargetLowering::shouldSplatInsEltVarIndex(EVT VT) const {
5833 // Any legal vector type can be splatted more efficiently than
5834 // loading/spilling from memory.
5835 return isTypeLegal(VT);
5836}
5837
5838MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
5839 MVT VT = MVT::getIntegerVT(NumBits);
5840 if (isTypeLegal(VT))
5841 return VT;
5842
5843 // PMOVMSKB can handle this.
5844 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
5845 return MVT::v16i8;
5846
5847 // VPMOVMSKB can handle this.
5848 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
5849 return MVT::v32i8;
5850
5851 // TODO: Allow 64-bit type for 32-bit target.
5852 // TODO: 512-bit types should be allowed, but make sure that those
5853 // cases are handled in combineVectorSizedSetCCEquality().
5854
5855 return MVT::INVALID_SIMPLE_VALUE_TYPE;
5856}
5857
5858/// Val is the undef sentinel value or equal to the specified value.
5859static bool isUndefOrEqual(int Val, int CmpVal) {
5860 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
5861}
5862
5863/// Return true if every element in Mask is the undef sentinel value or equal to
5864/// the specified value..
5865static bool isUndefOrEqual(ArrayRef<int> Mask, int CmpVal) {
5866 return llvm::all_of(Mask, [CmpVal](int M) {
5867 return (M == SM_SentinelUndef) || (M == CmpVal);
5868 });
5869}
5870
5871/// Val is either the undef or zero sentinel value.
5872static bool isUndefOrZero(int Val) {
5873 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
5874}
5875
5876/// Return true if every element in Mask, beginning from position Pos and ending
5877/// in Pos+Size is the undef sentinel value.
5878static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
5879 return llvm::all_of(Mask.slice(Pos, Size),
5880 [](int M) { return M == SM_SentinelUndef; });
5881}
5882
5883/// Return true if the mask creates a vector whose lower half is undefined.
5884static bool isUndefLowerHalf(ArrayRef<int> Mask) {
5885 unsigned NumElts = Mask.size();
5886 return isUndefInRange(Mask, 0, NumElts / 2);
5887}
5888
5889/// Return true if the mask creates a vector whose upper half is undefined.
5890static bool isUndefUpperHalf(ArrayRef<int> Mask) {
5891 unsigned NumElts = Mask.size();
5892 return isUndefInRange(Mask, NumElts / 2, NumElts / 2);
5893}
5894
5895/// Return true if Val falls within the specified range (L, H].
5896static bool isInRange(int Val, int Low, int Hi) {
5897 return (Val >= Low && Val < Hi);
5898}
5899
5900/// Return true if the value of any element in Mask falls within the specified
5901/// range (L, H].
5902static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
5903 return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });
5904}
5905
5906/// Return true if the value of any element in Mask is the zero sentinel value.
5907static bool isAnyZero(ArrayRef<int> Mask) {
5908 return llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
5909}
5910
5911/// Return true if the value of any element in Mask is the zero or undef
5912/// sentinel values.
5913static bool isAnyZeroOrUndef(ArrayRef<int> Mask) {
5914 return llvm::any_of(Mask, [](int M) {
5915 return M == SM_SentinelZero || M == SM_SentinelUndef;
5916 });
5917}
5918
5919/// Return true if Val is undef or if its value falls within the
5920/// specified range (L, H].
5921static bool isUndefOrInRange(int Val, int Low, int Hi) {
5922 return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
5923}
5924
5925/// Return true if every element in Mask is undef or if its value
5926/// falls within the specified range (L, H].
5927static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
5928 return llvm::all_of(
5929 Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });
5930}
5931
5932/// Return true if Val is undef, zero or if its value falls within the
5933/// specified range (L, H].
5934static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
5935 return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
5936}
5937
5938/// Return true if every element in Mask is undef, zero or if its value
5939/// falls within the specified range (L, H].
5940static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
5941 return llvm::all_of(
5942 Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });
5943}
5944
5945/// Return true if every element in Mask, beginning
5946/// from position Pos and ending in Pos + Size, falls within the specified
5947/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
5948static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
5949 unsigned Size, int Low, int Step = 1) {
5950 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
5951 if (!isUndefOrEqual(Mask[i], Low))
5952 return false;
5953 return true;
5954}
5955
5956/// Return true if every element in Mask, beginning
5957/// from position Pos and ending in Pos+Size, falls within the specified
5958/// sequential range (Low, Low+Size], or is undef or is zero.
5959static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
5960 unsigned Size, int Low,
5961 int Step = 1) {
5962 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
5963 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
5964 return false;
5965 return true;
5966}
5967
5968/// Return true if every element in Mask, beginning
5969/// from position Pos and ending in Pos+Size is undef or is zero.
5970static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
5971 unsigned Size) {
5972 return llvm::all_of(Mask.slice(Pos, Size), isUndefOrZero);
5973}
5974
5975/// Helper function to test whether a shuffle mask could be
5976/// simplified by widening the elements being shuffled.
5977///
5978/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
5979/// leaves it in an unspecified state.
5980///
5981/// NOTE: This must handle normal vector shuffle masks and *target* vector
5982/// shuffle masks. The latter have the special property of a '-2' representing
5983/// a zero-ed lane of a vector.
5984static bool canWidenShuffleElements(ArrayRef<int> Mask,
5985 SmallVectorImpl<int> &WidenedMask) {
5986 WidenedMask.assign(Mask.size() / 2, 0);
5987 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
5988 int M0 = Mask[i];
5989 int M1 = Mask[i + 1];
5990
5991 // If both elements are undef, its trivial.
5992 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
5993 WidenedMask[i / 2] = SM_SentinelUndef;
5994 continue;
5995 }
5996
5997 // Check for an undef mask and a mask value properly aligned to fit with
5998 // a pair of values. If we find such a case, use the non-undef mask's value.
5999 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
6000 WidenedMask[i / 2] = M1 / 2;
6001 continue;
6002 }
6003 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
6004 WidenedMask[i / 2] = M0 / 2;
6005 continue;
6006 }
6007
6008 // When zeroing, we need to spread the zeroing across both lanes to widen.
6009 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
6010 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
6011 (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {
6012 WidenedMask[i / 2] = SM_SentinelZero;
6013 continue;
6014 }
6015 return false;
6016 }
6017
6018 // Finally check if the two mask values are adjacent and aligned with
6019 // a pair.
6020 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
6021 WidenedMask[i / 2] = M0 / 2;
6022 continue;
6023 }
6024
6025 // Otherwise we can't safely widen the elements used in this shuffle.
6026 return false;
6027 }
6028 assert(WidenedMask.size() == Mask.size() / 2 &&(static_cast <bool> (WidenedMask.size() == Mask.size() /
2 && "Incorrect size of mask after widening the elements!"
) ? void (0) : __assert_fail ("WidenedMask.size() == Mask.size() / 2 && \"Incorrect size of mask after widening the elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6029, __extension__
__PRETTY_FUNCTION__))
6029 "Incorrect size of mask after widening the elements!")(static_cast <bool> (WidenedMask.size() == Mask.size() /
2 && "Incorrect size of mask after widening the elements!"
) ? void (0) : __assert_fail ("WidenedMask.size() == Mask.size() / 2 && \"Incorrect size of mask after widening the elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6029, __extension__
__PRETTY_FUNCTION__))
;
6030
6031 return true;
6032}
6033
6034static bool canWidenShuffleElements(ArrayRef<int> Mask,
6035 const APInt &Zeroable,
6036 bool V2IsZero,
6037 SmallVectorImpl<int> &WidenedMask) {
6038 // Create an alternative mask with info about zeroable elements.
6039 // Here we do not set undef elements as zeroable.
6040 SmallVector<int, 64> ZeroableMask(Mask.begin(), Mask.end());
6041 if (V2IsZero) {
6042 assert(!Zeroable.isZero() && "V2's non-undef elements are used?!")(static_cast <bool> (!Zeroable.isZero() && "V2's non-undef elements are used?!"
) ? void (0) : __assert_fail ("!Zeroable.isZero() && \"V2's non-undef elements are used?!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6042, __extension__
__PRETTY_FUNCTION__))
;
6043 for (int i = 0, Size = Mask.size(); i != Size; ++i)
6044 if (Mask[i] != SM_SentinelUndef && Zeroable[i])
6045 ZeroableMask[i] = SM_SentinelZero;
6046 }
6047 return canWidenShuffleElements(ZeroableMask, WidenedMask);
6048}
6049
6050static bool canWidenShuffleElements(ArrayRef<int> Mask) {
6051 SmallVector<int, 32> WidenedMask;
6052 return canWidenShuffleElements(Mask, WidenedMask);
6053}
6054
6055// Attempt to narrow/widen shuffle mask until it matches the target number of
6056// elements.
6057static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,
6058 SmallVectorImpl<int> &ScaledMask) {
6059 unsigned NumSrcElts = Mask.size();
6060 assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&(static_cast <bool> (((NumSrcElts % NumDstElts) == 0 ||
(NumDstElts % NumSrcElts) == 0) && "Illegal shuffle scale factor"
) ? void (0) : __assert_fail ("((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) && \"Illegal shuffle scale factor\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6061, __extension__
__PRETTY_FUNCTION__))
6061 "Illegal shuffle scale factor")(static_cast <bool> (((NumSrcElts % NumDstElts) == 0 ||
(NumDstElts % NumSrcElts) == 0) && "Illegal shuffle scale factor"
) ? void (0) : __assert_fail ("((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) && \"Illegal shuffle scale factor\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6061, __extension__
__PRETTY_FUNCTION__))
;
6062
6063 // Narrowing is guaranteed to work.
6064 if (NumDstElts >= NumSrcElts) {
6065 int Scale = NumDstElts / NumSrcElts;
6066 llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);
6067 return true;
6068 }
6069
6070 // We have to repeat the widening until we reach the target size, but we can
6071 // split out the first widening as it sets up ScaledMask for us.
6072 if (canWidenShuffleElements(Mask, ScaledMask)) {
6073 while (ScaledMask.size() > NumDstElts) {
6074 SmallVector<int, 16> WidenedMask;
6075 if (!canWidenShuffleElements(ScaledMask, WidenedMask))
6076 return false;
6077 ScaledMask = std::move(WidenedMask);
6078 }
6079 return true;
6080 }
6081
6082 return false;
6083}
6084
6085/// Returns true if Elt is a constant zero or a floating point constant +0.0.
6086bool X86::isZeroNode(SDValue Elt) {
6087 return isNullConstant(Elt) || isNullFPConstant(Elt);
6088}
6089
6090// Build a vector of constants.
6091// Use an UNDEF node if MaskElt == -1.
6092// Split 64-bit constants in the 32-bit mode.
6093static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
6094 const SDLoc &dl, bool IsMask = false) {
6095
6096 SmallVector<SDValue, 32> Ops;
6097 bool Split = false;
6098
6099 MVT ConstVecVT = VT;
6100 unsigned NumElts = VT.getVectorNumElements();
6101 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
6102 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
6103 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
6104 Split = true;
6105 }
6106
6107 MVT EltVT = ConstVecVT.getVectorElementType();
6108 for (unsigned i = 0; i < NumElts; ++i) {
6109 bool IsUndef = Values[i] < 0 && IsMask;
6110 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
6111 DAG.getConstant(Values[i], dl, EltVT);
6112 Ops.push_back(OpNode);
6113 if (Split)
6114 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
6115 DAG.getConstant(0, dl, EltVT));
6116 }
6117 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
6118 if (Split)
6119 ConstsNode = DAG.getBitcast(VT, ConstsNode);
6120 return ConstsNode;
6121}
6122
6123static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
6124 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
6125 assert(Bits.size() == Undefs.getBitWidth() &&(static_cast <bool> (Bits.size() == Undefs.getBitWidth(
) && "Unequal constant and undef arrays") ? void (0) :
__assert_fail ("Bits.size() == Undefs.getBitWidth() && \"Unequal constant and undef arrays\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6126, __extension__
__PRETTY_FUNCTION__))
6126 "Unequal constant and undef arrays")(static_cast <bool> (Bits.size() == Undefs.getBitWidth(
) && "Unequal constant and undef arrays") ? void (0) :
__assert_fail ("Bits.size() == Undefs.getBitWidth() && \"Unequal constant and undef arrays\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6126, __extension__
__PRETTY_FUNCTION__))
;
6127 SmallVector<SDValue, 32> Ops;
6128 bool Split = false;
6129
6130 MVT ConstVecVT = VT;
6131 unsigned NumElts = VT.getVectorNumElements();
6132 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
6133 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
6134 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
6135 Split = true;
6136 }
6137
6138 MVT EltVT = ConstVecVT.getVectorElementType();
6139 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
6140 if (Undefs[i]) {
6141 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
6142 continue;
6143 }
6144 const APInt &V = Bits[i];
6145 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes")(static_cast <bool> (V.getBitWidth() == VT.getScalarSizeInBits
() && "Unexpected sizes") ? void (0) : __assert_fail (
"V.getBitWidth() == VT.getScalarSizeInBits() && \"Unexpected sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6145, __extension__
__PRETTY_FUNCTION__))
;
6146 if (Split) {
6147 Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
6148 Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
6149 } else if (EltVT == MVT::f32) {
6150 APFloat FV(APFloat::IEEEsingle(), V);
6151 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
6152 } else if (EltVT == MVT::f64) {
6153 APFloat FV(APFloat::IEEEdouble(), V);
6154 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
6155 } else {
6156 Ops.push_back(DAG.getConstant(V, dl, EltVT));
6157 }
6158 }
6159
6160 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
6161 return DAG.getBitcast(VT, ConstsNode);
6162}
6163
6164/// Returns a vector of specified type with all zero elements.
6165static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
6166 SelectionDAG &DAG, const SDLoc &dl) {
6167 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector() || VT.getVectorElementType() == MVT
::i1) && "Unexpected vector type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6169, __extension__
__PRETTY_FUNCTION__))
6168 VT.getVectorElementType() == MVT::i1) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector() || VT.getVectorElementType() == MVT
::i1) && "Unexpected vector type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6169, __extension__
__PRETTY_FUNCTION__))
6169 "Unexpected vector type")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector() || VT.getVectorElementType() == MVT
::i1) && "Unexpected vector type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6169, __extension__
__PRETTY_FUNCTION__))
;
6170
6171 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
6172 // type. This ensures they get CSE'd. But if the integer type is not
6173 // available, use a floating-point +0.0 instead.
6174 SDValue Vec;
6175 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
6176 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
6177 } else if (VT.isFloatingPoint()) {
6178 Vec = DAG.getConstantFP(+0.0, dl, VT);
6179 } else if (VT.getVectorElementType() == MVT::i1) {
6180 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&(static_cast <bool> ((Subtarget.hasBWI() || VT.getVectorNumElements
() <= 16) && "Unexpected vector type") ? void (0) :
__assert_fail ("(Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6181, __extension__
__PRETTY_FUNCTION__))
6181 "Unexpected vector type")(static_cast <bool> ((Subtarget.hasBWI() || VT.getVectorNumElements
() <= 16) && "Unexpected vector type") ? void (0) :
__assert_fail ("(Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6181, __extension__
__PRETTY_FUNCTION__))
;
6182 Vec = DAG.getConstant(0, dl, VT);
6183 } else {
6184 unsigned Num32BitElts = VT.getSizeInBits() / 32;
6185 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
6186 }
6187 return DAG.getBitcast(VT, Vec);
6188}
6189
6190// Helper to determine if the ops are all the extracted subvectors come from a
6191// single source. If we allow commute they don't have to be in order (Lo/Hi).
6192static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute) {
6193 if (LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
6194 RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
6195 LHS.getValueType() != RHS.getValueType() ||
6196 LHS.getOperand(0) != RHS.getOperand(0))
6197 return SDValue();
6198
6199 SDValue Src = LHS.getOperand(0);
6200 if (Src.getValueSizeInBits() != (LHS.getValueSizeInBits() * 2))
6201 return SDValue();
6202
6203 unsigned NumElts = LHS.getValueType().getVectorNumElements();
6204 if ((LHS.getConstantOperandAPInt(1) == 0 &&
6205 RHS.getConstantOperandAPInt(1) == NumElts) ||
6206 (AllowCommute && RHS.getConstantOperandAPInt(1) == 0 &&
6207 LHS.getConstantOperandAPInt(1) == NumElts))
6208 return Src;
6209
6210 return SDValue();
6211}
6212
6213static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
6214 const SDLoc &dl, unsigned vectorWidth) {
6215 EVT VT = Vec.getValueType();
6216 EVT ElVT = VT.getVectorElementType();
6217 unsigned Factor = VT.getSizeInBits() / vectorWidth;
6218 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
6219 VT.getVectorNumElements() / Factor);
6220
6221 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
6222 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
6223 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")(static_cast <bool> (isPowerOf2_32(ElemsPerChunk) &&
"Elements per chunk not power of 2") ? void (0) : __assert_fail
("isPowerOf2_32(ElemsPerChunk) && \"Elements per chunk not power of 2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6223, __extension__
__PRETTY_FUNCTION__))
;
6224
6225 // This is the index of the first element of the vectorWidth-bit chunk
6226 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
6227 IdxVal &= ~(ElemsPerChunk - 1);
6228
6229 // If the input is a buildvector just emit a smaller one.
6230 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
6231 return DAG.getBuildVector(ResultVT, dl,
6232 Vec->ops().slice(IdxVal, ElemsPerChunk));
6233
6234 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
6235 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
6236}
6237
6238/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
6239/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
6240/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
6241/// instructions or a simple subregister reference. Idx is an index in the
6242/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
6243/// lowering EXTRACT_VECTOR_ELT operations easier.
6244static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
6245 SelectionDAG &DAG, const SDLoc &dl) {
6246 assert((Vec.getValueType().is256BitVector() ||(static_cast <bool> ((Vec.getValueType().is256BitVector
() || Vec.getValueType().is512BitVector()) && "Unexpected vector size!"
) ? void (0) : __assert_fail ("(Vec.getValueType().is256BitVector() || Vec.getValueType().is512BitVector()) && \"Unexpected vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6247, __extension__
__PRETTY_FUNCTION__))
6247 Vec.getValueType().is512BitVector()) && "Unexpected vector size!")(static_cast <bool> ((Vec.getValueType().is256BitVector
() || Vec.getValueType().is512BitVector()) && "Unexpected vector size!"
) ? void (0) : __assert_fail ("(Vec.getValueType().is256BitVector() || Vec.getValueType().is512BitVector()) && \"Unexpected vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6247, __extension__
__PRETTY_FUNCTION__))
;
6248 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
6249}
6250
6251/// Generate a DAG to grab 256-bits from a 512-bit vector.
6252static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
6253 SelectionDAG &DAG, const SDLoc &dl) {
6254 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!")(static_cast <bool> (Vec.getValueType().is512BitVector(
) && "Unexpected vector size!") ? void (0) : __assert_fail
("Vec.getValueType().is512BitVector() && \"Unexpected vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6254, __extension__
__PRETTY_FUNCTION__))
;
6255 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
6256}
6257
6258static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
6259 SelectionDAG &DAG, const SDLoc &dl,
6260 unsigned vectorWidth) {
6261 assert((vectorWidth == 128 || vectorWidth == 256) &&(static_cast <bool> ((vectorWidth == 128 || vectorWidth
== 256) && "Unsupported vector width") ? void (0) : __assert_fail
("(vectorWidth == 128 || vectorWidth == 256) && \"Unsupported vector width\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6262, __extension__
__PRETTY_FUNCTION__))
6262 "Unsupported vector width")(static_cast <bool> ((vectorWidth == 128 || vectorWidth
== 256) && "Unsupported vector width") ? void (0) : __assert_fail
("(vectorWidth == 128 || vectorWidth == 256) && \"Unsupported vector width\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6262, __extension__
__PRETTY_FUNCTION__))
;
6263 // Inserting UNDEF is Result
6264 if (Vec.isUndef())
6265 return Result;
6266 EVT VT = Vec.getValueType();
6267 EVT ElVT = VT.getVectorElementType();
6268 EVT ResultVT = Result.getValueType();
6269
6270 // Insert the relevant vectorWidth bits.
6271 unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
6272 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")(static_cast <bool> (isPowerOf2_32(ElemsPerChunk) &&
"Elements per chunk not power of 2") ? void (0) : __assert_fail
("isPowerOf2_32(ElemsPerChunk) && \"Elements per chunk not power of 2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6272, __extension__
__PRETTY_FUNCTION__))
;
6273
6274 // This is the index of the first element of the vectorWidth-bit chunk
6275 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
6276 IdxVal &= ~(ElemsPerChunk - 1);
6277
6278 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
6279 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
6280}
6281
6282/// Generate a DAG to put 128-bits into a vector > 128 bits. This
6283/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
6284/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
6285/// simple superregister reference. Idx is an index in the 128 bits
6286/// we want. It need not be aligned to a 128-bit boundary. That makes
6287/// lowering INSERT_VECTOR_ELT operations easier.
6288static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
6289 SelectionDAG &DAG, const SDLoc &dl) {
6290 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!")(static_cast <bool> (Vec.getValueType().is128BitVector(
) && "Unexpected vector size!") ? void (0) : __assert_fail
("Vec.getValueType().is128BitVector() && \"Unexpected vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6290, __extension__
__PRETTY_FUNCTION__))
;
6291 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
6292}
6293
6294/// Widen a vector to a larger size with the same scalar type, with the new
6295/// elements either zero or undef.
6296static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
6297 const X86Subtarget &Subtarget, SelectionDAG &DAG,
6298 const SDLoc &dl) {
6299 assert(Vec.getValueSizeInBits().getFixedSize() < VT.getFixedSizeInBits() &&(static_cast <bool> (Vec.getValueSizeInBits().getFixedSize
() < VT.getFixedSizeInBits() && Vec.getValueType()
.getScalarType() == VT.getScalarType() && "Unsupported vector widening type"
) ? void (0) : __assert_fail ("Vec.getValueSizeInBits().getFixedSize() < VT.getFixedSizeInBits() && Vec.getValueType().getScalarType() == VT.getScalarType() && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6301, __extension__
__PRETTY_FUNCTION__))
6300 Vec.getValueType().getScalarType() == VT.getScalarType() &&(static_cast <bool> (Vec.getValueSizeInBits().getFixedSize
() < VT.getFixedSizeInBits() && Vec.getValueType()
.getScalarType() == VT.getScalarType() && "Unsupported vector widening type"
) ? void (0) : __assert_fail ("Vec.getValueSizeInBits().getFixedSize() < VT.getFixedSizeInBits() && Vec.getValueType().getScalarType() == VT.getScalarType() && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6301, __extension__
__PRETTY_FUNCTION__))
6301 "Unsupported vector widening type")(static_cast <bool> (Vec.getValueSizeInBits().getFixedSize
() < VT.getFixedSizeInBits() && Vec.getValueType()
.getScalarType() == VT.getScalarType() && "Unsupported vector widening type"
) ? void (0) : __assert_fail ("Vec.getValueSizeInBits().getFixedSize() < VT.getFixedSizeInBits() && Vec.getValueType().getScalarType() == VT.getScalarType() && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6301, __extension__
__PRETTY_FUNCTION__))
;
6302 SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
6303 : DAG.getUNDEF(VT);
6304 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec,
6305 DAG.getIntPtrConstant(0, dl));
6306}
6307
6308/// Widen a vector to a larger size with the same scalar type, with the new
6309/// elements either zero or undef.
6310static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
6311 const X86Subtarget &Subtarget, SelectionDAG &DAG,
6312 const SDLoc &dl, unsigned WideSizeInBits) {
6313 assert(Vec.getValueSizeInBits() < WideSizeInBits &&(static_cast <bool> (Vec.getValueSizeInBits() < WideSizeInBits
&& (WideSizeInBits % Vec.getScalarValueSizeInBits())
== 0 && "Unsupported vector widening type") ? void (
0) : __assert_fail ("Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6315, __extension__
__PRETTY_FUNCTION__))
6314 (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&(static_cast <bool> (Vec.getValueSizeInBits() < WideSizeInBits
&& (WideSizeInBits % Vec.getScalarValueSizeInBits())
== 0 && "Unsupported vector widening type") ? void (
0) : __assert_fail ("Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6315, __extension__
__PRETTY_FUNCTION__))
6315 "Unsupported vector widening type")(static_cast <bool> (Vec.getValueSizeInBits() < WideSizeInBits
&& (WideSizeInBits % Vec.getScalarValueSizeInBits())
== 0 && "Unsupported vector widening type") ? void (
0) : __assert_fail ("Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6315, __extension__
__PRETTY_FUNCTION__))
;
6316 unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();
6317 MVT SVT = Vec.getSimpleValueType().getScalarType();
6318 MVT VT = MVT::getVectorVT(SVT, WideNumElts);
6319 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
6320}
6321
6322// Helper function to collect subvector ops that are concatenated together,
6323// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
6324// The subvectors in Ops are guaranteed to be the same type.
6325static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops) {
6326 assert(Ops.empty() && "Expected an empty ops vector")(static_cast <bool> (Ops.empty() && "Expected an empty ops vector"
) ? void (0) : __assert_fail ("Ops.empty() && \"Expected an empty ops vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6326, __extension__
__PRETTY_FUNCTION__))
;
6327
6328 if (N->getOpcode() == ISD::CONCAT_VECTORS) {
6329 Ops.append(N->op_begin(), N->op_end());
6330 return true;
6331 }
6332
6333 if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {
6334 SDValue Src = N->getOperand(0);
6335 SDValue Sub = N->getOperand(1);
6336 const APInt &Idx = N->getConstantOperandAPInt(2);
6337 EVT VT = Src.getValueType();
6338 EVT SubVT = Sub.getValueType();
6339
6340 // TODO - Handle more general insert_subvector chains.
6341 if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2) &&
6342 Idx == (VT.getVectorNumElements() / 2)) {
6343 // insert_subvector(insert_subvector(undef, x, lo), y, hi)
6344 if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
6345 Src.getOperand(1).getValueType() == SubVT &&
6346 isNullConstant(Src.getOperand(2))) {
6347 Ops.push_back(Src.getOperand(1));
6348 Ops.push_back(Sub);
6349 return true;
6350 }
6351 // insert_subvector(x, extract_subvector(x, lo), hi)
6352 if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
6353 Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {
6354 Ops.append(2, Sub);
6355 return true;
6356 }
6357 }
6358 }
6359
6360 return false;
6361}
6362
6363static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,
6364 const SDLoc &dl) {
6365 EVT VT = Op.getValueType();
6366 unsigned NumElems = VT.getVectorNumElements();
6367 unsigned SizeInBits = VT.getSizeInBits();
6368 assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&(static_cast <bool> ((NumElems % 2) == 0 && (SizeInBits
% 2) == 0 && "Can't split odd sized vector") ? void (
0) : __assert_fail ("(NumElems % 2) == 0 && (SizeInBits % 2) == 0 && \"Can't split odd sized vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6369, __extension__
__PRETTY_FUNCTION__))
6369 "Can't split odd sized vector")(static_cast <bool> ((NumElems % 2) == 0 && (SizeInBits
% 2) == 0 && "Can't split odd sized vector") ? void (
0) : __assert_fail ("(NumElems % 2) == 0 && (SizeInBits % 2) == 0 && \"Can't split odd sized vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6369, __extension__
__PRETTY_FUNCTION__))
;
6370
6371 // If this is a splat value (with no-undefs) then use the lower subvector,
6372 // which should be a free extraction.
6373 SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);
6374 if (DAG.isSplatValue(Op, /*AllowUndefs*/ false))
6375 return std::make_pair(Lo, Lo);
6376
6377 SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);
6378 return std::make_pair(Lo, Hi);
6379}
6380
6381/// Break an operation into 2 half sized ops and then concatenate the results.
6382static SDValue splitVectorOp(SDValue Op, SelectionDAG &DAG) {
6383 unsigned NumOps = Op.getNumOperands();
6384 EVT VT = Op.getValueType();
6385 SDLoc dl(Op);
6386
6387 // Extract the LHS Lo/Hi vectors
6388 SmallVector<SDValue> LoOps(NumOps, SDValue());
6389 SmallVector<SDValue> HiOps(NumOps, SDValue());
6390 for (unsigned I = 0; I != NumOps; ++I) {
6391 SDValue SrcOp = Op.getOperand(I);
6392 if (!SrcOp.getValueType().isVector()) {
6393 LoOps[I] = HiOps[I] = SrcOp;
6394 continue;
6395 }
6396 std::tie(LoOps[I], HiOps[I]) = splitVector(SrcOp, DAG, dl);
6397 }
6398
6399 EVT LoVT, HiVT;
6400 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
6401 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
6402 DAG.getNode(Op.getOpcode(), dl, LoVT, LoOps),
6403 DAG.getNode(Op.getOpcode(), dl, HiVT, HiOps));
6404}
6405
6406/// Break an unary integer operation into 2 half sized ops and then
6407/// concatenate the result back.
6408static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
6409 // Make sure we only try to split 256/512-bit types to avoid creating
6410 // narrow vectors.
6411 EVT VT = Op.getValueType();
6412 (void)VT;
6413 assert((Op.getOperand(0).getValueType().is256BitVector() ||(static_cast <bool> ((Op.getOperand(0).getValueType().is256BitVector
() || Op.getOperand(0).getValueType().is512BitVector()) &&
(VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!"
) ? void (0) : __assert_fail ("(Op.getOperand(0).getValueType().is256BitVector() || Op.getOperand(0).getValueType().is512BitVector()) && (VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6415, __extension__
__PRETTY_FUNCTION__))
6414 Op.getOperand(0).getValueType().is512BitVector()) &&(static_cast <bool> ((Op.getOperand(0).getValueType().is256BitVector
() || Op.getOperand(0).getValueType().is512BitVector()) &&
(VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!"
) ? void (0) : __assert_fail ("(Op.getOperand(0).getValueType().is256BitVector() || Op.getOperand(0).getValueType().is512BitVector()) && (VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6415, __extension__
__PRETTY_FUNCTION__))
6415 (VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!")(static_cast <bool> ((Op.getOperand(0).getValueType().is256BitVector
() || Op.getOperand(0).getValueType().is512BitVector()) &&
(VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!"
) ? void (0) : __assert_fail ("(Op.getOperand(0).getValueType().is256BitVector() || Op.getOperand(0).getValueType().is512BitVector()) && (VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6415, __extension__
__PRETTY_FUNCTION__))
;
6416 assert(Op.getOperand(0).getValueType().getVectorNumElements() ==(static_cast <bool> (Op.getOperand(0).getValueType().getVectorNumElements
() == VT.getVectorNumElements() && "Unexpected VTs!")
? void (0) : __assert_fail ("Op.getOperand(0).getValueType().getVectorNumElements() == VT.getVectorNumElements() && \"Unexpected VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6418, __extension__
__PRETTY_FUNCTION__))
6417 VT.getVectorNumElements() &&(static_cast <bool> (Op.getOperand(0).getValueType().getVectorNumElements
() == VT.getVectorNumElements() && "Unexpected VTs!")
? void (0) : __assert_fail ("Op.getOperand(0).getValueType().getVectorNumElements() == VT.getVectorNumElements() && \"Unexpected VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6418, __extension__
__PRETTY_FUNCTION__))
6418 "Unexpected VTs!")(static_cast <bool> (Op.getOperand(0).getValueType().getVectorNumElements
() == VT.getVectorNumElements() && "Unexpected VTs!")
? void (0) : __assert_fail ("Op.getOperand(0).getValueType().getVectorNumElements() == VT.getVectorNumElements() && \"Unexpected VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6418, __extension__
__PRETTY_FUNCTION__))
;
6419 return splitVectorOp(Op, DAG);
6420}
6421
6422/// Break a binary integer operation into 2 half sized ops and then
6423/// concatenate the result back.
6424static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG) {
6425 // Assert that all the types match.
6426 EVT VT = Op.getValueType();
6427 (void)VT;
6428 assert(Op.getOperand(0).getValueType() == VT &&(static_cast <bool> (Op.getOperand(0).getValueType() ==
VT && Op.getOperand(1).getValueType() == VT &&
"Unexpected VTs!") ? void (0) : __assert_fail ("Op.getOperand(0).getValueType() == VT && Op.getOperand(1).getValueType() == VT && \"Unexpected VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6429, __extension__
__PRETTY_FUNCTION__))
6429 Op.getOperand(1).getValueType() == VT && "Unexpected VTs!")(static_cast <bool> (Op.getOperand(0).getValueType() ==
VT && Op.getOperand(1).getValueType() == VT &&
"Unexpected VTs!") ? void (0) : __assert_fail ("Op.getOperand(0).getValueType() == VT && Op.getOperand(1).getValueType() == VT && \"Unexpected VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6429, __extension__
__PRETTY_FUNCTION__))
;
6430 assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!")(static_cast <bool> ((VT.is256BitVector() || VT.is512BitVector
()) && "Unsupported VT!") ? void (0) : __assert_fail (
"(VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6430, __extension__
__PRETTY_FUNCTION__))
;
6431 return splitVectorOp(Op, DAG);
6432}
6433
6434// Helper for splitting operands of an operation to legal target size and
6435// apply a function on each part.
6436// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
6437// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
6438// deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
6439// The argument Builder is a function that will be applied on each split part:
6440// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
6441template <typename F>
6442SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,
6443 const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
6444 F Builder, bool CheckBWI = true) {
6445 assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2")(static_cast <bool> (Subtarget.hasSSE2() && "Target assumed to support at least SSE2"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Target assumed to support at least SSE2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6445, __extension__
__PRETTY_FUNCTION__))
;
6446 unsigned NumSubs = 1;
6447 if ((CheckBWI && Subtarget.useBWIRegs()) ||
6448 (!CheckBWI && Subtarget.useAVX512Regs())) {
6449 if (VT.getSizeInBits() > 512) {
6450 NumSubs = VT.getSizeInBits() / 512;
6451 assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size")(static_cast <bool> ((VT.getSizeInBits() % 512) == 0 &&
"Illegal vector size") ? void (0) : __assert_fail ("(VT.getSizeInBits() % 512) == 0 && \"Illegal vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6451, __extension__
__PRETTY_FUNCTION__))
;
6452 }
6453 } else if (Subtarget.hasAVX2()) {
6454 if (VT.getSizeInBits() > 256) {
6455 NumSubs = VT.getSizeInBits() / 256;
6456 assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size")(static_cast <bool> ((VT.getSizeInBits() % 256) == 0 &&
"Illegal vector size") ? void (0) : __assert_fail ("(VT.getSizeInBits() % 256) == 0 && \"Illegal vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6456, __extension__
__PRETTY_FUNCTION__))
;
6457 }
6458 } else {
6459 if (VT.getSizeInBits() > 128) {
6460 NumSubs = VT.getSizeInBits() / 128;
6461 assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size")(static_cast <bool> ((VT.getSizeInBits() % 128) == 0 &&
"Illegal vector size") ? void (0) : __assert_fail ("(VT.getSizeInBits() % 128) == 0 && \"Illegal vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6461, __extension__
__PRETTY_FUNCTION__))
;
6462 }
6463 }
6464
6465 if (NumSubs == 1)
6466 return Builder(DAG, DL, Ops);
6467
6468 SmallVector<SDValue, 4> Subs;
6469 for (unsigned i = 0; i != NumSubs; ++i) {
6470 SmallVector<SDValue, 2> SubOps;
6471 for (SDValue Op : Ops) {
6472 EVT OpVT = Op.getValueType();
6473 unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
6474 unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
6475 SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
6476 }
6477 Subs.push_back(Builder(DAG, DL, SubOps));
6478 }
6479 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
6480}
6481
6482// Helper function that extends a non-512-bit vector op to 512-bits on non-VLX
6483// targets.
6484static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT,
6485 ArrayRef<SDValue> Ops, SelectionDAG &DAG,
6486 const X86Subtarget &Subtarget) {
6487 assert(Subtarget.hasAVX512() && "AVX512 target expected")(static_cast <bool> (Subtarget.hasAVX512() && "AVX512 target expected"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"AVX512 target expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6487, __extension__
__PRETTY_FUNCTION__))
;
6488 MVT SVT = VT.getScalarType();
6489
6490 // If we have a 32/64 splatted constant, splat it to DstTy to
6491 // encourage a foldable broadcast'd operand.
6492 auto MakeBroadcastOp = [&](SDValue Op, MVT OpVT, MVT DstVT) {
6493 unsigned OpEltSizeInBits = OpVT.getScalarSizeInBits();
6494 // AVX512 broadcasts 32/64-bit operands.
6495 // TODO: Support float once getAVX512Node is used by fp-ops.
6496 if (!OpVT.isInteger() || OpEltSizeInBits < 32 ||
6497 !DAG.getTargetLoweringInfo().isTypeLegal(SVT))
6498 return SDValue();
6499 // If we're not widening, don't bother if we're not bitcasting.
6500 if (OpVT == DstVT && Op.getOpcode() != ISD::BITCAST)
6501 return SDValue();
6502 if (auto *BV = dyn_cast<BuildVectorSDNode>(peekThroughBitcasts(Op))) {
6503 APInt SplatValue, SplatUndef;
6504 unsigned SplatBitSize;
6505 bool HasAnyUndefs;
6506 if (BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
6507 HasAnyUndefs, OpEltSizeInBits) &&
6508 !HasAnyUndefs && SplatValue.getBitWidth() == OpEltSizeInBits)
6509 return DAG.getConstant(SplatValue, DL, DstVT);
6510 }
6511 return SDValue();
6512 };
6513
6514 bool Widen = !(Subtarget.hasVLX() || VT.is512BitVector());
6515
6516 MVT DstVT = VT;
6517 if (Widen)
6518 DstVT = MVT::getVectorVT(SVT, 512 / SVT.getSizeInBits());
6519
6520 // Canonicalize src operands.
6521 SmallVector<SDValue> SrcOps(Ops.begin(), Ops.end());
6522 for (SDValue &Op : SrcOps) {
6523 MVT OpVT = Op.getSimpleValueType();
6524 // Just pass through scalar operands.
6525 if (!OpVT.isVector())
6526 continue;
6527 assert(OpVT == VT && "Vector type mismatch")(static_cast <bool> (OpVT == VT && "Vector type mismatch"
) ? void (0) : __assert_fail ("OpVT == VT && \"Vector type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6527, __extension__
__PRETTY_FUNCTION__))
;
6528
6529 if (SDValue BroadcastOp = MakeBroadcastOp(Op, OpVT, DstVT)) {
6530 Op = BroadcastOp;
6531 continue;
6532 }
6533
6534 // Just widen the subvector by inserting into an undef wide vector.
6535 if (Widen)
6536 Op = widenSubVector(Op, false, Subtarget, DAG, DL, 512);
6537 }
6538
6539 SDValue Res = DAG.getNode(Opcode, DL, DstVT, SrcOps);
6540
6541 // Perform the 512-bit op then extract the bottom subvector.
6542 if (Widen)
6543 Res = extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
6544 return Res;
6545}
6546
6547/// Insert i1-subvector to i1-vector.
6548static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
6549 const X86Subtarget &Subtarget) {
6550
6551 SDLoc dl(Op);
6552 SDValue Vec = Op.getOperand(0);
6553 SDValue SubVec = Op.getOperand(1);
6554 SDValue Idx = Op.getOperand(2);
6555 unsigned IdxVal = Op.getConstantOperandVal(2);
6556
6557 // Inserting undef is a nop. We can just return the original vector.
6558 if (SubVec.isUndef())
6559 return Vec;
6560
6561 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
6562 return Op;
6563
6564 MVT OpVT = Op.getSimpleValueType();
6565 unsigned NumElems = OpVT.getVectorNumElements();
6566 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
6567
6568 // Extend to natively supported kshift.
6569 MVT WideOpVT = OpVT;
6570 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
6571 WideOpVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
6572
6573 // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
6574 // if necessary.
6575 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
6576 // May need to promote to a legal type.
6577 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6578 DAG.getConstant(0, dl, WideOpVT),
6579 SubVec, Idx);
6580 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
6581 }
6582
6583 MVT SubVecVT = SubVec.getSimpleValueType();
6584 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
6585 assert(IdxVal + SubVecNumElems <= NumElems &&(static_cast <bool> (IdxVal + SubVecNumElems <= NumElems
&& IdxVal % SubVecVT.getSizeInBits() == 0 &&
"Unexpected index value in INSERT_SUBVECTOR") ? void (0) : __assert_fail
("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6587, __extension__
__PRETTY_FUNCTION__))
6586 IdxVal % SubVecVT.getSizeInBits() == 0 &&(static_cast <bool> (IdxVal + SubVecNumElems <= NumElems
&& IdxVal % SubVecVT.getSizeInBits() == 0 &&
"Unexpected index value in INSERT_SUBVECTOR") ? void (0) : __assert_fail
("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6587, __extension__
__PRETTY_FUNCTION__))
6587 "Unexpected index value in INSERT_SUBVECTOR")(static_cast <bool> (IdxVal + SubVecNumElems <= NumElems
&& IdxVal % SubVecVT.getSizeInBits() == 0 &&
"Unexpected index value in INSERT_SUBVECTOR") ? void (0) : __assert_fail
("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6587, __extension__
__PRETTY_FUNCTION__))
;
6588
6589 SDValue Undef = DAG.getUNDEF(WideOpVT);
6590
6591 if (IdxVal == 0) {
6592 // Zero lower bits of the Vec
6593 SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);
6594 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
6595 ZeroIdx);
6596 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
6597 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
6598 // Merge them together, SubVec should be zero extended.
6599 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6600 DAG.getConstant(0, dl, WideOpVT),
6601 SubVec, ZeroIdx);
6602 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
6603 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
6604 }
6605
6606 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6607 Undef, SubVec, ZeroIdx);
6608
6609 if (Vec.isUndef()) {
6610 assert(IdxVal != 0 && "Unexpected index")(static_cast <bool> (IdxVal != 0 && "Unexpected index"
) ? void (0) : __assert_fail ("IdxVal != 0 && \"Unexpected index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6610, __extension__
__PRETTY_FUNCTION__))
;
6611 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6612 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
6613 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
6614 }
6615
6616 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
6617 assert(IdxVal != 0 && "Unexpected index")(static_cast <bool> (IdxVal != 0 && "Unexpected index"
) ? void (0) : __assert_fail ("IdxVal != 0 && \"Unexpected index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6617, __extension__
__PRETTY_FUNCTION__))
;
6618 // If upper elements of Vec are known undef, then just shift into place.
6619 if (llvm::all_of(Vec->ops().slice(IdxVal + SubVecNumElems),
6620 [](SDValue V) { return V.isUndef(); })) {
6621 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6622 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
6623 } else {
6624 NumElems = WideOpVT.getVectorNumElements();
6625 unsigned ShiftLeft = NumElems - SubVecNumElems;
6626 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
6627 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6628 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
6629 if (ShiftRight != 0)
6630 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
6631 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
6632 }
6633 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
6634 }
6635
6636 // Simple case when we put subvector in the upper part
6637 if (IdxVal + SubVecNumElems == NumElems) {
6638 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6639 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
6640 if (SubVecNumElems * 2 == NumElems) {
6641 // Special case, use legal zero extending insert_subvector. This allows
6642 // isel to optimize when bits are known zero.
6643 Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
6644 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6645 DAG.getConstant(0, dl, WideOpVT),
6646 Vec, ZeroIdx);
6647 } else {
6648 // Otherwise use explicit shifts to zero the bits.
6649 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6650 Undef, Vec, ZeroIdx);
6651 NumElems = WideOpVT.getVectorNumElements();
6652 SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
6653 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
6654 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
6655 }
6656 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
6657 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
6658 }
6659
6660 // Inserting into the middle is more complicated.
6661
6662 NumElems = WideOpVT.getVectorNumElements();
6663
6664 // Widen the vector if needed.
6665 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
6666
6667 unsigned ShiftLeft = NumElems - SubVecNumElems;
6668 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
6669
6670 // Do an optimization for the the most frequently used types.
6671 if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {
6672 APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);
6673 Mask0.flipAllBits();
6674 SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));
6675 SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);
6676 Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);
6677 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6678 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
6679 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
6680 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
6681 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
6682
6683 // Reduce to original width if needed.
6684 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
6685 }
6686
6687 // Clear the upper bits of the subvector and move it to its insert position.
6688 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6689 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
6690 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
6691 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
6692
6693 // Isolate the bits below the insertion point.
6694 unsigned LowShift = NumElems - IdxVal;
6695 SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,
6696 DAG.getTargetConstant(LowShift, dl, MVT::i8));
6697 Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,
6698 DAG.getTargetConstant(LowShift, dl, MVT::i8));
6699
6700 // Isolate the bits after the last inserted bit.
6701 unsigned HighShift = IdxVal + SubVecNumElems;
6702 SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
6703 DAG.getTargetConstant(HighShift, dl, MVT::i8));
6704 High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
6705 DAG.getTargetConstant(HighShift, dl, MVT::i8));
6706
6707 // Now OR all 3 pieces together.
6708 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
6709 SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);
6710
6711 // Reduce to original width if needed.
6712 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
6713}
6714
6715static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG,
6716 const SDLoc &dl) {
6717 assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch")(static_cast <bool> (V1.getValueType() == V2.getValueType
() && "subvector type mismatch") ? void (0) : __assert_fail
("V1.getValueType() == V2.getValueType() && \"subvector type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6717, __extension__
__PRETTY_FUNCTION__))
;
6718 EVT SubVT = V1.getValueType();
6719 EVT SubSVT = SubVT.getScalarType();
6720 unsigned SubNumElts = SubVT.getVectorNumElements();
6721 unsigned SubVectorWidth = SubVT.getSizeInBits();
6722 EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);
6723 SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);
6724 return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);
6725}
6726
6727/// Returns a vector of specified type with all bits set.
6728/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
6729/// Then bitcast to their original type, ensuring they get CSE'd.
6730static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
6731 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected a 128/256/512-bit vector type"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected a 128/256/512-bit vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6732, __extension__
__PRETTY_FUNCTION__))
6732 "Expected a 128/256/512-bit vector type")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected a 128/256/512-bit vector type"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected a 128/256/512-bit vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6732, __extension__
__PRETTY_FUNCTION__))
;
6733
6734 APInt Ones = APInt::getAllOnes(32);
6735 unsigned NumElts = VT.getSizeInBits() / 32;
6736 SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
6737 return DAG.getBitcast(VT, Vec);
6738}
6739
6740// Convert *_EXTEND_VECTOR_INREG to *_EXTEND opcode.
6741static unsigned getOpcode_EXTEND(unsigned Opcode) {
6742 switch (Opcode) {
6743 case ISD::ANY_EXTEND:
6744 case ISD::ANY_EXTEND_VECTOR_INREG:
6745 return ISD::ANY_EXTEND;
6746 case ISD::ZERO_EXTEND:
6747 case ISD::ZERO_EXTEND_VECTOR_INREG:
6748 return ISD::ZERO_EXTEND;
6749 case ISD::SIGN_EXTEND:
6750 case ISD::SIGN_EXTEND_VECTOR_INREG:
6751 return ISD::SIGN_EXTEND;
6752 }
6753 llvm_unreachable("Unknown opcode")::llvm::llvm_unreachable_internal("Unknown opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6753)
;
6754}
6755
6756// Convert *_EXTEND to *_EXTEND_VECTOR_INREG opcode.
6757static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode) {
6758 switch (Opcode) {
6759 case ISD::ANY_EXTEND:
6760 case ISD::ANY_EXTEND_VECTOR_INREG:
6761 return ISD::ANY_EXTEND_VECTOR_INREG;
6762 case ISD::ZERO_EXTEND:
6763 case ISD::ZERO_EXTEND_VECTOR_INREG:
6764 return ISD::ZERO_EXTEND_VECTOR_INREG;
6765 case ISD::SIGN_EXTEND:
6766 case ISD::SIGN_EXTEND_VECTOR_INREG:
6767 return ISD::SIGN_EXTEND_VECTOR_INREG;
6768 }
6769 llvm_unreachable("Unknown opcode")::llvm::llvm_unreachable_internal("Unknown opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6769)
;
6770}
6771
6772static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT,
6773 SDValue In, SelectionDAG &DAG) {
6774 EVT InVT = In.getValueType();
6775 assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.")(static_cast <bool> (VT.isVector() && InVT.isVector
() && "Expected vector VTs.") ? void (0) : __assert_fail
("VT.isVector() && InVT.isVector() && \"Expected vector VTs.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6775, __extension__
__PRETTY_FUNCTION__))
;
6776 assert((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||(static_cast <bool> ((ISD::ANY_EXTEND == Opcode || ISD::
SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) &&
"Unknown extension opcode") ? void (0) : __assert_fail ("(ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) && \"Unknown extension opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6778, __extension__
__PRETTY_FUNCTION__))
6777 ISD::ZERO_EXTEND == Opcode) &&(static_cast <bool> ((ISD::ANY_EXTEND == Opcode || ISD::
SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) &&
"Unknown extension opcode") ? void (0) : __assert_fail ("(ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) && \"Unknown extension opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6778, __extension__
__PRETTY_FUNCTION__))
6778 "Unknown extension opcode")(static_cast <bool> ((ISD::ANY_EXTEND == Opcode || ISD::
SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) &&
"Unknown extension opcode") ? void (0) : __assert_fail ("(ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) && \"Unknown extension opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6778, __extension__
__PRETTY_FUNCTION__))
;
6779
6780 // For 256-bit vectors, we only need the lower (128-bit) input half.
6781 // For 512-bit vectors, we only need the lower input half or quarter.
6782 if (InVT.getSizeInBits() > 128) {
6783 assert(VT.getSizeInBits() == InVT.getSizeInBits() &&(static_cast <bool> (VT.getSizeInBits() == InVT.getSizeInBits
() && "Expected VTs to be the same size!") ? void (0)
: __assert_fail ("VT.getSizeInBits() == InVT.getSizeInBits() && \"Expected VTs to be the same size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6784, __extension__
__PRETTY_FUNCTION__))
6784 "Expected VTs to be the same size!")(static_cast <bool> (VT.getSizeInBits() == InVT.getSizeInBits
() && "Expected VTs to be the same size!") ? void (0)
: __assert_fail ("VT.getSizeInBits() == InVT.getSizeInBits() && \"Expected VTs to be the same size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6784, __extension__
__PRETTY_FUNCTION__))
;
6785 unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
6786 In = extractSubVector(In, 0, DAG, DL,
6787 std::max(128U, (unsigned)VT.getSizeInBits() / Scale));
6788 InVT = In.getValueType();
6789 }
6790
6791 if (VT.getVectorNumElements() != InVT.getVectorNumElements())
6792 Opcode = getOpcode_EXTEND_VECTOR_INREG(Opcode);
6793
6794 return DAG.getNode(Opcode, DL, VT, In);
6795}
6796
6797// Match (xor X, -1) -> X.
6798// Match extract_subvector(xor X, -1) -> extract_subvector(X).
6799// Match concat_vectors(xor X, -1, xor Y, -1) -> concat_vectors(X, Y).
6800static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {
6801 V = peekThroughBitcasts(V);
6802 if (V.getOpcode() == ISD::XOR &&
6803 ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()))
6804 return V.getOperand(0);
6805 if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
6806 (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
6807 if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
6808 Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
6809 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), V.getValueType(),
6810 Not, V.getOperand(1));
6811 }
6812 }
6813 SmallVector<SDValue, 2> CatOps;
6814 if (collectConcatOps(V.getNode(), CatOps)) {
6815 for (SDValue &CatOp : CatOps) {
6816 SDValue NotCat = IsNOT(CatOp, DAG);
6817 if (!NotCat) return SDValue();
6818 CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
6819 }
6820 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), V.getValueType(), CatOps);
6821 }
6822 return SDValue();
6823}
6824
6825void llvm::createUnpackShuffleMask(EVT VT, SmallVectorImpl<int> &Mask,
6826 bool Lo, bool Unary) {
6827 assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 &&(static_cast <bool> (VT.getScalarType().isSimple() &&
(VT.getSizeInBits() % 128) == 0 && "Illegal vector type to unpack"
) ? void (0) : __assert_fail ("VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 && \"Illegal vector type to unpack\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6828, __extension__
__PRETTY_FUNCTION__))
6828 "Illegal vector type to unpack")(static_cast <bool> (VT.getScalarType().isSimple() &&
(VT.getSizeInBits() % 128) == 0 && "Illegal vector type to unpack"
) ? void (0) : __assert_fail ("VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 && \"Illegal vector type to unpack\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6828, __extension__
__PRETTY_FUNCTION__))
;
6829 assert(Mask.empty() && "Expected an empty shuffle mask vector")(static_cast <bool> (Mask.empty() && "Expected an empty shuffle mask vector"
) ? void (0) : __assert_fail ("Mask.empty() && \"Expected an empty shuffle mask vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6829, __extension__
__PRETTY_FUNCTION__))
;
6830 int NumElts = VT.getVectorNumElements();
6831 int NumEltsInLane = 128 / VT.getScalarSizeInBits();
6832 for (int i = 0; i < NumElts; ++i) {
6833 unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
6834 int Pos = (i % NumEltsInLane) / 2 + LaneStart;
6835 Pos += (Unary ? 0 : NumElts * (i % 2));
6836 Pos += (Lo ? 0 : NumEltsInLane / 2);
6837 Mask.push_back(Pos);
6838 }
6839}
6840
6841/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
6842/// imposed by AVX and specific to the unary pattern. Example:
6843/// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
6844/// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
6845void llvm::createSplat2ShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
6846 bool Lo) {
6847 assert(Mask.empty() && "Expected an empty shuffle mask vector")(static_cast <bool> (Mask.empty() && "Expected an empty shuffle mask vector"
) ? void (0) : __assert_fail ("Mask.empty() && \"Expected an empty shuffle mask vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6847, __extension__
__PRETTY_FUNCTION__))
;
6848 int NumElts = VT.getVectorNumElements();
6849 for (int i = 0; i < NumElts; ++i) {
6850 int Pos = i / 2;
6851 Pos += (Lo ? 0 : NumElts / 2);
6852 Mask.push_back(Pos);
6853 }
6854}
6855
6856// Attempt to constant fold, else just create a VECTOR_SHUFFLE.
6857static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl,
6858 SDValue V1, SDValue V2, ArrayRef<int> Mask) {
6859 if ((ISD::isBuildVectorOfConstantSDNodes(V1.getNode()) || V1.isUndef()) &&
6860 (ISD::isBuildVectorOfConstantSDNodes(V2.getNode()) || V2.isUndef())) {
6861 SmallVector<SDValue> Ops(Mask.size(), DAG.getUNDEF(VT.getScalarType()));
6862 for (int I = 0, NumElts = Mask.size(); I != NumElts; ++I) {
6863 int M = Mask[I];
6864 if (M < 0)
6865 continue;
6866 SDValue V = (M < NumElts) ? V1 : V2;
6867 if (V.isUndef())
6868 continue;
6869 Ops[I] = V.getOperand(M % NumElts);
6870 }
6871 return DAG.getBuildVector(VT, dl, Ops);
6872 }
6873
6874 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
6875}
6876
6877/// Returns a vector_shuffle node for an unpackl operation.
6878static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
6879 SDValue V1, SDValue V2) {
6880 SmallVector<int, 8> Mask;
6881 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
6882 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
6883}
6884
6885/// Returns a vector_shuffle node for an unpackh operation.
6886static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
6887 SDValue V1, SDValue V2) {
6888 SmallVector<int, 8> Mask;
6889 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
6890 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
6891}
6892
6893/// Returns a node that packs the LHS + RHS nodes together at half width.
6894/// May return X86ISD::PACKSS/PACKUS, packing the top/bottom half.
6895/// TODO: Add subvector splitting if/when we have a need for it.
6896static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget,
6897 const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS,
6898 bool PackHiHalf = false) {
6899 MVT OpVT = LHS.getSimpleValueType();
6900 unsigned EltSizeInBits = VT.getScalarSizeInBits();
6901 bool UsePackUS = Subtarget.hasSSE41() || EltSizeInBits == 8;
6902 assert(OpVT == RHS.getSimpleValueType() &&(static_cast <bool> (OpVT == RHS.getSimpleValueType() &&
VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits
* 2) == OpVT.getScalarSizeInBits() && "Unexpected PACK operand types"
) ? void (0) : __assert_fail ("OpVT == RHS.getSimpleValueType() && VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() && \"Unexpected PACK operand types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6905, __extension__
__PRETTY_FUNCTION__))
6903 VT.getSizeInBits() == OpVT.getSizeInBits() &&(static_cast <bool> (OpVT == RHS.getSimpleValueType() &&
VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits
* 2) == OpVT.getScalarSizeInBits() && "Unexpected PACK operand types"
) ? void (0) : __assert_fail ("OpVT == RHS.getSimpleValueType() && VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() && \"Unexpected PACK operand types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6905, __extension__
__PRETTY_FUNCTION__))
6904 (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() &&(static_cast <bool> (OpVT == RHS.getSimpleValueType() &&
VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits
* 2) == OpVT.getScalarSizeInBits() && "Unexpected PACK operand types"
) ? void (0) : __assert_fail ("OpVT == RHS.getSimpleValueType() && VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() && \"Unexpected PACK operand types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6905, __extension__
__PRETTY_FUNCTION__))
6905 "Unexpected PACK operand types")(static_cast <bool> (OpVT == RHS.getSimpleValueType() &&
VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits
* 2) == OpVT.getScalarSizeInBits() && "Unexpected PACK operand types"
) ? void (0) : __assert_fail ("OpVT == RHS.getSimpleValueType() && VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() && \"Unexpected PACK operand types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6905, __extension__
__PRETTY_FUNCTION__))
;
6906 assert((EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) &&(static_cast <bool> ((EltSizeInBits == 8 || EltSizeInBits
== 16 || EltSizeInBits == 32) && "Unexpected PACK result type"
) ? void (0) : __assert_fail ("(EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) && \"Unexpected PACK result type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6907, __extension__
__PRETTY_FUNCTION__))
6907 "Unexpected PACK result type")(static_cast <bool> ((EltSizeInBits == 8 || EltSizeInBits
== 16 || EltSizeInBits == 32) && "Unexpected PACK result type"
) ? void (0) : __assert_fail ("(EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) && \"Unexpected PACK result type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6907, __extension__
__PRETTY_FUNCTION__))
;
6908
6909 // Rely on vector shuffles for vXi64 -> vXi32 packing.
6910 if (EltSizeInBits == 32) {
6911 SmallVector<int> PackMask;
6912 int Offset = PackHiHalf ? 1 : 0;
6913 int NumElts = VT.getVectorNumElements();
6914 for (int I = 0; I != NumElts; I += 4) {
6915 PackMask.push_back(I + Offset);
6916 PackMask.push_back(I + Offset + 2);
6917 PackMask.push_back(I + Offset + NumElts);
6918 PackMask.push_back(I + Offset + NumElts + 2);
6919 }
6920 return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, LHS),
6921 DAG.getBitcast(VT, RHS), PackMask);
6922 }
6923
6924 // See if we already have sufficient leading bits for PACKSS/PACKUS.
6925 if (!PackHiHalf) {
6926 if (UsePackUS &&
6927 DAG.computeKnownBits(LHS).countMaxActiveBits() <= EltSizeInBits &&
6928 DAG.computeKnownBits(RHS).countMaxActiveBits() <= EltSizeInBits)
6929 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
6930
6931 if (DAG.ComputeMaxSignificantBits(LHS) <= EltSizeInBits &&
6932 DAG.ComputeMaxSignificantBits(RHS) <= EltSizeInBits)
6933 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
6934 }
6935
6936 // Fallback to sign/zero extending the requested half and pack.
6937 SDValue Amt = DAG.getTargetConstant(EltSizeInBits, dl, MVT::i8);
6938 if (UsePackUS) {
6939 if (PackHiHalf) {
6940 LHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, LHS, Amt);
6941 RHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, RHS, Amt);
6942 } else {
6943 SDValue Mask = DAG.getConstant((1ULL << EltSizeInBits) - 1, dl, OpVT);
6944 LHS = DAG.getNode(ISD::AND, dl, OpVT, LHS, Mask);
6945 RHS = DAG.getNode(ISD::AND, dl, OpVT, RHS, Mask);
6946 };
6947 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
6948 };
6949
6950 if (!PackHiHalf) {
6951 LHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, LHS, Amt);
6952 RHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, RHS, Amt);
6953 }
6954 LHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, LHS, Amt);
6955 RHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, RHS, Amt);
6956 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
6957}
6958
6959/// Return a vector_shuffle of the specified vector of zero or undef vector.
6960/// This produces a shuffle where the low element of V2 is swizzled into the
6961/// zero/undef vector, landing at element Idx.
6962/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
6963static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
6964 bool IsZero,
6965 const X86Subtarget &Subtarget,
6966 SelectionDAG &DAG) {
6967 MVT VT = V2.getSimpleValueType();
6968 SDValue V1 = IsZero
6969 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
6970 int NumElems = VT.getVectorNumElements();
6971 SmallVector<int, 16> MaskVec(NumElems);
6972 for (int i = 0; i != NumElems; ++i)
6973 // If this is the insertion idx, put the low elt of V2 here.
6974 MaskVec[i] = (i == Idx) ? NumElems : i;
6975 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
6976}
6977
6978static const Constant *getTargetConstantFromBasePtr(SDValue Ptr) {
6979 if (Ptr.getOpcode() == X86ISD::Wrapper ||
6980 Ptr.getOpcode() == X86ISD::WrapperRIP)
6981 Ptr = Ptr.getOperand(0);
6982
6983 auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
6984 if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
6985 return nullptr;
6986
6987 return CNode->getConstVal();
6988}
6989
6990static const Constant *getTargetConstantFromNode(LoadSDNode *Load) {
6991 if (!Load || !ISD::isNormalLoad(Load))
6992 return nullptr;
6993 return getTargetConstantFromBasePtr(Load->getBasePtr());
6994}
6995
6996static const Constant *getTargetConstantFromNode(SDValue Op) {
6997 Op = peekThroughBitcasts(Op);
6998 return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op));
6999}
7000
7001const Constant *
7002X86TargetLowering::getTargetConstantFromLoad(LoadSDNode *LD) const {
7003 assert(LD && "Unexpected null LoadSDNode")(static_cast <bool> (LD && "Unexpected null LoadSDNode"
) ? void (0) : __assert_fail ("LD && \"Unexpected null LoadSDNode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7003, __extension__
__PRETTY_FUNCTION__))
;
7004 return getTargetConstantFromNode(LD);
7005}
7006
7007// Extract raw constant bits from constant pools.
7008static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
7009 APInt &UndefElts,
7010 SmallVectorImpl<APInt> &EltBits,
7011 bool AllowWholeUndefs = true,
7012 bool AllowPartialUndefs = true) {
7013 assert(EltBits.empty() && "Expected an empty EltBits vector")(static_cast <bool> (EltBits.empty() && "Expected an empty EltBits vector"
) ? void (0) : __assert_fail ("EltBits.empty() && \"Expected an empty EltBits vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7013, __extension__
__PRETTY_FUNCTION__))
;
7014
7015 Op = peekThroughBitcasts(Op);
7016
7017 EVT VT = Op.getValueType();
7018 unsigned SizeInBits = VT.getSizeInBits();
7019 assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!")(static_cast <bool> ((SizeInBits % EltSizeInBits) == 0 &&
"Can't split constant!") ? void (0) : __assert_fail ("(SizeInBits % EltSizeInBits) == 0 && \"Can't split constant!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7019, __extension__
__PRETTY_FUNCTION__))
;
7020 unsigned NumElts = SizeInBits / EltSizeInBits;
7021
7022 // Bitcast a source array of element bits to the target size.
7023 auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
7024 unsigned NumSrcElts = UndefSrcElts.getBitWidth();
7025 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
7026 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&(static_cast <bool> ((NumSrcElts * SrcEltSizeInBits) ==
SizeInBits && "Constant bit sizes don't match") ? void
(0) : __assert_fail ("(NumSrcElts * SrcEltSizeInBits) == SizeInBits && \"Constant bit sizes don't match\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7027, __extension__
__PRETTY_FUNCTION__))
7027 "Constant bit sizes don't match")(static_cast <bool> ((NumSrcElts * SrcEltSizeInBits) ==
SizeInBits && "Constant bit sizes don't match") ? void
(0) : __assert_fail ("(NumSrcElts * SrcEltSizeInBits) == SizeInBits && \"Constant bit sizes don't match\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7027, __extension__
__PRETTY_FUNCTION__))
;
7028
7029 // Don't split if we don't allow undef bits.
7030 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
7031 if (UndefSrcElts.getBoolValue() && !AllowUndefs)
7032 return false;
7033
7034 // If we're already the right size, don't bother bitcasting.
7035 if (NumSrcElts == NumElts) {
7036 UndefElts = UndefSrcElts;
7037 EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
7038 return true;
7039 }
7040
7041 // Extract all the undef/constant element data and pack into single bitsets.
7042 APInt UndefBits(SizeInBits, 0);
7043 APInt MaskBits(SizeInBits, 0);
7044
7045 for (unsigned i = 0; i != NumSrcElts; ++i) {
7046 unsigned BitOffset = i * SrcEltSizeInBits;
7047 if (UndefSrcElts[i])
7048 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
7049 MaskBits.insertBits(SrcEltBits[i], BitOffset);
7050 }
7051
7052 // Split the undef/constant single bitset data into the target elements.
7053 UndefElts = APInt(NumElts, 0);
7054 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
7055
7056 for (unsigned i = 0; i != NumElts; ++i) {
7057 unsigned BitOffset = i * EltSizeInBits;
7058 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
7059
7060 // Only treat an element as UNDEF if all bits are UNDEF.
7061 if (UndefEltBits.isAllOnes()) {
7062 if (!AllowWholeUndefs)
7063 return false;
7064 UndefElts.setBit(i);
7065 continue;
7066 }
7067
7068 // If only some bits are UNDEF then treat them as zero (or bail if not
7069 // supported).
7070 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
7071 return false;
7072
7073 EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);
7074 }
7075 return true;
7076 };
7077
7078 // Collect constant bits and insert into mask/undef bit masks.
7079 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
7080 unsigned UndefBitIndex) {
7081 if (!Cst)
7082 return false;
7083 if (isa<UndefValue>(Cst)) {
7084 Undefs.setBit(UndefBitIndex);
7085 return true;
7086 }
7087 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
7088 Mask = CInt->getValue();
7089 return true;
7090 }
7091 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
7092 Mask = CFP->getValueAPF().bitcastToAPInt();
7093 return true;
7094 }
7095 return false;
7096 };
7097
7098 // Handle UNDEFs.
7099 if (Op.isUndef()) {
7100 APInt UndefSrcElts = APInt::getAllOnes(NumElts);
7101 SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
7102 return CastBitData(UndefSrcElts, SrcEltBits);
7103 }
7104
7105 // Extract scalar constant bits.
7106 if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
7107 APInt UndefSrcElts = APInt::getZero(1);
7108 SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
7109 return CastBitData(UndefSrcElts, SrcEltBits);
7110 }
7111 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
7112 APInt UndefSrcElts = APInt::getZero(1);
7113 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
7114 SmallVector<APInt, 64> SrcEltBits(1, RawBits);
7115 return CastBitData(UndefSrcElts, SrcEltBits);
7116 }
7117
7118 // Extract constant bits from build vector.
7119 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op)) {
7120 BitVector Undefs;
7121 SmallVector<APInt> SrcEltBits;
7122 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
7123 if (BV->getConstantRawBits(true, SrcEltSizeInBits, SrcEltBits, Undefs)) {
7124 APInt UndefSrcElts = APInt::getNullValue(SrcEltBits.size());
7125 for (unsigned I = 0, E = SrcEltBits.size(); I != E; ++I)
7126 if (Undefs[I])
7127 UndefSrcElts.setBit(I);
7128 return CastBitData(UndefSrcElts, SrcEltBits);
7129 }
7130 }
7131
7132 // Extract constant bits from constant pool vector.
7133 if (auto *Cst = getTargetConstantFromNode(Op)) {
7134 Type *CstTy = Cst->getType();
7135 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
7136 if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
7137 return false;
7138
7139 unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
7140 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
7141
7142 APInt UndefSrcElts(NumSrcElts, 0);
7143 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
7144 for (unsigned i = 0; i != NumSrcElts; ++i)
7145 if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
7146 UndefSrcElts, i))
7147 return false;
7148
7149 return CastBitData(UndefSrcElts, SrcEltBits);
7150 }
7151
7152 // Extract constant bits from a broadcasted constant pool scalar.
7153 if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
7154 EltSizeInBits <= VT.getScalarSizeInBits()) {
7155 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
7156 if (MemIntr->getMemoryVT().getScalarSizeInBits() != VT.getScalarSizeInBits())
7157 return false;
7158
7159 SDValue Ptr = MemIntr->getBasePtr();
7160 if (const Constant *C = getTargetConstantFromBasePtr(Ptr)) {
7161 unsigned SrcEltSizeInBits = C->getType()->getScalarSizeInBits();
7162 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
7163
7164 APInt UndefSrcElts(NumSrcElts, 0);
7165 SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
7166 if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {
7167 if (UndefSrcElts[0])
7168 UndefSrcElts.setBits(0, NumSrcElts);
7169 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
7170 return CastBitData(UndefSrcElts, SrcEltBits);
7171 }
7172 }
7173 }
7174
7175 // Extract constant bits from a subvector broadcast.
7176 if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
7177 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
7178 SDValue Ptr = MemIntr->getBasePtr();
7179 // The source constant may be larger than the subvector broadcast,
7180 // ensure we extract the correct subvector constants.
7181 if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) {
7182 Type *CstTy = Cst->getType();
7183 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
7184 unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits();
7185 if (!CstTy->isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 ||
7186 (SizeInBits % SubVecSizeInBits) != 0)
7187 return false;
7188 unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
7189 unsigned NumSubElts = SubVecSizeInBits / CstEltSizeInBits;
7190 unsigned NumSubVecs = SizeInBits / SubVecSizeInBits;
7191 APInt UndefSubElts(NumSubElts, 0);
7192 SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs,
7193 APInt(CstEltSizeInBits, 0));
7194 for (unsigned i = 0; i != NumSubElts; ++i) {
7195 if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],
7196 UndefSubElts, i))
7197 return false;
7198 for (unsigned j = 1; j != NumSubVecs; ++j)
7199 SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];
7200 }
7201 UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(),
7202 UndefSubElts);
7203 return CastBitData(UndefSubElts, SubEltBits);
7204 }
7205 }
7206
7207 // Extract a rematerialized scalar constant insertion.
7208 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
7209 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
7210 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
7211 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
7212 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
7213
7214 APInt UndefSrcElts(NumSrcElts, 0);
7215 SmallVector<APInt, 64> SrcEltBits;
7216 auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
7217 SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));
7218 SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
7219 return CastBitData(UndefSrcElts, SrcEltBits);
7220 }
7221
7222 // Insert constant bits from a base and sub vector sources.
7223 if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {
7224 // If bitcasts to larger elements we might lose track of undefs - don't
7225 // allow any to be safe.
7226 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
7227 bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;
7228
7229 APInt UndefSrcElts, UndefSubElts;
7230 SmallVector<APInt, 32> EltSrcBits, EltSubBits;
7231 if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits,
7232 UndefSubElts, EltSubBits,
7233 AllowWholeUndefs && AllowUndefs,
7234 AllowPartialUndefs && AllowUndefs) &&
7235 getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits,
7236 UndefSrcElts, EltSrcBits,
7237 AllowWholeUndefs && AllowUndefs,
7238 AllowPartialUndefs && AllowUndefs)) {
7239 unsigned BaseIdx = Op.getConstantOperandVal(2);
7240 UndefSrcElts.insertBits(UndefSubElts, BaseIdx);
7241 for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
7242 EltSrcBits[BaseIdx + i] = EltSubBits[i];
7243 return CastBitData(UndefSrcElts, EltSrcBits);
7244 }
7245 }
7246
7247 // Extract constant bits from a subvector's source.
7248 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
7249 // TODO - support extract_subvector through bitcasts.
7250 if (EltSizeInBits != VT.getScalarSizeInBits())
7251 return false;
7252
7253 if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
7254 UndefElts, EltBits, AllowWholeUndefs,
7255 AllowPartialUndefs)) {
7256 EVT SrcVT = Op.getOperand(0).getValueType();
7257 unsigned NumSrcElts = SrcVT.getVectorNumElements();
7258 unsigned NumSubElts = VT.getVectorNumElements();
7259 unsigned BaseIdx = Op.getConstantOperandVal(1);
7260 UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
7261 if ((BaseIdx + NumSubElts) != NumSrcElts)
7262 EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
7263 if (BaseIdx != 0)
7264 EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
7265 return true;
7266 }
7267 }
7268
7269 // Extract constant bits from shuffle node sources.
7270 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {
7271 // TODO - support shuffle through bitcasts.
7272 if (EltSizeInBits != VT.getScalarSizeInBits())
7273 return false;
7274
7275 ArrayRef<int> Mask = SVN->getMask();
7276 if ((!AllowWholeUndefs || !AllowPartialUndefs) &&
7277 llvm::any_of(Mask, [](int M) { return M < 0; }))
7278 return false;
7279
7280 APInt UndefElts0, UndefElts1;
7281 SmallVector<APInt, 32> EltBits0, EltBits1;
7282 if (isAnyInRange(Mask, 0, NumElts) &&
7283 !getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
7284 UndefElts0, EltBits0, AllowWholeUndefs,
7285 AllowPartialUndefs))
7286 return false;
7287 if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&
7288 !getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
7289 UndefElts1, EltBits1, AllowWholeUndefs,
7290 AllowPartialUndefs))
7291 return false;
7292
7293 UndefElts = APInt::getZero(NumElts);
7294 for (int i = 0; i != (int)NumElts; ++i) {
7295 int M = Mask[i];
7296 if (M < 0) {
7297 UndefElts.setBit(i);
7298 EltBits.push_back(APInt::getZero(EltSizeInBits));
7299 } else if (M < (int)NumElts) {
7300 if (UndefElts0[M])
7301 UndefElts.setBit(i);
7302 EltBits.push_back(EltBits0[M]);
7303 } else {
7304 if (UndefElts1[M - NumElts])
7305 UndefElts.setBit(i);
7306 EltBits.push_back(EltBits1[M - NumElts]);
7307 }
7308 }
7309 return true;
7310 }
7311
7312 return false;
7313}
7314
7315namespace llvm {
7316namespace X86 {
7317bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {
7318 APInt UndefElts;
7319 SmallVector<APInt, 16> EltBits;
7320 if (getTargetConstantBitsFromNode(Op, Op.getScalarValueSizeInBits(),
7321 UndefElts, EltBits, true,
7322 AllowPartialUndefs)) {
7323 int SplatIndex = -1;
7324 for (int i = 0, e = EltBits.size(); i != e; ++i) {
7325 if (UndefElts[i])
7326 continue;
7327 if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
7328 SplatIndex = -1;
7329 break;
7330 }
7331 SplatIndex = i;
7332 }
7333 if (0 <= SplatIndex) {
7334 SplatVal = EltBits[SplatIndex];
7335 return true;
7336 }
7337 }
7338
7339 return false;
7340}
7341} // namespace X86
7342} // namespace llvm
7343
7344static bool getTargetShuffleMaskIndices(SDValue MaskNode,
7345 unsigned MaskEltSizeInBits,
7346 SmallVectorImpl<uint64_t> &RawMask,
7347 APInt &UndefElts) {
7348 // Extract the raw target constant bits.
7349 SmallVector<APInt, 64> EltBits;
7350 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
7351 EltBits, /* AllowWholeUndefs */ true,
7352 /* AllowPartialUndefs */ false))
7353 return false;
7354
7355 // Insert the extracted elements into the mask.
7356 for (const APInt &Elt : EltBits)
7357 RawMask.push_back(Elt.getZExtValue());
7358
7359 return true;
7360}
7361
7362/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
7363/// A multi-stage pack shuffle mask is created by specifying NumStages > 1.
7364/// Note: This ignores saturation, so inputs must be checked first.
7365static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
7366 bool Unary, unsigned NumStages = 1) {
7367 assert(Mask.empty() && "Expected an empty shuffle mask vector")(static_cast <bool> (Mask.empty() && "Expected an empty shuffle mask vector"
) ? void (0) : __assert_fail ("Mask.empty() && \"Expected an empty shuffle mask vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7367, __extension__
__PRETTY_FUNCTION__))
;
7368 unsigned NumElts = VT.getVectorNumElements();
7369 unsigned NumLanes = VT.getSizeInBits() / 128;
7370 unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
7371 unsigned Offset = Unary ? 0 : NumElts;
7372 unsigned Repetitions = 1u << (NumStages - 1);
7373 unsigned Increment = 1u << NumStages;
7374 assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction")(static_cast <bool> ((NumEltsPerLane >> NumStages
) > 0 && "Illegal packing compaction") ? void (0) :
__assert_fail ("(NumEltsPerLane >> NumStages) > 0 && \"Illegal packing compaction\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7374, __extension__
__PRETTY_FUNCTION__))
;
7375
7376 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
7377 for (unsigned Stage = 0; Stage != Repetitions; ++Stage) {
7378 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
7379 Mask.push_back(Elt + (Lane * NumEltsPerLane));
7380 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
7381 Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
7382 }
7383 }
7384}
7385
7386// Split the demanded elts of a PACKSS/PACKUS node between its operands.
7387static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
7388 APInt &DemandedLHS, APInt &DemandedRHS) {
7389 int NumLanes = VT.getSizeInBits() / 128;
7390 int NumElts = DemandedElts.getBitWidth();
7391 int NumInnerElts = NumElts / 2;
7392 int NumEltsPerLane = NumElts / NumLanes;
7393 int NumInnerEltsPerLane = NumInnerElts / NumLanes;
7394
7395 DemandedLHS = APInt::getZero(NumInnerElts);
7396 DemandedRHS = APInt::getZero(NumInnerElts);
7397
7398 // Map DemandedElts to the packed operands.
7399 for (int Lane = 0; Lane != NumLanes; ++Lane) {
7400 for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
7401 int OuterIdx = (Lane * NumEltsPerLane) + Elt;
7402 int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
7403 if (DemandedElts[OuterIdx])
7404 DemandedLHS.setBit(InnerIdx);
7405 if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
7406 DemandedRHS.setBit(InnerIdx);
7407 }
7408 }
7409}
7410
7411// Split the demanded elts of a HADD/HSUB node between its operands.
7412static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
7413 APInt &DemandedLHS, APInt &DemandedRHS) {
7414 int NumLanes = VT.getSizeInBits() / 128;
7415 int NumElts = DemandedElts.getBitWidth();
7416 int NumEltsPerLane = NumElts / NumLanes;
7417 int HalfEltsPerLane = NumEltsPerLane / 2;
7418
7419 DemandedLHS = APInt::getZero(NumElts);
7420 DemandedRHS = APInt::getZero(NumElts);
7421
7422 // Map DemandedElts to the horizontal operands.
7423 for (int Idx = 0; Idx != NumElts; ++Idx) {
7424 if (!DemandedElts[Idx])
7425 continue;
7426 int LaneIdx = (Idx / NumEltsPerLane) * NumEltsPerLane;
7427 int LocalIdx = Idx % NumEltsPerLane;
7428 if (LocalIdx < HalfEltsPerLane) {
7429 DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 0);
7430 DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 1);
7431 } else {
7432 LocalIdx -= HalfEltsPerLane;
7433 DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 0);
7434 DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 1);
7435 }
7436 }
7437}
7438
7439/// Calculates the shuffle mask corresponding to the target-specific opcode.
7440/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
7441/// operands in \p Ops, and returns true.
7442/// Sets \p IsUnary to true if only one source is used. Note that this will set
7443/// IsUnary for shuffles which use a single input multiple times, and in those
7444/// cases it will adjust the mask to only have indices within that single input.
7445/// It is an error to call this with non-empty Mask/Ops vectors.
7446static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
7447 SmallVectorImpl<SDValue> &Ops,
7448 SmallVectorImpl<int> &Mask, bool &IsUnary) {
7449 unsigned NumElems = VT.getVectorNumElements();
7450 unsigned MaskEltSize = VT.getScalarSizeInBits();
7451 SmallVector<uint64_t, 32> RawMask;
7452 APInt RawUndefs;
7453 uint64_t ImmN;
7454
7455 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector")(static_cast <bool> (Mask.empty() && "getTargetShuffleMask expects an empty Mask vector"
) ? void (0) : __assert_fail ("Mask.empty() && \"getTargetShuffleMask expects an empty Mask vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7455, __extension__
__PRETTY_FUNCTION__))
;
7456 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector")(static_cast <bool> (Ops.empty() && "getTargetShuffleMask expects an empty Ops vector"
) ? void (0) : __assert_fail ("Ops.empty() && \"getTargetShuffleMask expects an empty Ops vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7456, __extension__
__PRETTY_FUNCTION__))
;
7457
7458 IsUnary = false;
7459 bool IsFakeUnary = false;
7460 switch (N->getOpcode()) {
7461 case X86ISD::BLENDI:
7462 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7462, __extension__
__PRETTY_FUNCTION__))
;
7463 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7463, __extension__
__PRETTY_FUNCTION__))
;
7464 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7465 DecodeBLENDMask(NumElems, ImmN, Mask);
7466 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7467 break;
7468 case X86ISD::SHUFP:
7469 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7469, __extension__
__PRETTY_FUNCTION__))
;
7470 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7470, __extension__
__PRETTY_FUNCTION__))
;
7471 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7472 DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);
7473 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7474 break;
7475 case X86ISD::INSERTPS:
7476 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7476, __extension__
__PRETTY_FUNCTION__))
;
7477 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7477, __extension__
__PRETTY_FUNCTION__))
;
7478 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7479 DecodeINSERTPSMask(ImmN, Mask);
7480 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7481 break;
7482 case X86ISD::EXTRQI:
7483 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7483, __extension__
__PRETTY_FUNCTION__))
;
7484 if (isa<ConstantSDNode>(N->getOperand(1)) &&
7485 isa<ConstantSDNode>(N->getOperand(2))) {
7486 int BitLen = N->getConstantOperandVal(1);
7487 int BitIdx = N->getConstantOperandVal(2);
7488 DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
7489 IsUnary = true;
7490 }
7491 break;
7492 case X86ISD::INSERTQI:
7493 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7493, __extension__
__PRETTY_FUNCTION__))
;
7494 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7494, __extension__
__PRETTY_FUNCTION__))
;
7495 if (isa<ConstantSDNode>(N->getOperand(2)) &&
7496 isa<ConstantSDNode>(N->getOperand(3))) {
7497 int BitLen = N->getConstantOperandVal(2);
7498 int BitIdx = N->getConstantOperandVal(3);
7499 DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
7500 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7501 }
7502 break;
7503 case X86ISD::UNPCKH:
7504 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7504, __extension__
__PRETTY_FUNCTION__))
;
7505 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7505, __extension__
__PRETTY_FUNCTION__))
;
7506 DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);
7507 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7508 break;
7509 case X86ISD::UNPCKL:
7510 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7510, __extension__
__PRETTY_FUNCTION__))
;
7511 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7511, __extension__
__PRETTY_FUNCTION__))
;
7512 DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);
7513 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7514 break;
7515 case X86ISD::MOVHLPS:
7516 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7516, __extension__
__PRETTY_FUNCTION__))
;
7517 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7517, __extension__
__PRETTY_FUNCTION__))
;
7518 DecodeMOVHLPSMask(NumElems, Mask);
7519 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7520 break;
7521 case X86ISD::MOVLHPS:
7522 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7522, __extension__
__PRETTY_FUNCTION__))
;
7523 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7523, __extension__
__PRETTY_FUNCTION__))
;
7524 DecodeMOVLHPSMask(NumElems, Mask);
7525 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7526 break;
7527 case X86ISD::VALIGN:
7528 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&(static_cast <bool> ((VT.getScalarType() == MVT::i32 ||
VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? void (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7529, __extension__
__PRETTY_FUNCTION__))
7529 "Only 32-bit and 64-bit elements are supported!")(static_cast <bool> ((VT.getScalarType() == MVT::i32 ||
VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? void (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7529, __extension__
__PRETTY_FUNCTION__))
;
7530 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7530, __extension__
__PRETTY_FUNCTION__))
;
7531 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7531, __extension__
__PRETTY_FUNCTION__))
;
7532 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7533 DecodeVALIGNMask(NumElems, ImmN, Mask);
7534 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7535 Ops.push_back(N->getOperand(1));
7536 Ops.push_back(N->getOperand(0));
7537 break;
7538 case X86ISD::PALIGNR:
7539 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Byte vector expected") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7539, __extension__
__PRETTY_FUNCTION__))
;
7540 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7540, __extension__
__PRETTY_FUNCTION__))
;
7541 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7541, __extension__
__PRETTY_FUNCTION__))
;
7542 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7543 DecodePALIGNRMask(NumElems, ImmN, Mask);
7544 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7545 Ops.push_back(N->getOperand(1));
7546 Ops.push_back(N->getOperand(0));
7547 break;
7548 case X86ISD::VSHLDQ:
7549 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Byte vector expected") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7549, __extension__
__PRETTY_FUNCTION__))
;
7550 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7550, __extension__
__PRETTY_FUNCTION__))
;
7551 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7552 DecodePSLLDQMask(NumElems, ImmN, Mask);
7553 IsUnary = true;
7554 break;
7555 case X86ISD::VSRLDQ:
7556 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Byte vector expected") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7556, __extension__
__PRETTY_FUNCTION__))
;
7557 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7557, __extension__
__PRETTY_FUNCTION__))
;
7558 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7559 DecodePSRLDQMask(NumElems, ImmN, Mask);
7560 IsUnary = true;
7561 break;
7562 case X86ISD::PSHUFD:
7563 case X86ISD::VPERMILPI:
7564 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7564, __extension__
__PRETTY_FUNCTION__))
;
7565 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7566 DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);
7567 IsUnary = true;
7568 break;
7569 case X86ISD::PSHUFHW:
7570 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7570, __extension__
__PRETTY_FUNCTION__))
;
7571 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7572 DecodePSHUFHWMask(NumElems, ImmN, Mask);
7573 IsUnary = true;
7574 break;
7575 case X86ISD::PSHUFLW:
7576 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7576, __extension__
__PRETTY_FUNCTION__))
;
7577 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7578 DecodePSHUFLWMask(NumElems, ImmN, Mask);
7579 IsUnary = true;
7580 break;
7581 case X86ISD::VZEXT_MOVL:
7582 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7582, __extension__
__PRETTY_FUNCTION__))
;
7583 DecodeZeroMoveLowMask(NumElems, Mask);
7584 IsUnary = true;
7585 break;
7586 case X86ISD::VBROADCAST:
7587 // We only decode broadcasts of same-sized vectors, peeking through to
7588 // extracted subvectors is likely to cause hasOneUse issues with
7589 // SimplifyDemandedBits etc.
7590 if (N->getOperand(0).getValueType() == VT) {
7591 DecodeVectorBroadcast(NumElems, Mask);
7592 IsUnary = true;
7593 break;
7594 }
7595 return false;
7596 case X86ISD::VPERMILPV: {
7597 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7597, __extension__
__PRETTY_FUNCTION__))
;
7598 IsUnary = true;
7599 SDValue MaskNode = N->getOperand(1);
7600 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
7601 RawUndefs)) {
7602 DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);
7603 break;
7604 }
7605 return false;
7606 }
7607 case X86ISD::PSHUFB: {
7608 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Byte vector expected") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7608, __extension__
__PRETTY_FUNCTION__))
;
7609 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7609, __extension__
__PRETTY_FUNCTION__))
;
7610 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7610, __extension__
__PRETTY_FUNCTION__))
;
7611 IsUnary = true;
7612 SDValue MaskNode = N->getOperand(1);
7613 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
7614 DecodePSHUFBMask(RawMask, RawUndefs, Mask);
7615 break;
7616 }
7617 return false;
7618 }
7619 case X86ISD::VPERMI:
7620 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7620, __extension__
__PRETTY_FUNCTION__))
;
7621 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7622 DecodeVPERMMask(NumElems, ImmN, Mask);
7623 IsUnary = true;
7624 break;
7625 case X86ISD::MOVSS:
7626 case X86ISD::MOVSD:
7627 case X86ISD::MOVSH:
7628 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7628, __extension__
__PRETTY_FUNCTION__))
;
7629 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7629, __extension__
__PRETTY_FUNCTION__))
;
7630 DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
7631 break;
7632 case X86ISD::VPERM2X128:
7633 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7633, __extension__
__PRETTY_FUNCTION__))
;
7634 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7634, __extension__
__PRETTY_FUNCTION__))
;
7635 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7636 DecodeVPERM2X128Mask(NumElems, ImmN, Mask);
7637 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7638 break;
7639 case X86ISD::SHUF128:
7640 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7640, __extension__
__PRETTY_FUNCTION__))
;
7641 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7641, __extension__
__PRETTY_FUNCTION__))
;
7642 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7643 decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);
7644 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7645 break;
7646 case X86ISD::MOVSLDUP:
7647 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7647, __extension__
__PRETTY_FUNCTION__))
;
7648 DecodeMOVSLDUPMask(NumElems, Mask);
7649 IsUnary = true;
7650 break;
7651 case X86ISD::MOVSHDUP:
7652 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7652, __extension__
__PRETTY_FUNCTION__))
;
7653 DecodeMOVSHDUPMask(NumElems, Mask);
7654 IsUnary = true;
7655 break;
7656 case X86ISD::MOVDDUP:
7657 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7657, __extension__
__PRETTY_FUNCTION__))
;
7658 DecodeMOVDDUPMask(NumElems, Mask);
7659 IsUnary = true;
7660 break;
7661 case X86ISD::VPERMIL2: {
7662 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7662, __extension__
__PRETTY_FUNCTION__))
;
7663 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7663, __extension__
__PRETTY_FUNCTION__))
;
7664 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7665 SDValue MaskNode = N->getOperand(2);
7666 SDValue CtrlNode = N->getOperand(3);
7667 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
7668 unsigned CtrlImm = CtrlOp->getZExtValue();
7669 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
7670 RawUndefs)) {
7671 DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,
7672 Mask);
7673 break;
7674 }
7675 }
7676 return false;
7677 }
7678 case X86ISD::VPPERM: {
7679 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7679, __extension__
__PRETTY_FUNCTION__))
;
7680 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7680, __extension__
__PRETTY_FUNCTION__))
;
7681 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7682 SDValue MaskNode = N->getOperand(2);
7683 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
7684 DecodeVPPERMMask(RawMask, RawUndefs, Mask);
7685 break;
7686 }
7687 return false;
7688 }
7689 case X86ISD::VPERMV: {
7690 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7690, __extension__
__PRETTY_FUNCTION__))
;
7691 IsUnary = true;
7692 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
7693 Ops.push_back(N->getOperand(1));
7694 SDValue MaskNode = N->getOperand(0);
7695 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
7696 RawUndefs)) {
7697 DecodeVPERMVMask(RawMask, RawUndefs, Mask);
7698 break;
7699 }
7700 return false;
7701 }
7702 case X86ISD::VPERMV3: {
7703 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7703, __extension__
__PRETTY_FUNCTION__))
;
7704 assert(N->getOperand(2).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(2).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(2).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7704, __extension__
__PRETTY_FUNCTION__))
;
7705 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
7706 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
7707 Ops.push_back(N->getOperand(0));
7708 Ops.push_back(N->getOperand(2));
7709 SDValue MaskNode = N->getOperand(1);
7710 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
7711 RawUndefs)) {
7712 DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);
7713 break;
7714 }
7715 return false;
7716 }
7717 default: llvm_unreachable("unknown target shuffle node")::llvm::llvm_unreachable_internal("unknown target shuffle node"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7717)
;
7718 }
7719
7720 // Empty mask indicates the decode failed.
7721 if (Mask.empty())
7722 return false;
7723
7724 // Check if we're getting a shuffle mask with zero'd elements.
7725 if (!AllowSentinelZero && isAnyZero(Mask))
7726 return false;
7727
7728 // If we have a fake unary shuffle, the shuffle mask is spread across two
7729 // inputs that are actually the same node. Re-map the mask to always point
7730 // into the first input.
7731 if (IsFakeUnary)
7732 for (int &M : Mask)
7733 if (M >= (int)Mask.size())
7734 M -= Mask.size();
7735
7736 // If we didn't already add operands in the opcode-specific code, default to
7737 // adding 1 or 2 operands starting at 0.
7738 if (Ops.empty()) {
7739 Ops.push_back(N->getOperand(0));
7740 if (!IsUnary || IsFakeUnary)
7741 Ops.push_back(N->getOperand(1));
7742 }
7743
7744 return true;
7745}
7746
7747// Wrapper for getTargetShuffleMask with InUnary;
7748static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
7749 SmallVectorImpl<SDValue> &Ops,
7750 SmallVectorImpl<int> &Mask) {
7751 bool IsUnary;
7752 return getTargetShuffleMask(N, VT, AllowSentinelZero, Ops, Mask, IsUnary);
7753}
7754
7755/// Compute whether each element of a shuffle is zeroable.
7756///
7757/// A "zeroable" vector shuffle element is one which can be lowered to zero.
7758/// Either it is an undef element in the shuffle mask, the element of the input
7759/// referenced is undef, or the element of the input referenced is known to be
7760/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
7761/// as many lanes with this technique as possible to simplify the remaining
7762/// shuffle.
7763static void computeZeroableShuffleElements(ArrayRef<int> Mask,
7764 SDValue V1, SDValue V2,
7765 APInt &KnownUndef, APInt &KnownZero) {
7766 int Size = Mask.size();
7767 KnownUndef = KnownZero = APInt::getZero(Size);
7768
7769 V1 = peekThroughBitcasts(V1);
7770 V2 = peekThroughBitcasts(V2);
7771
7772 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
7773 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
7774
7775 int VectorSizeInBits = V1.getValueSizeInBits();
7776 int ScalarSizeInBits = VectorSizeInBits / Size;
7777 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size")(static_cast <bool> (!(VectorSizeInBits % ScalarSizeInBits
) && "Illegal shuffle mask size") ? void (0) : __assert_fail
("!(VectorSizeInBits % ScalarSizeInBits) && \"Illegal shuffle mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7777, __extension__
__PRETTY_FUNCTION__))
;
7778
7779 for (int i = 0; i < Size; ++i) {
7780 int M = Mask[i];
7781 // Handle the easy cases.
7782 if (M < 0) {
7783 KnownUndef.setBit(i);
7784 continue;
7785 }
7786 if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
7787 KnownZero.setBit(i);
7788 continue;
7789 }
7790
7791 // Determine shuffle input and normalize the mask.
7792 SDValue V = M < Size ? V1 : V2;
7793 M %= Size;
7794
7795 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
7796 if (V.getOpcode() != ISD::BUILD_VECTOR)
7797 continue;
7798
7799 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
7800 // the (larger) source element must be UNDEF/ZERO.
7801 if ((Size % V.getNumOperands()) == 0) {
7802 int Scale = Size / V->getNumOperands();
7803 SDValue Op = V.getOperand(M / Scale);
7804 if (Op.isUndef())
7805 KnownUndef.setBit(i);
7806 if (X86::isZeroNode(Op))
7807 KnownZero.setBit(i);
7808 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
7809 APInt Val = Cst->getAPIntValue();
7810 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
7811 if (Val == 0)
7812 KnownZero.setBit(i);
7813 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
7814 APInt Val = Cst->getValueAPF().bitcastToAPInt();
7815 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
7816 if (Val == 0)
7817 KnownZero.setBit(i);
7818 }
7819 continue;
7820 }
7821
7822 // If the BUILD_VECTOR has more elements then all the (smaller) source
7823 // elements must be UNDEF or ZERO.
7824 if ((V.getNumOperands() % Size) == 0) {
7825 int Scale = V->getNumOperands() / Size;
7826 bool AllUndef = true;
7827 bool AllZero = true;
7828 for (int j = 0; j < Scale; ++j) {
7829 SDValue Op = V.getOperand((M * Scale) + j);
7830 AllUndef &= Op.isUndef();
7831 AllZero &= X86::isZeroNode(Op);
7832 }
7833 if (AllUndef)
7834 KnownUndef.setBit(i);
7835 if (AllZero)
7836 KnownZero.setBit(i);
7837 continue;
7838 }
7839 }
7840}
7841
7842/// Decode a target shuffle mask and inputs and see if any values are
7843/// known to be undef or zero from their inputs.
7844/// Returns true if the target shuffle mask was decoded.
7845/// FIXME: Merge this with computeZeroableShuffleElements?
7846static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask,
7847 SmallVectorImpl<SDValue> &Ops,
7848 APInt &KnownUndef, APInt &KnownZero) {
7849 bool IsUnary;
7850 if (!isTargetShuffle(N.getOpcode()))
7851 return false;
7852
7853 MVT VT = N.getSimpleValueType();
7854 if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
7855 return false;
7856
7857 int Size = Mask.size();
7858 SDValue V1 = Ops[0];
7859 SDValue V2 = IsUnary ? V1 : Ops[1];
7860 KnownUndef = KnownZero = APInt::getZero(Size);
7861
7862 V1 = peekThroughBitcasts(V1);
7863 V2 = peekThroughBitcasts(V2);
7864
7865 assert((VT.getSizeInBits() % Size) == 0 &&(static_cast <bool> ((VT.getSizeInBits() % Size) == 0 &&
"Illegal split of shuffle value type") ? void (0) : __assert_fail
("(VT.getSizeInBits() % Size) == 0 && \"Illegal split of shuffle value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7866, __extension__
__PRETTY_FUNCTION__))
7866 "Illegal split of shuffle value type")(static_cast <bool> ((VT.getSizeInBits() % Size) == 0 &&
"Illegal split of shuffle value type") ? void (0) : __assert_fail
("(VT.getSizeInBits() % Size) == 0 && \"Illegal split of shuffle value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7866, __extension__
__PRETTY_FUNCTION__))
;
7867 unsigned EltSizeInBits = VT.getSizeInBits() / Size;
7868
7869 // Extract known constant input data.
7870 APInt UndefSrcElts[2];
7871 SmallVector<APInt, 32> SrcEltBits[2];
7872 bool IsSrcConstant[2] = {
7873 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
7874 SrcEltBits[0], true, false),
7875 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
7876 SrcEltBits[1], true, false)};
7877
7878 for (int i = 0; i < Size; ++i) {
7879 int M = Mask[i];
7880
7881 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
7882 if (M < 0) {
7883 assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!")(static_cast <bool> (isUndefOrZero(M) && "Unknown shuffle sentinel value!"
) ? void (0) : __assert_fail ("isUndefOrZero(M) && \"Unknown shuffle sentinel value!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7883, __extension__
__PRETTY_FUNCTION__))
;
7884 if (SM_SentinelUndef == M)
7885 KnownUndef.setBit(i);
7886 if (SM_SentinelZero == M)
7887 KnownZero.setBit(i);
7888 continue;
7889 }
7890
7891 // Determine shuffle input and normalize the mask.
7892 unsigned SrcIdx = M / Size;
7893 SDValue V = M < Size ? V1 : V2;
7894 M %= Size;
7895
7896 // We are referencing an UNDEF input.
7897 if (V.isUndef()) {
7898 KnownUndef.setBit(i);
7899 continue;
7900 }
7901
7902 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
7903 // TODO: We currently only set UNDEF for integer types - floats use the same
7904 // registers as vectors and many of the scalar folded loads rely on the
7905 // SCALAR_TO_VECTOR pattern.
7906 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
7907 (Size % V.getValueType().getVectorNumElements()) == 0) {
7908 int Scale = Size / V.getValueType().getVectorNumElements();
7909 int Idx = M / Scale;
7910 if (Idx != 0 && !VT.isFloatingPoint())
7911 KnownUndef.setBit(i);
7912 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
7913 KnownZero.setBit(i);
7914 continue;
7915 }
7916
7917 // INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF
7918 // base vectors.
7919 if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {
7920 SDValue Vec = V.getOperand(0);
7921 int NumVecElts = Vec.getValueType().getVectorNumElements();
7922 if (Vec.isUndef() && Size == NumVecElts) {
7923 int Idx = V.getConstantOperandVal(2);
7924 int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();
7925 if (M < Idx || (Idx + NumSubElts) <= M)
7926 KnownUndef.setBit(i);
7927 }
7928 continue;
7929 }
7930
7931 // Attempt to extract from the source's constant bits.
7932 if (IsSrcConstant[SrcIdx]) {
7933 if (UndefSrcElts[SrcIdx][M])
7934 KnownUndef.setBit(i);
7935 else if (SrcEltBits[SrcIdx][M] == 0)
7936 KnownZero.setBit(i);
7937 }
7938 }
7939
7940 assert(VT.getVectorNumElements() == (unsigned)Size &&(static_cast <bool> (VT.getVectorNumElements() == (unsigned
)Size && "Different mask size from vector size!") ? void
(0) : __assert_fail ("VT.getVectorNumElements() == (unsigned)Size && \"Different mask size from vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7941, __extension__
__PRETTY_FUNCTION__))
7941 "Different mask size from vector size!")(static_cast <bool> (VT.getVectorNumElements() == (unsigned
)Size && "Different mask size from vector size!") ? void
(0) : __assert_fail ("VT.getVectorNumElements() == (unsigned)Size && \"Different mask size from vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7941, __extension__
__PRETTY_FUNCTION__))
;
7942 return true;
7943}
7944
7945// Replace target shuffle mask elements with known undef/zero sentinels.
7946static void resolveTargetShuffleFromZeroables(SmallVectorImpl<int> &Mask,
7947 const APInt &KnownUndef,
7948 const APInt &KnownZero,
7949 bool ResolveKnownZeros= true) {
7950 unsigned NumElts = Mask.size();
7951 assert(KnownUndef.getBitWidth() == NumElts &&(static_cast <bool> (KnownUndef.getBitWidth() == NumElts
&& KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch"
) ? void (0) : __assert_fail ("KnownUndef.getBitWidth() == NumElts && KnownZero.getBitWidth() == NumElts && \"Shuffle mask size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7952, __extension__
__PRETTY_FUNCTION__))
7952 KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch")(static_cast <bool> (KnownUndef.getBitWidth() == NumElts
&& KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch"
) ? void (0) : __assert_fail ("KnownUndef.getBitWidth() == NumElts && KnownZero.getBitWidth() == NumElts && \"Shuffle mask size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7952, __extension__
__PRETTY_FUNCTION__))
;
7953
7954 for (unsigned i = 0; i != NumElts; ++i) {
7955 if (KnownUndef[i])
7956 Mask[i] = SM_SentinelUndef;
7957 else if (ResolveKnownZeros && KnownZero[i])
7958 Mask[i] = SM_SentinelZero;
7959 }
7960}
7961
7962// Extract target shuffle mask sentinel elements to known undef/zero bitmasks.
7963static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl<int> &Mask,
7964 APInt &KnownUndef,
7965 APInt &KnownZero) {
7966 unsigned NumElts = Mask.size();
7967 KnownUndef = KnownZero = APInt::getZero(NumElts);
7968
7969 for (unsigned i = 0; i != NumElts; ++i) {
7970 int M = Mask[i];
7971 if (SM_SentinelUndef == M)
7972 KnownUndef.setBit(i);
7973 if (SM_SentinelZero == M)
7974 KnownZero.setBit(i);
7975 }
7976}
7977
7978// Forward declaration (for getFauxShuffleMask recursive check).
7979static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
7980 SmallVectorImpl<int> &Mask,
7981 const SelectionDAG &DAG, unsigned Depth,
7982 bool ResolveKnownElts);
7983
7984// Attempt to decode ops that could be represented as a shuffle mask.
7985// The decoded shuffle mask may contain a different number of elements to the
7986// destination value type.
7987// TODO: Merge into getTargetShuffleInputs()
7988static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
7989 SmallVectorImpl<int> &Mask,
7990 SmallVectorImpl<SDValue> &Ops,
7991 const SelectionDAG &DAG, unsigned Depth,
7992 bool ResolveKnownElts) {
7993 Mask.clear();
7994 Ops.clear();
7995
7996 MVT VT = N.getSimpleValueType();
7997 unsigned NumElts = VT.getVectorNumElements();
7998 unsigned NumSizeInBits = VT.getSizeInBits();
7999 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
8000 if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
8001 return false;
8002 assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size")(static_cast <bool> (NumElts == DemandedElts.getBitWidth
() && "Unexpected vector size") ? void (0) : __assert_fail
("NumElts == DemandedElts.getBitWidth() && \"Unexpected vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8002, __extension__
__PRETTY_FUNCTION__))
;
8003 unsigned NumSizeInBytes = NumSizeInBits / 8;
8004 unsigned NumBytesPerElt = NumBitsPerElt / 8;
8005
8006 unsigned Opcode = N.getOpcode();
8007 switch (Opcode) {
8008 case ISD::VECTOR_SHUFFLE: {
8009 // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
8010 ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
8011 if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
8012 Mask.append(ShuffleMask.begin(), ShuffleMask.end());
8013 Ops.push_back(N.getOperand(0));
8014 Ops.push_back(N.getOperand(1));
8015 return true;
8016 }
8017 return false;
8018 }
8019 case ISD::AND:
8020 case X86ISD::ANDNP: {
8021 // Attempt to decode as a per-byte mask.
8022 APInt UndefElts;
8023 SmallVector<APInt, 32> EltBits;
8024 SDValue N0 = N.getOperand(0);
8025 SDValue N1 = N.getOperand(1);
8026 bool IsAndN = (X86ISD::ANDNP == Opcode);
8027 uint64_t ZeroMask = IsAndN ? 255 : 0;
8028 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
8029 return false;
8030 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
8031 if (UndefElts[i]) {
8032 Mask.push_back(SM_SentinelUndef);
8033 continue;
8034 }
8035 const APInt &ByteBits = EltBits[i];
8036 if (ByteBits != 0 && ByteBits != 255)
8037 return false;
8038 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
8039 }
8040 Ops.push_back(IsAndN ? N1 : N0);
8041 return true;
8042 }
8043 case ISD::OR: {
8044 // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
8045 // is a valid shuffle index.
8046 SDValue N0 = peekThroughBitcasts(N.getOperand(0));
8047 SDValue N1 = peekThroughBitcasts(N.getOperand(1));
8048 if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
8049 return false;
8050 SmallVector<int, 64> SrcMask0, SrcMask1;
8051 SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
8052 if (!getTargetShuffleInputs(N0, SrcInputs0, SrcMask0, DAG, Depth + 1,
8053 true) ||
8054 !getTargetShuffleInputs(N1, SrcInputs1, SrcMask1, DAG, Depth + 1,
8055 true))
8056 return false;
8057
8058 size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
8059 SmallVector<int, 64> Mask0, Mask1;
8060 narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
8061 narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
8062 for (int i = 0; i != (int)MaskSize; ++i) {
8063 // NOTE: Don't handle SM_SentinelUndef, as we can end up in infinite
8064 // loops converting between OR and BLEND shuffles due to
8065 // canWidenShuffleElements merging away undef elements, meaning we
8066 // fail to recognise the OR as the undef element isn't known zero.
8067 if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
8068 Mask.push_back(SM_SentinelZero);
8069 else if (Mask1[i] == SM_SentinelZero)
8070 Mask.push_back(i);
8071 else if (Mask0[i] == SM_SentinelZero)
8072 Mask.push_back(i + MaskSize);
8073 else
8074 return false;
8075 }
8076 Ops.push_back(N0);
8077 Ops.push_back(N1);
8078 return true;
8079 }
8080 case ISD::INSERT_SUBVECTOR: {
8081 SDValue Src = N.getOperand(0);
8082 SDValue Sub = N.getOperand(1);
8083 EVT SubVT = Sub.getValueType();
8084 unsigned NumSubElts = SubVT.getVectorNumElements();
8085 if (!N->isOnlyUserOf(Sub.getNode()))
8086 return false;
8087 uint64_t InsertIdx = N.getConstantOperandVal(2);
8088 // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
8089 if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
8090 Sub.getOperand(0).getValueType() == VT) {
8091 uint64_t ExtractIdx = Sub.getConstantOperandVal(1);
8092 for (int i = 0; i != (int)NumElts; ++i)
8093 Mask.push_back(i);
8094 for (int i = 0; i != (int)NumSubElts; ++i)
8095 Mask[InsertIdx + i] = NumElts + ExtractIdx + i;
8096 Ops.push_back(Src);
8097 Ops.push_back(Sub.getOperand(0));
8098 return true;
8099 }
8100 // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
8101 SmallVector<int, 64> SubMask;
8102 SmallVector<SDValue, 2> SubInputs;
8103 if (!getTargetShuffleInputs(peekThroughOneUseBitcasts(Sub), SubInputs,
8104 SubMask, DAG, Depth + 1, ResolveKnownElts))
8105 return false;
8106
8107 // Subvector shuffle inputs must not be larger than the subvector.
8108 if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {
8109 return SubVT.getFixedSizeInBits() <
8110 SubInput.getValueSizeInBits().getFixedSize();
8111 }))
8112 return false;
8113
8114 if (SubMask.size() != NumSubElts) {
8115 assert(((SubMask.size() % NumSubElts) == 0 ||(static_cast <bool> (((SubMask.size() % NumSubElts) == 0
|| (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale"
) ? void (0) : __assert_fail ("((SubMask.size() % NumSubElts) == 0 || (NumSubElts % SubMask.size()) == 0) && \"Illegal submask scale\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8116, __extension__
__PRETTY_FUNCTION__))
8116 (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale")(static_cast <bool> (((SubMask.size() % NumSubElts) == 0
|| (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale"
) ? void (0) : __assert_fail ("((SubMask.size() % NumSubElts) == 0 || (NumSubElts % SubMask.size()) == 0) && \"Illegal submask scale\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8116, __extension__
__PRETTY_FUNCTION__))
;
8117 if ((NumSubElts % SubMask.size()) == 0) {
8118 int Scale = NumSubElts / SubMask.size();
8119 SmallVector<int,64> ScaledSubMask;
8120 narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask);
8121 SubMask = ScaledSubMask;
8122 } else {
8123 int Scale = SubMask.size() / NumSubElts;
8124 NumSubElts = SubMask.size();
8125 NumElts *= Scale;
8126 InsertIdx *= Scale;
8127 }
8128 }
8129 Ops.push_back(Src);
8130 Ops.append(SubInputs.begin(), SubInputs.end());
8131 if (ISD::isBuildVectorAllZeros(Src.getNode()))
8132 Mask.append(NumElts, SM_SentinelZero);
8133 else
8134 for (int i = 0; i != (int)NumElts; ++i)
8135 Mask.push_back(i);
8136 for (int i = 0; i != (int)NumSubElts; ++i) {
8137 int M = SubMask[i];
8138 if (0 <= M) {
8139 int InputIdx = M / NumSubElts;
8140 M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
8141 }
8142 Mask[i + InsertIdx] = M;
8143 }
8144 return true;
8145 }
8146 case X86ISD::PINSRB:
8147 case X86ISD::PINSRW:
8148 case ISD::SCALAR_TO_VECTOR:
8149 case ISD::INSERT_VECTOR_ELT: {
8150 // Match against a insert_vector_elt/scalar_to_vector of an extract from a
8151 // vector, for matching src/dst vector types.
8152 SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1);
8153
8154 unsigned DstIdx = 0;
8155 if (Opcode != ISD::SCALAR_TO_VECTOR) {
8156 // Check we have an in-range constant insertion index.
8157 if (!isa<ConstantSDNode>(N.getOperand(2)) ||
8158 N.getConstantOperandAPInt(2).uge(NumElts))
8159 return false;
8160 DstIdx = N.getConstantOperandVal(2);
8161
8162 // Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern.
8163 if (X86::isZeroNode(Scl)) {
8164 Ops.push_back(N.getOperand(0));
8165 for (unsigned i = 0; i != NumElts; ++i)
8166 Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i);
8167 return true;
8168 }
8169 }
8170
8171 // Peek through trunc/aext/zext.
8172 // TODO: aext shouldn't require SM_SentinelZero padding.
8173 // TODO: handle shift of scalars.
8174 unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();
8175 while (Scl.getOpcode() == ISD::TRUNCATE ||
8176 Scl.getOpcode() == ISD::ANY_EXTEND ||
8177 Scl.getOpcode() == ISD::ZERO_EXTEND) {
8178 Scl = Scl.getOperand(0);
8179 MinBitsPerElt =
8180 std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());
8181 }
8182 if ((MinBitsPerElt % 8) != 0)
8183 return false;
8184
8185 // Attempt to find the source vector the scalar was extracted from.
8186 SDValue SrcExtract;
8187 if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
8188 Scl.getOpcode() == X86ISD::PEXTRW ||
8189 Scl.getOpcode() == X86ISD::PEXTRB) &&
8190 Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
8191 SrcExtract = Scl;
8192 }
8193 if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
8194 return false;
8195
8196 SDValue SrcVec = SrcExtract.getOperand(0);
8197 EVT SrcVT = SrcVec.getValueType();
8198 if (!SrcVT.getScalarType().isByteSized())
8199 return false;
8200 unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
8201 unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);
8202 unsigned DstByte = DstIdx * NumBytesPerElt;
8203 MinBitsPerElt =
8204 std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());
8205
8206 // Create 'identity' byte level shuffle mask and then add inserted bytes.
8207 if (Opcode == ISD::SCALAR_TO_VECTOR) {
8208 Ops.push_back(SrcVec);
8209 Mask.append(NumSizeInBytes, SM_SentinelUndef);
8210 } else {
8211 Ops.push_back(SrcVec);
8212 Ops.push_back(N.getOperand(0));
8213 for (int i = 0; i != (int)NumSizeInBytes; ++i)
8214 Mask.push_back(NumSizeInBytes + i);
8215 }
8216
8217 unsigned MinBytesPerElts = MinBitsPerElt / 8;
8218 MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
8219 for (unsigned i = 0; i != MinBytesPerElts; ++i)
8220 Mask[DstByte + i] = SrcByte + i;
8221 for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
8222 Mask[DstByte + i] = SM_SentinelZero;
8223 return true;
8224 }
8225 case X86ISD::PACKSS:
8226 case X86ISD::PACKUS: {
8227 SDValue N0 = N.getOperand(0);
8228 SDValue N1 = N.getOperand(1);
8229 assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&(static_cast <bool> (N0.getValueType().getVectorNumElements
() == (NumElts / 2) && N1.getValueType().getVectorNumElements
() == (NumElts / 2) && "Unexpected input value type")
? void (0) : __assert_fail ("N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && \"Unexpected input value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8231, __extension__
__PRETTY_FUNCTION__))
8230 N1.getValueType().getVectorNumElements() == (NumElts / 2) &&(static_cast <bool> (N0.getValueType().getVectorNumElements
() == (NumElts / 2) && N1.getValueType().getVectorNumElements
() == (NumElts / 2) && "Unexpected input value type")
? void (0) : __assert_fail ("N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && \"Unexpected input value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8231, __extension__
__PRETTY_FUNCTION__))
8231 "Unexpected input value type")(static_cast <bool> (N0.getValueType().getVectorNumElements
() == (NumElts / 2) && N1.getValueType().getVectorNumElements
() == (NumElts / 2) && "Unexpected input value type")
? void (0) : __assert_fail ("N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && \"Unexpected input value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8231, __extension__
__PRETTY_FUNCTION__))
;
8232
8233 APInt EltsLHS, EltsRHS;
8234 getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);
8235
8236 // If we know input saturation won't happen (or we don't care for particular
8237 // lanes), we can treat this as a truncation shuffle.
8238 bool Offset0 = false, Offset1 = false;
8239 if (Opcode == X86ISD::PACKSS) {
8240 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
8241 DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
8242 (!(N1.isUndef() || EltsRHS.isZero()) &&
8243 DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
8244 return false;
8245 // We can't easily fold ASHR into a shuffle, but if it was feeding a
8246 // PACKSS then it was likely being used for sign-extension for a
8247 // truncation, so just peek through and adjust the mask accordingly.
8248 if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&
8249 N0.getConstantOperandAPInt(1) == NumBitsPerElt) {
8250 Offset0 = true;
8251 N0 = N0.getOperand(0);
8252 }
8253 if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&
8254 N1.getConstantOperandAPInt(1) == NumBitsPerElt) {
8255 Offset1 = true;
8256 N1 = N1.getOperand(0);
8257 }
8258 } else {
8259 APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
8260 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
8261 !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||
8262 (!(N1.isUndef() || EltsRHS.isZero()) &&
8263 !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
8264 return false;
8265 }
8266
8267 bool IsUnary = (N0 == N1);
8268
8269 Ops.push_back(N0);
8270 if (!IsUnary)
8271 Ops.push_back(N1);
8272
8273 createPackShuffleMask(VT, Mask, IsUnary);
8274
8275 if (Offset0 || Offset1) {
8276 for (int &M : Mask)
8277 if ((Offset0 && isInRange(M, 0, NumElts)) ||
8278 (Offset1 && isInRange(M, NumElts, 2 * NumElts)))
8279 ++M;
8280 }
8281 return true;
8282 }
8283 case X86ISD::VTRUNC: {
8284 SDValue Src = N.getOperand(0);
8285 EVT SrcVT = Src.getValueType();
8286 // Truncated source must be a simple vector.
8287 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
8288 (SrcVT.getScalarSizeInBits() % 8) != 0)
8289 return false;
8290 unsigned NumSrcElts = SrcVT.getVectorNumElements();
8291 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
8292 unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
8293 assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation")(static_cast <bool> ((NumBitsPerSrcElt % NumBitsPerElt)
== 0 && "Illegal truncation") ? void (0) : __assert_fail
("(NumBitsPerSrcElt % NumBitsPerElt) == 0 && \"Illegal truncation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8293, __extension__
__PRETTY_FUNCTION__))
;
8294 for (unsigned i = 0; i != NumSrcElts; ++i)
8295 Mask.push_back(i * Scale);
8296 Mask.append(NumElts - NumSrcElts, SM_SentinelZero);
8297 Ops.push_back(Src);
8298 return true;
8299 }
8300 case X86ISD::VSHLI:
8301 case X86ISD::VSRLI: {
8302 uint64_t ShiftVal = N.getConstantOperandVal(1);
8303 // Out of range bit shifts are guaranteed to be zero.
8304 if (NumBitsPerElt <= ShiftVal) {
8305 Mask.append(NumElts, SM_SentinelZero);
8306 return true;
8307 }
8308
8309 // We can only decode 'whole byte' bit shifts as shuffles.
8310 if ((ShiftVal % 8) != 0)
8311 break;
8312
8313 uint64_t ByteShift = ShiftVal / 8;
8314 Ops.push_back(N.getOperand(0));
8315
8316 // Clear mask to all zeros and insert the shifted byte indices.
8317 Mask.append(NumSizeInBytes, SM_SentinelZero);
8318
8319 if (X86ISD::VSHLI == Opcode) {
8320 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
8321 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
8322 Mask[i + j] = i + j - ByteShift;
8323 } else {
8324 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
8325 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
8326 Mask[i + j - ByteShift] = i + j;
8327 }
8328 return true;
8329 }
8330 case X86ISD::VROTLI:
8331 case X86ISD::VROTRI: {
8332 // We can only decode 'whole byte' bit rotates as shuffles.
8333 uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);
8334 if ((RotateVal % 8) != 0)
8335 return false;
8336 Ops.push_back(N.getOperand(0));
8337 int Offset = RotateVal / 8;
8338 Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);
8339 for (int i = 0; i != (int)NumElts; ++i) {
8340 int BaseIdx = i * NumBytesPerElt;
8341 for (int j = 0; j != (int)NumBytesPerElt; ++j) {
8342 Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));
8343 }
8344 }
8345 return true;
8346 }
8347 case X86ISD::VBROADCAST: {
8348 SDValue Src = N.getOperand(0);
8349 if (!Src.getSimpleValueType().isVector()) {
8350 if (Src.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8351 !isNullConstant(Src.getOperand(1)) ||
8352 Src.getOperand(0).getValueType().getScalarType() !=
8353 VT.getScalarType())
8354 return false;
8355 Src = Src.getOperand(0);
8356 }
8357 Ops.push_back(Src);
8358 Mask.append(NumElts, 0);
8359 return true;
8360 }
8361 case ISD::ZERO_EXTEND:
8362 case ISD::ANY_EXTEND:
8363 case ISD::ZERO_EXTEND_VECTOR_INREG:
8364 case ISD::ANY_EXTEND_VECTOR_INREG: {
8365 SDValue Src = N.getOperand(0);
8366 EVT SrcVT = Src.getValueType();
8367
8368 // Extended source must be a simple vector.
8369 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
8370 (SrcVT.getScalarSizeInBits() % 8) != 0)
8371 return false;
8372
8373 bool IsAnyExtend =
8374 (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
8375 DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,
8376 IsAnyExtend, Mask);
8377 Ops.push_back(Src);
8378 return true;
8379 }
8380 }
8381
8382 return false;
8383}
8384
8385/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
8386static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
8387 SmallVectorImpl<int> &Mask) {
8388 int MaskWidth = Mask.size();
8389 SmallVector<SDValue, 16> UsedInputs;
8390 for (int i = 0, e = Inputs.size(); i < e; ++i) {
8391 int lo = UsedInputs.size() * MaskWidth;
8392 int hi = lo + MaskWidth;
8393
8394 // Strip UNDEF input usage.
8395 if (Inputs[i].isUndef())
8396 for (int &M : Mask)
8397 if ((lo <= M) && (M < hi))
8398 M = SM_SentinelUndef;
8399
8400 // Check for unused inputs.
8401 if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
8402 for (int &M : Mask)
8403 if (lo <= M)
8404 M -= MaskWidth;
8405 continue;
8406 }
8407
8408 // Check for repeated inputs.
8409 bool IsRepeat = false;
8410 for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {
8411 if (UsedInputs[j] != Inputs[i])
8412 continue;
8413 for (int &M : Mask)
8414 if (lo <= M)
8415 M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
8416 IsRepeat = true;
8417 break;
8418 }
8419 if (IsRepeat)
8420 continue;
8421
8422 UsedInputs.push_back(Inputs[i]);
8423 }
8424 Inputs = UsedInputs;
8425}
8426
8427/// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs
8428/// and then sets the SM_SentinelUndef and SM_SentinelZero values.
8429/// Returns true if the target shuffle mask was decoded.
8430static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
8431 SmallVectorImpl<SDValue> &Inputs,
8432 SmallVectorImpl<int> &Mask,
8433 APInt &KnownUndef, APInt &KnownZero,
8434 const SelectionDAG &DAG, unsigned Depth,
8435 bool ResolveKnownElts) {
8436 if (Depth >= SelectionDAG::MaxRecursionDepth)
8437 return false; // Limit search depth.
8438
8439 EVT VT = Op.getValueType();
8440 if (!VT.isSimple() || !VT.isVector())
8441 return false;
8442
8443 if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {
8444 if (ResolveKnownElts)
8445 resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);
8446 return true;
8447 }
8448 if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,
8449 ResolveKnownElts)) {
8450 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
8451 return true;
8452 }
8453 return false;
8454}
8455
8456static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
8457 SmallVectorImpl<int> &Mask,
8458 const SelectionDAG &DAG, unsigned Depth = 0,
8459 bool ResolveKnownElts = true) {
8460 EVT VT = Op.getValueType();
8461 if (!VT.isSimple() || !VT.isVector())
8462 return false;
8463
8464 APInt KnownUndef, KnownZero;
8465 unsigned NumElts = Op.getValueType().getVectorNumElements();
8466 APInt DemandedElts = APInt::getAllOnes(NumElts);
8467 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,
8468 KnownZero, DAG, Depth, ResolveKnownElts);
8469}
8470
8471// Attempt to create a scalar/subvector broadcast from the base MemSDNode.
8472static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT,
8473 EVT MemVT, MemSDNode *Mem, unsigned Offset,
8474 SelectionDAG &DAG) {
8475 assert((Opcode == X86ISD::VBROADCAST_LOAD ||(static_cast <bool> ((Opcode == X86ISD::VBROADCAST_LOAD
|| Opcode == X86ISD::SUBV_BROADCAST_LOAD) && "Unknown broadcast load type"
) ? void (0) : __assert_fail ("(Opcode == X86ISD::VBROADCAST_LOAD || Opcode == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8477, __extension__
__PRETTY_FUNCTION__))
8476 Opcode == X86ISD::SUBV_BROADCAST_LOAD) &&(static_cast <bool> ((Opcode == X86ISD::VBROADCAST_LOAD
|| Opcode == X86ISD::SUBV_BROADCAST_LOAD) && "Unknown broadcast load type"
) ? void (0) : __assert_fail ("(Opcode == X86ISD::VBROADCAST_LOAD || Opcode == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8477, __extension__
__PRETTY_FUNCTION__))
8477 "Unknown broadcast load type")(static_cast <bool> ((Opcode == X86ISD::VBROADCAST_LOAD
|| Opcode == X86ISD::SUBV_BROADCAST_LOAD) && "Unknown broadcast load type"
) ? void (0) : __assert_fail ("(Opcode == X86ISD::VBROADCAST_LOAD || Opcode == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8477, __extension__
__PRETTY_FUNCTION__))
;
8478
8479 // Ensure this is a simple (non-atomic, non-voltile), temporal read memop.
8480 if (!Mem || !Mem->readMem() || !Mem->isSimple() || Mem->isNonTemporal())
8481 return SDValue();
8482
8483 SDValue Ptr =
8484 DAG.getMemBasePlusOffset(Mem->getBasePtr(), TypeSize::Fixed(Offset), DL);
8485 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
8486 SDValue Ops[] = {Mem->getChain(), Ptr};
8487 SDValue BcstLd = DAG.getMemIntrinsicNode(
8488 Opcode, DL, Tys, Ops, MemVT,
8489 DAG.getMachineFunction().getMachineMemOperand(
8490 Mem->getMemOperand(), Offset, MemVT.getStoreSize()));
8491 DAG.makeEquivalentMemoryOrdering(SDValue(Mem, 1), BcstLd.getValue(1));
8492 return BcstLd;
8493}
8494
8495/// Returns the scalar element that will make up the i'th
8496/// element of the result of the vector shuffle.
8497static SDValue getShuffleScalarElt(SDValue Op, unsigned Index,
8498 SelectionDAG &DAG, unsigned Depth) {
8499 if (Depth >= SelectionDAG::MaxRecursionDepth)
8500 return SDValue(); // Limit search depth.
8501
8502 EVT VT = Op.getValueType();
8503 unsigned Opcode = Op.getOpcode();
8504 unsigned NumElems = VT.getVectorNumElements();
8505
8506 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
8507 if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) {
8508 int Elt = SV->getMaskElt(Index);
8509
8510 if (Elt < 0)
8511 return DAG.getUNDEF(VT.getVectorElementType());
8512
8513 SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);
8514 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
8515 }
8516
8517 // Recurse into target specific vector shuffles to find scalars.
8518 if (isTargetShuffle(Opcode)) {
8519 MVT ShufVT = VT.getSimpleVT();
8520 MVT ShufSVT = ShufVT.getVectorElementType();
8521 int NumElems = (int)ShufVT.getVectorNumElements();
8522 SmallVector<int, 16> ShuffleMask;
8523 SmallVector<SDValue, 16> ShuffleOps;
8524 if (!getTargetShuffleMask(Op.getNode(), ShufVT, true, ShuffleOps,
8525 ShuffleMask))
8526 return SDValue();
8527
8528 int Elt = ShuffleMask[Index];
8529 if (Elt == SM_SentinelZero)
8530 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT)
8531 : DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT);
8532 if (Elt == SM_SentinelUndef)
8533 return DAG.getUNDEF(ShufSVT);
8534
8535 assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range")(static_cast <bool> (0 <= Elt && Elt < (2
* NumElems) && "Shuffle index out of range") ? void (
0) : __assert_fail ("0 <= Elt && Elt < (2 * NumElems) && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8535, __extension__
__PRETTY_FUNCTION__))
;
8536 SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
8537 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
8538 }
8539
8540 // Recurse into insert_subvector base/sub vector to find scalars.
8541 if (Opcode == ISD::INSERT_SUBVECTOR) {
8542 SDValue Vec = Op.getOperand(0);
8543 SDValue Sub = Op.getOperand(1);
8544 uint64_t SubIdx = Op.getConstantOperandVal(2);
8545 unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
8546
8547 if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
8548 return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);
8549 return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);
8550 }
8551
8552 // Recurse into concat_vectors sub vector to find scalars.
8553 if (Opcode == ISD::CONCAT_VECTORS) {
8554 EVT SubVT = Op.getOperand(0).getValueType();
8555 unsigned NumSubElts = SubVT.getVectorNumElements();
8556 uint64_t SubIdx = Index / NumSubElts;
8557 uint64_t SubElt = Index % NumSubElts;
8558 return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);
8559 }
8560
8561 // Recurse into extract_subvector src vector to find scalars.
8562 if (Opcode == ISD::EXTRACT_SUBVECTOR) {
8563 SDValue Src = Op.getOperand(0);
8564 uint64_t SrcIdx = Op.getConstantOperandVal(1);
8565 return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);
8566 }
8567
8568 // We only peek through bitcasts of the same vector width.
8569 if (Opcode == ISD::BITCAST) {
8570 SDValue Src = Op.getOperand(0);
8571 EVT SrcVT = Src.getValueType();
8572 if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems)
8573 return getShuffleScalarElt(Src, Index, DAG, Depth + 1);
8574 return SDValue();
8575 }
8576
8577 // Actual nodes that may contain scalar elements
8578
8579 // For insert_vector_elt - either return the index matching scalar or recurse
8580 // into the base vector.
8581 if (Opcode == ISD::INSERT_VECTOR_ELT &&
8582 isa<ConstantSDNode>(Op.getOperand(2))) {
8583 if (Op.getConstantOperandAPInt(2) == Index)
8584 return Op.getOperand(1);
8585 return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);
8586 }
8587
8588 if (Opcode == ISD::SCALAR_TO_VECTOR)
8589 return (Index == 0) ? Op.getOperand(0)
8590 : DAG.getUNDEF(VT.getVectorElementType());
8591
8592 if (Opcode == ISD::BUILD_VECTOR)
8593 return Op.getOperand(Index);
8594
8595 return SDValue();
8596}
8597
8598// Use PINSRB/PINSRW/PINSRD to create a build vector.
8599static SDValue LowerBuildVectorAsInsert(SDValue Op, const APInt &NonZeroMask,
8600 unsigned NumNonZero, unsigned NumZero,
8601 SelectionDAG &DAG,
8602 const X86Subtarget &Subtarget) {
8603 MVT VT = Op.getSimpleValueType();
8604 unsigned NumElts = VT.getVectorNumElements();
8605 assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||(static_cast <bool> (((VT == MVT::v8i16 && Subtarget
.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) &&
Subtarget.hasSSE41())) && "Illegal vector insertion"
) ? void (0) : __assert_fail ("((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && \"Illegal vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8607, __extension__
__PRETTY_FUNCTION__))
8606 ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&(static_cast <bool> (((VT == MVT::v8i16 && Subtarget
.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) &&
Subtarget.hasSSE41())) && "Illegal vector insertion"
) ? void (0) : __assert_fail ("((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && \"Illegal vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8607, __extension__
__PRETTY_FUNCTION__))
8607 "Illegal vector insertion")(static_cast <bool> (((VT == MVT::v8i16 && Subtarget
.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) &&
Subtarget.hasSSE41())) && "Illegal vector insertion"
) ? void (0) : __assert_fail ("((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && \"Illegal vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8607, __extension__
__PRETTY_FUNCTION__))
;
8608
8609 SDLoc dl(Op);
8610 SDValue V;
8611 bool First = true;
8612
8613 for (unsigned i = 0; i < NumElts; ++i) {
8614 bool IsNonZero = NonZeroMask[i];
8615 if (!IsNonZero)
8616 continue;
8617
8618 // If the build vector contains zeros or our first insertion is not the
8619 // first index then insert into zero vector to break any register
8620 // dependency else use SCALAR_TO_VECTOR.
8621 if (First) {
8622 First = false;
8623 if (NumZero || 0 != i)
8624 V = getZeroVector(VT, Subtarget, DAG, dl);
8625 else {
8626 assert(0 == i && "Expected insertion into zero-index")(static_cast <bool> (0 == i && "Expected insertion into zero-index"
) ? void (0) : __assert_fail ("0 == i && \"Expected insertion into zero-index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8626, __extension__
__PRETTY_FUNCTION__))
;
8627 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
8628 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
8629 V = DAG.getBitcast(VT, V);
8630 continue;
8631 }
8632 }
8633 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V, Op.getOperand(i),
8634 DAG.getIntPtrConstant(i, dl));
8635 }
8636
8637 return V;
8638}
8639
8640/// Custom lower build_vector of v16i8.
8641static SDValue LowerBuildVectorv16i8(SDValue Op, const APInt &NonZeroMask,
8642 unsigned NumNonZero, unsigned NumZero,
8643 SelectionDAG &DAG,
8644 const X86Subtarget &Subtarget) {
8645 if (NumNonZero > 8 && !Subtarget.hasSSE41())
8646 return SDValue();
8647
8648 // SSE4.1 - use PINSRB to insert each byte directly.
8649 if (Subtarget.hasSSE41())
8650 return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG,
8651 Subtarget);
8652
8653 SDLoc dl(Op);
8654 SDValue V;
8655
8656 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
8657 for (unsigned i = 0; i < 16; i += 2) {
8658 bool ThisIsNonZero = NonZeroMask[i];
8659 bool NextIsNonZero = NonZeroMask[i + 1];
8660 if (!ThisIsNonZero && !NextIsNonZero)
8661 continue;
8662
8663 // FIXME: Investigate combining the first 4 bytes as a i32 instead.
8664 SDValue Elt;
8665 if (ThisIsNonZero) {
8666 if (NumZero || NextIsNonZero)
8667 Elt = DAG.getZExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
8668 else
8669 Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
8670 }
8671
8672 if (NextIsNonZero) {
8673 SDValue NextElt = Op.getOperand(i + 1);
8674 if (i == 0 && NumZero)
8675 NextElt = DAG.getZExtOrTrunc(NextElt, dl, MVT::i32);
8676 else
8677 NextElt = DAG.getAnyExtOrTrunc(NextElt, dl, MVT::i32);
8678 NextElt = DAG.getNode(ISD::SHL, dl, MVT::i32, NextElt,
8679 DAG.getConstant(8, dl, MVT::i8));
8680 if (ThisIsNonZero)
8681 Elt = DAG.getNode(ISD::OR, dl, MVT::i32, NextElt, Elt);
8682 else
8683 Elt = NextElt;
8684 }
8685
8686 // If our first insertion is not the first index or zeros are needed, then
8687 // insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high
8688 // elements undefined).
8689 if (!V) {
8690 if (i != 0 || NumZero)
8691 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
8692 else {
8693 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Elt);
8694 V = DAG.getBitcast(MVT::v8i16, V);
8695 continue;
8696 }
8697 }
8698 Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Elt);
8699 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, Elt,
8700 DAG.getIntPtrConstant(i / 2, dl));
8701 }
8702
8703 return DAG.getBitcast(MVT::v16i8, V);
8704}
8705
8706/// Custom lower build_vector of v8i16.
8707static SDValue LowerBuildVectorv8i16(SDValue Op, const APInt &NonZeroMask,
8708 unsigned NumNonZero, unsigned NumZero,
8709 SelectionDAG &DAG,
8710 const X86Subtarget &Subtarget) {
8711 if (NumNonZero > 4 && !Subtarget.hasSSE41())
8712 return SDValue();
8713
8714 // Use PINSRW to insert each byte directly.
8715 return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG,
8716 Subtarget);
8717}
8718
8719/// Custom lower build_vector of v4i32 or v4f32.
8720static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
8721 const X86Subtarget &Subtarget) {
8722 // If this is a splat of a pair of elements, use MOVDDUP (unless the target
8723 // has XOP; in that case defer lowering to potentially use VPERMIL2PS).
8724 // Because we're creating a less complicated build vector here, we may enable
8725 // further folding of the MOVDDUP via shuffle transforms.
8726 if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&
8727 Op.getOperand(0) == Op.getOperand(2) &&
8728 Op.getOperand(1) == Op.getOperand(3) &&
8729 Op.getOperand(0) != Op.getOperand(1)) {
8730 SDLoc DL(Op);
8731 MVT VT = Op.getSimpleValueType();
8732 MVT EltVT = VT.getVectorElementType();
8733 // Create a new build vector with the first 2 elements followed by undef
8734 // padding, bitcast to v2f64, duplicate, and bitcast back.
8735 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
8736 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
8737 SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));
8738 SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);
8739 return DAG.getBitcast(VT, Dup);
8740 }
8741
8742 // Find all zeroable elements.
8743 std::bitset<4> Zeroable, Undefs;
8744 for (int i = 0; i < 4; ++i) {
8745 SDValue Elt = Op.getOperand(i);
8746 Undefs[i] = Elt.isUndef();
8747 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
8748 }
8749 assert(Zeroable.size() - Zeroable.count() > 1 &&(static_cast <bool> (Zeroable.size() - Zeroable.count()
> 1 && "We expect at least two non-zero elements!"
) ? void (0) : __assert_fail ("Zeroable.size() - Zeroable.count() > 1 && \"We expect at least two non-zero elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8750, __extension__
__PRETTY_FUNCTION__))
8750 "We expect at least two non-zero elements!")(static_cast <bool> (Zeroable.size() - Zeroable.count()
> 1 && "We expect at least two non-zero elements!"
) ? void (0) : __assert_fail ("Zeroable.size() - Zeroable.count() > 1 && \"We expect at least two non-zero elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8750, __extension__
__PRETTY_FUNCTION__))
;
8751
8752 // We only know how to deal with build_vector nodes where elements are either
8753 // zeroable or extract_vector_elt with constant index.
8754 SDValue FirstNonZero;
8755 unsigned FirstNonZeroIdx;
8756 for (unsigned i = 0; i < 4; ++i) {
8757 if (Zeroable[i])
8758 continue;
8759 SDValue Elt = Op.getOperand(i);
8760 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8761 !isa<ConstantSDNode>(Elt.getOperand(1)))
8762 return SDValue();
8763 // Make sure that this node is extracting from a 128-bit vector.
8764 MVT VT = Elt.getOperand(0).getSimpleValueType();
8765 if (!VT.is128BitVector())
8766 return SDValue();
8767 if (!FirstNonZero.getNode()) {
8768 FirstNonZero = Elt;
8769 FirstNonZeroIdx = i;
8770 }
8771 }
8772
8773 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!")(static_cast <bool> (FirstNonZero.getNode() && "Unexpected build vector of all zeros!"
) ? void (0) : __assert_fail ("FirstNonZero.getNode() && \"Unexpected build vector of all zeros!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8773, __extension__
__PRETTY_FUNCTION__))
;
8774 SDValue V1 = FirstNonZero.getOperand(0);
8775 MVT VT = V1.getSimpleValueType();
8776
8777 // See if this build_vector can be lowered as a blend with zero.
8778 SDValue Elt;
8779 unsigned EltMaskIdx, EltIdx;
8780 int Mask[4];
8781 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
8782 if (Zeroable[EltIdx]) {
8783 // The zero vector will be on the right hand side.
8784 Mask[EltIdx] = EltIdx+4;
8785 continue;
8786 }
8787
8788 Elt = Op->getOperand(EltIdx);
8789 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
8790 EltMaskIdx = Elt.getConstantOperandVal(1);
8791 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
8792 break;
8793 Mask[EltIdx] = EltIdx;
8794 }
8795
8796 if (EltIdx == 4) {
8797 // Let the shuffle legalizer deal with blend operations.
8798 SDValue VZeroOrUndef = (Zeroable == Undefs)
8799 ? DAG.getUNDEF(VT)
8800 : getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
8801 if (V1.getSimpleValueType() != VT)
8802 V1 = DAG.getBitcast(VT, V1);
8803 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);
8804 }
8805
8806 // See if we can lower this build_vector to a INSERTPS.
8807 if (!Subtarget.hasSSE41())
8808 return SDValue();
8809
8810 SDValue V2 = Elt.getOperand(0);
8811 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
8812 V1 = SDValue();
8813
8814 bool CanFold = true;
8815 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
8816 if (Zeroable[i])
8817 continue;
8818
8819 SDValue Current = Op->getOperand(i);
8820 SDValue SrcVector = Current->getOperand(0);
8821 if (!V1.getNode())
8822 V1 = SrcVector;
8823 CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);
8824 }
8825
8826 if (!CanFold)
8827 return SDValue();
8828
8829 assert(V1.getNode() && "Expected at least two non-zero elements!")(static_cast <bool> (V1.getNode() && "Expected at least two non-zero elements!"
) ? void (0) : __assert_fail ("V1.getNode() && \"Expected at least two non-zero elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8829, __extension__
__PRETTY_FUNCTION__))
;
8830 if (V1.getSimpleValueType() != MVT::v4f32)
8831 V1 = DAG.getBitcast(MVT::v4f32, V1);
8832 if (V2.getSimpleValueType() != MVT::v4f32)
8833 V2 = DAG.getBitcast(MVT::v4f32, V2);
8834
8835 // Ok, we can emit an INSERTPS instruction.
8836 unsigned ZMask = Zeroable.to_ulong();
8837
8838 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
8839 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!")(static_cast <bool> ((InsertPSMask & ~0xFFu) == 0 &&
"Invalid mask!") ? void (0) : __assert_fail ("(InsertPSMask & ~0xFFu) == 0 && \"Invalid mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8839, __extension__
__PRETTY_FUNCTION__))
;
8840 SDLoc DL(Op);
8841 SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
8842 DAG.getIntPtrConstant(InsertPSMask, DL, true));
8843 return DAG.getBitcast(VT, Result);
8844}
8845
8846/// Return a vector logical shift node.
8847static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
8848 SelectionDAG &DAG, const TargetLowering &TLI,
8849 const SDLoc &dl) {
8850 assert(VT.is128BitVector() && "Unknown type for VShift")(static_cast <bool> (VT.is128BitVector() && "Unknown type for VShift"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Unknown type for VShift\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8850, __extension__
__PRETTY_FUNCTION__))
;
8851 MVT ShVT = MVT::v16i8;
8852 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
8853 SrcOp = DAG.getBitcast(ShVT, SrcOp);
8854 assert(NumBits % 8 == 0 && "Only support byte sized shifts")(static_cast <bool> (NumBits % 8 == 0 && "Only support byte sized shifts"
) ? void (0) : __assert_fail ("NumBits % 8 == 0 && \"Only support byte sized shifts\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8854, __extension__
__PRETTY_FUNCTION__))
;
8855 SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);
8856 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
8857}
8858
8859static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
8860 SelectionDAG &DAG) {
8861
8862 // Check if the scalar load can be widened into a vector load. And if
8863 // the address is "base + cst" see if the cst can be "absorbed" into
8864 // the shuffle mask.
8865 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
8866 SDValue Ptr = LD->getBasePtr();
8867 if (!ISD::isNormalLoad(LD) || !LD->isSimple())
8868 return SDValue();
8869 EVT PVT = LD->getValueType(0);
8870 if (PVT != MVT::i32 && PVT != MVT::f32)
8871 return SDValue();
8872
8873 int FI = -1;
8874 int64_t Offset = 0;
8875 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
8876 FI = FINode->getIndex();
8877 Offset = 0;
8878 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
8879 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
8880 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
8881 Offset = Ptr.getConstantOperandVal(1);
8882 Ptr = Ptr.getOperand(0);
8883 } else {
8884 return SDValue();
8885 }
8886
8887 // FIXME: 256-bit vector instructions don't require a strict alignment,
8888 // improve this code to support it better.
8889 Align RequiredAlign(VT.getSizeInBits() / 8);
8890 SDValue Chain = LD->getChain();
8891 // Make sure the stack object alignment is at least 16 or 32.
8892 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
8893 MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr);
8894 if (!InferredAlign || *InferredAlign < RequiredAlign) {
8895 if (MFI.isFixedObjectIndex(FI)) {
8896 // Can't change the alignment. FIXME: It's possible to compute
8897 // the exact stack offset and reference FI + adjust offset instead.
8898 // If someone *really* cares about this. That's the way to implement it.
8899 return SDValue();
8900 } else {
8901 MFI.setObjectAlignment(FI, RequiredAlign);
8902 }
8903 }
8904
8905 // (Offset % 16 or 32) must be multiple of 4. Then address is then
8906 // Ptr + (Offset & ~15).
8907 if (Offset < 0)
8908 return SDValue();
8909 if ((Offset % RequiredAlign.value()) & 3)
8910 return SDValue();
8911 int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);
8912 if (StartOffset) {
8913 SDLoc DL(Ptr);
8914 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
8915 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
8916 }
8917
8918 int EltNo = (Offset - StartOffset) >> 2;
8919 unsigned NumElems = VT.getVectorNumElements();
8920
8921 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
8922 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
8923 LD->getPointerInfo().getWithOffset(StartOffset));
8924
8925 SmallVector<int, 8> Mask(NumElems, EltNo);
8926
8927 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
8928 }
8929
8930 return SDValue();
8931}
8932
8933// Recurse to find a LoadSDNode source and the accumulated ByteOffest.
8934static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
8935 if (ISD::isNON_EXTLoad(Elt.getNode())) {
8936 auto *BaseLd = cast<LoadSDNode>(Elt);
8937 if (!BaseLd->isSimple())
8938 return false;
8939 Ld = BaseLd;
8940 ByteOffset = 0;
8941 return true;
8942 }
8943
8944 switch (Elt.getOpcode()) {
8945 case ISD::BITCAST:
8946 case ISD::TRUNCATE:
8947 case ISD::SCALAR_TO_VECTOR:
8948 return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
8949 case ISD::SRL:
8950 if (auto *AmtC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
8951 uint64_t Amt = AmtC->getZExtValue();
8952 if ((Amt % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
8953 ByteOffset += Amt / 8;
8954 return true;
8955 }
8956 }
8957 break;
8958 case ISD::EXTRACT_VECTOR_ELT:
8959 if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
8960 SDValue Src = Elt.getOperand(0);
8961 unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
8962 unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
8963 if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
8964 findEltLoadSrc(Src, Ld, ByteOffset)) {
8965 uint64_t Idx = IdxC->getZExtValue();
8966 ByteOffset += Idx * (SrcSizeInBits / 8);
8967 return true;
8968 }
8969 }
8970 break;
8971 }
8972
8973 return false;
8974}
8975
8976/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
8977/// elements can be replaced by a single large load which has the same value as
8978/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
8979///
8980/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
8981static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
8982 const SDLoc &DL, SelectionDAG &DAG,
8983 const X86Subtarget &Subtarget,
8984 bool IsAfterLegalize) {
8985 if ((VT.getScalarSizeInBits() % 8) != 0)
8986 return SDValue();
8987
8988 unsigned NumElems = Elts.size();
8989
8990 int LastLoadedElt = -1;
8991 APInt LoadMask = APInt::getZero(NumElems);
8992 APInt ZeroMask = APInt::getZero(NumElems);
8993 APInt UndefMask = APInt::getZero(NumElems);
8994
8995 SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
8996 SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
8997
8998 // For each element in the initializer, see if we've found a load, zero or an
8999 // undef.
9000 for (unsigned i = 0; i < NumElems; ++i) {
9001 SDValue Elt = peekThroughBitcasts(Elts[i]);
9002 if (!Elt.getNode())
9003 return SDValue();
9004 if (Elt.isUndef()) {
9005 UndefMask.setBit(i);
9006 continue;
9007 }
9008 if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode())) {
9009 ZeroMask.setBit(i);
9010 continue;
9011 }
9012
9013 // Each loaded element must be the correct fractional portion of the
9014 // requested vector load.
9015 unsigned EltSizeInBits = Elt.getValueSizeInBits();
9016 if ((NumElems * EltSizeInBits) != VT.getSizeInBits())
9017 return SDValue();
9018
9019 if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)
9020 return SDValue();
9021 unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
9022 if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
9023 return SDValue();
9024
9025 LoadMask.setBit(i);
9026 LastLoadedElt = i;
9027 }
9028 assert((ZeroMask.countPopulation() + UndefMask.countPopulation() +(static_cast <bool> ((ZeroMask.countPopulation() + UndefMask
.countPopulation() + LoadMask.countPopulation()) == NumElems &&
"Incomplete element masks") ? void (0) : __assert_fail ("(ZeroMask.countPopulation() + UndefMask.countPopulation() + LoadMask.countPopulation()) == NumElems && \"Incomplete element masks\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9030, __extension__
__PRETTY_FUNCTION__))
9029 LoadMask.countPopulation()) == NumElems &&(static_cast <bool> ((ZeroMask.countPopulation() + UndefMask
.countPopulation() + LoadMask.countPopulation()) == NumElems &&
"Incomplete element masks") ? void (0) : __assert_fail ("(ZeroMask.countPopulation() + UndefMask.countPopulation() + LoadMask.countPopulation()) == NumElems && \"Incomplete element masks\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9030, __extension__
__PRETTY_FUNCTION__))
9030 "Incomplete element masks")(static_cast <bool> ((ZeroMask.countPopulation() + UndefMask
.countPopulation() + LoadMask.countPopulation()) == NumElems &&
"Incomplete element masks") ? void (0) : __assert_fail ("(ZeroMask.countPopulation() + UndefMask.countPopulation() + LoadMask.countPopulation()) == NumElems && \"Incomplete element masks\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9030, __extension__
__PRETTY_FUNCTION__))
;
9031
9032 // Handle Special Cases - all undef or undef/zero.
9033 if (UndefMask.countPopulation() == NumElems)
9034 return DAG.getUNDEF(VT);
9035 if ((ZeroMask.countPopulation() + UndefMask.countPopulation()) == NumElems)
9036 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
9037 : DAG.getConstantFP(0.0, DL, VT);
9038
9039 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9040 int FirstLoadedElt = LoadMask.countTrailingZeros();
9041 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
9042 EVT EltBaseVT = EltBase.getValueType();
9043 assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&(static_cast <bool> (EltBaseVT.getSizeInBits() == EltBaseVT
.getStoreSizeInBits() && "Register/Memory size mismatch"
) ? void (0) : __assert_fail ("EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() && \"Register/Memory size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9044, __extension__
__PRETTY_FUNCTION__))
9044 "Register/Memory size mismatch")(static_cast <bool> (EltBaseVT.getSizeInBits() == EltBaseVT
.getStoreSizeInBits() && "Register/Memory size mismatch"
) ? void (0) : __assert_fail ("EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() && \"Register/Memory size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9044, __extension__
__PRETTY_FUNCTION__))
;
9045 LoadSDNode *LDBase = Loads[FirstLoadedElt];
9046 assert(LDBase && "Did not find base load for merging consecutive loads")(static_cast <bool> (LDBase && "Did not find base load for merging consecutive loads"
) ? void (0) : __assert_fail ("LDBase && \"Did not find base load for merging consecutive loads\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9046, __extension__
__PRETTY_FUNCTION__))
;
9047 unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
9048 unsigned BaseSizeInBytes = BaseSizeInBits / 8;
9049 int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);
9050 int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;
9051 assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected")(static_cast <bool> ((BaseSizeInBits % 8) == 0 &&
"Sub-byte element loads detected") ? void (0) : __assert_fail
("(BaseSizeInBits % 8) == 0 && \"Sub-byte element loads detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9051, __extension__
__PRETTY_FUNCTION__))
;
9052
9053 // TODO: Support offsetting the base load.
9054 if (ByteOffsets[FirstLoadedElt] != 0)
9055 return SDValue();
9056
9057 // Check to see if the element's load is consecutive to the base load
9058 // or offset from a previous (already checked) load.
9059 auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
9060 LoadSDNode *Ld = Loads[EltIdx];
9061 int64_t ByteOffset = ByteOffsets[EltIdx];
9062 if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
9063 int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
9064 return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
9065 Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
9066 }
9067 return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,
9068 EltIdx - FirstLoadedElt);
9069 };
9070
9071 // Consecutive loads can contain UNDEFS but not ZERO elements.
9072 // Consecutive loads with UNDEFs and ZEROs elements require a
9073 // an additional shuffle stage to clear the ZERO elements.
9074 bool IsConsecutiveLoad = true;
9075 bool IsConsecutiveLoadWithZeros = true;
9076 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
9077 if (LoadMask[i]) {
9078 if (!CheckConsecutiveLoad(LDBase, i)) {
9079 IsConsecutiveLoad = false;
9080 IsConsecutiveLoadWithZeros = false;
9081 break;
9082 }
9083 } else if (ZeroMask[i]) {
9084 IsConsecutiveLoad = false;
9085 }
9086 }
9087
9088 auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
9089 auto MMOFlags = LDBase->getMemOperand()->getFlags();
9090 assert(LDBase->isSimple() &&(static_cast <bool> (LDBase->isSimple() && "Cannot merge volatile or atomic loads."
) ? void (0) : __assert_fail ("LDBase->isSimple() && \"Cannot merge volatile or atomic loads.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9091, __extension__
__PRETTY_FUNCTION__))
9091 "Cannot merge volatile or atomic loads.")(static_cast <bool> (LDBase->isSimple() && "Cannot merge volatile or atomic loads."
) ? void (0) : __assert_fail ("LDBase->isSimple() && \"Cannot merge volatile or atomic loads.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9091, __extension__
__PRETTY_FUNCTION__))
;
9092 SDValue NewLd =
9093 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
9094 LDBase->getPointerInfo(), LDBase->getOriginalAlign(),
9095 MMOFlags);
9096 for (auto *LD : Loads)
9097 if (LD)
9098 DAG.makeEquivalentMemoryOrdering(LD, NewLd);
9099 return NewLd;
9100 };
9101
9102 // Check if the base load is entirely dereferenceable.
9103 bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
9104 VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());
9105
9106 // LOAD - all consecutive load/undefs (must start/end with a load or be
9107 // entirely dereferenceable). If we have found an entire vector of loads and
9108 // undefs, then return a large load of the entire vector width starting at the
9109 // base pointer. If the vector contains zeros, then attempt to shuffle those
9110 // elements.
9111 if (FirstLoadedElt == 0 &&
9112 (NumLoadedElts == (int)NumElems || IsDereferenceable) &&
9113 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
9114 if (IsAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
9115 return SDValue();
9116
9117 // Don't create 256-bit non-temporal aligned loads without AVX2 as these
9118 // will lower to regular temporal loads and use the cache.
9119 if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 &&
9120 VT.is256BitVector() && !Subtarget.hasInt256())
9121 return SDValue();
9122
9123 if (NumElems == 1)
9124 return DAG.getBitcast(VT, Elts[FirstLoadedElt]);
9125
9126 if (!ZeroMask)
9127 return CreateLoad(VT, LDBase);
9128
9129 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
9130 // vector and a zero vector to clear out the zero elements.
9131 if (!IsAfterLegalize && VT.isVector()) {
9132 unsigned NumMaskElts = VT.getVectorNumElements();
9133 if ((NumMaskElts % NumElems) == 0) {
9134 unsigned Scale = NumMaskElts / NumElems;
9135 SmallVector<int, 4> ClearMask(NumMaskElts, -1);
9136 for (unsigned i = 0; i < NumElems; ++i) {
9137 if (UndefMask[i])
9138 continue;
9139 int Offset = ZeroMask[i] ? NumMaskElts : 0;
9140 for (unsigned j = 0; j != Scale; ++j)
9141 ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;
9142 }
9143 SDValue V = CreateLoad(VT, LDBase);
9144 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
9145 : DAG.getConstantFP(0.0, DL, VT);
9146 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
9147 }
9148 }
9149 }
9150
9151 // If the upper half of a ymm/zmm load is undef then just load the lower half.
9152 if (VT.is256BitVector() || VT.is512BitVector()) {
9153 unsigned HalfNumElems = NumElems / 2;
9154 if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnes()) {
9155 EVT HalfVT =
9156 EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
9157 SDValue HalfLD =
9158 EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
9159 DAG, Subtarget, IsAfterLegalize);
9160 if (HalfLD)
9161 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
9162 HalfLD, DAG.getIntPtrConstant(0, DL));
9163 }
9164 }
9165
9166 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
9167 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
9168 ((LoadSizeInBits == 16 && Subtarget.hasFP16()) || LoadSizeInBits == 32 ||
9169 LoadSizeInBits == 64) &&
9170 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
9171 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
9172 : MVT::getIntegerVT(LoadSizeInBits);
9173 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
9174 // Allow v4f32 on SSE1 only targets.
9175 // FIXME: Add more isel patterns so we can just use VT directly.
9176 if (!Subtarget.hasSSE2() && VT == MVT::v4f32)
9177 VecVT = MVT::v4f32;
9178 if (TLI.isTypeLegal(VecVT)) {
9179 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
9180 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
9181 SDValue ResNode = DAG.getMemIntrinsicNode(
9182 X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),
9183 LDBase->getOriginalAlign(), MachineMemOperand::MOLoad);
9184 for (auto *LD : Loads)
9185 if (LD)
9186 DAG.makeEquivalentMemoryOrdering(LD, ResNode);
9187 return DAG.getBitcast(VT, ResNode);
9188 }
9189 }
9190
9191 // BROADCAST - match the smallest possible repetition pattern, load that
9192 // scalar/subvector element and then broadcast to the entire vector.
9193 if (ZeroMask.isZero() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
9194 (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {
9195 for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
9196 unsigned RepeatSize = SubElems * BaseSizeInBits;
9197 unsigned ScalarSize = std::min(RepeatSize, 64u);
9198 if (!Subtarget.hasAVX2() && ScalarSize < 32)
9199 continue;
9200
9201 // Don't attempt a 1:N subvector broadcast - it should be caught by
9202 // combineConcatVectorOps, else will cause infinite loops.
9203 if (RepeatSize > ScalarSize && SubElems == 1)
9204 continue;
9205
9206 bool Match = true;
9207 SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
9208 for (unsigned i = 0; i != NumElems && Match; ++i) {
9209 if (!LoadMask[i])
9210 continue;
9211 SDValue Elt = peekThroughBitcasts(Elts[i]);
9212 if (RepeatedLoads[i % SubElems].isUndef())
9213 RepeatedLoads[i % SubElems] = Elt;
9214 else
9215 Match &= (RepeatedLoads[i % SubElems] == Elt);
9216 }
9217
9218 // We must have loads at both ends of the repetition.
9219 Match &= !RepeatedLoads.front().isUndef();
9220 Match &= !RepeatedLoads.back().isUndef();
9221 if (!Match)
9222 continue;
9223
9224 EVT RepeatVT =
9225 VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))
9226 ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)
9227 : EVT::getFloatingPointVT(ScalarSize);
9228 if (RepeatSize > ScalarSize)
9229 RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,
9230 RepeatSize / ScalarSize);
9231 EVT BroadcastVT =
9232 EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),
9233 VT.getSizeInBits() / ScalarSize);
9234 if (TLI.isTypeLegal(BroadcastVT)) {
9235 if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
9236 RepeatVT, RepeatedLoads, DL, DAG, Subtarget, IsAfterLegalize)) {
9237 SDValue Broadcast = RepeatLoad;
9238 if (RepeatSize > ScalarSize) {
9239 while (Broadcast.getValueSizeInBits() < VT.getSizeInBits())
9240 Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL);
9241 } else {
9242 if (!Subtarget.hasAVX2() &&
9243 !X86::mayFoldLoadIntoBroadcastFromMem(
9244 RepeatLoad, RepeatVT.getScalarType().getSimpleVT(),
9245 Subtarget,
9246 /*AssumeSingleUse=*/true))
9247 return SDValue();
9248 Broadcast =
9249 DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad);
9250 }
9251 return DAG.getBitcast(VT, Broadcast);
9252 }
9253 }
9254 }
9255 }
9256
9257 return SDValue();
9258}
9259
9260// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
9261// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
9262// are consecutive, non-overlapping, and in the right order.
9263static SDValue combineToConsecutiveLoads(EVT VT, SDValue Op, const SDLoc &DL,
9264 SelectionDAG &DAG,
9265 const X86Subtarget &Subtarget,
9266 bool IsAfterLegalize) {
9267 SmallVector<SDValue, 64> Elts;
9268 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
9269 if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {
9270 Elts.push_back(Elt);
9271 continue;
9272 }
9273 return SDValue();
9274 }
9275 assert(Elts.size() == VT.getVectorNumElements())(static_cast <bool> (Elts.size() == VT.getVectorNumElements
()) ? void (0) : __assert_fail ("Elts.size() == VT.getVectorNumElements()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9275, __extension__
__PRETTY_FUNCTION__))
;
9276 return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,
9277 IsAfterLegalize);
9278}
9279
9280static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
9281 unsigned SplatBitSize, LLVMContext &C) {
9282 unsigned ScalarSize = VT.getScalarSizeInBits();
9283 unsigned NumElm = SplatBitSize / ScalarSize;
9284
9285 SmallVector<Constant *, 32> ConstantVec;
9286 for (unsigned i = 0; i < NumElm; i++) {
9287 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
9288 Constant *Const;
9289 if (VT.isFloatingPoint()) {
9290 if (ScalarSize == 16) {
9291 Const = ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
9292 } else if (ScalarSize == 32) {
9293 Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
9294 } else {
9295 assert(ScalarSize == 64 && "Unsupported floating point scalar size")(static_cast <bool> (ScalarSize == 64 && "Unsupported floating point scalar size"
) ? void (0) : __assert_fail ("ScalarSize == 64 && \"Unsupported floating point scalar size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9295, __extension__
__PRETTY_FUNCTION__))
;
9296 Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
9297 }
9298 } else
9299 Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
9300 ConstantVec.push_back(Const);
9301 }
9302 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
9303}
9304
9305static bool isFoldableUseOfShuffle(SDNode *N) {
9306 for (auto *U : N->uses()) {
9307 unsigned Opc = U->getOpcode();
9308 // VPERMV/VPERMV3 shuffles can never fold their index operands.
9309 if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
9310 return false;
9311 if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
9312 return false;
9313 if (isTargetShuffle(Opc))
9314 return true;
9315 if (Opc == ISD::BITCAST) // Ignore bitcasts
9316 return isFoldableUseOfShuffle(U);
9317 if (N->hasOneUse()) {
9318 // TODO, there may be some general way to know if a SDNode can
9319 // be folded. We now only know whether an MI is foldable.
9320 if (Opc == X86ISD::VPDPBUSD && U->getOperand(2).getNode() != N)
9321 return false;
9322 return true;
9323 }
9324 }
9325 return false;
9326}
9327
9328/// Attempt to use the vbroadcast instruction to generate a splat value
9329/// from a splat BUILD_VECTOR which uses:
9330/// a. A single scalar load, or a constant.
9331/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
9332///
9333/// The VBROADCAST node is returned when a pattern is found,
9334/// or SDValue() otherwise.
9335static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
9336 const X86Subtarget &Subtarget,
9337 SelectionDAG &DAG) {
9338 // VBROADCAST requires AVX.
9339 // TODO: Splats could be generated for non-AVX CPUs using SSE
9340 // instructions, but there's less potential gain for only 128-bit vectors.
9341 if (!Subtarget.hasAVX())
9342 return SDValue();
9343
9344 MVT VT = BVOp->getSimpleValueType(0);
9345 unsigned NumElts = VT.getVectorNumElements();
9346 SDLoc dl(BVOp);
9347
9348 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Unsupported vector type for broadcast."
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported vector type for broadcast.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9349, __extension__
__PRETTY_FUNCTION__))
9349 "Unsupported vector type for broadcast.")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Unsupported vector type for broadcast."
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported vector type for broadcast.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9349, __extension__
__PRETTY_FUNCTION__))
;
9350
9351 // See if the build vector is a repeating sequence of scalars (inc. splat).
9352 SDValue Ld;
9353 BitVector UndefElements;
9354 SmallVector<SDValue, 16> Sequence;
9355 if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) {
9356 assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit.")(static_cast <bool> ((NumElts % Sequence.size()) == 0 &&
"Sequence doesn't fit.") ? void (0) : __assert_fail ("(NumElts % Sequence.size()) == 0 && \"Sequence doesn't fit.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9356, __extension__
__PRETTY_FUNCTION__))
;
9357 if (Sequence.size() == 1)
9358 Ld = Sequence[0];
9359 }
9360
9361 // Attempt to use VBROADCASTM
9362 // From this pattern:
9363 // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
9364 // b. t1 = (build_vector t0 t0)
9365 //
9366 // Create (VBROADCASTM v2i1 X)
9367 if (!Sequence.empty() && Subtarget.hasCDI()) {
9368 // If not a splat, are the upper sequence values zeroable?
9369 unsigned SeqLen = Sequence.size();
9370 bool UpperZeroOrUndef =
9371 SeqLen == 1 ||
9372 llvm::all_of(makeArrayRef(Sequence).drop_front(), [](SDValue V) {
9373 return !V || V.isUndef() || isNullConstant(V);
9374 });
9375 SDValue Op0 = Sequence[0];
9376 if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) ||
9377 (Op0.getOpcode() == ISD::ZERO_EXTEND &&
9378 Op0.getOperand(0).getOpcode() == ISD::BITCAST))) {
9379 SDValue BOperand = Op0.getOpcode() == ISD::BITCAST
9380 ? Op0.getOperand(0)
9381 : Op0.getOperand(0).getOperand(0);
9382 MVT MaskVT = BOperand.getSimpleValueType();
9383 MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen);
9384 if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q
9385 (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
9386 MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen);
9387 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
9388 unsigned Scale = 512 / VT.getSizeInBits();
9389 BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen));
9390 }
9391 SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand);
9392 if (BcstVT.getSizeInBits() != VT.getSizeInBits())
9393 Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits());
9394 return DAG.getBitcast(VT, Bcst);
9395 }
9396 }
9397 }
9398
9399 unsigned NumUndefElts = UndefElements.count();
9400 if (!Ld || (NumElts - NumUndefElts) <= 1) {
9401 APInt SplatValue, Undef;
9402 unsigned SplatBitSize;
9403 bool HasUndef;
9404 // Check if this is a repeated constant pattern suitable for broadcasting.
9405 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
9406 SplatBitSize > VT.getScalarSizeInBits() &&
9407 SplatBitSize < VT.getSizeInBits()) {
9408 // Avoid replacing with broadcast when it's a use of a shuffle
9409 // instruction to preserve the present custom lowering of shuffles.
9410 if (isFoldableUseOfShuffle(BVOp))
9411 return SDValue();
9412 // replace BUILD_VECTOR with broadcast of the repeated constants.
9413 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9414 LLVMContext *Ctx = DAG.getContext();
9415 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
9416 if (Subtarget.hasAVX()) {
9417 if (SplatBitSize == 32 || SplatBitSize == 64 ||
9418 (SplatBitSize < 32 && Subtarget.hasAVX2())) {
9419 // Splatted value can fit in one INTEGER constant in constant pool.
9420 // Load the constant and broadcast it.
9421 MVT CVT = MVT::getIntegerVT(SplatBitSize);
9422 Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);
9423 Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
9424 SDValue CP = DAG.getConstantPool(C, PVT);
9425 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
9426
9427 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
9428 SDVTList Tys =
9429 DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);
9430 SDValue Ops[] = {DAG.getEntryNode(), CP};
9431 MachinePointerInfo MPI =
9432 MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
9433 SDValue Brdcst = DAG.getMemIntrinsicNode(
9434 X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT, MPI, Alignment,
9435 MachineMemOperand::MOLoad);
9436 return DAG.getBitcast(VT, Brdcst);
9437 }
9438 if (SplatBitSize > 64) {
9439 // Load the vector of constants and broadcast it.
9440 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
9441 *Ctx);
9442 SDValue VCP = DAG.getConstantPool(VecC, PVT);
9443 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
9444 MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm);
9445 Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();
9446 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
9447 SDValue Ops[] = {DAG.getEntryNode(), VCP};
9448 MachinePointerInfo MPI =
9449 MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
9450 return DAG.getMemIntrinsicNode(
9451 X86ISD::SUBV_BROADCAST_LOAD, dl, Tys, Ops, VVT, MPI, Alignment,
9452 MachineMemOperand::MOLoad);
9453 }
9454 }
9455 }
9456
9457 // If we are moving a scalar into a vector (Ld must be set and all elements
9458 // but 1 are undef) and that operation is not obviously supported by
9459 // vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.
9460 // That's better than general shuffling and may eliminate a load to GPR and
9461 // move from scalar to vector register.
9462 if (!Ld || NumElts - NumUndefElts != 1)
9463 return SDValue();
9464 unsigned ScalarSize = Ld.getValueSizeInBits();
9465 if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))
9466 return SDValue();
9467 }
9468
9469 bool ConstSplatVal =
9470 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
9471 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
9472
9473 // TODO: Handle broadcasts of non-constant sequences.
9474
9475 // Make sure that all of the users of a non-constant load are from the
9476 // BUILD_VECTOR node.
9477 // FIXME: Is the use count needed for non-constant, non-load case?
9478 if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))
9479 return SDValue();
9480
9481 unsigned ScalarSize = Ld.getValueSizeInBits();
9482 bool IsGE256 = (VT.getSizeInBits() >= 256);
9483
9484 // When optimizing for size, generate up to 5 extra bytes for a broadcast
9485 // instruction to save 8 or more bytes of constant pool data.
9486 // TODO: If multiple splats are generated to load the same constant,
9487 // it may be detrimental to overall size. There needs to be a way to detect
9488 // that condition to know if this is truly a size win.
9489 bool OptForSize = DAG.shouldOptForSize();
9490
9491 // Handle broadcasting a single constant scalar from the constant pool
9492 // into a vector.
9493 // On Sandybridge (no AVX2), it is still better to load a constant vector
9494 // from the constant pool and not to broadcast it from a scalar.
9495 // But override that restriction when optimizing for size.
9496 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
9497 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
9498 EVT CVT = Ld.getValueType();
9499 assert(!CVT.isVector() && "Must not broadcast a vector type")(static_cast <bool> (!CVT.isVector() && "Must not broadcast a vector type"
) ? void (0) : __assert_fail ("!CVT.isVector() && \"Must not broadcast a vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9499, __extension__
__PRETTY_FUNCTION__))
;
9500
9501 // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
9502 // For size optimization, also splat v2f64 and v2i64, and for size opt
9503 // with AVX2, also splat i8 and i16.
9504 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
9505 if (ScalarSize == 32 ||
9506 (ScalarSize == 64 && (IsGE256 || Subtarget.hasVLX())) ||
9507 (ScalarSize == 16 && Subtarget.hasFP16() && CVT.isFloatingPoint()) ||
9508 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
9509 const Constant *C = nullptr;
9510 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
9511 C = CI->getConstantIntValue();
9512 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
9513 C = CF->getConstantFPValue();
9514
9515 assert(C && "Invalid constant type")(static_cast <bool> (C && "Invalid constant type"
) ? void (0) : __assert_fail ("C && \"Invalid constant type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9515, __extension__
__PRETTY_FUNCTION__))
;
9516
9517 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9518 SDValue CP =
9519 DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
9520 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
9521
9522 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
9523 SDValue Ops[] = {DAG.getEntryNode(), CP};
9524 MachinePointerInfo MPI =
9525 MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
9526 return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
9527 MPI, Alignment, MachineMemOperand::MOLoad);
9528 }
9529 }
9530
9531 // Handle AVX2 in-register broadcasts.
9532 if (!IsLoad && Subtarget.hasInt256() &&
9533 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
9534 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
9535
9536 // The scalar source must be a normal load.
9537 if (!IsLoad)
9538 return SDValue();
9539
9540 // Make sure the non-chain result is only used by this build vector.
9541 if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))
9542 return SDValue();
9543
9544 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
9545 (Subtarget.hasVLX() && ScalarSize == 64)) {
9546 auto *LN = cast<LoadSDNode>(Ld);
9547 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
9548 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
9549 SDValue BCast =
9550 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
9551 LN->getMemoryVT(), LN->getMemOperand());
9552 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
9553 return BCast;
9554 }
9555
9556 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
9557 // double since there is no vbroadcastsd xmm
9558 if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&
9559 (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {
9560 auto *LN = cast<LoadSDNode>(Ld);
9561 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
9562 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
9563 SDValue BCast =
9564 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
9565 LN->getMemoryVT(), LN->getMemOperand());
9566 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
9567 return BCast;
9568 }
9569
9570 if (ScalarSize == 16 && Subtarget.hasFP16() && IsGE256)
9571 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
9572
9573 // Unsupported broadcast.
9574 return SDValue();
9575}
9576
9577/// For an EXTRACT_VECTOR_ELT with a constant index return the real
9578/// underlying vector and index.
9579///
9580/// Modifies \p ExtractedFromVec to the real vector and returns the real
9581/// index.
9582static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
9583 SDValue ExtIdx) {
9584 int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
9585 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
9586 return Idx;
9587
9588 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
9589 // lowered this:
9590 // (extract_vector_elt (v8f32 %1), Constant<6>)
9591 // to:
9592 // (extract_vector_elt (vector_shuffle<2,u,u,u>
9593 // (extract_subvector (v8f32 %0), Constant<4>),
9594 // undef)
9595 // Constant<0>)
9596 // In this case the vector is the extract_subvector expression and the index
9597 // is 2, as specified by the shuffle.
9598 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
9599 SDValue ShuffleVec = SVOp->getOperand(0);
9600 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
9601 assert(ShuffleVecVT.getVectorElementType() ==(static_cast <bool> (ShuffleVecVT.getVectorElementType(
) == ExtractedFromVec.getSimpleValueType().getVectorElementType
()) ? void (0) : __assert_fail ("ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType().getVectorElementType()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9602, __extension__
__PRETTY_FUNCTION__))
9602 ExtractedFromVec.getSimpleValueType().getVectorElementType())(static_cast <bool> (ShuffleVecVT.getVectorElementType(
) == ExtractedFromVec.getSimpleValueType().getVectorElementType
()) ? void (0) : __assert_fail ("ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType().getVectorElementType()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9602, __extension__
__PRETTY_FUNCTION__))
;
9603
9604 int ShuffleIdx = SVOp->getMaskElt(Idx);
9605 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
9606 ExtractedFromVec = ShuffleVec;
9607 return ShuffleIdx;
9608 }
9609 return Idx;
9610}
9611
9612static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
9613 MVT VT = Op.getSimpleValueType();
9614
9615 // Skip if insert_vec_elt is not supported.
9616 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9617 if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
9618 return SDValue();
9619
9620 SDLoc DL(Op);
9621 unsigned NumElems = Op.getNumOperands();
9622
9623 SDValue VecIn1;
9624 SDValue VecIn2;
9625 SmallVector<unsigned, 4> InsertIndices;
9626 SmallVector<int, 8> Mask(NumElems, -1);
9627
9628 for (unsigned i = 0; i != NumElems; ++i) {
9629 unsigned Opc = Op.getOperand(i).getOpcode();
9630
9631 if (Opc == ISD::UNDEF)
9632 continue;
9633
9634 if (Opc != ISD::EXTRACT_VECTOR_ELT) {
9635 // Quit if more than 1 elements need inserting.
9636 if (InsertIndices.size() > 1)
9637 return SDValue();
9638
9639 InsertIndices.push_back(i);
9640 continue;
9641 }
9642
9643 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
9644 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
9645
9646 // Quit if non-constant index.
9647 if (!isa<ConstantSDNode>(ExtIdx))
9648 return SDValue();
9649 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
9650
9651 // Quit if extracted from vector of different type.
9652 if (ExtractedFromVec.getValueType() != VT)
9653 return SDValue();
9654
9655 if (!VecIn1.getNode())
9656 VecIn1 = ExtractedFromVec;
9657 else if (VecIn1 != ExtractedFromVec) {
9658 if (!VecIn2.getNode())
9659 VecIn2 = ExtractedFromVec;
9660 else if (VecIn2 != ExtractedFromVec)
9661 // Quit if more than 2 vectors to shuffle
9662 return SDValue();
9663 }
9664
9665 if (ExtractedFromVec == VecIn1)
9666 Mask[i] = Idx;
9667 else if (ExtractedFromVec == VecIn2)
9668 Mask[i] = Idx + NumElems;
9669 }
9670
9671 if (!VecIn1.getNode())
9672 return SDValue();
9673
9674 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
9675 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
9676
9677 for (unsigned Idx : InsertIndices)
9678 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
9679 DAG.getIntPtrConstant(Idx, DL));
9680
9681 return NV;
9682}
9683
9684// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
9685static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
9686 const X86Subtarget &Subtarget) {
9687
9688 MVT VT = Op.getSimpleValueType();
9689 assert((VT.getVectorElementType() == MVT::i1) &&(static_cast <bool> ((VT.getVectorElementType() == MVT::
i1) && "Unexpected type in LowerBUILD_VECTORvXi1!") ?
void (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i1) && \"Unexpected type in LowerBUILD_VECTORvXi1!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9690, __extension__
__PRETTY_FUNCTION__))
9690 "Unexpected type in LowerBUILD_VECTORvXi1!")(static_cast <bool> ((VT.getVectorElementType() == MVT::
i1) && "Unexpected type in LowerBUILD_VECTORvXi1!") ?
void (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i1) && \"Unexpected type in LowerBUILD_VECTORvXi1!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9690, __extension__
__PRETTY_FUNCTION__))
;
9691
9692 SDLoc dl(Op);
9693 if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
9694 ISD::isBuildVectorAllOnes(Op.getNode()))
9695 return Op;
9696
9697 uint64_t Immediate = 0;
9698 SmallVector<unsigned, 16> NonConstIdx;
9699 bool IsSplat = true;
9700 bool HasConstElts = false;
9701 int SplatIdx = -1;
9702 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
9703 SDValue In = Op.getOperand(idx);
9704 if (In.isUndef())
9705 continue;
9706 if (auto *InC = dyn_cast<ConstantSDNode>(In)) {
9707 Immediate |= (InC->getZExtValue() & 0x1) << idx;
9708 HasConstElts = true;
9709 } else {
9710 NonConstIdx.push_back(idx);
9711 }
9712 if (SplatIdx < 0)
9713 SplatIdx = idx;
9714 else if (In != Op.getOperand(SplatIdx))
9715 IsSplat = false;
9716 }
9717
9718 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
9719 if (IsSplat) {
9720 // The build_vector allows the scalar element to be larger than the vector
9721 // element type. We need to mask it to use as a condition unless we know
9722 // the upper bits are zero.
9723 // FIXME: Use computeKnownBits instead of checking specific opcode?
9724 SDValue Cond = Op.getOperand(SplatIdx);
9725 assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!")(static_cast <bool> (Cond.getValueType() == MVT::i8 &&
"Unexpected VT!") ? void (0) : __assert_fail ("Cond.getValueType() == MVT::i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9725, __extension__
__PRETTY_FUNCTION__))
;
9726 if (Cond.getOpcode() != ISD::SETCC)
9727 Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,
9728 DAG.getConstant(1, dl, MVT::i8));
9729
9730 // Perform the select in the scalar domain so we can use cmov.
9731 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
9732 SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,
9733 DAG.getAllOnesConstant(dl, MVT::i32),
9734 DAG.getConstant(0, dl, MVT::i32));
9735 Select = DAG.getBitcast(MVT::v32i1, Select);
9736 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);
9737 } else {
9738 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
9739 SDValue Select = DAG.getSelect(dl, ImmVT, Cond,
9740 DAG.getAllOnesConstant(dl, ImmVT),
9741 DAG.getConstant(0, dl, ImmVT));
9742 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
9743 Select = DAG.getBitcast(VecVT, Select);
9744 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,
9745 DAG.getIntPtrConstant(0, dl));
9746 }
9747 }
9748
9749 // insert elements one by one
9750 SDValue DstVec;
9751 if (HasConstElts) {
9752 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
9753 SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);
9754 SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);
9755 ImmL = DAG.getBitcast(MVT::v32i1, ImmL);
9756 ImmH = DAG.getBitcast(MVT::v32i1, ImmH);
9757 DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);
9758 } else {
9759 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
9760 SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);
9761 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
9762 DstVec = DAG.getBitcast(VecVT, Imm);
9763 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,
9764 DAG.getIntPtrConstant(0, dl));
9765 }
9766 } else
9767 DstVec = DAG.getUNDEF(VT);
9768
9769 for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
9770 unsigned InsertIdx = NonConstIdx[i];
9771 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
9772 Op.getOperand(InsertIdx),
9773 DAG.getIntPtrConstant(InsertIdx, dl));
9774 }
9775 return DstVec;
9776}
9777
9778LLVM_ATTRIBUTE_UNUSED__attribute__((__unused__)) static bool isHorizOp(unsigned Opcode) {
9779 switch (Opcode) {
9780 case X86ISD::PACKSS:
9781 case X86ISD::PACKUS:
9782 case X86ISD::FHADD:
9783 case X86ISD::FHSUB:
9784 case X86ISD::HADD:
9785 case X86ISD::HSUB:
9786 return true;
9787 }
9788 return false;
9789}
9790
9791/// This is a helper function of LowerToHorizontalOp().
9792/// This function checks that the build_vector \p N in input implements a
9793/// 128-bit partial horizontal operation on a 256-bit vector, but that operation
9794/// may not match the layout of an x86 256-bit horizontal instruction.
9795/// In other words, if this returns true, then some extraction/insertion will
9796/// be required to produce a valid horizontal instruction.
9797///
9798/// Parameter \p Opcode defines the kind of horizontal operation to match.
9799/// For example, if \p Opcode is equal to ISD::ADD, then this function
9800/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
9801/// is equal to ISD::SUB, then this function checks if this is a horizontal
9802/// arithmetic sub.
9803///
9804/// This function only analyzes elements of \p N whose indices are
9805/// in range [BaseIdx, LastIdx).
9806///
9807/// TODO: This function was originally used to match both real and fake partial
9808/// horizontal operations, but the index-matching logic is incorrect for that.
9809/// See the corrected implementation in isHopBuildVector(). Can we reduce this
9810/// code because it is only used for partial h-op matching now?
9811static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
9812 SelectionDAG &DAG,
9813 unsigned BaseIdx, unsigned LastIdx,
9814 SDValue &V0, SDValue &V1) {
9815 EVT VT = N->getValueType(0);
9816 assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops")(static_cast <bool> (VT.is256BitVector() && "Only use for matching partial 256-bit h-ops"
) ? void (0) : __assert_fail ("VT.is256BitVector() && \"Only use for matching partial 256-bit h-ops\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9816, __extension__
__PRETTY_FUNCTION__))
;
9817 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!")(static_cast <bool> (BaseIdx * 2 <= LastIdx &&
"Invalid Indices in input!") ? void (0) : __assert_fail ("BaseIdx * 2 <= LastIdx && \"Invalid Indices in input!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9817, __extension__
__PRETTY_FUNCTION__))
;
9818 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&(static_cast <bool> (VT.isVector() && VT.getVectorNumElements
() >= LastIdx && "Invalid Vector in input!") ? void
(0) : __assert_fail ("VT.isVector() && VT.getVectorNumElements() >= LastIdx && \"Invalid Vector in input!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9819, __extension__
__PRETTY_FUNCTION__))
9819 "Invalid Vector in input!")(static_cast <bool> (VT.isVector() && VT.getVectorNumElements
() >= LastIdx && "Invalid Vector in input!") ? void
(0) : __assert_fail ("VT.isVector() && VT.getVectorNumElements() >= LastIdx && \"Invalid Vector in input!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9819, __extension__
__PRETTY_FUNCTION__))
;
9820
9821 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
9822 bool CanFold = true;
9823 unsigned ExpectedVExtractIdx = BaseIdx;
9824 unsigned NumElts = LastIdx - BaseIdx;
9825 V0 = DAG.getUNDEF(VT);
9826 V1 = DAG.getUNDEF(VT);
9827
9828 // Check if N implements a horizontal binop.
9829 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
9830 SDValue Op = N->getOperand(i + BaseIdx);
9831
9832 // Skip UNDEFs.
9833 if (Op->isUndef()) {
9834 // Update the expected vector extract index.
9835 if (i * 2 == NumElts)
9836 ExpectedVExtractIdx = BaseIdx;
9837 ExpectedVExtractIdx += 2;
9838 continue;
9839 }
9840
9841 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
9842
9843 if (!CanFold)
9844 break;
9845
9846 SDValue Op0 = Op.getOperand(0);
9847 SDValue Op1 = Op.getOperand(1);
9848
9849 // Try to match the following pattern:
9850 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
9851 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
9852 Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
9853 Op0.getOperand(0) == Op1.getOperand(0) &&
9854 isa<ConstantSDNode>(Op0.getOperand(1)) &&
9855 isa<ConstantSDNode>(Op1.getOperand(1)));
9856 if (!CanFold)
9857 break;
9858
9859 unsigned I0 = Op0.getConstantOperandVal(1);
9860 unsigned I1 = Op1.getConstantOperandVal(1);
9861
9862 if (i * 2 < NumElts) {
9863 if (V0.isUndef()) {
9864 V0 = Op0.getOperand(0);
9865 if (V0.getValueType() != VT)
9866 return false;
9867 }
9868 } else {
9869 if (V1.isUndef()) {
9870 V1 = Op0.getOperand(0);
9871 if (V1.getValueType() != VT)
9872 return false;
9873 }
9874 if (i * 2 == NumElts)
9875 ExpectedVExtractIdx = BaseIdx;
9876 }
9877
9878 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
9879 if (I0 == ExpectedVExtractIdx)
9880 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
9881 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
9882 // Try to match the following dag sequence:
9883 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
9884 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
9885 } else
9886 CanFold = false;
9887
9888 ExpectedVExtractIdx += 2;
9889 }
9890
9891 return CanFold;
9892}
9893
9894/// Emit a sequence of two 128-bit horizontal add/sub followed by
9895/// a concat_vector.
9896///
9897/// This is a helper function of LowerToHorizontalOp().
9898/// This function expects two 256-bit vectors called V0 and V1.
9899/// At first, each vector is split into two separate 128-bit vectors.
9900/// Then, the resulting 128-bit vectors are used to implement two
9901/// horizontal binary operations.
9902///
9903/// The kind of horizontal binary operation is defined by \p X86Opcode.
9904///
9905/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
9906/// the two new horizontal binop.
9907/// When Mode is set, the first horizontal binop dag node would take as input
9908/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
9909/// horizontal binop dag node would take as input the lower 128-bit of V1
9910/// and the upper 128-bit of V1.
9911/// Example:
9912/// HADD V0_LO, V0_HI
9913/// HADD V1_LO, V1_HI
9914///
9915/// Otherwise, the first horizontal binop dag node takes as input the lower
9916/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
9917/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
9918/// Example:
9919/// HADD V0_LO, V1_LO
9920/// HADD V0_HI, V1_HI
9921///
9922/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
9923/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
9924/// the upper 128-bits of the result.
9925static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
9926 const SDLoc &DL, SelectionDAG &DAG,
9927 unsigned X86Opcode, bool Mode,
9928 bool isUndefLO, bool isUndefHI) {
9929 MVT VT = V0.getSimpleValueType();
9930 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&(static_cast <bool> (VT.is256BitVector() && VT ==
V1.getSimpleValueType() && "Invalid nodes in input!"
) ? void (0) : __assert_fail ("VT.is256BitVector() && VT == V1.getSimpleValueType() && \"Invalid nodes in input!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9931, __extension__
__PRETTY_FUNCTION__))
9931 "Invalid nodes in input!")(static_cast <bool> (VT.is256BitVector() && VT ==
V1.getSimpleValueType() && "Invalid nodes in input!"
) ? void (0) : __assert_fail ("VT.is256BitVector() && VT == V1.getSimpleValueType() && \"Invalid nodes in input!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9931, __extension__
__PRETTY_FUNCTION__))
;
9932
9933 unsigned NumElts = VT.getVectorNumElements();
9934 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
9935 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
9936 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
9937 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
9938 MVT NewVT = V0_LO.getSimpleValueType();
9939
9940 SDValue LO = DAG.getUNDEF(NewVT);
9941 SDValue HI = DAG.getUNDEF(NewVT);
9942
9943 if (Mode) {
9944 // Don't emit a horizontal binop if the result is expected to be UNDEF.
9945 if (!isUndefLO && !V0->isUndef())
9946 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
9947 if (!isUndefHI && !V1->isUndef())
9948 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
9949 } else {
9950 // Don't emit a horizontal binop if the result is expected to be UNDEF.
9951 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
9952 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
9953
9954 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
9955 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
9956 }
9957
9958 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
9959}
9960
9961/// Returns true iff \p BV builds a vector with the result equivalent to
9962/// the result of ADDSUB/SUBADD operation.
9963/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
9964/// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
9965/// \p Opnd0 and \p Opnd1.
9966static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,
9967 const X86Subtarget &Subtarget, SelectionDAG &DAG,
9968 SDValue &Opnd0, SDValue &Opnd1,
9969 unsigned &NumExtracts,
9970 bool &IsSubAdd) {
9971
9972 MVT VT = BV->getSimpleValueType(0);
9973 if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
9974 return false;
9975
9976 unsigned NumElts = VT.getVectorNumElements();
9977 SDValue InVec0 = DAG.getUNDEF(VT);
9978 SDValue InVec1 = DAG.getUNDEF(VT);
9979
9980 NumExtracts = 0;
9981
9982 // Odd-numbered elements in the input build vector are obtained from
9983 // adding/subtracting two integer/float elements.
9984 // Even-numbered elements in the input build vector are obtained from
9985 // subtracting/adding two integer/float elements.
9986 unsigned Opc[2] = {0, 0};
9987 for (unsigned i = 0, e = NumElts; i != e; ++i) {
9988 SDValue Op = BV->getOperand(i);
9989
9990 // Skip 'undef' values.
9991 unsigned Opcode = Op.getOpcode();
9992 if (Opcode == ISD::UNDEF)
9993 continue;
9994
9995 // Early exit if we found an unexpected opcode.
9996 if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
9997 return false;
9998
9999 SDValue Op0 = Op.getOperand(0);
10000 SDValue Op1 = Op.getOperand(1);
10001
10002 // Try to match the following pattern:
10003 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
10004 // Early exit if we cannot match that sequence.
10005 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
10006 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
10007 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
10008 Op0.getOperand(1) != Op1.getOperand(1))
10009 return false;
10010
10011 unsigned I0 = Op0.getConstantOperandVal(1);
10012 if (I0 != i)
10013 return false;
10014
10015 // We found a valid add/sub node, make sure its the same opcode as previous
10016 // elements for this parity.
10017 if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
10018 return false;
10019 Opc[i % 2] = Opcode;
10020
10021 // Update InVec0 and InVec1.
10022 if (InVec0.isUndef()) {
10023 InVec0 = Op0.getOperand(0);
10024 if (InVec0.getSimpleValueType() != VT)
10025 return false;
10026 }
10027 if (InVec1.isUndef()) {
10028 InVec1 = Op1.getOperand(0);
10029 if (InVec1.getSimpleValueType() != VT)
10030 return false;
10031 }
10032
10033 // Make sure that operands in input to each add/sub node always
10034 // come from a same pair of vectors.
10035 if (InVec0 != Op0.getOperand(0)) {
10036 if (Opcode == ISD::FSUB)
10037 return false;
10038
10039 // FADD is commutable. Try to commute the operands
10040 // and then test again.
10041 std::swap(Op0, Op1);
10042 if (InVec0 != Op0.getOperand(0))
10043 return false;
10044 }
10045
10046 if (InVec1 != Op1.getOperand(0))
10047 return false;
10048
10049 // Increment the number of extractions done.
10050 ++NumExtracts;
10051 }
10052
10053 // Ensure we have found an opcode for both parities and that they are
10054 // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
10055 // inputs are undef.
10056 if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
10057 InVec0.isUndef() || InVec1.isUndef())
10058 return false;
10059
10060 IsSubAdd = Opc[0] == ISD::FADD;
10061
10062 Opnd0 = InVec0;
10063 Opnd1 = InVec1;
10064 return true;
10065}
10066
10067/// Returns true if is possible to fold MUL and an idiom that has already been
10068/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
10069/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
10070/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
10071///
10072/// Prior to calling this function it should be known that there is some
10073/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
10074/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
10075/// before replacement of such SDNode with ADDSUB operation. Thus the number
10076/// of \p Opnd0 uses is expected to be equal to 2.
10077/// For example, this function may be called for the following IR:
10078/// %AB = fmul fast <2 x double> %A, %B
10079/// %Sub = fsub fast <2 x double> %AB, %C
10080/// %Add = fadd fast <2 x double> %AB, %C
10081/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
10082/// <2 x i32> <i32 0, i32 3>
10083/// There is a def for %Addsub here, which potentially can be replaced by
10084/// X86ISD::ADDSUB operation:
10085/// %Addsub = X86ISD::ADDSUB %AB, %C
10086/// and such ADDSUB can further be replaced with FMADDSUB:
10087/// %Addsub = FMADDSUB %A, %B, %C.
10088///
10089/// The main reason why this method is called before the replacement of the
10090/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
10091/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
10092/// FMADDSUB is.
10093static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
10094 SelectionDAG &DAG,
10095 SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,
10096 unsigned ExpectedUses) {
10097 if (Opnd0.getOpcode() != ISD::FMUL ||
10098 !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
10099 return false;
10100
10101 // FIXME: These checks must match the similar ones in
10102 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
10103 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
10104 // or MUL + ADDSUB to FMADDSUB.
10105 const TargetOptions &Options = DAG.getTarget().Options;
10106 bool AllowFusion =
10107 (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
10108 if (!AllowFusion)
10109 return false;
10110
10111 Opnd2 = Opnd1;
10112 Opnd1 = Opnd0.getOperand(1);
10113 Opnd0 = Opnd0.getOperand(0);
10114
10115 return true;
10116}
10117
10118/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
10119/// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
10120/// X86ISD::FMSUBADD node.
10121static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
10122 const X86Subtarget &Subtarget,
10123 SelectionDAG &DAG) {
10124 SDValue Opnd0, Opnd1;
10125 unsigned NumExtracts;
10126 bool IsSubAdd;
10127 if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts,
10128 IsSubAdd))
10129 return SDValue();
10130
10131 MVT VT = BV->getSimpleValueType(0);
10132 SDLoc DL(BV);
10133
10134 // Try to generate X86ISD::FMADDSUB node here.
10135 SDValue Opnd2;
10136 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) {
10137 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
10138 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
10139 }
10140
10141 // We only support ADDSUB.
10142 if (IsSubAdd)
10143 return SDValue();
10144
10145 // There are no known X86 targets with 512-bit ADDSUB instructions!
10146 // Convert to blend(fsub,fadd).
10147 if (VT.is512BitVector()) {
10148 SmallVector<int> Mask;
10149 for (int I = 0, E = VT.getVectorNumElements(); I != E; I += 2) {
10150 Mask.push_back(I);
10151 Mask.push_back(I + E + 1);
10152 }
10153 SDValue Sub = DAG.getNode(ISD::FSUB, DL, VT, Opnd0, Opnd1);
10154 SDValue Add = DAG.getNode(ISD::FADD, DL, VT, Opnd0, Opnd1);
10155 return DAG.getVectorShuffle(VT, DL, Sub, Add, Mask);
10156 }
10157
10158 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
10159}
10160
10161static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG,
10162 unsigned &HOpcode, SDValue &V0, SDValue &V1) {
10163 // Initialize outputs to known values.
10164 MVT VT = BV->getSimpleValueType(0);
10165 HOpcode = ISD::DELETED_NODE;
10166 V0 = DAG.getUNDEF(VT);
10167 V1 = DAG.getUNDEF(VT);
10168
10169 // x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
10170 // half of the result is calculated independently from the 128-bit halves of
10171 // the inputs, so that makes the index-checking logic below more complicated.
10172 unsigned NumElts = VT.getVectorNumElements();
10173 unsigned GenericOpcode = ISD::DELETED_NODE;
10174 unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;
10175 unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
10176 unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
10177 for (unsigned i = 0; i != Num128BitChunks; ++i) {
10178 for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {
10179 // Ignore undef elements.
10180 SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
10181 if (Op.isUndef())
10182 continue;
10183
10184 // If there's an opcode mismatch, we're done.
10185 if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)
10186 return false;
10187
10188 // Initialize horizontal opcode.
10189 if (HOpcode == ISD::DELETED_NODE) {
10190 GenericOpcode = Op.getOpcode();
10191 switch (GenericOpcode) {
10192 case ISD::ADD: HOpcode = X86ISD::HADD; break;
10193 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
10194 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
10195 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
10196 default: return false;
10197 }
10198 }
10199
10200 SDValue Op0 = Op.getOperand(0);
10201 SDValue Op1 = Op.getOperand(1);
10202 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
10203 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
10204 Op0.getOperand(0) != Op1.getOperand(0) ||
10205 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
10206 !isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())
10207 return false;
10208
10209 // The source vector is chosen based on which 64-bit half of the
10210 // destination vector is being calculated.
10211 if (j < NumEltsIn64Bits) {
10212 if (V0.isUndef())
10213 V0 = Op0.getOperand(0);
10214 } else {
10215 if (V1.isUndef())
10216 V1 = Op0.getOperand(0);
10217 }
10218
10219 SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
10220 if (SourceVec != Op0.getOperand(0))
10221 return false;
10222
10223 // op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
10224 unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
10225 unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
10226 unsigned ExpectedIndex = i * NumEltsIn128Bits +
10227 (j % NumEltsIn64Bits) * 2;
10228 if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
10229 continue;
10230
10231 // If this is not a commutative op, this does not match.
10232 if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)
10233 return false;
10234
10235 // Addition is commutative, so try swapping the extract indexes.
10236 // op (extract_vector_elt A, I+1), (extract_vector_elt A, I)
10237 if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
10238 continue;
10239
10240 // Extract indexes do not match horizontal requirement.
10241 return false;
10242 }
10243 }
10244 // We matched. Opcode and operands are returned by reference as arguments.
10245 return true;
10246}
10247
10248static SDValue getHopForBuildVector(const BuildVectorSDNode *BV,
10249 SelectionDAG &DAG, unsigned HOpcode,
10250 SDValue V0, SDValue V1) {
10251 // If either input vector is not the same size as the build vector,
10252 // extract/insert the low bits to the correct size.
10253 // This is free (examples: zmm --> xmm, xmm --> ymm).
10254 MVT VT = BV->getSimpleValueType(0);
10255 unsigned Width = VT.getSizeInBits();
10256 if (V0.getValueSizeInBits() > Width)
10257 V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), Width);
10258 else if (V0.getValueSizeInBits() < Width)
10259 V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, SDLoc(BV), Width);
10260
10261 if (V1.getValueSizeInBits() > Width)
10262 V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), Width);
10263 else if (V1.getValueSizeInBits() < Width)
10264 V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, SDLoc(BV), Width);
10265
10266 unsigned NumElts = VT.getVectorNumElements();
10267 APInt DemandedElts = APInt::getAllOnes(NumElts);
10268 for (unsigned i = 0; i != NumElts; ++i)
10269 if (BV->getOperand(i).isUndef())
10270 DemandedElts.clearBit(i);
10271
10272 // If we don't need the upper xmm, then perform as a xmm hop.
10273 unsigned HalfNumElts = NumElts / 2;
10274 if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
10275 MVT HalfVT = VT.getHalfNumVectorElementsVT();
10276 V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), 128);
10277 V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), 128);
10278 SDValue Half = DAG.getNode(HOpcode, SDLoc(BV), HalfVT, V0, V1);
10279 return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, SDLoc(BV), 256);
10280 }
10281
10282 return DAG.getNode(HOpcode, SDLoc(BV), VT, V0, V1);
10283}
10284
10285/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
10286static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
10287 const X86Subtarget &Subtarget,
10288 SelectionDAG &DAG) {
10289 // We need at least 2 non-undef elements to make this worthwhile by default.
10290 unsigned NumNonUndefs =
10291 count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
10292 if (NumNonUndefs < 2)
10293 return SDValue();
10294
10295 // There are 4 sets of horizontal math operations distinguished by type:
10296 // int/FP at 128-bit/256-bit. Each type was introduced with a different
10297 // subtarget feature. Try to match those "native" patterns first.
10298 MVT VT = BV->getSimpleValueType(0);
10299 if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||
10300 ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||
10301 ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||
10302 ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {
10303 unsigned HOpcode;
10304 SDValue V0, V1;
10305 if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
10306 return getHopForBuildVector(BV, DAG, HOpcode, V0, V1);
10307 }
10308
10309 // Try harder to match 256-bit ops by using extract/concat.
10310 if (!Subtarget.hasAVX() || !VT.is256BitVector())
10311 return SDValue();
10312
10313 // Count the number of UNDEF operands in the build_vector in input.
10314 unsigned NumElts = VT.getVectorNumElements();
10315 unsigned Half = NumElts / 2;
10316 unsigned NumUndefsLO = 0;
10317 unsigned NumUndefsHI = 0;
10318 for (unsigned i = 0, e = Half; i != e; ++i)
10319 if (BV->getOperand(i)->isUndef())
10320 NumUndefsLO++;
10321
10322 for (unsigned i = Half, e = NumElts; i != e; ++i)
10323 if (BV->getOperand(i)->isUndef())
10324 NumUndefsHI++;
10325
10326 SDLoc DL(BV);
10327 SDValue InVec0, InVec1;
10328 if (VT == MVT::v8i32 || VT == MVT::v16i16) {
10329 SDValue InVec2, InVec3;
10330 unsigned X86Opcode;
10331 bool CanFold = true;
10332
10333 if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
10334 isHorizontalBinOpPart(BV, ISD::ADD, DAG, Half, NumElts, InVec2,
10335 InVec3) &&
10336 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
10337 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
10338 X86Opcode = X86ISD::HADD;
10339 else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, Half, InVec0,
10340 InVec1) &&
10341 isHorizontalBinOpPart(BV, ISD::SUB, DAG, Half, NumElts, InVec2,
10342 InVec3) &&
10343 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
10344 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
10345 X86Opcode = X86ISD::HSUB;
10346 else
10347 CanFold = false;
10348
10349 if (CanFold) {
10350 // Do not try to expand this build_vector into a pair of horizontal
10351 // add/sub if we can emit a pair of scalar add/sub.
10352 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
10353 return SDValue();
10354
10355 // Convert this build_vector into a pair of horizontal binops followed by
10356 // a concat vector. We must adjust the outputs from the partial horizontal
10357 // matching calls above to account for undefined vector halves.
10358 SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
10359 SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
10360 assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?")(static_cast <bool> ((!V0.isUndef() || !V1.isUndef()) &&
"Horizontal-op of undefs?") ? void (0) : __assert_fail ("(!V0.isUndef() || !V1.isUndef()) && \"Horizontal-op of undefs?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10360, __extension__
__PRETTY_FUNCTION__))
;
10361 bool isUndefLO = NumUndefsLO == Half;
10362 bool isUndefHI = NumUndefsHI == Half;
10363 return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,
10364 isUndefHI);
10365 }
10366 }
10367
10368 if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
10369 VT == MVT::v16i16) {
10370 unsigned X86Opcode;
10371 if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
10372 X86Opcode = X86ISD::HADD;
10373 else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, NumElts, InVec0,
10374 InVec1))
10375 X86Opcode = X86ISD::HSUB;
10376 else if (isHorizontalBinOpPart(BV, ISD::FADD, DAG, 0, NumElts, InVec0,
10377 InVec1))
10378 X86Opcode = X86ISD::FHADD;
10379 else if (isHorizontalBinOpPart(BV, ISD::FSUB, DAG, 0, NumElts, InVec0,
10380 InVec1))
10381 X86Opcode = X86ISD::FHSUB;
10382 else
10383 return SDValue();
10384
10385 // Don't try to expand this build_vector into a pair of horizontal add/sub
10386 // if we can simply emit a pair of scalar add/sub.
10387 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
10388 return SDValue();
10389
10390 // Convert this build_vector into two horizontal add/sub followed by
10391 // a concat vector.
10392 bool isUndefLO = NumUndefsLO == Half;
10393 bool isUndefHI = NumUndefsHI == Half;
10394 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
10395 isUndefLO, isUndefHI);
10396 }
10397
10398 return SDValue();
10399}
10400
10401static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
10402 SelectionDAG &DAG);
10403
10404/// If a BUILD_VECTOR's source elements all apply the same bit operation and
10405/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
10406/// just apply the bit to the vectors.
10407/// NOTE: Its not in our interest to start make a general purpose vectorizer
10408/// from this, but enough scalar bit operations are created from the later
10409/// legalization + scalarization stages to need basic support.
10410static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
10411 const X86Subtarget &Subtarget,
10412 SelectionDAG &DAG) {
10413 SDLoc DL(Op);
10414 MVT VT = Op->getSimpleValueType(0);
10415 unsigned NumElems = VT.getVectorNumElements();
10416 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
10417
10418 // Check that all elements have the same opcode.
10419 // TODO: Should we allow UNDEFS and if so how many?
10420 unsigned Opcode = Op->getOperand(0).getOpcode();
10421 for (unsigned i = 1; i < NumElems; ++i)
10422 if (Opcode != Op->getOperand(i).getOpcode())
10423 return SDValue();
10424
10425 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
10426 bool IsShift = false;
10427 switch (Opcode) {
10428 default:
10429 return SDValue();
10430 case ISD::SHL:
10431 case ISD::SRL:
10432 case ISD::SRA:
10433 IsShift = true;
10434 break;
10435 case ISD::AND:
10436 case ISD::XOR:
10437 case ISD::OR:
10438 // Don't do this if the buildvector is a splat - we'd replace one
10439 // constant with an entire vector.
10440 if (Op->getSplatValue())
10441 return SDValue();
10442 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
10443 return SDValue();
10444 break;
10445 }
10446
10447 SmallVector<SDValue, 4> LHSElts, RHSElts;
10448 for (SDValue Elt : Op->ops()) {
10449 SDValue LHS = Elt.getOperand(0);
10450 SDValue RHS = Elt.getOperand(1);
10451
10452 // We expect the canonicalized RHS operand to be the constant.
10453 if (!isa<ConstantSDNode>(RHS))
10454 return SDValue();
10455
10456 // Extend shift amounts.
10457 if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {
10458 if (!IsShift)
10459 return SDValue();
10460 RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());
10461 }
10462
10463 LHSElts.push_back(LHS);
10464 RHSElts.push_back(RHS);
10465 }
10466
10467 // Limit to shifts by uniform immediates.
10468 // TODO: Only accept vXi8/vXi64 special cases?
10469 // TODO: Permit non-uniform XOP/AVX2/MULLO cases?
10470 if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
10471 return SDValue();
10472
10473 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
10474 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
10475 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
10476
10477 if (!IsShift)
10478 return Res;
10479
10480 // Immediately lower the shift to ensure the constant build vector doesn't
10481 // get converted to a constant pool before the shift is lowered.
10482 return LowerShift(Res, Subtarget, DAG);
10483}
10484
10485/// Create a vector constant without a load. SSE/AVX provide the bare minimum
10486/// functionality to do this, so it's all zeros, all ones, or some derivation
10487/// that is cheap to calculate.
10488static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
10489 const X86Subtarget &Subtarget) {
10490 SDLoc DL(Op);
10491 MVT VT = Op.getSimpleValueType();
10492
10493 // Vectors containing all zeros can be matched by pxor and xorps.
10494 if (ISD::isBuildVectorAllZeros(Op.getNode()))
10495 return Op;
10496
10497 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
10498 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
10499 // vpcmpeqd on 256-bit vectors.
10500 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
10501 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
10502 return Op;
10503
10504 return getOnesVector(VT, DAG, DL);
10505 }
10506
10507 return SDValue();
10508}
10509
10510/// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
10511/// from a vector of source values and a vector of extraction indices.
10512/// The vectors might be manipulated to match the type of the permute op.
10513static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
10514 SDLoc &DL, SelectionDAG &DAG,
10515 const X86Subtarget &Subtarget) {
10516 MVT ShuffleVT = VT;
10517 EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
10518 unsigned NumElts = VT.getVectorNumElements();
10519 unsigned SizeInBits = VT.getSizeInBits();
10520
10521 // Adjust IndicesVec to match VT size.
10522 assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&(static_cast <bool> (IndicesVec.getValueType().getVectorNumElements
() >= NumElts && "Illegal variable permute mask size"
) ? void (0) : __assert_fail ("IndicesVec.getValueType().getVectorNumElements() >= NumElts && \"Illegal variable permute mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10523, __extension__
__PRETTY_FUNCTION__))
10523 "Illegal variable permute mask size")(static_cast <bool> (IndicesVec.getValueType().getVectorNumElements
() >= NumElts && "Illegal variable permute mask size"
) ? void (0) : __assert_fail ("IndicesVec.getValueType().getVectorNumElements() >= NumElts && \"Illegal variable permute mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10523, __extension__
__PRETTY_FUNCTION__))
;
10524 if (IndicesVec.getValueType().getVectorNumElements() > NumElts) {
10525 // Narrow/widen the indices vector to the correct size.
10526 if (IndicesVec.getValueSizeInBits() > SizeInBits)
10527 IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
10528 NumElts * VT.getScalarSizeInBits());
10529 else if (IndicesVec.getValueSizeInBits() < SizeInBits)
10530 IndicesVec = widenSubVector(IndicesVec, false, Subtarget, DAG,
10531 SDLoc(IndicesVec), SizeInBits);
10532 // Zero-extend the index elements within the vector.
10533 if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
10534 IndicesVec = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(IndicesVec),
10535 IndicesVT, IndicesVec);
10536 }
10537 IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
10538
10539 // Handle SrcVec that don't match VT type.
10540 if (SrcVec.getValueSizeInBits() != SizeInBits) {
10541 if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
10542 // Handle larger SrcVec by treating it as a larger permute.
10543 unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
10544 VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
10545 IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
10546 IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
10547 Subtarget, DAG, SDLoc(IndicesVec));
10548 SDValue NewSrcVec =
10549 createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
10550 if (NewSrcVec)
10551 return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);
10552 return SDValue();
10553 } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
10554 // Widen smaller SrcVec to match VT.
10555 SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
10556 } else
10557 return SDValue();
10558 }
10559
10560 auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
10561 assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale")(static_cast <bool> (isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale"
) ? void (0) : __assert_fail ("isPowerOf2_64(Scale) && \"Illegal variable permute shuffle scale\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10561, __extension__
__PRETTY_FUNCTION__))
;
10562 EVT SrcVT = Idx.getValueType();
10563 unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
10564 uint64_t IndexScale = 0;
10565 uint64_t IndexOffset = 0;
10566
10567 // If we're scaling a smaller permute op, then we need to repeat the
10568 // indices, scaling and offsetting them as well.
10569 // e.g. v4i32 -> v16i8 (Scale = 4)
10570 // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
10571 // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
10572 for (uint64_t i = 0; i != Scale; ++i) {
10573 IndexScale |= Scale << (i * NumDstBits);
10574 IndexOffset |= i << (i * NumDstBits);
10575 }
10576
10577 Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
10578 DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
10579 Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
10580 DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
10581 return Idx;
10582 };
10583
10584 unsigned Opcode = 0;
10585 switch (VT.SimpleTy) {
10586 default:
10587 break;
10588 case MVT::v16i8:
10589 if (Subtarget.hasSSSE3())
10590 Opcode = X86ISD::PSHUFB;
10591 break;
10592 case MVT::v8i16:
10593 if (Subtarget.hasVLX() && Subtarget.hasBWI())
10594 Opcode = X86ISD::VPERMV;
10595 else if (Subtarget.hasSSSE3()) {
10596 Opcode = X86ISD::PSHUFB;
10597 ShuffleVT = MVT::v16i8;
10598 }
10599 break;
10600 case MVT::v4f32:
10601 case MVT::v4i32:
10602 if (Subtarget.hasAVX()) {
10603 Opcode = X86ISD::VPERMILPV;
10604 ShuffleVT = MVT::v4f32;
10605 } else if (Subtarget.hasSSSE3()) {
10606 Opcode = X86ISD::PSHUFB;
10607 ShuffleVT = MVT::v16i8;
10608 }
10609 break;
10610 case MVT::v2f64:
10611 case MVT::v2i64:
10612 if (Subtarget.hasAVX()) {
10613 // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
10614 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
10615 Opcode = X86ISD::VPERMILPV;
10616 ShuffleVT = MVT::v2f64;
10617 } else if (Subtarget.hasSSE41()) {
10618 // SSE41 can compare v2i64 - select between indices 0 and 1.
10619 return DAG.getSelectCC(
10620 DL, IndicesVec,
10621 getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
10622 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
10623 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
10624 ISD::CondCode::SETEQ);
10625 }
10626 break;
10627 case MVT::v32i8:
10628 if (Subtarget.hasVLX() && Subtarget.hasVBMI())
10629 Opcode = X86ISD::VPERMV;
10630 else if (Subtarget.hasXOP()) {
10631 SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
10632 SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
10633 SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
10634 SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
10635 return DAG.getNode(
10636 ISD::CONCAT_VECTORS, DL, VT,
10637 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
10638 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
10639 } else if (Subtarget.hasAVX()) {
10640 SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
10641 SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
10642 SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
10643 SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
10644 auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
10645 ArrayRef<SDValue> Ops) {
10646 // Permute Lo and Hi and then select based on index range.
10647 // This works as SHUFB uses bits[3:0] to permute elements and we don't
10648 // care about the bit[7] as its just an index vector.
10649 SDValue Idx = Ops[2];
10650 EVT VT = Idx.getValueType();
10651 return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
10652 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
10653 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
10654 ISD::CondCode::SETGT);
10655 };
10656 SDValue Ops[] = {LoLo, HiHi, IndicesVec};
10657 return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
10658 PSHUFBBuilder);
10659 }
10660 break;
10661 case MVT::v16i16:
10662 if (Subtarget.hasVLX() && Subtarget.hasBWI())
10663 Opcode = X86ISD::VPERMV;
10664 else if (Subtarget.hasAVX()) {
10665 // Scale to v32i8 and perform as v32i8.
10666 IndicesVec = ScaleIndices(IndicesVec, 2);
10667 return DAG.getBitcast(
10668 VT, createVariablePermute(
10669 MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
10670 DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
10671 }
10672 break;
10673 case MVT::v8f32:
10674 case MVT::v8i32:
10675 if (Subtarget.hasAVX2())
10676 Opcode = X86ISD::VPERMV;
10677 else if (Subtarget.hasAVX()) {
10678 SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
10679 SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
10680 {0, 1, 2, 3, 0, 1, 2, 3});
10681 SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
10682 {4, 5, 6, 7, 4, 5, 6, 7});
10683 if (Subtarget.hasXOP())
10684 return DAG.getBitcast(
10685 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,
10686 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
10687 // Permute Lo and Hi and then select based on index range.
10688 // This works as VPERMILPS only uses index bits[0:1] to permute elements.
10689 SDValue Res = DAG.getSelectCC(
10690 DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
10691 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
10692 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
10693 ISD::CondCode::SETGT);
10694 return DAG.getBitcast(VT, Res);
10695 }
10696 break;
10697 case MVT::v4i64:
10698 case MVT::v4f64:
10699 if (Subtarget.hasAVX512()) {
10700 if (!Subtarget.hasVLX()) {
10701 MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
10702 SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
10703 SDLoc(SrcVec));
10704 IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
10705 DAG, SDLoc(IndicesVec));
10706 SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
10707 DAG, Subtarget);
10708 return extract256BitVector(Res, 0, DAG, DL);
10709 }
10710 Opcode = X86ISD::VPERMV;
10711 } else if (Subtarget.hasAVX()) {
10712 SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
10713 SDValue LoLo =
10714 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
10715 SDValue HiHi =
10716 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
10717 // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
10718 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
10719 if (Subtarget.hasXOP())
10720 return DAG.getBitcast(
10721 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,
10722 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
10723 // Permute Lo and Hi and then select based on index range.
10724 // This works as VPERMILPD only uses index bit[1] to permute elements.
10725 SDValue Res = DAG.getSelectCC(
10726 DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
10727 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
10728 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
10729 ISD::CondCode::SETGT);
10730 return DAG.getBitcast(VT, Res);
10731 }
10732 break;
10733 case MVT::v64i8:
10734 if (Subtarget.hasVBMI())
10735 Opcode = X86ISD::VPERMV;
10736 break;
10737 case MVT::v32i16:
10738 if (Subtarget.hasBWI())
10739 Opcode = X86ISD::VPERMV;
10740 break;
10741 case MVT::v16f32:
10742 case MVT::v16i32:
10743 case MVT::v8f64:
10744 case MVT::v8i64:
10745 if (Subtarget.hasAVX512())
10746 Opcode = X86ISD::VPERMV;
10747 break;
10748 }
10749 if (!Opcode)
10750 return SDValue();
10751
10752 assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&(static_cast <bool> ((VT.getSizeInBits() == ShuffleVT.getSizeInBits
()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits
()) == 0 && "Illegal variable permute shuffle type") ?
void (0) : __assert_fail ("(VT.getSizeInBits() == ShuffleVT.getSizeInBits()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 && \"Illegal variable permute shuffle type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10754, __extension__
__PRETTY_FUNCTION__))
10753 (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&(static_cast <bool> ((VT.getSizeInBits() == ShuffleVT.getSizeInBits
()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits
()) == 0 && "Illegal variable permute shuffle type") ?
void (0) : __assert_fail ("(VT.getSizeInBits() == ShuffleVT.getSizeInBits()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 && \"Illegal variable permute shuffle type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10754, __extension__
__PRETTY_FUNCTION__))
10754 "Illegal variable permute shuffle type")(static_cast <bool> ((VT.getSizeInBits() == ShuffleVT.getSizeInBits
()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits
()) == 0 && "Illegal variable permute shuffle type") ?
void (0) : __assert_fail ("(VT.getSizeInBits() == ShuffleVT.getSizeInBits()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 && \"Illegal variable permute shuffle type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10754, __extension__
__PRETTY_FUNCTION__))
;
10755
10756 uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
10757 if (Scale > 1)
10758 IndicesVec = ScaleIndices(IndicesVec, Scale);
10759
10760 EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
10761 IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
10762
10763 SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
10764 SDValue Res = Opcode == X86ISD::VPERMV
10765 ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
10766 : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
10767 return DAG.getBitcast(VT, Res);
10768}
10769
10770// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
10771// reasoned to be a permutation of a vector by indices in a non-constant vector.
10772// (build_vector (extract_elt V, (extract_elt I, 0)),
10773// (extract_elt V, (extract_elt I, 1)),
10774// ...
10775// ->
10776// (vpermv I, V)
10777//
10778// TODO: Handle undefs
10779// TODO: Utilize pshufb and zero mask blending to support more efficient
10780// construction of vectors with constant-0 elements.
10781static SDValue
10782LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
10783 const X86Subtarget &Subtarget) {
10784 SDValue SrcVec, IndicesVec;
10785 // Check for a match of the permute source vector and permute index elements.
10786 // This is done by checking that the i-th build_vector operand is of the form:
10787 // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
10788 for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
10789 SDValue Op = V.getOperand(Idx);
10790 if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
10791 return SDValue();
10792
10793 // If this is the first extract encountered in V, set the source vector,
10794 // otherwise verify the extract is from the previously defined source
10795 // vector.
10796 if (!SrcVec)
10797 SrcVec = Op.getOperand(0);
10798 else if (SrcVec != Op.getOperand(0))
10799 return SDValue();
10800 SDValue ExtractedIndex = Op->getOperand(1);
10801 // Peek through extends.
10802 if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
10803 ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
10804 ExtractedIndex = ExtractedIndex.getOperand(0);
10805 if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
10806 return SDValue();
10807
10808 // If this is the first extract from the index vector candidate, set the
10809 // indices vector, otherwise verify the extract is from the previously
10810 // defined indices vector.
10811 if (!IndicesVec)
10812 IndicesVec = ExtractedIndex.getOperand(0);
10813 else if (IndicesVec != ExtractedIndex.getOperand(0))
10814 return SDValue();
10815
10816 auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
10817 if (!PermIdx || PermIdx->getAPIntValue() != Idx)
10818 return SDValue();
10819 }
10820
10821 SDLoc DL(V);
10822 MVT VT = V.getSimpleValueType();
10823 return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
10824}
10825
10826SDValue
10827X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
10828 SDLoc dl(Op);
10829
10830 MVT VT = Op.getSimpleValueType();
10831 MVT EltVT = VT.getVectorElementType();
10832 unsigned NumElems = Op.getNumOperands();
10833
10834 // Generate vectors for predicate vectors.
10835 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
10836 return LowerBUILD_VECTORvXi1(Op, DAG, Subtarget);
10837
10838 if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
10839 return VectorConstant;
10840
10841 unsigned EVTBits = EltVT.getSizeInBits();
10842 APInt UndefMask = APInt::getZero(NumElems);
10843 APInt ZeroMask = APInt::getZero(NumElems);
10844 APInt NonZeroMask = APInt::getZero(NumElems);
10845 bool IsAllConstants = true;
10846 SmallSet<SDValue, 8> Values;
10847 unsigned NumConstants = NumElems;
10848 for (unsigned i = 0; i < NumElems; ++i) {
10849 SDValue Elt = Op.getOperand(i);
10850 if (Elt.isUndef()) {
10851 UndefMask.setBit(i);
10852 continue;
10853 }
10854 Values.insert(Elt);
10855 if (!isa<ConstantSDNode>(Elt) && !isa<ConstantFPSDNode>(Elt)) {
10856 IsAllConstants = false;
10857 NumConstants--;
10858 }
10859 if (X86::isZeroNode(Elt)) {
10860 ZeroMask.setBit(i);
10861 } else {
10862 NonZeroMask.setBit(i);
10863 }
10864 }
10865
10866 // All undef vector. Return an UNDEF. All zero vectors were handled above.
10867 if (NonZeroMask == 0) {
10868 assert(UndefMask.isAllOnes() && "Fully undef mask expected")(static_cast <bool> (UndefMask.isAllOnes() && "Fully undef mask expected"
) ? void (0) : __assert_fail ("UndefMask.isAllOnes() && \"Fully undef mask expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10868, __extension__
__PRETTY_FUNCTION__))
;
10869 return DAG.getUNDEF(VT);
10870 }
10871
10872 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
10873
10874 // If the upper elts of a ymm/zmm are undef/zero then we might be better off
10875 // lowering to a smaller build vector and padding with undef/zero.
10876 if ((VT.is256BitVector() || VT.is512BitVector()) &&
10877 !isFoldableUseOfShuffle(BV)) {
10878 unsigned UpperElems = NumElems / 2;
10879 APInt UndefOrZeroMask = UndefMask | ZeroMask;
10880 unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countLeadingOnes();
10881 if (NumUpperUndefsOrZeros >= UpperElems) {
10882 if (VT.is512BitVector() &&
10883 NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))
10884 UpperElems = NumElems - (NumElems / 4);
10885 bool UndefUpper = UndefMask.countLeadingOnes() >= UpperElems;
10886 MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems);
10887 SDValue NewBV =
10888 DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems));
10889 return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);
10890 }
10891 }
10892
10893 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
10894 return AddSub;
10895 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
10896 return HorizontalOp;
10897 if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
10898 return Broadcast;
10899 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, Subtarget, DAG))
10900 return BitOp;
10901
10902 unsigned NumZero = ZeroMask.countPopulation();
10903 unsigned NumNonZero = NonZeroMask.countPopulation();
10904
10905 // If we are inserting one variable into a vector of non-zero constants, try
10906 // to avoid loading each constant element as a scalar. Load the constants as a
10907 // vector and then insert the variable scalar element. If insertion is not
10908 // supported, fall back to a shuffle to get the scalar blended with the
10909 // constants. Insertion into a zero vector is handled as a special-case
10910 // somewhere below here.
10911 if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
10912 (isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) ||
10913 isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) {
10914 // Create an all-constant vector. The variable element in the old
10915 // build vector is replaced by undef in the constant vector. Save the
10916 // variable scalar element and its index for use in the insertelement.
10917 LLVMContext &Context = *DAG.getContext();
10918 Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
10919 SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
10920 SDValue VarElt;
10921 SDValue InsIndex;
10922 for (unsigned i = 0; i != NumElems; ++i) {
10923 SDValue Elt = Op.getOperand(i);
10924 if (auto *C = dyn_cast<ConstantSDNode>(Elt))
10925 ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
10926 else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
10927 ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
10928 else if (!Elt.isUndef()) {
10929 assert(!VarElt.getNode() && !InsIndex.getNode() &&(static_cast <bool> (!VarElt.getNode() && !InsIndex
.getNode() && "Expected one variable element in this vector"
) ? void (0) : __assert_fail ("!VarElt.getNode() && !InsIndex.getNode() && \"Expected one variable element in this vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10930, __extension__
__PRETTY_FUNCTION__))
10930 "Expected one variable element in this vector")(static_cast <bool> (!VarElt.getNode() && !InsIndex
.getNode() && "Expected one variable element in this vector"
) ? void (0) : __assert_fail ("!VarElt.getNode() && !InsIndex.getNode() && \"Expected one variable element in this vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10930, __extension__
__PRETTY_FUNCTION__))
;
10931 VarElt = Elt;
10932 InsIndex = DAG.getVectorIdxConstant(i, dl);
10933 }
10934 }
10935 Constant *CV = ConstantVector::get(ConstVecOps);
10936 SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
10937
10938 // The constants we just created may not be legal (eg, floating point). We
10939 // must lower the vector right here because we can not guarantee that we'll
10940 // legalize it before loading it. This is also why we could not just create
10941 // a new build vector here. If the build vector contains illegal constants,
10942 // it could get split back up into a series of insert elements.
10943 // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
10944 SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
10945 MachineFunction &MF = DAG.getMachineFunction();
10946 MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
10947 SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
10948 unsigned InsertC = cast<ConstantSDNode>(InsIndex)->getZExtValue();
10949 unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();
10950 if (InsertC < NumEltsInLow128Bits)
10951 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
10952
10953 // There's no good way to insert into the high elements of a >128-bit
10954 // vector, so use shuffles to avoid an extract/insert sequence.
10955 assert(VT.getSizeInBits() > 128 && "Invalid insertion index?")(static_cast <bool> (VT.getSizeInBits() > 128 &&
"Invalid insertion index?") ? void (0) : __assert_fail ("VT.getSizeInBits() > 128 && \"Invalid insertion index?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10955, __extension__
__PRETTY_FUNCTION__))
;
10956 assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector")(static_cast <bool> (Subtarget.hasAVX() && "Must have AVX with >16-byte vector"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"Must have AVX with >16-byte vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10956, __extension__
__PRETTY_FUNCTION__))
;
10957 SmallVector<int, 8> ShuffleMask;
10958 unsigned NumElts = VT.getVectorNumElements();
10959 for (unsigned i = 0; i != NumElts; ++i)
10960 ShuffleMask.push_back(i == InsertC ? NumElts : i);
10961 SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);
10962 return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);
10963 }
10964
10965 // Special case for single non-zero, non-undef, element.
10966 if (NumNonZero == 1) {
10967 unsigned Idx = NonZeroMask.countTrailingZeros();
10968 SDValue Item = Op.getOperand(Idx);
10969
10970 // If we have a constant or non-constant insertion into the low element of
10971 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
10972 // the rest of the elements. This will be matched as movd/movq/movss/movsd
10973 // depending on what the source datatype is.
10974 if (Idx == 0) {
10975 if (NumZero == 0)
10976 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
10977
10978 if (EltVT == MVT::i32 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
10979 EltVT == MVT::f64 || (EltVT == MVT::i64 && Subtarget.is64Bit()) ||
10980 (EltVT == MVT::i16 && Subtarget.hasFP16())) {
10981 assert((VT.is128BitVector() || VT.is256BitVector() ||(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected an SSE value type!"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected an SSE value type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10983, __extension__
__PRETTY_FUNCTION__))
10982 VT.is512BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected an SSE value type!"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected an SSE value type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10983, __extension__
__PRETTY_FUNCTION__))
10983 "Expected an SSE value type!")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected an SSE value type!"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected an SSE value type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10983, __extension__
__PRETTY_FUNCTION__))
;
10984 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
10985 // Turn it into a MOVL (i.e. movsh, movss, movsd, movw or movd) to a
10986 // zero vector.
10987 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
10988 }
10989
10990 // We can't directly insert an i8 or i16 into a vector, so zero extend
10991 // it to i32 first.
10992 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
10993 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
10994 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
10995 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
10996 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
10997 return DAG.getBitcast(VT, Item);
10998 }
10999 }
11000
11001 // Is it a vector logical left shift?
11002 if (NumElems == 2 && Idx == 1 &&
11003 X86::isZeroNode(Op.getOperand(0)) &&
11004 !X86::isZeroNode(Op.getOperand(1))) {
11005 unsigned NumBits = VT.getSizeInBits();
11006 return getVShift(true, VT,
11007 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
11008 VT, Op.getOperand(1)),
11009 NumBits/2, DAG, *this, dl);
11010 }
11011
11012 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
11013 return SDValue();
11014
11015 // Otherwise, if this is a vector with i32 or f32 elements, and the element
11016 // is a non-constant being inserted into an element other than the low one,
11017 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
11018 // movd/movss) to move this into the low element, then shuffle it into
11019 // place.
11020 if (EVTBits == 32) {
11021 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
11022 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
11023 }
11024 }
11025
11026 // Splat is obviously ok. Let legalizer expand it to a shuffle.
11027 if (Values.size() == 1) {
11028 if (EVTBits == 32) {
11029 // Instead of a shuffle like this:
11030 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
11031 // Check if it's possible to issue this instead.
11032 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
11033 unsigned Idx = NonZeroMask.countTrailingZeros();
11034 SDValue Item = Op.getOperand(Idx);
11035 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
11036 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
11037 }
11038 return SDValue();
11039 }
11040
11041 // A vector full of immediates; various special cases are already
11042 // handled, so this is best done with a single constant-pool load.
11043 if (IsAllConstants)
11044 return SDValue();
11045
11046 if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget))
11047 return V;
11048
11049 // See if we can use a vector load to get all of the elements.
11050 {
11051 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
11052 if (SDValue LD =
11053 EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
11054 return LD;
11055 }
11056
11057 // If this is a splat of pairs of 32-bit elements, we can use a narrower
11058 // build_vector and broadcast it.
11059 // TODO: We could probably generalize this more.
11060 if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
11061 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
11062 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
11063 auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
11064 // Make sure all the even/odd operands match.
11065 for (unsigned i = 2; i != NumElems; ++i)
11066 if (Ops[i % 2] != Op.getOperand(i))
11067 return false;
11068 return true;
11069 };
11070 if (CanSplat(Op, NumElems, Ops)) {
11071 MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
11072 MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
11073 // Create a new build vector and cast to v2i64/v2f64.
11074 SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
11075 DAG.getBuildVector(NarrowVT, dl, Ops));
11076 // Broadcast from v2i64/v2f64 and cast to final VT.
11077 MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems / 2);
11078 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
11079 NewBV));
11080 }
11081 }
11082
11083 // For AVX-length vectors, build the individual 128-bit pieces and use
11084 // shuffles to put them in place.
11085 if (VT.getSizeInBits() > 128) {
11086 MVT HVT = MVT::getVectorVT(EltVT, NumElems / 2);
11087
11088 // Build both the lower and upper subvector.
11089 SDValue Lower =
11090 DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
11091 SDValue Upper = DAG.getBuildVector(
11092 HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
11093
11094 // Recreate the wider vector with the lower and upper part.
11095 return concatSubVectors(Lower, Upper, DAG, dl);
11096 }
11097
11098 // Let legalizer expand 2-wide build_vectors.
11099 if (EVTBits == 64) {
11100 if (NumNonZero == 1) {
11101 // One half is zero or undef.
11102 unsigned Idx = NonZeroMask.countTrailingZeros();
11103 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
11104 Op.getOperand(Idx));
11105 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
11106 }
11107 return SDValue();
11108 }
11109
11110 // If element VT is < 32 bits, convert it to inserts into a zero vector.
11111 if (EVTBits == 8 && NumElems == 16)
11112 if (SDValue V = LowerBuildVectorv16i8(Op, NonZeroMask, NumNonZero, NumZero,
11113 DAG, Subtarget))
11114 return V;
11115
11116 if (EltVT == MVT::i16 && NumElems == 8)
11117 if (SDValue V = LowerBuildVectorv8i16(Op, NonZeroMask, NumNonZero, NumZero,
11118 DAG, Subtarget))
11119 return V;
11120
11121 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
11122 if (EVTBits == 32 && NumElems == 4)
11123 if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
11124 return V;
11125
11126 // If element VT is == 32 bits, turn it into a number of shuffles.
11127 if (NumElems == 4 && NumZero > 0) {
11128 SmallVector<SDValue, 8> Ops(NumElems);
11129 for (unsigned i = 0; i < 4; ++i) {
11130 bool isZero = !NonZeroMask[i];
11131 if (isZero)
11132 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
11133 else
11134 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
11135 }
11136
11137 for (unsigned i = 0; i < 2; ++i) {
11138 switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) {
11139 default: llvm_unreachable("Unexpected NonZero count")::llvm::llvm_unreachable_internal("Unexpected NonZero count",
"llvm/lib/Target/X86/X86ISelLowering.cpp", 11139)
;
11140 case 0:
11141 Ops[i] = Ops[i*2]; // Must be a zero vector.
11142 break;
11143 case 1:
11144 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
11145 break;
11146 case 2:
11147 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
11148 break;
11149 case 3:
11150 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
11151 break;
11152 }
11153 }
11154
11155 bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2;
11156 bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2;
11157 int MaskVec[] = {
11158 Reverse1 ? 1 : 0,
11159 Reverse1 ? 0 : 1,
11160 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
11161 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
11162 };
11163 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
11164 }
11165
11166 assert(Values.size() > 1 && "Expected non-undef and non-splat vector")(static_cast <bool> (Values.size() > 1 && "Expected non-undef and non-splat vector"
) ? void (0) : __assert_fail ("Values.size() > 1 && \"Expected non-undef and non-splat vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11166, __extension__
__PRETTY_FUNCTION__))
;
11167
11168 // Check for a build vector from mostly shuffle plus few inserting.
11169 if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
11170 return Sh;
11171
11172 // For SSE 4.1, use insertps to put the high elements into the low element.
11173 if (Subtarget.hasSSE41() && EltVT != MVT::f16) {
11174 SDValue Result;
11175 if (!Op.getOperand(0).isUndef())
11176 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
11177 else
11178 Result = DAG.getUNDEF(VT);
11179
11180 for (unsigned i = 1; i < NumElems; ++i) {
11181 if (Op.getOperand(i).isUndef()) continue;
11182 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
11183 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
11184 }
11185 return Result;
11186 }
11187
11188 // Otherwise, expand into a number of unpckl*, start by extending each of
11189 // our (non-undef) elements to the full vector width with the element in the
11190 // bottom slot of the vector (which generates no code for SSE).
11191 SmallVector<SDValue, 8> Ops(NumElems);
11192 for (unsigned i = 0; i < NumElems; ++i) {
11193 if (!Op.getOperand(i).isUndef())
11194 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
11195 else
11196 Ops[i] = DAG.getUNDEF(VT);
11197 }
11198
11199 // Next, we iteratively mix elements, e.g. for v4f32:
11200 // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
11201 // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
11202 // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
11203 for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
11204 // Generate scaled UNPCKL shuffle mask.
11205 SmallVector<int, 16> Mask;
11206 for(unsigned i = 0; i != Scale; ++i)
11207 Mask.push_back(i);
11208 for (unsigned i = 0; i != Scale; ++i)
11209 Mask.push_back(NumElems+i);
11210 Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
11211
11212 for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
11213 Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
11214 }
11215 return Ops[0];
11216}
11217
11218// 256-bit AVX can use the vinsertf128 instruction
11219// to create 256-bit vectors from two other 128-bit ones.
11220// TODO: Detect subvector broadcast here instead of DAG combine?
11221static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
11222 const X86Subtarget &Subtarget) {
11223 SDLoc dl(Op);
11224 MVT ResVT = Op.getSimpleValueType();
11225
11226 assert((ResVT.is256BitVector() ||(static_cast <bool> ((ResVT.is256BitVector() || ResVT.is512BitVector
()) && "Value type must be 256-/512-bit wide") ? void
(0) : __assert_fail ("(ResVT.is256BitVector() || ResVT.is512BitVector()) && \"Value type must be 256-/512-bit wide\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11227, __extension__
__PRETTY_FUNCTION__))
11227 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide")(static_cast <bool> ((ResVT.is256BitVector() || ResVT.is512BitVector
()) && "Value type must be 256-/512-bit wide") ? void
(0) : __assert_fail ("(ResVT.is256BitVector() || ResVT.is512BitVector()) && \"Value type must be 256-/512-bit wide\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11227, __extension__
__PRETTY_FUNCTION__))
;
11228
11229 unsigned NumOperands = Op.getNumOperands();
11230 unsigned NumZero = 0;
11231 unsigned NumNonZero = 0;
11232 unsigned NonZeros = 0;
11233 for (unsigned i = 0; i != NumOperands; ++i) {
11234 SDValue SubVec = Op.getOperand(i);
11235 if (SubVec.isUndef())
11236 continue;
11237 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
11238 ++NumZero;
11239 else {
11240 assert(i < sizeof(NonZeros) * CHAR_BIT)(static_cast <bool> (i < sizeof(NonZeros) * 8) ? void
(0) : __assert_fail ("i < sizeof(NonZeros) * CHAR_BIT", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11240, __extension__ __PRETTY_FUNCTION__))
; // Ensure the shift is in range.
11241 NonZeros |= 1 << i;
11242 ++NumNonZero;
11243 }
11244 }
11245
11246 // If we have more than 2 non-zeros, build each half separately.
11247 if (NumNonZero > 2) {
11248 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
11249 ArrayRef<SDUse> Ops = Op->ops();
11250 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
11251 Ops.slice(0, NumOperands/2));
11252 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
11253 Ops.slice(NumOperands/2));
11254 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
11255 }
11256
11257 // Otherwise, build it up through insert_subvectors.
11258 SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
11259 : DAG.getUNDEF(ResVT);
11260
11261 MVT SubVT = Op.getOperand(0).getSimpleValueType();
11262 unsigned NumSubElems = SubVT.getVectorNumElements();
11263 for (unsigned i = 0; i != NumOperands; ++i) {
11264 if ((NonZeros & (1 << i)) == 0)
11265 continue;
11266
11267 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec,
11268 Op.getOperand(i),
11269 DAG.getIntPtrConstant(i * NumSubElems, dl));
11270 }
11271
11272 return Vec;
11273}
11274
11275// Returns true if the given node is a type promotion (by concatenating i1
11276// zeros) of the result of a node that already zeros all upper bits of
11277// k-register.
11278// TODO: Merge this with LowerAVXCONCAT_VECTORS?
11279static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
11280 const X86Subtarget &Subtarget,
11281 SelectionDAG & DAG) {
11282 SDLoc dl(Op);
11283 MVT ResVT = Op.getSimpleValueType();
11284 unsigned NumOperands = Op.getNumOperands();
11285
11286 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&(static_cast <bool> (NumOperands > 1 && isPowerOf2_32
(NumOperands) && "Unexpected number of operands in CONCAT_VECTORS"
) ? void (0) : __assert_fail ("NumOperands > 1 && isPowerOf2_32(NumOperands) && \"Unexpected number of operands in CONCAT_VECTORS\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11287, __extension__
__PRETTY_FUNCTION__))
11287 "Unexpected number of operands in CONCAT_VECTORS")(static_cast <bool> (NumOperands > 1 && isPowerOf2_32
(NumOperands) && "Unexpected number of operands in CONCAT_VECTORS"
) ? void (0) : __assert_fail ("NumOperands > 1 && isPowerOf2_32(NumOperands) && \"Unexpected number of operands in CONCAT_VECTORS\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11287, __extension__
__PRETTY_FUNCTION__))
;
11288
11289 uint64_t Zeros = 0;
11290 uint64_t NonZeros = 0;
11291 for (unsigned i = 0; i != NumOperands; ++i) {
11292 SDValue SubVec = Op.getOperand(i);
11293 if (SubVec.isUndef())
11294 continue;
11295 assert(i < sizeof(NonZeros) * CHAR_BIT)(static_cast <bool> (i < sizeof(NonZeros) * 8) ? void
(0) : __assert_fail ("i < sizeof(NonZeros) * CHAR_BIT", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11295, __extension__ __PRETTY_FUNCTION__))
; // Ensure the shift is in range.
11296 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
11297 Zeros |= (uint64_t)1 << i;
11298 else
11299 NonZeros |= (uint64_t)1 << i;
11300 }
11301
11302 unsigned NumElems = ResVT.getVectorNumElements();
11303
11304 // If we are inserting non-zero vector and there are zeros in LSBs and undef
11305 // in the MSBs we need to emit a KSHIFTL. The generic lowering to
11306 // insert_subvector will give us two kshifts.
11307 if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
11308 Log2_64(NonZeros) != NumOperands - 1) {
11309 MVT ShiftVT = ResVT;
11310 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
11311 ShiftVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
11312 unsigned Idx = Log2_64(NonZeros);
11313 SDValue SubVec = Op.getOperand(Idx);
11314 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
11315 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ShiftVT,
11316 DAG.getUNDEF(ShiftVT), SubVec,
11317 DAG.getIntPtrConstant(0, dl));
11318 Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, SubVec,
11319 DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));
11320 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,
11321 DAG.getIntPtrConstant(0, dl));
11322 }
11323
11324 // If there are zero or one non-zeros we can handle this very simply.
11325 if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {
11326 SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);
11327 if (!NonZeros)
11328 return Vec;
11329 unsigned Idx = Log2_64(NonZeros);
11330 SDValue SubVec = Op.getOperand(Idx);
11331 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
11332 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
11333 DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));
11334 }
11335
11336 if (NumOperands > 2) {
11337 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
11338 ArrayRef<SDUse> Ops = Op->ops();
11339 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
11340 Ops.slice(0, NumOperands/2));
11341 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
11342 Ops.slice(NumOperands/2));
11343 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
11344 }
11345
11346 assert(countPopulation(NonZeros) == 2 && "Simple cases not handled?")(static_cast <bool> (countPopulation(NonZeros) == 2 &&
"Simple cases not handled?") ? void (0) : __assert_fail ("countPopulation(NonZeros) == 2 && \"Simple cases not handled?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11346, __extension__
__PRETTY_FUNCTION__))
;
11347
11348 if (ResVT.getVectorNumElements() >= 16)
11349 return Op; // The operation is legal with KUNPCK
11350
11351 SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,
11352 DAG.getUNDEF(ResVT), Op.getOperand(0),
11353 DAG.getIntPtrConstant(0, dl));
11354 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
11355 DAG.getIntPtrConstant(NumElems/2, dl));
11356}
11357
11358static SDValue LowerCONCAT_VECTORS(SDValue Op,
11359 const X86Subtarget &Subtarget,
11360 SelectionDAG &DAG) {
11361 MVT VT = Op.getSimpleValueType();
11362 if (VT.getVectorElementType() == MVT::i1)
11363 return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
11364
11365 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||(static_cast <bool> ((VT.is256BitVector() && Op
.getNumOperands() == 2) || (VT.is512BitVector() && (Op
.getNumOperands() == 2 || Op.getNumOperands() == 4))) ? void (
0) : __assert_fail ("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11367, __extension__
__PRETTY_FUNCTION__))
11366 (VT.is512BitVector() && (Op.getNumOperands() == 2 ||(static_cast <bool> ((VT.is256BitVector() && Op
.getNumOperands() == 2) || (VT.is512BitVector() && (Op
.getNumOperands() == 2 || Op.getNumOperands() == 4))) ? void (
0) : __assert_fail ("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11367, __extension__
__PRETTY_FUNCTION__))
11367 Op.getNumOperands() == 4)))(static_cast <bool> ((VT.is256BitVector() && Op
.getNumOperands() == 2) || (VT.is512BitVector() && (Op
.getNumOperands() == 2 || Op.getNumOperands() == 4))) ? void (
0) : __assert_fail ("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11367, __extension__
__PRETTY_FUNCTION__))
;
11368
11369 // AVX can use the vinsertf128 instruction to create 256-bit vectors
11370 // from two other 128-bit ones.
11371
11372 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
11373 return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget);
11374}
11375
11376//===----------------------------------------------------------------------===//
11377// Vector shuffle lowering
11378//
11379// This is an experimental code path for lowering vector shuffles on x86. It is
11380// designed to handle arbitrary vector shuffles and blends, gracefully
11381// degrading performance as necessary. It works hard to recognize idiomatic
11382// shuffles and lower them to optimal instruction patterns without leaving
11383// a framework that allows reasonably efficient handling of all vector shuffle
11384// patterns.
11385//===----------------------------------------------------------------------===//
11386
11387/// Tiny helper function to identify a no-op mask.
11388///
11389/// This is a somewhat boring predicate function. It checks whether the mask
11390/// array input, which is assumed to be a single-input shuffle mask of the kind
11391/// used by the X86 shuffle instructions (not a fully general
11392/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
11393/// in-place shuffle are 'no-op's.
11394static bool isNoopShuffleMask(ArrayRef<int> Mask) {
11395 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11396 assert(Mask[i] >= -1 && "Out of bound mask element!")(static_cast <bool> (Mask[i] >= -1 && "Out of bound mask element!"
) ? void (0) : __assert_fail ("Mask[i] >= -1 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11396, __extension__
__PRETTY_FUNCTION__))
;
11397 if (Mask[i] >= 0 && Mask[i] != i)
11398 return false;
11399 }
11400 return true;
11401}
11402
11403/// Test whether there are elements crossing LaneSizeInBits lanes in this
11404/// shuffle mask.
11405///
11406/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
11407/// and we routinely test for these.
11408static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,
11409 unsigned ScalarSizeInBits,
11410 ArrayRef<int> Mask) {
11411 assert(LaneSizeInBits && ScalarSizeInBits &&(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11413, __extension__
__PRETTY_FUNCTION__))
11412 (LaneSizeInBits % ScalarSizeInBits) == 0 &&(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11413, __extension__
__PRETTY_FUNCTION__))
11413 "Illegal shuffle lane size")(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11413, __extension__
__PRETTY_FUNCTION__))
;
11414 int LaneSize = LaneSizeInBits / ScalarSizeInBits;
11415 int Size = Mask.size();
11416 for (int i = 0; i < Size; ++i)
11417 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
11418 return true;
11419 return false;
11420}
11421
11422/// Test whether there are elements crossing 128-bit lanes in this
11423/// shuffle mask.
11424static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
11425 return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);
11426}
11427
11428/// Test whether elements in each LaneSizeInBits lane in this shuffle mask come
11429/// from multiple lanes - this is different to isLaneCrossingShuffleMask to
11430/// better support 'repeated mask + lane permute' style shuffles.
11431static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits,
11432 unsigned ScalarSizeInBits,
11433 ArrayRef<int> Mask) {
11434 assert(LaneSizeInBits && ScalarSizeInBits &&(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11436, __extension__
__PRETTY_FUNCTION__))
11435 (LaneSizeInBits % ScalarSizeInBits) == 0 &&(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11436, __extension__
__PRETTY_FUNCTION__))
11436 "Illegal shuffle lane size")(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11436, __extension__
__PRETTY_FUNCTION__))
;
11437 int NumElts = Mask.size();
11438 int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;
11439 int NumLanes = NumElts / NumEltsPerLane;
11440 if (NumLanes > 1) {
11441 for (int i = 0; i != NumLanes; ++i) {
11442 int SrcLane = -1;
11443 for (int j = 0; j != NumEltsPerLane; ++j) {
11444 int M = Mask[(i * NumEltsPerLane) + j];
11445 if (M < 0)
11446 continue;
11447 int Lane = (M % NumElts) / NumEltsPerLane;
11448 if (SrcLane >= 0 && SrcLane != Lane)
11449 return true;
11450 SrcLane = Lane;
11451 }
11452 }
11453 }
11454 return false;
11455}
11456
11457/// Test whether a shuffle mask is equivalent within each sub-lane.
11458///
11459/// This checks a shuffle mask to see if it is performing the same
11460/// lane-relative shuffle in each sub-lane. This trivially implies
11461/// that it is also not lane-crossing. It may however involve a blend from the
11462/// same lane of a second vector.
11463///
11464/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
11465/// non-trivial to compute in the face of undef lanes. The representation is
11466/// suitable for use with existing 128-bit shuffles as entries from the second
11467/// vector have been remapped to [LaneSize, 2*LaneSize).
11468static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
11469 ArrayRef<int> Mask,
11470 SmallVectorImpl<int> &RepeatedMask) {
11471 auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
11472 RepeatedMask.assign(LaneSize, -1);
11473 int Size = Mask.size();
11474 for (int i = 0; i < Size; ++i) {
11475 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0)(static_cast <bool> (Mask[i] == SM_SentinelUndef || Mask
[i] >= 0) ? void (0) : __assert_fail ("Mask[i] == SM_SentinelUndef || Mask[i] >= 0"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11475, __extension__
__PRETTY_FUNCTION__))
;
11476 if (Mask[i] < 0)
11477 continue;
11478 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
11479 // This entry crosses lanes, so there is no way to model this shuffle.
11480 return false;
11481
11482 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
11483 // Adjust second vector indices to start at LaneSize instead of Size.
11484 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
11485 : Mask[i] % LaneSize + LaneSize;
11486 if (RepeatedMask[i % LaneSize] < 0)
11487 // This is the first non-undef entry in this slot of a 128-bit lane.
11488 RepeatedMask[i % LaneSize] = LocalM;
11489 else if (RepeatedMask[i % LaneSize] != LocalM)
11490 // Found a mismatch with the repeated mask.
11491 return false;
11492 }
11493 return true;
11494}
11495
11496/// Test whether a shuffle mask is equivalent within each 128-bit lane.
11497static bool
11498is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
11499 SmallVectorImpl<int> &RepeatedMask) {
11500 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
11501}
11502
11503static bool
11504is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask) {
11505 SmallVector<int, 32> RepeatedMask;
11506 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
11507}
11508
11509/// Test whether a shuffle mask is equivalent within each 256-bit lane.
11510static bool
11511is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
11512 SmallVectorImpl<int> &RepeatedMask) {
11513 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
11514}
11515
11516/// Test whether a target shuffle mask is equivalent within each sub-lane.
11517/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
11518static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,
11519 unsigned EltSizeInBits,
11520 ArrayRef<int> Mask,
11521 SmallVectorImpl<int> &RepeatedMask) {
11522 int LaneSize = LaneSizeInBits / EltSizeInBits;
11523 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
11524 int Size = Mask.size();
11525 for (int i = 0; i < Size; ++i) {
11526 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0))(static_cast <bool> (isUndefOrZero(Mask[i]) || (Mask[i]
>= 0)) ? void (0) : __assert_fail ("isUndefOrZero(Mask[i]) || (Mask[i] >= 0)"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11526, __extension__
__PRETTY_FUNCTION__))
;
11527 if (Mask[i] == SM_SentinelUndef)
11528 continue;
11529 if (Mask[i] == SM_SentinelZero) {
11530 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
11531 return false;
11532 RepeatedMask[i % LaneSize] = SM_SentinelZero;
11533 continue;
11534 }
11535 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
11536 // This entry crosses lanes, so there is no way to model this shuffle.
11537 return false;
11538
11539 // Handle the in-lane shuffles by detecting if and when they repeat. Adjust
11540 // later vector indices to start at multiples of LaneSize instead of Size.
11541 int LaneM = Mask[i] / Size;
11542 int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize);
11543 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
11544 // This is the first non-undef entry in this slot of a 128-bit lane.
11545 RepeatedMask[i % LaneSize] = LocalM;
11546 else if (RepeatedMask[i % LaneSize] != LocalM)
11547 // Found a mismatch with the repeated mask.
11548 return false;
11549 }
11550 return true;
11551}
11552
11553/// Test whether a target shuffle mask is equivalent within each sub-lane.
11554/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
11555static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
11556 ArrayRef<int> Mask,
11557 SmallVectorImpl<int> &RepeatedMask) {
11558 return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(),
11559 Mask, RepeatedMask);
11560}
11561
11562/// Checks whether the vector elements referenced by two shuffle masks are
11563/// equivalent.
11564static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,
11565 int Idx, int ExpectedIdx) {
11566 assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&(static_cast <bool> (0 <= Idx && Idx < MaskSize
&& 0 <= ExpectedIdx && ExpectedIdx < MaskSize
&& "Out of range element index") ? void (0) : __assert_fail
("0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx && ExpectedIdx < MaskSize && \"Out of range element index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11567, __extension__
__PRETTY_FUNCTION__))
11567 ExpectedIdx < MaskSize && "Out of range element index")(static_cast <bool> (0 <= Idx && Idx < MaskSize
&& 0 <= ExpectedIdx && ExpectedIdx < MaskSize
&& "Out of range element index") ? void (0) : __assert_fail
("0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx && ExpectedIdx < MaskSize && \"Out of range element index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11567, __extension__
__PRETTY_FUNCTION__))
;
11568 if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode())
11569 return false;
11570
11571 switch (Op.getOpcode()) {
11572 case ISD::BUILD_VECTOR:
11573 // If the values are build vectors, we can look through them to find
11574 // equivalent inputs that make the shuffles equivalent.
11575 // TODO: Handle MaskSize != Op.getNumOperands()?
11576 if (MaskSize == (int)Op.getNumOperands() &&
11577 MaskSize == (int)ExpectedOp.getNumOperands())
11578 return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);
11579 break;
11580 case X86ISD::VBROADCAST:
11581 case X86ISD::VBROADCAST_LOAD:
11582 // TODO: Handle MaskSize != Op.getValueType().getVectorNumElements()?
11583 return (Op == ExpectedOp &&
11584 (int)Op.getValueType().getVectorNumElements() == MaskSize);
11585 case X86ISD::HADD:
11586 case X86ISD::HSUB:
11587 case X86ISD::FHADD:
11588 case X86ISD::FHSUB:
11589 case X86ISD::PACKSS:
11590 case X86ISD::PACKUS:
11591 // HOP(X,X) can refer to the elt from the lower/upper half of a lane.
11592 // TODO: Handle MaskSize != NumElts?
11593 // TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases.
11594 if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) {
11595 MVT VT = Op.getSimpleValueType();
11596 int NumElts = VT.getVectorNumElements();
11597 if (MaskSize == NumElts) {
11598 int NumLanes = VT.getSizeInBits() / 128;
11599 int NumEltsPerLane = NumElts / NumLanes;
11600 int NumHalfEltsPerLane = NumEltsPerLane / 2;
11601 bool SameLane =
11602 (Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);
11603 bool SameElt =
11604 (Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);
11605 return SameLane && SameElt;
11606 }
11607 }
11608 break;
11609 }
11610
11611 return false;
11612}
11613
11614/// Checks whether a shuffle mask is equivalent to an explicit list of
11615/// arguments.
11616///
11617/// This is a fast way to test a shuffle mask against a fixed pattern:
11618///
11619/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
11620///
11621/// It returns true if the mask is exactly as wide as the argument list, and
11622/// each element of the mask is either -1 (signifying undef) or the value given
11623/// in the argument.
11624static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask,
11625 SDValue V1 = SDValue(),
11626 SDValue V2 = SDValue()) {
11627 int Size = Mask.size();
11628 if (Size != (int)ExpectedMask.size())
11629 return false;
11630
11631 for (int i = 0; i < Size; ++i) {
11632 assert(Mask[i] >= -1 && "Out of bound mask element!")(static_cast <bool> (Mask[i] >= -1 && "Out of bound mask element!"
) ? void (0) : __assert_fail ("Mask[i] >= -1 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11632, __extension__
__PRETTY_FUNCTION__))
;
11633 int MaskIdx = Mask[i];
11634 int ExpectedIdx = ExpectedMask[i];
11635 if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {
11636 SDValue MaskV = MaskIdx < Size ? V1 : V2;
11637 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
11638 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
11639 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
11640 if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
11641 return false;
11642 }
11643 }
11644 return true;
11645}
11646
11647/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
11648///
11649/// The masks must be exactly the same width.
11650///
11651/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
11652/// value in ExpectedMask is always accepted. Otherwise the indices must match.
11653///
11654/// SM_SentinelZero is accepted as a valid negative index but must match in
11655/// both.
11656static bool isTargetShuffleEquivalent(MVT VT, ArrayRef<int> Mask,
11657 ArrayRef<int> ExpectedMask,
11658 SDValue V1 = SDValue(),
11659 SDValue V2 = SDValue()) {
11660 int Size = Mask.size();
11661 if (Size != (int)ExpectedMask.size())
11662 return false;
11663 assert(isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) &&(static_cast <bool> (isUndefOrZeroOrInRange(ExpectedMask
, 0, 2 * Size) && "Illegal target shuffle mask") ? void
(0) : __assert_fail ("isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) && \"Illegal target shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11664, __extension__
__PRETTY_FUNCTION__))
11664 "Illegal target shuffle mask")(static_cast <bool> (isUndefOrZeroOrInRange(ExpectedMask
, 0, 2 * Size) && "Illegal target shuffle mask") ? void
(0) : __assert_fail ("isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) && \"Illegal target shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11664, __extension__
__PRETTY_FUNCTION__))
;
11665
11666 // Check for out-of-range target shuffle mask indices.
11667 if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))
11668 return false;
11669
11670 // Don't use V1/V2 if they're not the same size as the shuffle mask type.
11671 if (V1 && V1.getValueSizeInBits() != VT.getSizeInBits())
11672 V1 = SDValue();
11673 if (V2 && V2.getValueSizeInBits() != VT.getSizeInBits())
11674 V2 = SDValue();
11675
11676 for (int i = 0; i < Size; ++i) {
11677 int MaskIdx = Mask[i];
11678 int ExpectedIdx = ExpectedMask[i];
11679 if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx)
11680 continue;
11681 if (0 <= MaskIdx && 0 <= ExpectedIdx) {
11682 SDValue MaskV = MaskIdx < Size ? V1 : V2;
11683 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
11684 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
11685 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
11686 if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
11687 continue;
11688 }
11689 // TODO - handle SM_Sentinel equivalences.
11690 return false;
11691 }
11692 return true;
11693}
11694
11695// Attempt to create a shuffle mask from a VSELECT/BLENDV condition mask.
11696static bool createShuffleMaskFromVSELECT(SmallVectorImpl<int> &Mask,
11697 SDValue Cond, bool IsBLENDV = false) {
11698 EVT CondVT = Cond.getValueType();
11699 unsigned EltSizeInBits = CondVT.getScalarSizeInBits();
11700 unsigned NumElts = CondVT.getVectorNumElements();
11701
11702 APInt UndefElts;
11703 SmallVector<APInt, 32> EltBits;
11704 if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,
11705 true, false))
11706 return false;
11707
11708 Mask.resize(NumElts, SM_SentinelUndef);
11709
11710 for (int i = 0; i != (int)NumElts; ++i) {
11711 Mask[i] = i;
11712 // Arbitrarily choose from the 2nd operand if the select condition element
11713 // is undef.
11714 // TODO: Can we do better by matching patterns such as even/odd?
11715 if (UndefElts[i] || (!IsBLENDV && EltBits[i].isZero()) ||
11716 (IsBLENDV && EltBits[i].isNonNegative()))
11717 Mask[i] += NumElts;
11718 }
11719
11720 return true;
11721}
11722
11723// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
11724// instructions.
11725static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
11726 if (VT != MVT::v8i32 && VT != MVT::v8f32)
11727 return false;
11728
11729 SmallVector<int, 8> Unpcklwd;
11730 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
11731 /* Unary = */ false);
11732 SmallVector<int, 8> Unpckhwd;
11733 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
11734 /* Unary = */ false);
11735 bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd) ||
11736 isTargetShuffleEquivalent(VT, Mask, Unpckhwd));
11737 return IsUnpackwdMask;
11738}
11739
11740static bool is128BitUnpackShuffleMask(ArrayRef<int> Mask) {
11741 // Create 128-bit vector type based on mask size.
11742 MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
11743 MVT VT = MVT::getVectorVT(EltVT, Mask.size());
11744
11745 // We can't assume a canonical shuffle mask, so try the commuted version too.
11746 SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
11747 ShuffleVectorSDNode::commuteMask(CommutedMask);
11748
11749 // Match any of unary/binary or low/high.
11750 for (unsigned i = 0; i != 4; ++i) {
11751 SmallVector<int, 16> UnpackMask;
11752 createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
11753 if (isTargetShuffleEquivalent(VT, Mask, UnpackMask) ||
11754 isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask))
11755 return true;
11756 }
11757 return false;
11758}
11759
11760/// Return true if a shuffle mask chooses elements identically in its top and
11761/// bottom halves. For example, any splat mask has the same top and bottom
11762/// halves. If an element is undefined in only one half of the mask, the halves
11763/// are not considered identical.
11764static bool hasIdenticalHalvesShuffleMask(ArrayRef<int> Mask) {
11765 assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask")(static_cast <bool> (Mask.size() % 2 == 0 && "Expecting even number of elements in mask"
) ? void (0) : __assert_fail ("Mask.size() % 2 == 0 && \"Expecting even number of elements in mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11765, __extension__
__PRETTY_FUNCTION__))
;
11766 unsigned HalfSize = Mask.size() / 2;
11767 for (unsigned i = 0; i != HalfSize; ++i) {
11768 if (Mask[i] != Mask[i + HalfSize])
11769 return false;
11770 }
11771 return true;
11772}
11773
11774/// Get a 4-lane 8-bit shuffle immediate for a mask.
11775///
11776/// This helper function produces an 8-bit shuffle immediate corresponding to
11777/// the ubiquitous shuffle encoding scheme used in x86 instructions for
11778/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
11779/// example.
11780///
11781/// NB: We rely heavily on "undef" masks preserving the input lane.
11782static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
11783 assert(Mask.size() == 4 && "Only 4-lane shuffle masks")(static_cast <bool> (Mask.size() == 4 && "Only 4-lane shuffle masks"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Only 4-lane shuffle masks\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11783, __extension__
__PRETTY_FUNCTION__))
;
11784 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[0] >= -1 && Mask[0
] < 4 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[0] >= -1 && Mask[0] < 4 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11784, __extension__
__PRETTY_FUNCTION__))
;
11785 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[1] >= -1 && Mask[1
] < 4 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[1] >= -1 && Mask[1] < 4 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11785, __extension__
__PRETTY_FUNCTION__))
;
11786 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[2] >= -1 && Mask[2
] < 4 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[2] >= -1 && Mask[2] < 4 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11786, __extension__
__PRETTY_FUNCTION__))
;
11787 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[3] >= -1 && Mask[3
] < 4 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[3] >= -1 && Mask[3] < 4 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11787, __extension__
__PRETTY_FUNCTION__))
;
11788
11789 // If the mask only uses one non-undef element, then fully 'splat' it to
11790 // improve later broadcast matching.
11791 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
11792 assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask")(static_cast <bool> (0 <= FirstIndex && FirstIndex
< 4 && "All undef shuffle mask") ? void (0) : __assert_fail
("0 <= FirstIndex && FirstIndex < 4 && \"All undef shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11792, __extension__
__PRETTY_FUNCTION__))
;
11793
11794 int FirstElt = Mask[FirstIndex];
11795 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }))
11796 return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;
11797
11798 unsigned Imm = 0;
11799 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
11800 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
11801 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
11802 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
11803 return Imm;
11804}
11805
11806static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
11807 SelectionDAG &DAG) {
11808 return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
11809}
11810
11811// The Shuffle result is as follow:
11812// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
11813// Each Zeroable's element correspond to a particular Mask's element.
11814// As described in computeZeroableShuffleElements function.
11815//
11816// The function looks for a sub-mask that the nonzero elements are in
11817// increasing order. If such sub-mask exist. The function returns true.
11818static bool isNonZeroElementsInOrder(const APInt &Zeroable,
11819 ArrayRef<int> Mask, const EVT &VectorType,
11820 bool &IsZeroSideLeft) {
11821 int NextElement = -1;
11822 // Check if the Mask's nonzero elements are in increasing order.
11823 for (int i = 0, e = Mask.size(); i < e; i++) {
11824 // Checks if the mask's zeros elements are built from only zeros.
11825 assert(Mask[i] >= -1 && "Out of bound mask element!")(static_cast <bool> (Mask[i] >= -1 && "Out of bound mask element!"
) ? void (0) : __assert_fail ("Mask[i] >= -1 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11825, __extension__
__PRETTY_FUNCTION__))
;
11826 if (Mask[i] < 0)
11827 return false;
11828 if (Zeroable[i])
11829 continue;
11830 // Find the lowest non zero element
11831 if (NextElement < 0) {
11832 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
11833 IsZeroSideLeft = NextElement != 0;
11834 }
11835 // Exit if the mask's non zero elements are not in increasing order.
11836 if (NextElement != Mask[i])
11837 return false;
11838 NextElement++;
11839 }
11840 return true;
11841}
11842
11843/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
11844static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
11845 ArrayRef<int> Mask, SDValue V1,
11846 SDValue V2, const APInt &Zeroable,
11847 const X86Subtarget &Subtarget,
11848 SelectionDAG &DAG) {
11849 int Size = Mask.size();
11850 int LaneSize = 128 / VT.getScalarSizeInBits();
11851 const int NumBytes = VT.getSizeInBits() / 8;
11852 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
11853
11854 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||(static_cast <bool> ((Subtarget.hasSSSE3() && VT
.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector
()) || (Subtarget.hasBWI() && VT.is512BitVector())) ?
void (0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11856, __extension__
__PRETTY_FUNCTION__))
11855 (Subtarget.hasAVX2() && VT.is256BitVector()) ||(static_cast <bool> ((Subtarget.hasSSSE3() && VT
.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector
()) || (Subtarget.hasBWI() && VT.is512BitVector())) ?
void (0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11856, __extension__
__PRETTY_FUNCTION__))
11856 (Subtarget.hasBWI() && VT.is512BitVector()))(static_cast <bool> ((Subtarget.hasSSSE3() && VT
.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector
()) || (Subtarget.hasBWI() && VT.is512BitVector())) ?
void (0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11856, __extension__
__PRETTY_FUNCTION__))
;
11857
11858 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
11859 // Sign bit set in i8 mask means zero element.
11860 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
11861
11862 SDValue V;
11863 for (int i = 0; i < NumBytes; ++i) {
11864 int M = Mask[i / NumEltBytes];
11865 if (M < 0) {
11866 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
11867 continue;
11868 }
11869 if (Zeroable[i / NumEltBytes]) {
11870 PSHUFBMask[i] = ZeroMask;
11871 continue;
11872 }
11873
11874 // We can only use a single input of V1 or V2.
11875 SDValue SrcV = (M >= Size ? V2 : V1);
11876 if (V && V != SrcV)
11877 return SDValue();
11878 V = SrcV;
11879 M %= Size;
11880
11881 // PSHUFB can't cross lanes, ensure this doesn't happen.
11882 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
11883 return SDValue();
11884
11885 M = M % LaneSize;
11886 M = M * NumEltBytes + (i % NumEltBytes);
11887 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
11888 }
11889 assert(V && "Failed to find a source input")(static_cast <bool> (V && "Failed to find a source input"
) ? void (0) : __assert_fail ("V && \"Failed to find a source input\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11889, __extension__
__PRETTY_FUNCTION__))
;
11890
11891 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
11892 return DAG.getBitcast(
11893 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
11894 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
11895}
11896
11897static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
11898 const X86Subtarget &Subtarget, SelectionDAG &DAG,
11899 const SDLoc &dl);
11900
11901// X86 has dedicated shuffle that can be lowered to VEXPAND
11902static SDValue lowerShuffleToEXPAND(const SDLoc &DL, MVT VT,
11903 const APInt &Zeroable,
11904 ArrayRef<int> Mask, SDValue &V1,
11905 SDValue &V2, SelectionDAG &DAG,
11906 const X86Subtarget &Subtarget) {
11907 bool IsLeftZeroSide = true;
11908 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
11909 IsLeftZeroSide))
11910 return SDValue();
11911 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
11912 MVT IntegerType =
11913 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
11914 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
11915 unsigned NumElts = VT.getVectorNumElements();
11916 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&(static_cast <bool> ((NumElts == 4 || NumElts == 8 || NumElts
== 16) && "Unexpected number of vector elements") ? void
(0) : __assert_fail ("(NumElts == 4 || NumElts == 8 || NumElts == 16) && \"Unexpected number of vector elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11917, __extension__
__PRETTY_FUNCTION__))
11917 "Unexpected number of vector elements")(static_cast <bool> ((NumElts == 4 || NumElts == 8 || NumElts
== 16) && "Unexpected number of vector elements") ? void
(0) : __assert_fail ("(NumElts == 4 || NumElts == 8 || NumElts == 16) && \"Unexpected number of vector elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11917, __extension__
__PRETTY_FUNCTION__))
;
11918 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
11919 Subtarget, DAG, DL);
11920 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
11921 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
11922 return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
11923}
11924
11925static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
11926 unsigned &UnpackOpcode, bool IsUnary,
11927 ArrayRef<int> TargetMask, const SDLoc &DL,
11928 SelectionDAG &DAG,
11929 const X86Subtarget &Subtarget) {
11930 int NumElts = VT.getVectorNumElements();
11931
11932 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
11933 for (int i = 0; i != NumElts; i += 2) {
11934 int M1 = TargetMask[i + 0];
11935 int M2 = TargetMask[i + 1];
11936 Undef1 &= (SM_SentinelUndef == M1);
11937 Undef2 &= (SM_SentinelUndef == M2);
11938 Zero1 &= isUndefOrZero(M1);
11939 Zero2 &= isUndefOrZero(M2);
11940 }
11941 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&(static_cast <bool> (!((Undef1 || Zero1) && (Undef2
|| Zero2)) && "Zeroable shuffle detected") ? void (0
) : __assert_fail ("!((Undef1 || Zero1) && (Undef2 || Zero2)) && \"Zeroable shuffle detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11942, __extension__
__PRETTY_FUNCTION__))
11942 "Zeroable shuffle detected")(static_cast <bool> (!((Undef1 || Zero1) && (Undef2
|| Zero2)) && "Zeroable shuffle detected") ? void (0
) : __assert_fail ("!((Undef1 || Zero1) && (Undef2 || Zero2)) && \"Zeroable shuffle detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11942, __extension__
__PRETTY_FUNCTION__))
;
11943
11944 // Attempt to match the target mask against the unpack lo/hi mask patterns.
11945 SmallVector<int, 64> Unpckl, Unpckh;
11946 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
11947 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, V1,
11948 (IsUnary ? V1 : V2))) {
11949 UnpackOpcode = X86ISD::UNPCKL;
11950 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
11951 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
11952 return true;
11953 }
11954
11955 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
11956 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, V1,
11957 (IsUnary ? V1 : V2))) {
11958 UnpackOpcode = X86ISD::UNPCKH;
11959 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
11960 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
11961 return true;
11962 }
11963
11964 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
11965 if (IsUnary && (Zero1 || Zero2)) {
11966 // Don't bother if we can blend instead.
11967 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
11968 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
11969 return false;
11970
11971 bool MatchLo = true, MatchHi = true;
11972 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
11973 int M = TargetMask[i];
11974
11975 // Ignore if the input is known to be zero or the index is undef.
11976 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
11977 (M == SM_SentinelUndef))
11978 continue;
11979
11980 MatchLo &= (M == Unpckl[i]);
11981 MatchHi &= (M == Unpckh[i]);
11982 }
11983
11984 if (MatchLo || MatchHi) {
11985 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
11986 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
11987 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
11988 return true;
11989 }
11990 }
11991
11992 // If a binary shuffle, commute and try again.
11993 if (!IsUnary) {
11994 ShuffleVectorSDNode::commuteMask(Unpckl);
11995 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl)) {
11996 UnpackOpcode = X86ISD::UNPCKL;
11997 std::swap(V1, V2);
11998 return true;
11999 }
12000
12001 ShuffleVectorSDNode::commuteMask(Unpckh);
12002 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh)) {
12003 UnpackOpcode = X86ISD::UNPCKH;
12004 std::swap(V1, V2);
12005 return true;
12006 }
12007 }
12008
12009 return false;
12010}
12011
12012// X86 has dedicated unpack instructions that can handle specific blend
12013// operations: UNPCKH and UNPCKL.
12014static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT,
12015 ArrayRef<int> Mask, SDValue V1, SDValue V2,
12016 SelectionDAG &DAG) {
12017 SmallVector<int, 8> Unpckl;
12018 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
12019 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
12020 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
12021
12022 SmallVector<int, 8> Unpckh;
12023 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
12024 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
12025 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
12026
12027 // Commute and try again.
12028 ShuffleVectorSDNode::commuteMask(Unpckl);
12029 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
12030 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
12031
12032 ShuffleVectorSDNode::commuteMask(Unpckh);
12033 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
12034 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
12035
12036 return SDValue();
12037}
12038
12039/// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)
12040/// followed by unpack 256-bit.
12041static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT,
12042 ArrayRef<int> Mask, SDValue V1,
12043 SDValue V2, SelectionDAG &DAG) {
12044 SmallVector<int, 32> Unpckl, Unpckh;
12045 createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);
12046 createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);
12047
12048 unsigned UnpackOpcode;
12049 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
12050 UnpackOpcode = X86ISD::UNPCKL;
12051 else if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
12052 UnpackOpcode = X86ISD::UNPCKH;
12053 else
12054 return SDValue();
12055
12056 // This is a "natural" unpack operation (rather than the 128-bit sectored
12057 // operation implemented by AVX). We need to rearrange 64-bit chunks of the
12058 // input in order to use the x86 instruction.
12059 V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),
12060 DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});
12061 V1 = DAG.getBitcast(VT, V1);
12062 return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);
12063}
12064
12065// Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the
12066// source into the lower elements and zeroing the upper elements.
12067static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,
12068 ArrayRef<int> Mask, const APInt &Zeroable,
12069 const X86Subtarget &Subtarget) {
12070 if (!VT.is512BitVector() && !Subtarget.hasVLX())
12071 return false;
12072
12073 unsigned NumElts = Mask.size();
12074 unsigned EltSizeInBits = VT.getScalarSizeInBits();
12075 unsigned MaxScale = 64 / EltSizeInBits;
12076
12077 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
12078 unsigned SrcEltBits = EltSizeInBits * Scale;
12079 if (SrcEltBits < 32 && !Subtarget.hasBWI())
12080 continue;
12081 unsigned NumSrcElts = NumElts / Scale;
12082 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))
12083 continue;
12084 unsigned UpperElts = NumElts - NumSrcElts;
12085 if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
12086 continue;
12087 SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);
12088 SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);
12089 DstVT = MVT::getIntegerVT(EltSizeInBits);
12090 if ((NumSrcElts * EltSizeInBits) >= 128) {
12091 // ISD::TRUNCATE
12092 DstVT = MVT::getVectorVT(DstVT, NumSrcElts);
12093 } else {
12094 // X86ISD::VTRUNC
12095 DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);
12096 }
12097 return true;
12098 }
12099
12100 return false;
12101}
12102
12103// Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper
12104// element padding to the final DstVT.
12105static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,
12106 const X86Subtarget &Subtarget,
12107 SelectionDAG &DAG, bool ZeroUppers) {
12108 MVT SrcVT = Src.getSimpleValueType();
12109 MVT DstSVT = DstVT.getScalarType();
12110 unsigned NumDstElts = DstVT.getVectorNumElements();
12111 unsigned NumSrcElts = SrcVT.getVectorNumElements();
12112 unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();
12113
12114 if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
12115 return SDValue();
12116
12117 // Perform a direct ISD::TRUNCATE if possible.
12118 if (NumSrcElts == NumDstElts)
12119 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);
12120
12121 if (NumSrcElts > NumDstElts) {
12122 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
12123 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
12124 return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());
12125 }
12126
12127 if ((NumSrcElts * DstEltSizeInBits) >= 128) {
12128 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
12129 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
12130 return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
12131 DstVT.getSizeInBits());
12132 }
12133
12134 // Non-VLX targets must truncate from a 512-bit type, so we need to
12135 // widen, truncate and then possibly extract the original subvector.
12136 if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {
12137 SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512);
12138 return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);
12139 }
12140
12141 // Fallback to a X86ISD::VTRUNC, padding if necessary.
12142 MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);
12143 SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);
12144 if (DstVT != TruncVT)
12145 Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
12146 DstVT.getSizeInBits());
12147 return Trunc;
12148}
12149
12150// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
12151//
12152// An example is the following:
12153//
12154// t0: ch = EntryToken
12155// t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
12156// t25: v4i32 = truncate t2
12157// t41: v8i16 = bitcast t25
12158// t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
12159// Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
12160// t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
12161// t18: v2i64 = bitcast t51
12162//
12163// One can just use a single vpmovdw instruction, without avx512vl we need to
12164// use the zmm variant and extract the lower subvector, padding with zeroes.
12165// TODO: Merge with lowerShuffleAsVTRUNC.
12166static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1,
12167 SDValue V2, ArrayRef<int> Mask,
12168 const APInt &Zeroable,
12169 const X86Subtarget &Subtarget,
12170 SelectionDAG &DAG) {
12171 assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type")(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v8i16
) && "Unexpected VTRUNC type") ? void (0) : __assert_fail
("(VT == MVT::v16i8 || VT == MVT::v8i16) && \"Unexpected VTRUNC type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12171, __extension__
__PRETTY_FUNCTION__))
;
12172 if (!Subtarget.hasAVX512())
12173 return SDValue();
12174
12175 unsigned NumElts = VT.getVectorNumElements();
12176 unsigned EltSizeInBits = VT.getScalarSizeInBits();
12177 unsigned MaxScale = 64 / EltSizeInBits;
12178 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
12179 unsigned NumSrcElts = NumElts / Scale;
12180 unsigned UpperElts = NumElts - NumSrcElts;
12181 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
12182 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
12183 continue;
12184
12185 SDValue Src = V1;
12186 if (!Src.hasOneUse())
12187 return SDValue();
12188
12189 Src = peekThroughOneUseBitcasts(Src);
12190 if (Src.getOpcode() != ISD::TRUNCATE ||
12191 Src.getScalarValueSizeInBits() != (EltSizeInBits * Scale))
12192 return SDValue();
12193 Src = Src.getOperand(0);
12194
12195 // VPMOVWB is only available with avx512bw.
12196 MVT SrcVT = Src.getSimpleValueType();
12197 if (SrcVT.getVectorElementType() == MVT::i16 && VT == MVT::v16i8 &&
12198 !Subtarget.hasBWI())
12199 return SDValue();
12200
12201 bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);
12202 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
12203 }
12204
12205 return SDValue();
12206}
12207
12208// Attempt to match binary shuffle patterns as a truncate.
12209static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1,
12210 SDValue V2, ArrayRef<int> Mask,
12211 const APInt &Zeroable,
12212 const X86Subtarget &Subtarget,
12213 SelectionDAG &DAG) {
12214 assert((VT.is128BitVector() || VT.is256BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
()) && "Unexpected VTRUNC type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector()) && \"Unexpected VTRUNC type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12215, __extension__
__PRETTY_FUNCTION__))
12215 "Unexpected VTRUNC type")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
()) && "Unexpected VTRUNC type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector()) && \"Unexpected VTRUNC type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12215, __extension__
__PRETTY_FUNCTION__))
;
12216 if (!Subtarget.hasAVX512())
12217 return SDValue();
12218
12219 unsigned NumElts = VT.getVectorNumElements();
12220 unsigned EltSizeInBits = VT.getScalarSizeInBits();
12221 unsigned MaxScale = 64 / EltSizeInBits;
12222 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
12223 // TODO: Support non-BWI VPMOVWB truncations?
12224 unsigned SrcEltBits = EltSizeInBits * Scale;
12225 if (SrcEltBits < 32 && !Subtarget.hasBWI())
12226 continue;
12227
12228 // Match shuffle <0,Scale,2*Scale,..,undef_or_zero,undef_or_zero,...>
12229 // Bail if the V2 elements are undef.
12230 unsigned NumHalfSrcElts = NumElts / Scale;
12231 unsigned NumSrcElts = 2 * NumHalfSrcElts;
12232 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
12233 isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))
12234 continue;
12235
12236 // The elements beyond the truncation must be undef/zero.
12237 unsigned UpperElts = NumElts - NumSrcElts;
12238 if (UpperElts > 0 &&
12239 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
12240 continue;
12241 bool UndefUppers =
12242 UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);
12243
12244 // As we're using both sources then we need to concat them together
12245 // and truncate from the double-sized src.
12246 MVT ConcatVT = MVT::getVectorVT(VT.getScalarType(), NumElts * 2);
12247 SDValue Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);
12248
12249 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
12250 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
12251 Src = DAG.getBitcast(SrcVT, Src);
12252 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
12253 }
12254
12255 return SDValue();
12256}
12257
12258/// Check whether a compaction lowering can be done by dropping even/odd
12259/// elements and compute how many times even/odd elements must be dropped.
12260///
12261/// This handles shuffles which take every Nth element where N is a power of
12262/// two. Example shuffle masks:
12263///
12264/// (even)
12265/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
12266/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
12267/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
12268/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
12269/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
12270/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
12271///
12272/// (odd)
12273/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14
12274/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
12275///
12276/// Any of these lanes can of course be undef.
12277///
12278/// This routine only supports N <= 3.
12279/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
12280/// for larger N.
12281///
12282/// \returns N above, or the number of times even/odd elements must be dropped
12283/// if there is such a number. Otherwise returns zero.
12284static int canLowerByDroppingElements(ArrayRef<int> Mask, bool MatchEven,
12285 bool IsSingleInput) {
12286 // The modulus for the shuffle vector entries is based on whether this is
12287 // a single input or not.
12288 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
12289 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&(static_cast <bool> (isPowerOf2_32((uint32_t)ShuffleModulus
) && "We should only be called with masks with a power-of-2 size!"
) ? void (0) : __assert_fail ("isPowerOf2_32((uint32_t)ShuffleModulus) && \"We should only be called with masks with a power-of-2 size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12290, __extension__
__PRETTY_FUNCTION__))
12290 "We should only be called with masks with a power-of-2 size!")(static_cast <bool> (isPowerOf2_32((uint32_t)ShuffleModulus
) && "We should only be called with masks with a power-of-2 size!"
) ? void (0) : __assert_fail ("isPowerOf2_32((uint32_t)ShuffleModulus) && \"We should only be called with masks with a power-of-2 size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12290, __extension__
__PRETTY_FUNCTION__))
;
12291
12292 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
12293 int Offset = MatchEven ? 0 : 1;
12294
12295 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
12296 // and 2^3 simultaneously. This is because we may have ambiguity with
12297 // partially undef inputs.
12298 bool ViableForN[3] = {true, true, true};
12299
12300 for (int i = 0, e = Mask.size(); i < e; ++i) {
12301 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
12302 // want.
12303 if (Mask[i] < 0)
12304 continue;
12305
12306 bool IsAnyViable = false;
12307 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
12308 if (ViableForN[j]) {
12309 uint64_t N = j + 1;
12310
12311 // The shuffle mask must be equal to (i * 2^N) % M.
12312 if ((uint64_t)(Mask[i] - Offset) == (((uint64_t)i << N) & ModMask))
12313 IsAnyViable = true;
12314 else
12315 ViableForN[j] = false;
12316 }
12317 // Early exit if we exhaust the possible powers of two.
12318 if (!IsAnyViable)
12319 break;
12320 }
12321
12322 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
12323 if (ViableForN[j])
12324 return j + 1;
12325
12326 // Return 0 as there is no viable power of two.
12327 return 0;
12328}
12329
12330// X86 has dedicated pack instructions that can handle specific truncation
12331// operations: PACKSS and PACKUS.
12332// Checks for compaction shuffle masks if MaxStages > 1.
12333// TODO: Add support for matching multiple PACKSS/PACKUS stages.
12334static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
12335 unsigned &PackOpcode, ArrayRef<int> TargetMask,
12336 const SelectionDAG &DAG,
12337 const X86Subtarget &Subtarget,
12338 unsigned MaxStages = 1) {
12339 unsigned NumElts = VT.getVectorNumElements();
12340 unsigned BitSize = VT.getScalarSizeInBits();
12341 assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&(static_cast <bool> (0 < MaxStages && MaxStages
<= 3 && (BitSize << MaxStages) <= 64 &&
"Illegal maximum compaction") ? void (0) : __assert_fail ("0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 && \"Illegal maximum compaction\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12342, __extension__
__PRETTY_FUNCTION__))
12342 "Illegal maximum compaction")(static_cast <bool> (0 < MaxStages && MaxStages
<= 3 && (BitSize << MaxStages) <= 64 &&
"Illegal maximum compaction") ? void (0) : __assert_fail ("0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 && \"Illegal maximum compaction\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12342, __extension__
__PRETTY_FUNCTION__))
;
12343
12344 auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {
12345 unsigned NumSrcBits = PackVT.getScalarSizeInBits();
12346 unsigned NumPackedBits = NumSrcBits - BitSize;
12347 N1 = peekThroughBitcasts(N1);
12348 N2 = peekThroughBitcasts(N2);
12349 unsigned NumBits1 = N1.getScalarValueSizeInBits();
12350 unsigned NumBits2 = N2.getScalarValueSizeInBits();
12351 bool IsZero1 = llvm::isNullOrNullSplat(N1, /*AllowUndefs*/ false);
12352 bool IsZero2 = llvm::isNullOrNullSplat(N2, /*AllowUndefs*/ false);
12353 if ((!N1.isUndef() && !IsZero1 && NumBits1 != NumSrcBits) ||
12354 (!N2.isUndef() && !IsZero2 && NumBits2 != NumSrcBits))
12355 return false;
12356 if (Subtarget.hasSSE41() || BitSize == 8) {
12357 APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);
12358 if ((N1.isUndef() || IsZero1 || DAG.MaskedValueIsZero(N1, ZeroMask)) &&
12359 (N2.isUndef() || IsZero2 || DAG.MaskedValueIsZero(N2, ZeroMask))) {
12360 V1 = N1;
12361 V2 = N2;
12362 SrcVT = PackVT;
12363 PackOpcode = X86ISD::PACKUS;
12364 return true;
12365 }
12366 }
12367 bool IsAllOnes1 = llvm::isAllOnesOrAllOnesSplat(N1, /*AllowUndefs*/ false);
12368 bool IsAllOnes2 = llvm::isAllOnesOrAllOnesSplat(N2, /*AllowUndefs*/ false);
12369 if ((N1.isUndef() || IsZero1 || IsAllOnes1 ||
12370 DAG.ComputeNumSignBits(N1) > NumPackedBits) &&
12371 (N2.isUndef() || IsZero2 || IsAllOnes2 ||
12372 DAG.ComputeNumSignBits(N2) > NumPackedBits)) {
12373 V1 = N1;
12374 V2 = N2;
12375 SrcVT = PackVT;
12376 PackOpcode = X86ISD::PACKSS;
12377 return true;
12378 }
12379 return false;
12380 };
12381
12382 // Attempt to match against wider and wider compaction patterns.
12383 for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {
12384 MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages);
12385 MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages);
12386
12387 // Try binary shuffle.
12388 SmallVector<int, 32> BinaryMask;
12389 createPackShuffleMask(VT, BinaryMask, false, NumStages);
12390 if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, V1, V2))
12391 if (MatchPACK(V1, V2, PackVT))
12392 return true;
12393
12394 // Try unary shuffle.
12395 SmallVector<int, 32> UnaryMask;
12396 createPackShuffleMask(VT, UnaryMask, true, NumStages);
12397 if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, V1))
12398 if (MatchPACK(V1, V1, PackVT))
12399 return true;
12400 }
12401
12402 return false;
12403}
12404
12405static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
12406 SDValue V1, SDValue V2, SelectionDAG &DAG,
12407 const X86Subtarget &Subtarget) {
12408 MVT PackVT;
12409 unsigned PackOpcode;
12410 unsigned SizeBits = VT.getSizeInBits();
12411 unsigned EltBits = VT.getScalarSizeInBits();
12412 unsigned MaxStages = Log2_32(64 / EltBits);
12413 if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
12414 Subtarget, MaxStages))
12415 return SDValue();
12416
12417 unsigned CurrentEltBits = PackVT.getScalarSizeInBits();
12418 unsigned NumStages = Log2_32(CurrentEltBits / EltBits);
12419
12420 // Don't lower multi-stage packs on AVX512, truncation is better.
12421 if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())
12422 return SDValue();
12423
12424 // Pack to the largest type possible:
12425 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
12426 unsigned MaxPackBits = 16;
12427 if (CurrentEltBits > 16 &&
12428 (PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41()))
12429 MaxPackBits = 32;
12430
12431 // Repeatedly pack down to the target size.
12432 SDValue Res;
12433 for (unsigned i = 0; i != NumStages; ++i) {
12434 unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);
12435 unsigned NumSrcElts = SizeBits / SrcEltBits;
12436 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
12437 MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2);
12438 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
12439 MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2);
12440 Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1),
12441 DAG.getBitcast(SrcVT, V2));
12442 V1 = V2 = Res;
12443 CurrentEltBits /= 2;
12444 }
12445 assert(Res && Res.getValueType() == VT &&(static_cast <bool> (Res && Res.getValueType() ==
VT && "Failed to lower compaction shuffle") ? void (
0) : __assert_fail ("Res && Res.getValueType() == VT && \"Failed to lower compaction shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12446, __extension__
__PRETTY_FUNCTION__))
12446 "Failed to lower compaction shuffle")(static_cast <bool> (Res && Res.getValueType() ==
VT && "Failed to lower compaction shuffle") ? void (
0) : __assert_fail ("Res && Res.getValueType() == VT && \"Failed to lower compaction shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12446, __extension__
__PRETTY_FUNCTION__))
;
12447 return Res;
12448}
12449
12450/// Try to emit a bitmask instruction for a shuffle.
12451///
12452/// This handles cases where we can model a blend exactly as a bitmask due to
12453/// one of the inputs being zeroable.
12454static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
12455 SDValue V2, ArrayRef<int> Mask,
12456 const APInt &Zeroable,
12457 const X86Subtarget &Subtarget,
12458 SelectionDAG &DAG) {
12459 MVT MaskVT = VT;
12460 MVT EltVT = VT.getVectorElementType();
12461 SDValue Zero, AllOnes;
12462 // Use f64 if i64 isn't legal.
12463 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
12464 EltVT = MVT::f64;
12465 MaskVT = MVT::getVectorVT(EltVT, Mask.size());
12466 }
12467
12468 MVT LogicVT = VT;
12469 if (EltVT == MVT::f32 || EltVT == MVT::f64) {
12470 Zero = DAG.getConstantFP(0.0, DL, EltVT);
12471 APFloat AllOnesValue =
12472 APFloat::getAllOnesValue(SelectionDAG::EVTToAPFloatSemantics(EltVT));
12473 AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);
12474 LogicVT =
12475 MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size());
12476 } else {
12477 Zero = DAG.getConstant(0, DL, EltVT);
12478 AllOnes = DAG.getAllOnesConstant(DL, EltVT);
12479 }
12480
12481 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
12482 SDValue V;
12483 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
12484 if (Zeroable[i])
12485 continue;
12486 if (Mask[i] % Size != i)
12487 return SDValue(); // Not a blend.
12488 if (!V)
12489 V = Mask[i] < Size ? V1 : V2;
12490 else if (V != (Mask[i] < Size ? V1 : V2))
12491 return SDValue(); // Can only let one input through the mask.
12492
12493 VMaskOps[i] = AllOnes;
12494 }
12495 if (!V)
12496 return SDValue(); // No non-zeroable elements!
12497
12498 SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
12499 VMask = DAG.getBitcast(LogicVT, VMask);
12500 V = DAG.getBitcast(LogicVT, V);
12501 SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
12502 return DAG.getBitcast(VT, And);
12503}
12504
12505/// Try to emit a blend instruction for a shuffle using bit math.
12506///
12507/// This is used as a fallback approach when first class blend instructions are
12508/// unavailable. Currently it is only suitable for integer vectors, but could
12509/// be generalized for floating point vectors if desirable.
12510static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
12511 SDValue V2, ArrayRef<int> Mask,
12512 SelectionDAG &DAG) {
12513 assert(VT.isInteger() && "Only supports integer vector types!")(static_cast <bool> (VT.isInteger() && "Only supports integer vector types!"
) ? void (0) : __assert_fail ("VT.isInteger() && \"Only supports integer vector types!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12513, __extension__
__PRETTY_FUNCTION__))
;
12514 MVT EltVT = VT.getVectorElementType();
12515 SDValue Zero = DAG.getConstant(0, DL, EltVT);
12516 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
12517 SmallVector<SDValue, 16> MaskOps;
12518 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
12519 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
12520 return SDValue(); // Shuffled input!
12521 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
12522 }
12523
12524 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
12525 V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
12526 V2 = DAG.getNode(X86ISD::ANDNP, DL, VT, V1Mask, V2);
12527 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
12528}
12529
12530static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
12531 SDValue PreservedSrc,
12532 const X86Subtarget &Subtarget,
12533 SelectionDAG &DAG);
12534
12535static bool matchShuffleAsBlend(SDValue V1, SDValue V2,
12536 MutableArrayRef<int> Mask,
12537 const APInt &Zeroable, bool &ForceV1Zero,
12538 bool &ForceV2Zero, uint64_t &BlendMask) {
12539 bool V1IsZeroOrUndef =
12540 V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());
12541 bool V2IsZeroOrUndef =
12542 V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
12543
12544 BlendMask = 0;
12545 ForceV1Zero = false, ForceV2Zero = false;
12546 assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask")(static_cast <bool> (Mask.size() <= 64 && "Shuffle mask too big for blend mask"
) ? void (0) : __assert_fail ("Mask.size() <= 64 && \"Shuffle mask too big for blend mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12546, __extension__
__PRETTY_FUNCTION__))
;
12547
12548 // Attempt to generate the binary blend mask. If an input is zero then
12549 // we can use any lane.
12550 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
12551 int M = Mask[i];
12552 if (M == SM_SentinelUndef)
12553 continue;
12554 if (M == i ||
12555 (0 <= M && M < Size && IsElementEquivalent(Size, V1, V1, M, i))) {
12556 Mask[i] = i;
12557 continue;
12558 }
12559 if (M == (i + Size) ||
12560 (Size <= M && IsElementEquivalent(Size, V2, V2, M - Size, i))) {
12561 BlendMask |= 1ull << i;
12562 Mask[i] = i + Size;
12563 continue;
12564 }
12565 if (Zeroable[i]) {
12566 if (V1IsZeroOrUndef) {
12567 ForceV1Zero = true;
12568 Mask[i] = i;
12569 continue;
12570 }
12571 if (V2IsZeroOrUndef) {
12572 ForceV2Zero = true;
12573 BlendMask |= 1ull << i;
12574 Mask[i] = i + Size;
12575 continue;
12576 }
12577 }
12578 return false;
12579 }
12580 return true;
12581}
12582
12583static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size,
12584 int Scale) {
12585 uint64_t ScaledMask = 0;
12586 for (int i = 0; i != Size; ++i)
12587 if (BlendMask & (1ull << i))
12588 ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);
12589 return ScaledMask;
12590}
12591
12592/// Try to emit a blend instruction for a shuffle.
12593///
12594/// This doesn't do any checks for the availability of instructions for blending
12595/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
12596/// be matched in the backend with the type given. What it does check for is
12597/// that the shuffle mask is a blend, or convertible into a blend with zero.
12598static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
12599 SDValue V2, ArrayRef<int> Original,
12600 const APInt &Zeroable,
12601 const X86Subtarget &Subtarget,
12602 SelectionDAG &DAG) {
12603 uint64_t BlendMask = 0;
12604 bool ForceV1Zero = false, ForceV2Zero = false;
12605 SmallVector<int, 64> Mask(Original.begin(), Original.end());
12606 if (!matchShuffleAsBlend(V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,
12607 BlendMask))
12608 return SDValue();
12609
12610 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
12611 if (ForceV1Zero)
12612 V1 = getZeroVector(VT, Subtarget, DAG, DL);
12613 if (ForceV2Zero)
12614 V2 = getZeroVector(VT, Subtarget, DAG, DL);
12615
12616 unsigned NumElts = VT.getVectorNumElements();
12617
12618 switch (VT.SimpleTy) {
12619 case MVT::v4i64:
12620 case MVT::v8i32:
12621 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "256-bit integer blends require AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"256-bit integer blends require AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12621, __extension__
__PRETTY_FUNCTION__))
;
12622 LLVM_FALLTHROUGH[[gnu::fallthrough]];
12623 case MVT::v4f64:
12624 case MVT::v8f32:
12625 assert(Subtarget.hasAVX() && "256-bit float blends require AVX!")(static_cast <bool> (Subtarget.hasAVX() && "256-bit float blends require AVX!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"256-bit float blends require AVX!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12625, __extension__
__PRETTY_FUNCTION__))
;
12626 LLVM_FALLTHROUGH[[gnu::fallthrough]];
12627 case MVT::v2f64:
12628 case MVT::v2i64:
12629 case MVT::v4f32:
12630 case MVT::v4i32:
12631 case MVT::v8i16:
12632 assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!")(static_cast <bool> (Subtarget.hasSSE41() && "128-bit blends require SSE41!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE41() && \"128-bit blends require SSE41!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12632, __extension__
__PRETTY_FUNCTION__))
;
12633 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
12634 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
12635 case MVT::v16i16: {
12636 assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "v16i16 blends require AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"v16i16 blends require AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12636, __extension__
__PRETTY_FUNCTION__))
;
12637 SmallVector<int, 8> RepeatedMask;
12638 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
12639 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
12640 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!")(static_cast <bool> (RepeatedMask.size() == 8 &&
"Repeated mask size doesn't match!") ? void (0) : __assert_fail
("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12640, __extension__
__PRETTY_FUNCTION__))
;
12641 BlendMask = 0;
12642 for (int i = 0; i < 8; ++i)
12643 if (RepeatedMask[i] >= 8)
12644 BlendMask |= 1ull << i;
12645 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
12646 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
12647 }
12648 // Use PBLENDW for lower/upper lanes and then blend lanes.
12649 // TODO - we should allow 2 PBLENDW here and leave shuffle combine to
12650 // merge to VSELECT where useful.
12651 uint64_t LoMask = BlendMask & 0xFF;
12652 uint64_t HiMask = (BlendMask >> 8) & 0xFF;
12653 if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
12654 SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
12655 DAG.getTargetConstant(LoMask, DL, MVT::i8));
12656 SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
12657 DAG.getTargetConstant(HiMask, DL, MVT::i8));
12658 return DAG.getVectorShuffle(
12659 MVT::v16i16, DL, Lo, Hi,
12660 {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
12661 }
12662 LLVM_FALLTHROUGH[[gnu::fallthrough]];
12663 }
12664 case MVT::v32i8:
12665 assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"256-bit byte-blends require AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12665, __extension__
__PRETTY_FUNCTION__))
;
12666 LLVM_FALLTHROUGH[[gnu::fallthrough]];
12667 case MVT::v16i8: {
12668 assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!")(static_cast <bool> (Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE41() && \"128-bit byte-blends require SSE41!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12668, __extension__
__PRETTY_FUNCTION__))
;
12669
12670 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
12671 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
12672 Subtarget, DAG))
12673 return Masked;
12674
12675 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
12676 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
12677 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
12678 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
12679 }
12680
12681 // If we have VPTERNLOG, we can use that as a bit blend.
12682 if (Subtarget.hasVLX())
12683 if (SDValue BitBlend =
12684 lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
12685 return BitBlend;
12686
12687 // Scale the blend by the number of bytes per element.
12688 int Scale = VT.getScalarSizeInBits() / 8;
12689
12690 // This form of blend is always done on bytes. Compute the byte vector
12691 // type.
12692 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
12693
12694 // x86 allows load folding with blendvb from the 2nd source operand. But
12695 // we are still using LLVM select here (see comment below), so that's V1.
12696 // If V2 can be load-folded and V1 cannot be load-folded, then commute to
12697 // allow that load-folding possibility.
12698 if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
12699 ShuffleVectorSDNode::commuteMask(Mask);
12700 std::swap(V1, V2);
12701 }
12702
12703 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
12704 // mix of LLVM's code generator and the x86 backend. We tell the code
12705 // generator that boolean values in the elements of an x86 vector register
12706 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
12707 // mapping a select to operand #1, and 'false' mapping to operand #2. The
12708 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
12709 // of the element (the remaining are ignored) and 0 in that high bit would
12710 // mean operand #1 while 1 in the high bit would mean operand #2. So while
12711 // the LLVM model for boolean values in vector elements gets the relevant
12712 // bit set, it is set backwards and over constrained relative to x86's
12713 // actual model.
12714 SmallVector<SDValue, 32> VSELECTMask;
12715 for (int i = 0, Size = Mask.size(); i < Size; ++i)
12716 for (int j = 0; j < Scale; ++j)
12717 VSELECTMask.push_back(
12718 Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
12719 : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
12720 MVT::i8));
12721
12722 V1 = DAG.getBitcast(BlendVT, V1);
12723 V2 = DAG.getBitcast(BlendVT, V2);
12724 return DAG.getBitcast(
12725 VT,
12726 DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
12727 V1, V2));
12728 }
12729 case MVT::v16f32:
12730 case MVT::v8f64:
12731 case MVT::v8i64:
12732 case MVT::v16i32:
12733 case MVT::v32i16:
12734 case MVT::v64i8: {
12735 // Attempt to lower to a bitmask if we can. Only if not optimizing for size.
12736 bool OptForSize = DAG.shouldOptForSize();
12737 if (!OptForSize) {
12738 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
12739 Subtarget, DAG))
12740 return Masked;
12741 }
12742
12743 // Otherwise load an immediate into a GPR, cast to k-register, and use a
12744 // masked move.
12745 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
12746 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
12747 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
12748 }
12749 default:
12750 llvm_unreachable("Not a supported integer vector type!")::llvm::llvm_unreachable_internal("Not a supported integer vector type!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12750)
;
12751 }
12752}
12753
12754/// Try to lower as a blend of elements from two inputs followed by
12755/// a single-input permutation.
12756///
12757/// This matches the pattern where we can blend elements from two inputs and
12758/// then reduce the shuffle to a single-input permutation.
12759static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
12760 SDValue V1, SDValue V2,
12761 ArrayRef<int> Mask,
12762 SelectionDAG &DAG,
12763 bool ImmBlends = false) {
12764 // We build up the blend mask while checking whether a blend is a viable way
12765 // to reduce the shuffle.
12766 SmallVector<int, 32> BlendMask(Mask.size(), -1);
12767 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
12768
12769 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
12770 if (Mask[i] < 0)
12771 continue;
12772
12773 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.")(static_cast <bool> (Mask[i] < Size * 2 && "Shuffle input is out of bounds."
) ? void (0) : __assert_fail ("Mask[i] < Size * 2 && \"Shuffle input is out of bounds.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12773, __extension__
__PRETTY_FUNCTION__))
;
12774
12775 if (BlendMask[Mask[i] % Size] < 0)
12776 BlendMask[Mask[i] % Size] = Mask[i];
12777 else if (BlendMask[Mask[i] % Size] != Mask[i])
12778 return SDValue(); // Can't blend in the needed input!
12779
12780 PermuteMask[i] = Mask[i] % Size;
12781 }
12782
12783 // If only immediate blends, then bail if the blend mask can't be widened to
12784 // i16.
12785 unsigned EltSize = VT.getScalarSizeInBits();
12786 if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
12787 return SDValue();
12788
12789 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
12790 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
12791}
12792
12793/// Try to lower as an unpack of elements from two inputs followed by
12794/// a single-input permutation.
12795///
12796/// This matches the pattern where we can unpack elements from two inputs and
12797/// then reduce the shuffle to a single-input (wider) permutation.
12798static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT,
12799 SDValue V1, SDValue V2,
12800 ArrayRef<int> Mask,
12801 SelectionDAG &DAG) {
12802 int NumElts = Mask.size();
12803 int NumLanes = VT.getSizeInBits() / 128;
12804 int NumLaneElts = NumElts / NumLanes;
12805 int NumHalfLaneElts = NumLaneElts / 2;
12806
12807 bool MatchLo = true, MatchHi = true;
12808 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
12809
12810 // Determine UNPCKL/UNPCKH type and operand order.
12811 for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
12812 for (int Elt = 0; Elt != NumLaneElts; ++Elt) {
12813 int M = Mask[Lane + Elt];
12814 if (M < 0)
12815 continue;
12816
12817 SDValue &Op = Ops[Elt & 1];
12818 if (M < NumElts && (Op.isUndef() || Op == V1))
12819 Op = V1;
12820 else if (NumElts <= M && (Op.isUndef() || Op == V2))
12821 Op = V2;
12822 else
12823 return SDValue();
12824
12825 int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;
12826 MatchLo &= isUndefOrInRange(M, Lo, Mid) ||
12827 isUndefOrInRange(M, NumElts + Lo, NumElts + Mid);
12828 MatchHi &= isUndefOrInRange(M, Mid, Hi) ||
12829 isUndefOrInRange(M, NumElts + Mid, NumElts + Hi);
12830 if (!MatchLo && !MatchHi)
12831 return SDValue();
12832 }
12833 }
12834 assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI")(static_cast <bool> ((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI"
) ? void (0) : __assert_fail ("(MatchLo ^ MatchHi) && \"Failed to match UNPCKLO/UNPCKHI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12834, __extension__
__PRETTY_FUNCTION__))
;
12835
12836 // Now check that each pair of elts come from the same unpack pair
12837 // and set the permute mask based on each pair.
12838 // TODO - Investigate cases where we permute individual elements.
12839 SmallVector<int, 32> PermuteMask(NumElts, -1);
12840 for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
12841 for (int Elt = 0; Elt != NumLaneElts; Elt += 2) {
12842 int M0 = Mask[Lane + Elt + 0];
12843 int M1 = Mask[Lane + Elt + 1];
12844 if (0 <= M0 && 0 <= M1 &&
12845 (M0 % NumHalfLaneElts) != (M1 % NumHalfLaneElts))
12846 return SDValue();
12847 if (0 <= M0)
12848 PermuteMask[Lane + Elt + 0] = Lane + (2 * (M0 % NumHalfLaneElts));
12849 if (0 <= M1)
12850 PermuteMask[Lane + Elt + 1] = Lane + (2 * (M1 % NumHalfLaneElts)) + 1;
12851 }
12852 }
12853
12854 unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
12855 SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);
12856 return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
12857}
12858
12859/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
12860/// permuting the elements of the result in place.
12861static SDValue lowerShuffleAsByteRotateAndPermute(
12862 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12863 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12864 if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
12865 (VT.is256BitVector() && !Subtarget.hasAVX2()) ||
12866 (VT.is512BitVector() && !Subtarget.hasBWI()))
12867 return SDValue();
12868
12869 // We don't currently support lane crossing permutes.
12870 if (is128BitLaneCrossingShuffleMask(VT, Mask))
12871 return SDValue();
12872
12873 int Scale = VT.getScalarSizeInBits() / 8;
12874 int NumLanes = VT.getSizeInBits() / 128;
12875 int NumElts = VT.getVectorNumElements();
12876 int NumEltsPerLane = NumElts / NumLanes;
12877
12878 // Determine range of mask elts.
12879 bool Blend1 = true;
12880 bool Blend2 = true;
12881 std::pair<int, int> Range1 = std::make_pair(INT_MAX2147483647, INT_MIN(-2147483647 -1));
12882 std::pair<int, int> Range2 = std::make_pair(INT_MAX2147483647, INT_MIN(-2147483647 -1));
12883 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
12884 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
12885 int M = Mask[Lane + Elt];
12886 if (M < 0)
12887 continue;
12888 if (M < NumElts) {
12889 Blend1 &= (M == (Lane + Elt));
12890 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask")(static_cast <bool> (Lane <= M && M < (Lane
+ NumEltsPerLane) && "Out of range mask") ? void (0)
: __assert_fail ("Lane <= M && M < (Lane + NumEltsPerLane) && \"Out of range mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12890, __extension__
__PRETTY_FUNCTION__))
;
12891 M = M % NumEltsPerLane;
12892 Range1.first = std::min(Range1.first, M);
12893 Range1.second = std::max(Range1.second, M);
12894 } else {
12895 M -= NumElts;
12896 Blend2 &= (M == (Lane + Elt));
12897 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask")(static_cast <bool> (Lane <= M && M < (Lane
+ NumEltsPerLane) && "Out of range mask") ? void (0)
: __assert_fail ("Lane <= M && M < (Lane + NumEltsPerLane) && \"Out of range mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12897, __extension__
__PRETTY_FUNCTION__))
;
12898 M = M % NumEltsPerLane;
12899 Range2.first = std::min(Range2.first, M);
12900 Range2.second = std::max(Range2.second, M);
12901 }
12902 }
12903 }
12904
12905 // Bail if we don't need both elements.
12906 // TODO - it might be worth doing this for unary shuffles if the permute
12907 // can be widened.
12908 if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
12909 !(0 <= Range2.first && Range2.second < NumEltsPerLane))
12910 return SDValue();
12911
12912 if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))
12913 return SDValue();
12914
12915 // Rotate the 2 ops so we can access both ranges, then permute the result.
12916 auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
12917 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
12918 SDValue Rotate = DAG.getBitcast(
12919 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
12920 DAG.getBitcast(ByteVT, Lo),
12921 DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));
12922 SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
12923 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
12924 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
12925 int M = Mask[Lane + Elt];
12926 if (M < 0)
12927 continue;
12928 if (M < NumElts)
12929 PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
12930 else
12931 PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
12932 }
12933 }
12934 return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
12935 };
12936
12937 // Check if the ranges are small enough to rotate from either direction.
12938 if (Range2.second < Range1.first)
12939 return RotateAndPermute(V1, V2, Range1.first, 0);
12940 if (Range1.second < Range2.first)
12941 return RotateAndPermute(V2, V1, Range2.first, NumElts);
12942 return SDValue();
12943}
12944
12945static bool isBroadcastShuffleMask(ArrayRef<int> Mask) {
12946 return isUndefOrEqual(Mask, 0);
12947}
12948
12949static bool isNoopOrBroadcastShuffleMask(ArrayRef<int> Mask) {
12950 return isNoopShuffleMask(Mask) || isBroadcastShuffleMask(Mask);
12951}
12952
12953/// Generic routine to decompose a shuffle and blend into independent
12954/// blends and permutes.
12955///
12956/// This matches the extremely common pattern for handling combined
12957/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
12958/// operations. It will try to pick the best arrangement of shuffles and
12959/// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.
12960static SDValue lowerShuffleAsDecomposedShuffleMerge(
12961 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12962 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12963 int NumElts = Mask.size();
12964 int NumLanes = VT.getSizeInBits() / 128;
12965 int NumEltsPerLane = NumElts / NumLanes;
12966
12967 // Shuffle the input elements into the desired positions in V1 and V2 and
12968 // unpack/blend them together.
12969 bool IsAlternating = true;
12970 SmallVector<int, 32> V1Mask(NumElts, -1);
12971 SmallVector<int, 32> V2Mask(NumElts, -1);
12972 SmallVector<int, 32> FinalMask(NumElts, -1);
12973 for (int i = 0; i < NumElts; ++i) {
12974 int M = Mask[i];
12975 if (M >= 0 && M < NumElts) {
12976 V1Mask[i] = M;
12977 FinalMask[i] = i;
12978 IsAlternating &= (i & 1) == 0;
12979 } else if (M >= NumElts) {
12980 V2Mask[i] = M - NumElts;
12981 FinalMask[i] = i + NumElts;
12982 IsAlternating &= (i & 1) == 1;
12983 }
12984 }
12985
12986 // If we effectively only demand the 0'th element of \p Input, and not only
12987 // as 0'th element, then broadcast said input,
12988 // and change \p InputMask to be a no-op (identity) mask.
12989 auto canonicalizeBroadcastableInput = [DL, VT, &Subtarget,
12990 &DAG](SDValue &Input,
12991 MutableArrayRef<int> InputMask) {
12992 unsigned EltSizeInBits = Input.getScalarValueSizeInBits();
12993 if (!Subtarget.hasAVX2() && (!Subtarget.hasAVX() || EltSizeInBits < 32 ||
12994 !X86::mayFoldLoad(Input, Subtarget)))
12995 return;
12996 if (isNoopShuffleMask(InputMask))
12997 return;
12998 assert(isBroadcastShuffleMask(InputMask) &&(static_cast <bool> (isBroadcastShuffleMask(InputMask) &&
"Expected to demand only the 0'th element.") ? void (0) : __assert_fail
("isBroadcastShuffleMask(InputMask) && \"Expected to demand only the 0'th element.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12999, __extension__
__PRETTY_FUNCTION__))
12999 "Expected to demand only the 0'th element.")(static_cast <bool> (isBroadcastShuffleMask(InputMask) &&
"Expected to demand only the 0'th element.") ? void (0) : __assert_fail
("isBroadcastShuffleMask(InputMask) && \"Expected to demand only the 0'th element.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12999, __extension__
__PRETTY_FUNCTION__))
;
13000 Input = DAG.getNode(X86ISD::VBROADCAST, DL, VT, Input);
13001 for (auto I : enumerate(InputMask)) {
13002 int &InputMaskElt = I.value();
13003 if (InputMaskElt >= 0)
13004 InputMaskElt = I.index();
13005 }
13006 };
13007
13008 // Currently, we may need to produce one shuffle per input, and blend results.
13009 // It is possible that the shuffle for one of the inputs is already a no-op.
13010 // See if we can simplify non-no-op shuffles into broadcasts,
13011 // which we consider to be strictly better than an arbitrary shuffle.
13012 if (isNoopOrBroadcastShuffleMask(V1Mask) &&
13013 isNoopOrBroadcastShuffleMask(V2Mask)) {
13014 canonicalizeBroadcastableInput(V1, V1Mask);
13015 canonicalizeBroadcastableInput(V2, V2Mask);
13016 }
13017
13018 // Try to lower with the simpler initial blend/unpack/rotate strategies unless
13019 // one of the input shuffles would be a no-op. We prefer to shuffle inputs as
13020 // the shuffle may be able to fold with a load or other benefit. However, when
13021 // we'll have to do 2x as many shuffles in order to achieve this, a 2-input
13022 // pre-shuffle first is a better strategy.
13023 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
13024 // Only prefer immediate blends to unpack/rotate.
13025 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
13026 DAG, true))
13027 return BlendPerm;
13028 if (SDValue UnpackPerm = lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask,
13029 DAG))
13030 return UnpackPerm;
13031 if (SDValue RotatePerm = lowerShuffleAsByteRotateAndPermute(
13032 DL, VT, V1, V2, Mask, Subtarget, DAG))
13033 return RotatePerm;
13034 // Unpack/rotate failed - try again with variable blends.
13035 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
13036 DAG))
13037 return BlendPerm;
13038 }
13039
13040 // If the final mask is an alternating blend of vXi8/vXi16, convert to an
13041 // UNPCKL(SHUFFLE, SHUFFLE) pattern.
13042 // TODO: It doesn't have to be alternating - but each lane mustn't have more
13043 // than half the elements coming from each source.
13044 if (IsAlternating && VT.getScalarSizeInBits() < 32) {
13045 V1Mask.assign(NumElts, -1);
13046 V2Mask.assign(NumElts, -1);
13047 FinalMask.assign(NumElts, -1);
13048 for (int i = 0; i != NumElts; i += NumEltsPerLane)
13049 for (int j = 0; j != NumEltsPerLane; ++j) {
13050 int M = Mask[i + j];
13051 if (M >= 0 && M < NumElts) {
13052 V1Mask[i + (j / 2)] = M;
13053 FinalMask[i + j] = i + (j / 2);
13054 } else if (M >= NumElts) {
13055 V2Mask[i + (j / 2)] = M - NumElts;
13056 FinalMask[i + j] = i + (j / 2) + NumElts;
13057 }
13058 }
13059 }
13060
13061 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
13062 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
13063 return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask);
13064}
13065
13066/// Try to lower a vector shuffle as a bit rotation.
13067///
13068/// Look for a repeated rotation pattern in each sub group.
13069/// Returns a ISD::ROTL element rotation amount or -1 if failed.
13070static int matchShuffleAsBitRotate(ArrayRef<int> Mask, int NumSubElts) {
13071 int NumElts = Mask.size();
13072 assert((NumElts % NumSubElts) == 0 && "Illegal shuffle mask")(static_cast <bool> ((NumElts % NumSubElts) == 0 &&
"Illegal shuffle mask") ? void (0) : __assert_fail ("(NumElts % NumSubElts) == 0 && \"Illegal shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13072, __extension__
__PRETTY_FUNCTION__))
;
13073
13074 int RotateAmt = -1;
13075 for (int i = 0; i != NumElts; i += NumSubElts) {
13076 for (int j = 0; j != NumSubElts; ++j) {
13077 int M = Mask[i + j];
13078 if (M < 0)
13079 continue;
13080 if (!isInRange(M, i, i + NumSubElts))
13081 return -1;
13082 int Offset = (NumSubElts - (M - (i + j))) % NumSubElts;
13083 if (0 <= RotateAmt && Offset != RotateAmt)
13084 return -1;
13085 RotateAmt = Offset;
13086 }
13087 }
13088 return RotateAmt;
13089}
13090
13091static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,
13092 const X86Subtarget &Subtarget,
13093 ArrayRef<int> Mask) {
13094 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")(static_cast <bool> (!isNoopShuffleMask(Mask) &&
"We shouldn't lower no-op shuffles!") ? void (0) : __assert_fail
("!isNoopShuffleMask(Mask) && \"We shouldn't lower no-op shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13094, __extension__
__PRETTY_FUNCTION__))
;
13095 assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers")(static_cast <bool> (EltSizeInBits < 64 && "Can't rotate 64-bit integers"
) ? void (0) : __assert_fail ("EltSizeInBits < 64 && \"Can't rotate 64-bit integers\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13095, __extension__
__PRETTY_FUNCTION__))
;
13096
13097 // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
13098 int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
13099 int MaxSubElts = 64 / EltSizeInBits;
13100 for (int NumSubElts = MinSubElts; NumSubElts <= MaxSubElts; NumSubElts *= 2) {
13101 int RotateAmt = matchShuffleAsBitRotate(Mask, NumSubElts);
13102 if (RotateAmt < 0)
13103 continue;
13104
13105 int NumElts = Mask.size();
13106 MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
13107 RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
13108 return RotateAmt * EltSizeInBits;
13109 }
13110
13111 return -1;
13112}
13113
13114/// Lower shuffle using X86ISD::VROTLI rotations.
13115static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1,
13116 ArrayRef<int> Mask,
13117 const X86Subtarget &Subtarget,
13118 SelectionDAG &DAG) {
13119 // Only XOP + AVX512 targets have bit rotation instructions.
13120 // If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.
13121 bool IsLegal =
13122 (VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();
13123 if (!IsLegal && Subtarget.hasSSE3())
13124 return SDValue();
13125
13126 MVT RotateVT;
13127 int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),
13128 Subtarget, Mask);
13129 if (RotateAmt < 0)
13130 return SDValue();
13131
13132 // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
13133 // expanded to OR(SRL,SHL), will be more efficient, but if they can
13134 // widen to vXi16 or more then existing lowering should will be better.
13135 if (!IsLegal) {
13136 if ((RotateAmt % 16) == 0)
13137 return SDValue();
13138 // TODO: Use getTargetVShiftByConstNode.
13139 unsigned ShlAmt = RotateAmt;
13140 unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;
13141 V1 = DAG.getBitcast(RotateVT, V1);
13142 SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,
13143 DAG.getTargetConstant(ShlAmt, DL, MVT::i8));
13144 SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,
13145 DAG.getTargetConstant(SrlAmt, DL, MVT::i8));
13146 SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
13147 return DAG.getBitcast(VT, Rot);
13148 }
13149
13150 SDValue Rot =
13151 DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
13152 DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
13153 return DAG.getBitcast(VT, Rot);
13154}
13155
13156/// Try to match a vector shuffle as an element rotation.
13157///
13158/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
13159static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2,
13160 ArrayRef<int> Mask) {
13161 int NumElts = Mask.size();
13162
13163 // We need to detect various ways of spelling a rotation:
13164 // [11, 12, 13, 14, 15, 0, 1, 2]
13165 // [-1, 12, 13, 14, -1, -1, 1, -1]
13166 // [-1, -1, -1, -1, -1, -1, 1, 2]
13167 // [ 3, 4, 5, 6, 7, 8, 9, 10]
13168 // [-1, 4, 5, 6, -1, -1, 9, -1]
13169 // [-1, 4, 5, 6, -1, -1, -1, -1]
13170 int Rotation = 0;
13171 SDValue Lo, Hi;
13172 for (int i = 0; i < NumElts; ++i) {
13173 int M = Mask[i];
13174 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&(static_cast <bool> ((M == SM_SentinelUndef || (0 <=
M && M < (2*NumElts))) && "Unexpected mask index."
) ? void (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) && \"Unexpected mask index.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13175, __extension__
__PRETTY_FUNCTION__))
13175 "Unexpected mask index.")(static_cast <bool> ((M == SM_SentinelUndef || (0 <=
M && M < (2*NumElts))) && "Unexpected mask index."
) ? void (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) && \"Unexpected mask index.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13175, __extension__
__PRETTY_FUNCTION__))
;
13176 if (M < 0)
13177 continue;
13178
13179 // Determine where a rotated vector would have started.
13180 int StartIdx = i - (M % NumElts);
13181 if (StartIdx == 0)
13182 // The identity rotation isn't interesting, stop.
13183 return -1;
13184
13185 // If we found the tail of a vector the rotation must be the missing
13186 // front. If we found the head of a vector, it must be how much of the
13187 // head.
13188 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
13189
13190 if (Rotation == 0)
13191 Rotation = CandidateRotation;
13192 else if (Rotation != CandidateRotation)
13193 // The rotations don't match, so we can't match this mask.
13194 return -1;
13195
13196 // Compute which value this mask is pointing at.
13197 SDValue MaskV = M < NumElts ? V1 : V2;
13198
13199 // Compute which of the two target values this index should be assigned
13200 // to. This reflects whether the high elements are remaining or the low
13201 // elements are remaining.
13202 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
13203
13204 // Either set up this value if we've not encountered it before, or check
13205 // that it remains consistent.
13206 if (!TargetV)
13207 TargetV = MaskV;
13208 else if (TargetV != MaskV)
13209 // This may be a rotation, but it pulls from the inputs in some
13210 // unsupported interleaving.
13211 return -1;
13212 }
13213
13214 // Check that we successfully analyzed the mask, and normalize the results.
13215 assert(Rotation != 0 && "Failed to locate a viable rotation!")(static_cast <bool> (Rotation != 0 && "Failed to locate a viable rotation!"
) ? void (0) : __assert_fail ("Rotation != 0 && \"Failed to locate a viable rotation!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13215, __extension__
__PRETTY_FUNCTION__))
;
13216 assert((Lo || Hi) && "Failed to find a rotated input vector!")(static_cast <bool> ((Lo || Hi) && "Failed to find a rotated input vector!"
) ? void (0) : __assert_fail ("(Lo || Hi) && \"Failed to find a rotated input vector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13216, __extension__
__PRETTY_FUNCTION__))
;
13217 if (!Lo)
13218 Lo = Hi;
13219 else if (!Hi)
13220 Hi = Lo;
13221
13222 V1 = Lo;
13223 V2 = Hi;
13224
13225 return Rotation;
13226}
13227
13228/// Try to lower a vector shuffle as a byte rotation.
13229///
13230/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
13231/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
13232/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
13233/// try to generically lower a vector shuffle through such an pattern. It
13234/// does not check for the profitability of lowering either as PALIGNR or
13235/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
13236/// This matches shuffle vectors that look like:
13237///
13238/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
13239///
13240/// Essentially it concatenates V1 and V2, shifts right by some number of
13241/// elements, and takes the low elements as the result. Note that while this is
13242/// specified as a *right shift* because x86 is little-endian, it is a *left
13243/// rotate* of the vector lanes.
13244static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
13245 ArrayRef<int> Mask) {
13246 // Don't accept any shuffles with zero elements.
13247 if (isAnyZero(Mask))
13248 return -1;
13249
13250 // PALIGNR works on 128-bit lanes.
13251 SmallVector<int, 16> RepeatedMask;
13252 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
13253 return -1;
13254
13255 int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask);
13256 if (Rotation <= 0)
13257 return -1;
13258
13259 // PALIGNR rotates bytes, so we need to scale the
13260 // rotation based on how many bytes are in the vector lane.
13261 int NumElts = RepeatedMask.size();
13262 int Scale = 16 / NumElts;
13263 return Rotation * Scale;
13264}
13265
13266static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1,
13267 SDValue V2, ArrayRef<int> Mask,
13268 const X86Subtarget &Subtarget,
13269 SelectionDAG &DAG) {
13270 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")(static_cast <bool> (!isNoopShuffleMask(Mask) &&
"We shouldn't lower no-op shuffles!") ? void (0) : __assert_fail
("!isNoopShuffleMask(Mask) && \"We shouldn't lower no-op shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13270, __extension__
__PRETTY_FUNCTION__))
;
13271
13272 SDValue Lo = V1, Hi = V2;
13273 int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
13274 if (ByteRotation <= 0)
13275 return SDValue();
13276
13277 // Cast the inputs to i8 vector of correct length to match PALIGNR or
13278 // PSLLDQ/PSRLDQ.
13279 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
13280 Lo = DAG.getBitcast(ByteVT, Lo);
13281 Hi = DAG.getBitcast(ByteVT, Hi);
13282
13283 // SSSE3 targets can use the palignr instruction.
13284 if (Subtarget.hasSSSE3()) {
13285 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&(static_cast <bool> ((!VT.is512BitVector() || Subtarget
.hasBWI()) && "512-bit PALIGNR requires BWI instructions"
) ? void (0) : __assert_fail ("(!VT.is512BitVector() || Subtarget.hasBWI()) && \"512-bit PALIGNR requires BWI instructions\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13286, __extension__
__PRETTY_FUNCTION__))
13286 "512-bit PALIGNR requires BWI instructions")(static_cast <bool> ((!VT.is512BitVector() || Subtarget
.hasBWI()) && "512-bit PALIGNR requires BWI instructions"
) ? void (0) : __assert_fail ("(!VT.is512BitVector() || Subtarget.hasBWI()) && \"512-bit PALIGNR requires BWI instructions\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13286, __extension__
__PRETTY_FUNCTION__))
;
13287 return DAG.getBitcast(
13288 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
13289 DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));
13290 }
13291
13292 assert(VT.is128BitVector() &&(static_cast <bool> (VT.is128BitVector() && "Rotate-based lowering only supports 128-bit lowering!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Rotate-based lowering only supports 128-bit lowering!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13293, __extension__
__PRETTY_FUNCTION__))
13293 "Rotate-based lowering only supports 128-bit lowering!")(static_cast <bool> (VT.is128BitVector() && "Rotate-based lowering only supports 128-bit lowering!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Rotate-based lowering only supports 128-bit lowering!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13293, __extension__
__PRETTY_FUNCTION__))
;
13294 assert(Mask.size() <= 16 &&(static_cast <bool> (Mask.size() <= 16 && "Can shuffle at most 16 bytes in a 128-bit vector!"
) ? void (0) : __assert_fail ("Mask.size() <= 16 && \"Can shuffle at most 16 bytes in a 128-bit vector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13295, __extension__
__PRETTY_FUNCTION__))
13295 "Can shuffle at most 16 bytes in a 128-bit vector!")(static_cast <bool> (Mask.size() <= 16 && "Can shuffle at most 16 bytes in a 128-bit vector!"
) ? void (0) : __assert_fail ("Mask.size() <= 16 && \"Can shuffle at most 16 bytes in a 128-bit vector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13295, __extension__
__PRETTY_FUNCTION__))
;
13296 assert(ByteVT == MVT::v16i8 &&(static_cast <bool> (ByteVT == MVT::v16i8 && "SSE2 rotate lowering only needed for v16i8!"
) ? void (0) : __assert_fail ("ByteVT == MVT::v16i8 && \"SSE2 rotate lowering only needed for v16i8!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13297, __extension__
__PRETTY_FUNCTION__))
13297 "SSE2 rotate lowering only needed for v16i8!")(static_cast <bool> (ByteVT == MVT::v16i8 && "SSE2 rotate lowering only needed for v16i8!"
) ? void (0) : __assert_fail ("ByteVT == MVT::v16i8 && \"SSE2 rotate lowering only needed for v16i8!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13297, __extension__
__PRETTY_FUNCTION__))
;
13298
13299 // Default SSE2 implementation
13300 int LoByteShift = 16 - ByteRotation;
13301 int HiByteShift = ByteRotation;
13302
13303 SDValue LoShift =
13304 DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
13305 DAG.getTargetConstant(LoByteShift, DL, MVT::i8));
13306 SDValue HiShift =
13307 DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
13308 DAG.getTargetConstant(HiByteShift, DL, MVT::i8));
13309 return DAG.getBitcast(VT,
13310 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
13311}
13312
13313/// Try to lower a vector shuffle as a dword/qword rotation.
13314///
13315/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
13316/// rotation of the concatenation of two vectors; This routine will
13317/// try to generically lower a vector shuffle through such an pattern.
13318///
13319/// Essentially it concatenates V1 and V2, shifts right by some number of
13320/// elements, and takes the low elements as the result. Note that while this is
13321/// specified as a *right shift* because x86 is little-endian, it is a *left
13322/// rotate* of the vector lanes.
13323static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1,
13324 SDValue V2, ArrayRef<int> Mask,
13325 const X86Subtarget &Subtarget,
13326 SelectionDAG &DAG) {
13327 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&(static_cast <bool> ((VT.getScalarType() == MVT::i32 ||
VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? void (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13328, __extension__
__PRETTY_FUNCTION__))
13328 "Only 32-bit and 64-bit elements are supported!")(static_cast <bool> ((VT.getScalarType() == MVT::i32 ||
VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? void (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13328, __extension__
__PRETTY_FUNCTION__))
;
13329
13330 // 128/256-bit vectors are only supported with VLX.
13331 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))(static_cast <bool> ((Subtarget.hasVLX() || (!VT.is128BitVector
() && !VT.is256BitVector())) && "VLX required for 128/256-bit vectors"
) ? void (0) : __assert_fail ("(Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector())) && \"VLX required for 128/256-bit vectors\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13332, __extension__
__PRETTY_FUNCTION__))
13332 && "VLX required for 128/256-bit vectors")(static_cast <bool> ((Subtarget.hasVLX() || (!VT.is128BitVector
() && !VT.is256BitVector())) && "VLX required for 128/256-bit vectors"
) ? void (0) : __assert_fail ("(Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector())) && \"VLX required for 128/256-bit vectors\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13332, __extension__
__PRETTY_FUNCTION__))
;
13333
13334 SDValue Lo = V1, Hi = V2;
13335 int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);
13336 if (Rotation <= 0)
13337 return SDValue();
13338
13339 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
13340 DAG.getTargetConstant(Rotation, DL, MVT::i8));
13341}
13342
13343/// Try to lower a vector shuffle as a byte shift sequence.
13344static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1,
13345 SDValue V2, ArrayRef<int> Mask,
13346 const APInt &Zeroable,
13347 const X86Subtarget &Subtarget,
13348 SelectionDAG &DAG) {
13349 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")(static_cast <bool> (!isNoopShuffleMask(Mask) &&
"We shouldn't lower no-op shuffles!") ? void (0) : __assert_fail
("!isNoopShuffleMask(Mask) && \"We shouldn't lower no-op shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13349, __extension__
__PRETTY_FUNCTION__))
;
13350 assert(VT.is128BitVector() && "Only 128-bit vectors supported")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vectors supported"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vectors supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13350, __extension__
__PRETTY_FUNCTION__))
;
13351
13352 // We need a shuffle that has zeros at one/both ends and a sequential
13353 // shuffle from one source within.
13354 unsigned ZeroLo = Zeroable.countTrailingOnes();
13355 unsigned ZeroHi = Zeroable.countLeadingOnes();
13356 if (!ZeroLo && !ZeroHi)
13357 return SDValue();
13358
13359 unsigned NumElts = Mask.size();
13360 unsigned Len = NumElts - (ZeroLo + ZeroHi);
13361 if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))
13362 return SDValue();
13363
13364 unsigned Scale = VT.getScalarSizeInBits() / 8;
13365 ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);
13366 if (!isUndefOrInRange(StubMask, 0, NumElts) &&
13367 !isUndefOrInRange(StubMask, NumElts, 2 * NumElts))
13368 return SDValue();
13369
13370 SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
13371 Res = DAG.getBitcast(MVT::v16i8, Res);
13372
13373 // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an
13374 // inner sequential set of elements, possibly offset:
13375 // 01234567 --> zzzzzz01 --> 1zzzzzzz
13376 // 01234567 --> 4567zzzz --> zzzzz456
13377 // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
13378 if (ZeroLo == 0) {
13379 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
13380 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
13381 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
13382 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
13383 DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));
13384 } else if (ZeroHi == 0) {
13385 unsigned Shift = Mask[ZeroLo] % NumElts;
13386 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
13387 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
13388 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
13389 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
13390 } else if (!Subtarget.hasSSSE3()) {
13391 // If we don't have PSHUFB then its worth avoiding an AND constant mask
13392 // by performing 3 byte shifts. Shuffle combining can kick in above that.
13393 // TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
13394 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
13395 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
13396 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
13397 Shift += Mask[ZeroLo] % NumElts;
13398 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
13399 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
13400 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
13401 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
13402 } else
13403 return SDValue();
13404
13405 return DAG.getBitcast(VT, Res);
13406}
13407
13408/// Try to lower a vector shuffle as a bit shift (shifts in zeros).
13409///
13410/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
13411/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
13412/// matches elements from one of the input vectors shuffled to the left or
13413/// right with zeroable elements 'shifted in'. It handles both the strictly
13414/// bit-wise element shifts and the byte shift across an entire 128-bit double
13415/// quad word lane.
13416///
13417/// PSHL : (little-endian) left bit shift.
13418/// [ zz, 0, zz, 2 ]
13419/// [ -1, 4, zz, -1 ]
13420/// PSRL : (little-endian) right bit shift.
13421/// [ 1, zz, 3, zz]
13422/// [ -1, -1, 7, zz]
13423/// PSLLDQ : (little-endian) left byte shift
13424/// [ zz, 0, 1, 2, 3, 4, 5, 6]
13425/// [ zz, zz, -1, -1, 2, 3, 4, -1]
13426/// [ zz, zz, zz, zz, zz, zz, -1, 1]
13427/// PSRLDQ : (little-endian) right byte shift
13428/// [ 5, 6, 7, zz, zz, zz, zz, zz]
13429/// [ -1, 5, 6, 7, zz, zz, zz, zz]
13430/// [ 1, 2, -1, -1, -1, -1, zz, zz]
13431static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
13432 unsigned ScalarSizeInBits, ArrayRef<int> Mask,
13433 int MaskOffset, const APInt &Zeroable,
13434 const X86Subtarget &Subtarget) {
13435 int Size = Mask.size();
13436 unsigned SizeInBits = Size * ScalarSizeInBits;
13437
13438 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
13439 for (int i = 0; i < Size; i += Scale)
13440 for (int j = 0; j < Shift; ++j)
13441 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
13442 return false;
13443
13444 return true;
13445 };
13446
13447 auto MatchShift = [&](int Shift, int Scale, bool Left) {
13448 for (int i = 0; i != Size; i += Scale) {
13449 unsigned Pos = Left ? i + Shift : i;
13450 unsigned Low = Left ? i : i + Shift;
13451 unsigned Len = Scale - Shift;
13452 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
13453 return -1;
13454 }
13455
13456 int ShiftEltBits = ScalarSizeInBits * Scale;
13457 bool ByteShift = ShiftEltBits > 64;
13458 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
13459 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
13460 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
13461
13462 // Normalize the scale for byte shifts to still produce an i64 element
13463 // type.
13464 Scale = ByteShift ? Scale / 2 : Scale;
13465
13466 // We need to round trip through the appropriate type for the shift.
13467 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
13468 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
13469 : MVT::getVectorVT(ShiftSVT, Size / Scale);
13470 return (int)ShiftAmt;
13471 };
13472
13473 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
13474 // keep doubling the size of the integer elements up to that. We can
13475 // then shift the elements of the integer vector by whole multiples of
13476 // their width within the elements of the larger integer vector. Test each
13477 // multiple to see if we can find a match with the moved element indices
13478 // and that the shifted in elements are all zeroable.
13479 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
13480 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
13481 for (int Shift = 1; Shift != Scale; ++Shift)
13482 for (bool Left : {true, false})
13483 if (CheckZeros(Shift, Scale, Left)) {
13484 int ShiftAmt = MatchShift(Shift, Scale, Left);
13485 if (0 < ShiftAmt)
13486 return ShiftAmt;
13487 }
13488
13489 // no match
13490 return -1;
13491}
13492
13493static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
13494 SDValue V2, ArrayRef<int> Mask,
13495 const APInt &Zeroable,
13496 const X86Subtarget &Subtarget,
13497 SelectionDAG &DAG) {
13498 int Size = Mask.size();
13499 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")(static_cast <bool> (Size == (int)VT.getVectorNumElements
() && "Unexpected mask size") ? void (0) : __assert_fail
("Size == (int)VT.getVectorNumElements() && \"Unexpected mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13499, __extension__
__PRETTY_FUNCTION__))
;
13500
13501 MVT ShiftVT;
13502 SDValue V = V1;
13503 unsigned Opcode;
13504
13505 // Try to match shuffle against V1 shift.
13506 int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
13507 Mask, 0, Zeroable, Subtarget);
13508
13509 // If V1 failed, try to match shuffle against V2 shift.
13510 if (ShiftAmt < 0) {
13511 ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
13512 Mask, Size, Zeroable, Subtarget);
13513 V = V2;
13514 }
13515
13516 if (ShiftAmt < 0)
13517 return SDValue();
13518
13519 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&(static_cast <bool> (DAG.getTargetLoweringInfo().isTypeLegal
(ShiftVT) && "Illegal integer vector type") ? void (0
) : __assert_fail ("DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) && \"Illegal integer vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13520, __extension__
__PRETTY_FUNCTION__))
13520 "Illegal integer vector type")(static_cast <bool> (DAG.getTargetLoweringInfo().isTypeLegal
(ShiftVT) && "Illegal integer vector type") ? void (0
) : __assert_fail ("DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) && \"Illegal integer vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13520, __extension__
__PRETTY_FUNCTION__))
;
13521 V = DAG.getBitcast(ShiftVT, V);
13522 V = DAG.getNode(Opcode, DL, ShiftVT, V,
13523 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
13524 return DAG.getBitcast(VT, V);
13525}
13526
13527// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
13528// Remainder of lower half result is zero and upper half is all undef.
13529static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
13530 ArrayRef<int> Mask, uint64_t &BitLen,
13531 uint64_t &BitIdx, const APInt &Zeroable) {
13532 int Size = Mask.size();
13533 int HalfSize = Size / 2;
13534 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")(static_cast <bool> (Size == (int)VT.getVectorNumElements
() && "Unexpected mask size") ? void (0) : __assert_fail
("Size == (int)VT.getVectorNumElements() && \"Unexpected mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13534, __extension__
__PRETTY_FUNCTION__))
;
13535 assert(!Zeroable.isAllOnes() && "Fully zeroable shuffle mask")(static_cast <bool> (!Zeroable.isAllOnes() && "Fully zeroable shuffle mask"
) ? void (0) : __assert_fail ("!Zeroable.isAllOnes() && \"Fully zeroable shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13535, __extension__
__PRETTY_FUNCTION__))
;
13536
13537 // Upper half must be undefined.
13538 if (!isUndefUpperHalf(Mask))
13539 return false;
13540
13541 // Determine the extraction length from the part of the
13542 // lower half that isn't zeroable.
13543 int Len = HalfSize;
13544 for (; Len > 0; --Len)
13545 if (!Zeroable[Len - 1])
13546 break;
13547 assert(Len > 0 && "Zeroable shuffle mask")(static_cast <bool> (Len > 0 && "Zeroable shuffle mask"
) ? void (0) : __assert_fail ("Len > 0 && \"Zeroable shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13547, __extension__
__PRETTY_FUNCTION__))
;
13548
13549 // Attempt to match first Len sequential elements from the lower half.
13550 SDValue Src;
13551 int Idx = -1;
13552 for (int i = 0; i != Len; ++i) {
13553 int M = Mask[i];
13554 if (M == SM_SentinelUndef)
13555 continue;
13556 SDValue &V = (M < Size ? V1 : V2);
13557 M = M % Size;
13558
13559 // The extracted elements must start at a valid index and all mask
13560 // elements must be in the lower half.
13561 if (i > M || M >= HalfSize)
13562 return false;
13563
13564 if (Idx < 0 || (Src == V && Idx == (M - i))) {
13565 Src = V;
13566 Idx = M - i;
13567 continue;
13568 }
13569 return false;
13570 }
13571
13572 if (!Src || Idx < 0)
13573 return false;
13574
13575 assert((Idx + Len) <= HalfSize && "Illegal extraction mask")(static_cast <bool> ((Idx + Len) <= HalfSize &&
"Illegal extraction mask") ? void (0) : __assert_fail ("(Idx + Len) <= HalfSize && \"Illegal extraction mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13575, __extension__
__PRETTY_FUNCTION__))
;
13576 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
13577 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
13578 V1 = Src;
13579 return true;
13580}
13581
13582// INSERTQ: Extract lowest Len elements from lower half of second source and
13583// insert over first source, starting at Idx.
13584// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
13585static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
13586 ArrayRef<int> Mask, uint64_t &BitLen,
13587 uint64_t &BitIdx) {
13588 int Size = Mask.size();
13589 int HalfSize = Size / 2;
13590 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")(static_cast <bool> (Size == (int)VT.getVectorNumElements
() && "Unexpected mask size") ? void (0) : __assert_fail
("Size == (int)VT.getVectorNumElements() && \"Unexpected mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13590, __extension__
__PRETTY_FUNCTION__))
;
13591
13592 // Upper half must be undefined.
13593 if (!isUndefUpperHalf(Mask))
13594 return false;
13595
13596 for (int Idx = 0; Idx != HalfSize; ++Idx) {
13597 SDValue Base;
13598
13599 // Attempt to match first source from mask before insertion point.
13600 if (isUndefInRange(Mask, 0, Idx)) {
13601 /* EMPTY */
13602 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
13603 Base = V1;
13604 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
13605 Base = V2;
13606 } else {
13607 continue;
13608 }
13609
13610 // Extend the extraction length looking to match both the insertion of
13611 // the second source and the remaining elements of the first.
13612 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
13613 SDValue Insert;
13614 int Len = Hi - Idx;
13615
13616 // Match insertion.
13617 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
13618 Insert = V1;
13619 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
13620 Insert = V2;
13621 } else {
13622 continue;
13623 }
13624
13625 // Match the remaining elements of the lower half.
13626 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
13627 /* EMPTY */
13628 } else if ((!Base || (Base == V1)) &&
13629 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
13630 Base = V1;
13631 } else if ((!Base || (Base == V2)) &&
13632 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
13633 Size + Hi)) {
13634 Base = V2;
13635 } else {
13636 continue;
13637 }
13638
13639 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
13640 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
13641 V1 = Base;
13642 V2 = Insert;
13643 return true;
13644 }
13645 }
13646
13647 return false;
13648}
13649
13650/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
13651static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
13652 SDValue V2, ArrayRef<int> Mask,
13653 const APInt &Zeroable, SelectionDAG &DAG) {
13654 uint64_t BitLen, BitIdx;
13655 if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
13656 return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
13657 DAG.getTargetConstant(BitLen, DL, MVT::i8),
13658 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
13659
13660 if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
13661 return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
13662 V2 ? V2 : DAG.getUNDEF(VT),
13663 DAG.getTargetConstant(BitLen, DL, MVT::i8),
13664 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
13665
13666 return SDValue();
13667}
13668
13669/// Lower a vector shuffle as a zero or any extension.
13670///
13671/// Given a specific number of elements, element bit width, and extension
13672/// stride, produce either a zero or any extension based on the available
13673/// features of the subtarget. The extended elements are consecutive and
13674/// begin and can start from an offsetted element index in the input; to
13675/// avoid excess shuffling the offset must either being in the bottom lane
13676/// or at the start of a higher lane. All extended elements must be from
13677/// the same lane.
13678static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(
13679 const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
13680 ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13681 assert(Scale > 1 && "Need a scale to extend.")(static_cast <bool> (Scale > 1 && "Need a scale to extend."
) ? void (0) : __assert_fail ("Scale > 1 && \"Need a scale to extend.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13681, __extension__
__PRETTY_FUNCTION__))
;
13682 int EltBits = VT.getScalarSizeInBits();
13683 int NumElements = VT.getVectorNumElements();
13684 int NumEltsPerLane = 128 / EltBits;
13685 int OffsetLane = Offset / NumEltsPerLane;
13686 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&(static_cast <bool> ((EltBits == 8 || EltBits == 16 || EltBits
== 32) && "Only 8, 16, and 32 bit elements can be extended."
) ? void (0) : __assert_fail ("(EltBits == 8 || EltBits == 16 || EltBits == 32) && \"Only 8, 16, and 32 bit elements can be extended.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13687, __extension__
__PRETTY_FUNCTION__))
13687 "Only 8, 16, and 32 bit elements can be extended.")(static_cast <bool> ((EltBits == 8 || EltBits == 16 || EltBits
== 32) && "Only 8, 16, and 32 bit elements can be extended."
) ? void (0) : __assert_fail ("(EltBits == 8 || EltBits == 16 || EltBits == 32) && \"Only 8, 16, and 32 bit elements can be extended.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13687, __extension__
__PRETTY_FUNCTION__))
;
13688 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.")(static_cast <bool> (Scale * EltBits <= 64 &&
"Cannot zero extend past 64 bits.") ? void (0) : __assert_fail
("Scale * EltBits <= 64 && \"Cannot zero extend past 64 bits.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13688, __extension__
__PRETTY_FUNCTION__))
;
13689 assert(0 <= Offset && "Extension offset must be positive.")(static_cast <bool> (0 <= Offset && "Extension offset must be positive."
) ? void (0) : __assert_fail ("0 <= Offset && \"Extension offset must be positive.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13689, __extension__
__PRETTY_FUNCTION__))
;
13690 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&(static_cast <bool> ((Offset < NumEltsPerLane || Offset
% NumEltsPerLane == 0) && "Extension offset must be in the first lane or start an upper lane."
) ? void (0) : __assert_fail ("(Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) && \"Extension offset must be in the first lane or start an upper lane.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13691, __extension__
__PRETTY_FUNCTION__))
13691 "Extension offset must be in the first lane or start an upper lane.")(static_cast <bool> ((Offset < NumEltsPerLane || Offset
% NumEltsPerLane == 0) && "Extension offset must be in the first lane or start an upper lane."
) ? void (0) : __assert_fail ("(Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) && \"Extension offset must be in the first lane or start an upper lane.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13691, __extension__
__PRETTY_FUNCTION__))
;
13692
13693 // Check that an index is in same lane as the base offset.
13694 auto SafeOffset = [&](int Idx) {
13695 return OffsetLane == (Idx / NumEltsPerLane);
13696 };
13697
13698 // Shift along an input so that the offset base moves to the first element.
13699 auto ShuffleOffset = [&](SDValue V) {
13700 if (!Offset)
13701 return V;
13702
13703 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
13704 for (int i = 0; i * Scale < NumElements; ++i) {
13705 int SrcIdx = i + Offset;
13706 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
13707 }
13708 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
13709 };
13710
13711 // Found a valid a/zext mask! Try various lowering strategies based on the
13712 // input type and available ISA extensions.
13713 if (Subtarget.hasSSE41()) {
13714 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
13715 // PUNPCK will catch this in a later shuffle match.
13716 if (Offset && Scale == 2 && VT.is128BitVector())
13717 return SDValue();
13718 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
13719 NumElements / Scale);
13720 InputV = ShuffleOffset(InputV);
13721 InputV = getEXTEND_VECTOR_INREG(AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND,
13722 DL, ExtVT, InputV, DAG);
13723 return DAG.getBitcast(VT, InputV);
13724 }
13725
13726 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vectors can be extended."
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vectors can be extended.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13726, __extension__
__PRETTY_FUNCTION__))
;
13727
13728 // For any extends we can cheat for larger element sizes and use shuffle
13729 // instructions that can fold with a load and/or copy.
13730 if (AnyExt && EltBits == 32) {
13731 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
13732 -1};
13733 return DAG.getBitcast(
13734 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
13735 DAG.getBitcast(MVT::v4i32, InputV),
13736 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13737 }
13738 if (AnyExt && EltBits == 16 && Scale > 2) {
13739 int PSHUFDMask[4] = {Offset / 2, -1,
13740 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
13741 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
13742 DAG.getBitcast(MVT::v4i32, InputV),
13743 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
13744 int PSHUFWMask[4] = {1, -1, -1, -1};
13745 unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
13746 return DAG.getBitcast(
13747 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
13748 DAG.getBitcast(MVT::v8i16, InputV),
13749 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
13750 }
13751
13752 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
13753 // to 64-bits.
13754 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
13755 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!")(static_cast <bool> (NumElements == (int)Mask.size() &&
"Unexpected shuffle mask size!") ? void (0) : __assert_fail (
"NumElements == (int)Mask.size() && \"Unexpected shuffle mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13755, __extension__
__PRETTY_FUNCTION__))
;
13756 assert(VT.is128BitVector() && "Unexpected vector width!")(static_cast <bool> (VT.is128BitVector() && "Unexpected vector width!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Unexpected vector width!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13756, __extension__
__PRETTY_FUNCTION__))
;
13757
13758 int LoIdx = Offset * EltBits;
13759 SDValue Lo = DAG.getBitcast(
13760 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
13761 DAG.getTargetConstant(EltBits, DL, MVT::i8),
13762 DAG.getTargetConstant(LoIdx, DL, MVT::i8)));
13763
13764 if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))
13765 return DAG.getBitcast(VT, Lo);
13766
13767 int HiIdx = (Offset + 1) * EltBits;
13768 SDValue Hi = DAG.getBitcast(
13769 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
13770 DAG.getTargetConstant(EltBits, DL, MVT::i8),
13771 DAG.getTargetConstant(HiIdx, DL, MVT::i8)));
13772 return DAG.getBitcast(VT,
13773 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
13774 }
13775
13776 // If this would require more than 2 unpack instructions to expand, use
13777 // pshufb when available. We can only use more than 2 unpack instructions
13778 // when zero extending i8 elements which also makes it easier to use pshufb.
13779 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
13780 assert(NumElements == 16 && "Unexpected byte vector width!")(static_cast <bool> (NumElements == 16 && "Unexpected byte vector width!"
) ? void (0) : __assert_fail ("NumElements == 16 && \"Unexpected byte vector width!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13780, __extension__
__PRETTY_FUNCTION__))
;
13781 SDValue PSHUFBMask[16];
13782 for (int i = 0; i < 16; ++i) {
13783 int Idx = Offset + (i / Scale);
13784 if ((i % Scale == 0 && SafeOffset(Idx))) {
13785 PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);
13786 continue;
13787 }
13788 PSHUFBMask[i] =
13789 AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);
13790 }
13791 InputV = DAG.getBitcast(MVT::v16i8, InputV);
13792 return DAG.getBitcast(
13793 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
13794 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
13795 }
13796
13797 // If we are extending from an offset, ensure we start on a boundary that
13798 // we can unpack from.
13799 int AlignToUnpack = Offset % (NumElements / Scale);
13800 if (AlignToUnpack) {
13801 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
13802 for (int i = AlignToUnpack; i < NumElements; ++i)
13803 ShMask[i - AlignToUnpack] = i;
13804 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
13805 Offset -= AlignToUnpack;
13806 }
13807
13808 // Otherwise emit a sequence of unpacks.
13809 do {
13810 unsigned UnpackLoHi = X86ISD::UNPCKL;
13811 if (Offset >= (NumElements / 2)) {
13812 UnpackLoHi = X86ISD::UNPCKH;
13813 Offset -= (NumElements / 2);
13814 }
13815
13816 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
13817 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
13818 : getZeroVector(InputVT, Subtarget, DAG, DL);
13819 InputV = DAG.getBitcast(InputVT, InputV);
13820 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
13821 Scale /= 2;
13822 EltBits *= 2;
13823 NumElements /= 2;
13824 } while (Scale > 1);
13825 return DAG.getBitcast(VT, InputV);
13826}
13827
13828/// Try to lower a vector shuffle as a zero extension on any microarch.
13829///
13830/// This routine will try to do everything in its power to cleverly lower
13831/// a shuffle which happens to match the pattern of a zero extend. It doesn't
13832/// check for the profitability of this lowering, it tries to aggressively
13833/// match this pattern. It will use all of the micro-architectural details it
13834/// can to emit an efficient lowering. It handles both blends with all-zero
13835/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
13836/// masking out later).
13837///
13838/// The reason we have dedicated lowering for zext-style shuffles is that they
13839/// are both incredibly common and often quite performance sensitive.
13840static SDValue lowerShuffleAsZeroOrAnyExtend(
13841 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13842 const APInt &Zeroable, const X86Subtarget &Subtarget,
13843 SelectionDAG &DAG) {
13844 int Bits = VT.getSizeInBits();
13845 int NumLanes = Bits / 128;
13846 int NumElements = VT.getVectorNumElements();
13847 int NumEltsPerLane = NumElements / NumLanes;
13848 assert(VT.getScalarSizeInBits() <= 32 &&(static_cast <bool> (VT.getScalarSizeInBits() <= 32 &&
"Exceeds 32-bit integer zero extension limit") ? void (0) : __assert_fail
("VT.getScalarSizeInBits() <= 32 && \"Exceeds 32-bit integer zero extension limit\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13849, __extension__
__PRETTY_FUNCTION__))
13849 "Exceeds 32-bit integer zero extension limit")(static_cast <bool> (VT.getScalarSizeInBits() <= 32 &&
"Exceeds 32-bit integer zero extension limit") ? void (0) : __assert_fail
("VT.getScalarSizeInBits() <= 32 && \"Exceeds 32-bit integer zero extension limit\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13849, __extension__
__PRETTY_FUNCTION__))
;
13850 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size")(static_cast <bool> ((int)Mask.size() == NumElements &&
"Unexpected shuffle mask size") ? void (0) : __assert_fail (
"(int)Mask.size() == NumElements && \"Unexpected shuffle mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13850, __extension__
__PRETTY_FUNCTION__))
;
13851
13852 // Define a helper function to check a particular ext-scale and lower to it if
13853 // valid.
13854 auto Lower = [&](int Scale) -> SDValue {
13855 SDValue InputV;
13856 bool AnyExt = true;
13857 int Offset = 0;
13858 int Matches = 0;
13859 for (int i = 0; i < NumElements; ++i) {
13860 int M = Mask[i];
13861 if (M < 0)
13862 continue; // Valid anywhere but doesn't tell us anything.
13863 if (i % Scale != 0) {
13864 // Each of the extended elements need to be zeroable.
13865 if (!Zeroable[i])
13866 return SDValue();
13867
13868 // We no longer are in the anyext case.
13869 AnyExt = false;
13870 continue;
13871 }
13872
13873 // Each of the base elements needs to be consecutive indices into the
13874 // same input vector.
13875 SDValue V = M < NumElements ? V1 : V2;
13876 M = M % NumElements;
13877 if (!InputV) {
13878 InputV = V;
13879 Offset = M - (i / Scale);
13880 } else if (InputV != V)
13881 return SDValue(); // Flip-flopping inputs.
13882
13883 // Offset must start in the lowest 128-bit lane or at the start of an
13884 // upper lane.
13885 // FIXME: Is it ever worth allowing a negative base offset?
13886 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
13887 (Offset % NumEltsPerLane) == 0))
13888 return SDValue();
13889
13890 // If we are offsetting, all referenced entries must come from the same
13891 // lane.
13892 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
13893 return SDValue();
13894
13895 if ((M % NumElements) != (Offset + (i / Scale)))
13896 return SDValue(); // Non-consecutive strided elements.
13897 Matches++;
13898 }
13899
13900 // If we fail to find an input, we have a zero-shuffle which should always
13901 // have already been handled.
13902 // FIXME: Maybe handle this here in case during blending we end up with one?
13903 if (!InputV)
13904 return SDValue();
13905
13906 // If we are offsetting, don't extend if we only match a single input, we
13907 // can always do better by using a basic PSHUF or PUNPCK.
13908 if (Offset != 0 && Matches < 2)
13909 return SDValue();
13910
13911 return lowerShuffleAsSpecificZeroOrAnyExtend(DL, VT, Scale, Offset, AnyExt,
13912 InputV, Mask, Subtarget, DAG);
13913 };
13914
13915 // The widest scale possible for extending is to a 64-bit integer.
13916 assert(Bits % 64 == 0 &&(static_cast <bool> (Bits % 64 == 0 && "The number of bits in a vector must be divisible by 64 on x86!"
) ? void (0) : __assert_fail ("Bits % 64 == 0 && \"The number of bits in a vector must be divisible by 64 on x86!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13917, __extension__
__PRETTY_FUNCTION__))
13917 "The number of bits in a vector must be divisible by 64 on x86!")(static_cast <bool> (Bits % 64 == 0 && "The number of bits in a vector must be divisible by 64 on x86!"
) ? void (0) : __assert_fail ("Bits % 64 == 0 && \"The number of bits in a vector must be divisible by 64 on x86!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13917, __extension__
__PRETTY_FUNCTION__))
;
13918 int NumExtElements = Bits / 64;
13919
13920 // Each iteration, try extending the elements half as much, but into twice as
13921 // many elements.
13922 for (; NumExtElements < NumElements; NumExtElements *= 2) {
13923 assert(NumElements % NumExtElements == 0 &&(static_cast <bool> (NumElements % NumExtElements == 0 &&
"The input vector size must be divisible by the extended size."
) ? void (0) : __assert_fail ("NumElements % NumExtElements == 0 && \"The input vector size must be divisible by the extended size.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13924, __extension__
__PRETTY_FUNCTION__))
13924 "The input vector size must be divisible by the extended size.")(static_cast <bool> (NumElements % NumExtElements == 0 &&
"The input vector size must be divisible by the extended size."
) ? void (0) : __assert_fail ("NumElements % NumExtElements == 0 && \"The input vector size must be divisible by the extended size.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13924, __extension__
__PRETTY_FUNCTION__))
;
13925 if (SDValue V = Lower(NumElements / NumExtElements))
13926 return V;
13927 }
13928
13929 // General extends failed, but 128-bit vectors may be able to use MOVQ.
13930 if (Bits != 128)
13931 return SDValue();
13932
13933 // Returns one of the source operands if the shuffle can be reduced to a
13934 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
13935 auto CanZExtLowHalf = [&]() {
13936 for (int i = NumElements / 2; i != NumElements; ++i)
13937 if (!Zeroable[i])
13938 return SDValue();
13939 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
13940 return V1;
13941 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
13942 return V2;
13943 return SDValue();
13944 };
13945
13946 if (SDValue V = CanZExtLowHalf()) {
13947 V = DAG.getBitcast(MVT::v2i64, V);
13948 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
13949 return DAG.getBitcast(VT, V);
13950 }
13951
13952 // No viable ext lowering found.
13953 return SDValue();
13954}
13955
13956/// Try to get a scalar value for a specific element of a vector.
13957///
13958/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
13959static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
13960 SelectionDAG &DAG) {
13961 MVT VT = V.getSimpleValueType();
13962 MVT EltVT = VT.getVectorElementType();
13963 V = peekThroughBitcasts(V);
13964
13965 // If the bitcasts shift the element size, we can't extract an equivalent
13966 // element from it.
13967 MVT NewVT = V.getSimpleValueType();
13968 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
13969 return SDValue();
13970
13971 if (V.getOpcode() == ISD::BUILD_VECTOR ||
13972 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
13973 // Ensure the scalar operand is the same size as the destination.
13974 // FIXME: Add support for scalar truncation where possible.
13975 SDValue S = V.getOperand(Idx);
13976 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
13977 return DAG.getBitcast(EltVT, S);
13978 }
13979
13980 return SDValue();
13981}
13982
13983/// Helper to test for a load that can be folded with x86 shuffles.
13984///
13985/// This is particularly important because the set of instructions varies
13986/// significantly based on whether the operand is a load or not.
13987static bool isShuffleFoldableLoad(SDValue V) {
13988 V = peekThroughBitcasts(V);
13989 return ISD::isNON_EXTLoad(V.getNode());
13990}
13991
13992/// Try to lower insertion of a single element into a zero vector.
13993///
13994/// This is a common pattern that we have especially efficient patterns to lower
13995/// across all subtarget feature sets.
13996static SDValue lowerShuffleAsElementInsertion(
13997 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13998 const APInt &Zeroable, const X86Subtarget &Subtarget,
13999 SelectionDAG &DAG) {
14000 MVT ExtVT = VT;
14001 MVT EltVT = VT.getVectorElementType();
14002
14003 int V2Index =
14004 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
14005 Mask.begin();
14006 bool IsV1Zeroable = true;
14007 for (int i = 0, Size = Mask.size(); i < Size; ++i)
14008 if (i != V2Index && !Zeroable[i]) {
14009 IsV1Zeroable = false;
14010 break;
14011 }
14012
14013 // Check for a single input from a SCALAR_TO_VECTOR node.
14014 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
14015 // all the smarts here sunk into that routine. However, the current
14016 // lowering of BUILD_VECTOR makes that nearly impossible until the old
14017 // vector shuffle lowering is dead.
14018 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
14019 DAG);
14020 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
14021 // We need to zext the scalar if it is smaller than an i32.
14022 V2S = DAG.getBitcast(EltVT, V2S);
14023 if (EltVT == MVT::i8 || (EltVT == MVT::i16 && !Subtarget.hasFP16())) {
14024 // Using zext to expand a narrow element won't work for non-zero
14025 // insertions.
14026 if (!IsV1Zeroable)
14027 return SDValue();
14028
14029 // Zero-extend directly to i32.
14030 ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
14031 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
14032 }
14033 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
14034 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
14035 EltVT == MVT::i16) {
14036 // Either not inserting from the low element of the input or the input
14037 // element size is too small to use VZEXT_MOVL to clear the high bits.
14038 return SDValue();
14039 }
14040
14041 if (!IsV1Zeroable) {
14042 // If V1 can't be treated as a zero vector we have fewer options to lower
14043 // this. We can't support integer vectors or non-zero targets cheaply, and
14044 // the V1 elements can't be permuted in any way.
14045 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!")(static_cast <bool> (VT == ExtVT && "Cannot change extended type when non-zeroable!"
) ? void (0) : __assert_fail ("VT == ExtVT && \"Cannot change extended type when non-zeroable!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14045, __extension__
__PRETTY_FUNCTION__))
;
14046 if (!VT.isFloatingPoint() || V2Index != 0)
14047 return SDValue();
14048 SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
14049 V1Mask[V2Index] = -1;
14050 if (!isNoopShuffleMask(V1Mask))
14051 return SDValue();
14052 if (!VT.is128BitVector())
14053 return SDValue();
14054
14055 // Otherwise, use MOVSD, MOVSS or MOVSH.
14056 unsigned MovOpc = 0;
14057 if (EltVT == MVT::f16)
14058 MovOpc = X86ISD::MOVSH;
14059 else if (EltVT == MVT::f32)
14060 MovOpc = X86ISD::MOVSS;
14061 else if (EltVT == MVT::f64)
14062 MovOpc = X86ISD::MOVSD;
14063 else
14064 llvm_unreachable("Unsupported floating point element type to handle!")::llvm::llvm_unreachable_internal("Unsupported floating point element type to handle!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14064)
;
14065 return DAG.getNode(MovOpc, DL, ExtVT, V1, V2);
14066 }
14067
14068 // This lowering only works for the low element with floating point vectors.
14069 if (VT.isFloatingPoint() && V2Index != 0)
14070 return SDValue();
14071
14072 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
14073 if (ExtVT != VT)
14074 V2 = DAG.getBitcast(VT, V2);
14075
14076 if (V2Index != 0) {
14077 // If we have 4 or fewer lanes we can cheaply shuffle the element into
14078 // the desired position. Otherwise it is more efficient to do a vector
14079 // shift left. We know that we can do a vector shift left because all
14080 // the inputs are zero.
14081 if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
14082 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
14083 V2Shuffle[V2Index] = 0;
14084 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
14085 } else {
14086 V2 = DAG.getBitcast(MVT::v16i8, V2);
14087 V2 = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
14088 DAG.getTargetConstant(
14089 V2Index * EltVT.getSizeInBits() / 8, DL, MVT::i8));
14090 V2 = DAG.getBitcast(VT, V2);
14091 }
14092 }
14093 return V2;
14094}
14095
14096/// Try to lower broadcast of a single - truncated - integer element,
14097/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
14098///
14099/// This assumes we have AVX2.
14100static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0,
14101 int BroadcastIdx,
14102 const X86Subtarget &Subtarget,
14103 SelectionDAG &DAG) {
14104 assert(Subtarget.hasAVX2() &&(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower integer broadcasts with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower integer broadcasts with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14105, __extension__
__PRETTY_FUNCTION__))
14105 "We can only lower integer broadcasts with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower integer broadcasts with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower integer broadcasts with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14105, __extension__
__PRETTY_FUNCTION__))
;
14106
14107 MVT EltVT = VT.getVectorElementType();
14108 MVT V0VT = V0.getSimpleValueType();
14109
14110 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!")(static_cast <bool> (VT.isInteger() && "Unexpected non-integer trunc broadcast!"
) ? void (0) : __assert_fail ("VT.isInteger() && \"Unexpected non-integer trunc broadcast!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14110, __extension__
__PRETTY_FUNCTION__))
;
14111 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!")(static_cast <bool> (V0VT.isVector() && "Unexpected non-vector vector-sized value!"
) ? void (0) : __assert_fail ("V0VT.isVector() && \"Unexpected non-vector vector-sized value!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14111, __extension__
__PRETTY_FUNCTION__))
;
14112
14113 MVT V0EltVT = V0VT.getVectorElementType();
14114 if (!V0EltVT.isInteger())
14115 return SDValue();
14116
14117 const unsigned EltSize = EltVT.getSizeInBits();
14118 const unsigned V0EltSize = V0EltVT.getSizeInBits();
14119
14120 // This is only a truncation if the original element type is larger.
14121 if (V0EltSize <= EltSize)
14122 return SDValue();
14123
14124 assert(((V0EltSize % EltSize) == 0) &&(static_cast <bool> (((V0EltSize % EltSize) == 0) &&
"Scalar type sizes must all be powers of 2 on x86!") ? void (
0) : __assert_fail ("((V0EltSize % EltSize) == 0) && \"Scalar type sizes must all be powers of 2 on x86!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14125, __extension__
__PRETTY_FUNCTION__))
14125 "Scalar type sizes must all be powers of 2 on x86!")(static_cast <bool> (((V0EltSize % EltSize) == 0) &&
"Scalar type sizes must all be powers of 2 on x86!") ? void (
0) : __assert_fail ("((V0EltSize % EltSize) == 0) && \"Scalar type sizes must all be powers of 2 on x86!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14125, __extension__
__PRETTY_FUNCTION__))
;
14126
14127 const unsigned V0Opc = V0.getOpcode();
14128 const unsigned Scale = V0EltSize / EltSize;
14129 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
14130
14131 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
14132 V0Opc != ISD::BUILD_VECTOR)
14133 return SDValue();
14134
14135 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
14136
14137 // If we're extracting non-least-significant bits, shift so we can truncate.
14138 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
14139 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
14140 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
14141 if (const int OffsetIdx = BroadcastIdx % Scale)
14142 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
14143 DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));
14144
14145 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
14146 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
14147}
14148
14149/// Test whether this can be lowered with a single SHUFPS instruction.
14150///
14151/// This is used to disable more specialized lowerings when the shufps lowering
14152/// will happen to be efficient.
14153static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
14154 // This routine only handles 128-bit shufps.
14155 assert(Mask.size() == 4 && "Unsupported mask size!")(static_cast <bool> (Mask.size() == 4 && "Unsupported mask size!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unsupported mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14155, __extension__
__PRETTY_FUNCTION__))
;
14156 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[0] >= -1 && Mask[0
] < 8 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[0] >= -1 && Mask[0] < 8 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14156, __extension__
__PRETTY_FUNCTION__))
;
14157 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[1] >= -1 && Mask[1
] < 8 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[1] >= -1 && Mask[1] < 8 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14157, __extension__
__PRETTY_FUNCTION__))
;
14158 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[2] >= -1 && Mask[2
] < 8 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[2] >= -1 && Mask[2] < 8 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14158, __extension__
__PRETTY_FUNCTION__))
;
14159 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[3] >= -1 && Mask[3
] < 8 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[3] >= -1 && Mask[3] < 8 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14159, __extension__
__PRETTY_FUNCTION__))
;
14160
14161 // To lower with a single SHUFPS we need to have the low half and high half
14162 // each requiring a single input.
14163 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
14164 return false;
14165 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
14166 return false;
14167
14168 return true;
14169}
14170
14171/// If we are extracting two 128-bit halves of a vector and shuffling the
14172/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
14173/// multi-shuffle lowering.
14174static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0,
14175 SDValue N1, ArrayRef<int> Mask,
14176 SelectionDAG &DAG) {
14177 MVT VT = N0.getSimpleValueType();
14178 assert((VT.is128BitVector() &&(static_cast <bool> ((VT.is128BitVector() && (VT
.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64
)) && "VPERM* family of shuffles requires 32-bit or 64-bit elements"
) ? void (0) : __assert_fail ("(VT.is128BitVector() && (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && \"VPERM* family of shuffles requires 32-bit or 64-bit elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14180, __extension__
__PRETTY_FUNCTION__))
14179 (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&(static_cast <bool> ((VT.is128BitVector() && (VT
.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64
)) && "VPERM* family of shuffles requires 32-bit or 64-bit elements"
) ? void (0) : __assert_fail ("(VT.is128BitVector() && (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && \"VPERM* family of shuffles requires 32-bit or 64-bit elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14180, __extension__
__PRETTY_FUNCTION__))
14180 "VPERM* family of shuffles requires 32-bit or 64-bit elements")(static_cast <bool> ((VT.is128BitVector() && (VT
.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64
)) && "VPERM* family of shuffles requires 32-bit or 64-bit elements"
) ? void (0) : __assert_fail ("(VT.is128BitVector() && (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && \"VPERM* family of shuffles requires 32-bit or 64-bit elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14180, __extension__
__PRETTY_FUNCTION__))
;
14181
14182 // Check that both sources are extracts of the same source vector.
14183 if (!N0.hasOneUse() || !N1.hasOneUse() ||
14184 N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
14185 N1.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
14186 N0.getOperand(0) != N1.getOperand(0))
14187 return SDValue();
14188
14189 SDValue WideVec = N0.getOperand(0);
14190 MVT WideVT = WideVec.getSimpleValueType();
14191 if (!WideVT.is256BitVector())
14192 return SDValue();
14193
14194 // Match extracts of each half of the wide source vector. Commute the shuffle
14195 // if the extract of the low half is N1.
14196 unsigned NumElts = VT.getVectorNumElements();
14197 SmallVector<int, 4> NewMask(Mask.begin(), Mask.end());
14198 const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);
14199 const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);
14200 if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
14201 ShuffleVectorSDNode::commuteMask(NewMask);
14202 else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)
14203 return SDValue();
14204
14205 // Final bailout: if the mask is simple, we are better off using an extract
14206 // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
14207 // because that avoids a constant load from memory.
14208 if (NumElts == 4 &&
14209 (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask)))
14210 return SDValue();
14211
14212 // Extend the shuffle mask with undef elements.
14213 NewMask.append(NumElts, -1);
14214
14215 // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
14216 SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
14217 NewMask);
14218 // This is free: ymm -> xmm.
14219 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
14220 DAG.getIntPtrConstant(0, DL));
14221}
14222
14223/// Try to lower broadcast of a single element.
14224///
14225/// For convenience, this code also bundles all of the subtarget feature set
14226/// filtering. While a little annoying to re-dispatch on type here, there isn't
14227/// a convenient way to factor it out.
14228static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
14229 SDValue V2, ArrayRef<int> Mask,
14230 const X86Subtarget &Subtarget,
14231 SelectionDAG &DAG) {
14232 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
14233 (Subtarget.hasAVX() && VT.isFloatingPoint()) ||
14234 (Subtarget.hasAVX2() && VT.isInteger())))
14235 return SDValue();
14236
14237 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
14238 // we can only broadcast from a register with AVX2.
14239 unsigned NumEltBits = VT.getScalarSizeInBits();
14240 unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
14241 ? X86ISD::MOVDDUP
14242 : X86ISD::VBROADCAST;
14243 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
14244
14245 // Check that the mask is a broadcast.
14246 int BroadcastIdx = getSplatIndex(Mask);
14247 if (BroadcastIdx < 0)
14248 return SDValue();
14249 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "(static_cast <bool> (BroadcastIdx < (int)Mask.size()
&& "We only expect to be called with " "a sorted mask where the broadcast "
"comes from V1.") ? void (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14251, __extension__
__PRETTY_FUNCTION__))
14250 "a sorted mask where the broadcast "(static_cast <bool> (BroadcastIdx < (int)Mask.size()
&& "We only expect to be called with " "a sorted mask where the broadcast "
"comes from V1.") ? void (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14251, __extension__
__PRETTY_FUNCTION__))
14251 "comes from V1.")(static_cast <bool> (BroadcastIdx < (int)Mask.size()
&& "We only expect to be called with " "a sorted mask where the broadcast "
"comes from V1.") ? void (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14251, __extension__
__PRETTY_FUNCTION__))
;
14252
14253 // Go up the chain of (vector) values to find a scalar load that we can
14254 // combine with the broadcast.
14255 // TODO: Combine this logic with findEltLoadSrc() used by
14256 // EltsFromConsecutiveLoads().
14257 int BitOffset = BroadcastIdx * NumEltBits;
14258 SDValue V = V1;
14259 for (;;) {
14260 switch (V.getOpcode()) {
14261 case ISD::BITCAST: {
14262 V = V.getOperand(0);
14263 continue;
14264 }
14265 case ISD::CONCAT_VECTORS: {
14266 int OpBitWidth = V.getOperand(0).getValueSizeInBits();
14267 int OpIdx = BitOffset / OpBitWidth;
14268 V = V.getOperand(OpIdx);
14269 BitOffset %= OpBitWidth;
14270 continue;
14271 }
14272 case ISD::EXTRACT_SUBVECTOR: {
14273 // The extraction index adds to the existing offset.
14274 unsigned EltBitWidth = V.getScalarValueSizeInBits();
14275 unsigned Idx = V.getConstantOperandVal(1);
14276 unsigned BeginOffset = Idx * EltBitWidth;
14277 BitOffset += BeginOffset;
14278 V = V.getOperand(0);
14279 continue;
14280 }
14281 case ISD::INSERT_SUBVECTOR: {
14282 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
14283 int EltBitWidth = VOuter.getScalarValueSizeInBits();
14284 int Idx = (int)V.getConstantOperandVal(2);
14285 int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
14286 int BeginOffset = Idx * EltBitWidth;
14287 int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
14288 if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
14289 BitOffset -= BeginOffset;
14290 V = VInner;
14291 } else {
14292 V = VOuter;
14293 }
14294 continue;
14295 }
14296 }
14297 break;
14298 }
14299 assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset")(static_cast <bool> ((BitOffset % NumEltBits) == 0 &&
"Illegal bit-offset") ? void (0) : __assert_fail ("(BitOffset % NumEltBits) == 0 && \"Illegal bit-offset\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14299, __extension__
__PRETTY_FUNCTION__))
;
14300 BroadcastIdx = BitOffset / NumEltBits;
14301
14302 // Do we need to bitcast the source to retrieve the original broadcast index?
14303 bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;
14304
14305 // Check if this is a broadcast of a scalar. We special case lowering
14306 // for scalars so that we can more effectively fold with loads.
14307 // If the original value has a larger element type than the shuffle, the
14308 // broadcast element is in essence truncated. Make that explicit to ease
14309 // folding.
14310 if (BitCastSrc && VT.isInteger())
14311 if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(
14312 DL, VT, V, BroadcastIdx, Subtarget, DAG))
14313 return TruncBroadcast;
14314
14315 // Also check the simpler case, where we can directly reuse the scalar.
14316 if (!BitCastSrc &&
14317 ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
14318 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {
14319 V = V.getOperand(BroadcastIdx);
14320
14321 // If we can't broadcast from a register, check that the input is a load.
14322 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
14323 return SDValue();
14324 } else if (ISD::isNormalLoad(V.getNode()) &&
14325 cast<LoadSDNode>(V)->isSimple()) {
14326 // We do not check for one-use of the vector load because a broadcast load
14327 // is expected to be a win for code size, register pressure, and possibly
14328 // uops even if the original vector load is not eliminated.
14329
14330 // Reduce the vector load and shuffle to a broadcasted scalar load.
14331 LoadSDNode *Ld = cast<LoadSDNode>(V);
14332 SDValue BaseAddr = Ld->getOperand(1);
14333 MVT SVT = VT.getScalarType();
14334 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
14335 assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset")(static_cast <bool> ((int)(Offset * 8) == BitOffset &&
"Unexpected bit-offset") ? void (0) : __assert_fail ("(int)(Offset * 8) == BitOffset && \"Unexpected bit-offset\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14335, __extension__
__PRETTY_FUNCTION__))
;
14336 SDValue NewAddr =
14337 DAG.getMemBasePlusOffset(BaseAddr, TypeSize::Fixed(Offset), DL);
14338
14339 // Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather
14340 // than MOVDDUP.
14341 // FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?
14342 if (Opcode == X86ISD::VBROADCAST) {
14343 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
14344 SDValue Ops[] = {Ld->getChain(), NewAddr};
14345 V = DAG.getMemIntrinsicNode(
14346 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT,
14347 DAG.getMachineFunction().getMachineMemOperand(
14348 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
14349 DAG.makeEquivalentMemoryOrdering(Ld, V);
14350 return DAG.getBitcast(VT, V);
14351 }
14352 assert(SVT == MVT::f64 && "Unexpected VT!")(static_cast <bool> (SVT == MVT::f64 && "Unexpected VT!"
) ? void (0) : __assert_fail ("SVT == MVT::f64 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14352, __extension__
__PRETTY_FUNCTION__))
;
14353 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
14354 DAG.getMachineFunction().getMachineMemOperand(
14355 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
14356 DAG.makeEquivalentMemoryOrdering(Ld, V);
14357 } else if (!BroadcastFromReg) {
14358 // We can't broadcast from a vector register.
14359 return SDValue();
14360 } else if (BitOffset != 0) {
14361 // We can only broadcast from the zero-element of a vector register,
14362 // but it can be advantageous to broadcast from the zero-element of a
14363 // subvector.
14364 if (!VT.is256BitVector() && !VT.is512BitVector())
14365 return SDValue();
14366
14367 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
14368 if (VT == MVT::v4f64 || VT == MVT::v4i64)
14369 return SDValue();
14370
14371 // Only broadcast the zero-element of a 128-bit subvector.
14372 if ((BitOffset % 128) != 0)
14373 return SDValue();
14374
14375 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&(static_cast <bool> ((BitOffset % V.getScalarValueSizeInBits
()) == 0 && "Unexpected bit-offset") ? void (0) : __assert_fail
("(BitOffset % V.getScalarValueSizeInBits()) == 0 && \"Unexpected bit-offset\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14376, __extension__
__PRETTY_FUNCTION__))
14376 "Unexpected bit-offset")(static_cast <bool> ((BitOffset % V.getScalarValueSizeInBits
()) == 0 && "Unexpected bit-offset") ? void (0) : __assert_fail
("(BitOffset % V.getScalarValueSizeInBits()) == 0 && \"Unexpected bit-offset\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14376, __extension__
__PRETTY_FUNCTION__))
;
14377 assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&(static_cast <bool> ((V.getValueSizeInBits() == 256 || V
.getValueSizeInBits() == 512) && "Unexpected vector size"
) ? void (0) : __assert_fail ("(V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) && \"Unexpected vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14378, __extension__
__PRETTY_FUNCTION__))
14378 "Unexpected vector size")(static_cast <bool> ((V.getValueSizeInBits() == 256 || V
.getValueSizeInBits() == 512) && "Unexpected vector size"
) ? void (0) : __assert_fail ("(V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) && \"Unexpected vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14378, __extension__
__PRETTY_FUNCTION__))
;
14379 unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
14380 V = extract128BitVector(V, ExtractIdx, DAG, DL);
14381 }
14382
14383 // On AVX we can use VBROADCAST directly for scalar sources.
14384 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) {
14385 V = DAG.getBitcast(MVT::f64, V);
14386 if (Subtarget.hasAVX()) {
14387 V = DAG.getNode(X86ISD::VBROADCAST, DL, MVT::v2f64, V);
14388 return DAG.getBitcast(VT, V);
14389 }
14390 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V);
14391 }
14392
14393 // If this is a scalar, do the broadcast on this type and bitcast.
14394 if (!V.getValueType().isVector()) {
14395 assert(V.getScalarValueSizeInBits() == NumEltBits &&(static_cast <bool> (V.getScalarValueSizeInBits() == NumEltBits
&& "Unexpected scalar size") ? void (0) : __assert_fail
("V.getScalarValueSizeInBits() == NumEltBits && \"Unexpected scalar size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14396, __extension__
__PRETTY_FUNCTION__))
14396 "Unexpected scalar size")(static_cast <bool> (V.getScalarValueSizeInBits() == NumEltBits
&& "Unexpected scalar size") ? void (0) : __assert_fail
("V.getScalarValueSizeInBits() == NumEltBits && \"Unexpected scalar size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14396, __extension__
__PRETTY_FUNCTION__))
;
14397 MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),
14398 VT.getVectorNumElements());
14399 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
14400 }
14401
14402 // We only support broadcasting from 128-bit vectors to minimize the
14403 // number of patterns we need to deal with in isel. So extract down to
14404 // 128-bits, removing as many bitcasts as possible.
14405 if (V.getValueSizeInBits() > 128)
14406 V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL);
14407
14408 // Otherwise cast V to a vector with the same element type as VT, but
14409 // possibly narrower than VT. Then perform the broadcast.
14410 unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
14411 MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);
14412 return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));
14413}
14414
14415// Check for whether we can use INSERTPS to perform the shuffle. We only use
14416// INSERTPS when the V1 elements are already in the correct locations
14417// because otherwise we can just always use two SHUFPS instructions which
14418// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
14419// perform INSERTPS if a single V1 element is out of place and all V2
14420// elements are zeroable.
14421static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2,
14422 unsigned &InsertPSMask,
14423 const APInt &Zeroable,
14424 ArrayRef<int> Mask, SelectionDAG &DAG) {
14425 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType().is128BitVector
() && "Bad operand type!") ? void (0) : __assert_fail
("V1.getSimpleValueType().is128BitVector() && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14425, __extension__
__PRETTY_FUNCTION__))
;
14426 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType().is128BitVector
() && "Bad operand type!") ? void (0) : __assert_fail
("V2.getSimpleValueType().is128BitVector() && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14426, __extension__
__PRETTY_FUNCTION__))
;
14427 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14427, __extension__
__PRETTY_FUNCTION__))
;
14428
14429 // Attempt to match INSERTPS with one element from VA or VB being
14430 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
14431 // are updated.
14432 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
14433 ArrayRef<int> CandidateMask) {
14434 unsigned ZMask = 0;
14435 int VADstIndex = -1;
14436 int VBDstIndex = -1;
14437 bool VAUsedInPlace = false;
14438
14439 for (int i = 0; i < 4; ++i) {
14440 // Synthesize a zero mask from the zeroable elements (includes undefs).
14441 if (Zeroable[i]) {
14442 ZMask |= 1 << i;
14443 continue;
14444 }
14445
14446 // Flag if we use any VA inputs in place.
14447 if (i == CandidateMask[i]) {
14448 VAUsedInPlace = true;
14449 continue;
14450 }
14451
14452 // We can only insert a single non-zeroable element.
14453 if (VADstIndex >= 0 || VBDstIndex >= 0)
14454 return false;
14455
14456 if (CandidateMask[i] < 4) {
14457 // VA input out of place for insertion.
14458 VADstIndex = i;
14459 } else {
14460 // VB input for insertion.
14461 VBDstIndex = i;
14462 }
14463 }
14464
14465 // Don't bother if we have no (non-zeroable) element for insertion.
14466 if (VADstIndex < 0 && VBDstIndex < 0)
14467 return false;
14468
14469 // Determine element insertion src/dst indices. The src index is from the
14470 // start of the inserted vector, not the start of the concatenated vector.
14471 unsigned VBSrcIndex = 0;
14472 if (VADstIndex >= 0) {
14473 // If we have a VA input out of place, we use VA as the V2 element
14474 // insertion and don't use the original V2 at all.
14475 VBSrcIndex = CandidateMask[VADstIndex];
14476 VBDstIndex = VADstIndex;
14477 VB = VA;
14478 } else {
14479 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
14480 }
14481
14482 // If no V1 inputs are used in place, then the result is created only from
14483 // the zero mask and the V2 insertion - so remove V1 dependency.
14484 if (!VAUsedInPlace)
14485 VA = DAG.getUNDEF(MVT::v4f32);
14486
14487 // Update V1, V2 and InsertPSMask accordingly.
14488 V1 = VA;
14489 V2 = VB;
14490
14491 // Insert the V2 element into the desired position.
14492 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
14493 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!")(static_cast <bool> ((InsertPSMask & ~0xFFu) == 0 &&
"Invalid mask!") ? void (0) : __assert_fail ("(InsertPSMask & ~0xFFu) == 0 && \"Invalid mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14493, __extension__
__PRETTY_FUNCTION__))
;
14494 return true;
14495 };
14496
14497 if (matchAsInsertPS(V1, V2, Mask))
14498 return true;
14499
14500 // Commute and try again.
14501 SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
14502 ShuffleVectorSDNode::commuteMask(CommutedMask);
14503 if (matchAsInsertPS(V2, V1, CommutedMask))
14504 return true;
14505
14506 return false;
14507}
14508
14509static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2,
14510 ArrayRef<int> Mask, const APInt &Zeroable,
14511 SelectionDAG &DAG) {
14512 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14512, __extension__
__PRETTY_FUNCTION__))
;
14513 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14513, __extension__
__PRETTY_FUNCTION__))
;
14514
14515 // Attempt to match the insertps pattern.
14516 unsigned InsertPSMask = 0;
14517 if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
14518 return SDValue();
14519
14520 // Insert the V2 element into the desired position.
14521 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
14522 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
14523}
14524
14525/// Try to lower a shuffle as a permute of the inputs followed by an
14526/// UNPCK instruction.
14527///
14528/// This specifically targets cases where we end up with alternating between
14529/// the two inputs, and so can permute them into something that feeds a single
14530/// UNPCK instruction. Note that this routine only targets integer vectors
14531/// because for floating point vectors we have a generalized SHUFPS lowering
14532/// strategy that handles everything that doesn't *exactly* match an unpack,
14533/// making this clever lowering unnecessary.
14534static SDValue lowerShuffleAsPermuteAndUnpack(
14535 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
14536 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
14537 assert(!VT.isFloatingPoint() &&(static_cast <bool> (!VT.isFloatingPoint() && "This routine only supports integer vectors."
) ? void (0) : __assert_fail ("!VT.isFloatingPoint() && \"This routine only supports integer vectors.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14538, __extension__
__PRETTY_FUNCTION__))
14538 "This routine only supports integer vectors.")(static_cast <bool> (!VT.isFloatingPoint() && "This routine only supports integer vectors."
) ? void (0) : __assert_fail ("!VT.isFloatingPoint() && \"This routine only supports integer vectors.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14538, __extension__
__PRETTY_FUNCTION__))
;
14539 assert(VT.is128BitVector() &&(static_cast <bool> (VT.is128BitVector() && "This routine only works on 128-bit vectors."
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"This routine only works on 128-bit vectors.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14540, __extension__
__PRETTY_FUNCTION__))
14540 "This routine only works on 128-bit vectors.")(static_cast <bool> (VT.is128BitVector() && "This routine only works on 128-bit vectors."
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"This routine only works on 128-bit vectors.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14540, __extension__
__PRETTY_FUNCTION__))
;
14541 assert(!V2.isUndef() &&(static_cast <bool> (!V2.isUndef() && "This routine should only be used when blending two inputs."
) ? void (0) : __assert_fail ("!V2.isUndef() && \"This routine should only be used when blending two inputs.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14542, __extension__
__PRETTY_FUNCTION__))
14542 "This routine should only be used when blending two inputs.")(static_cast <bool> (!V2.isUndef() && "This routine should only be used when blending two inputs."
) ? void (0) : __assert_fail ("!V2.isUndef() && \"This routine should only be used when blending two inputs.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14542, __extension__
__PRETTY_FUNCTION__))
;
14543 assert(Mask.size() >= 2 && "Single element masks are invalid.")(static_cast <bool> (Mask.size() >= 2 && "Single element masks are invalid."
) ? void (0) : __assert_fail ("Mask.size() >= 2 && \"Single element masks are invalid.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14543, __extension__
__PRETTY_FUNCTION__))
;
14544
14545 int Size = Mask.size();
14546
14547 int NumLoInputs =
14548 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
14549 int NumHiInputs =
14550 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
14551
14552 bool UnpackLo = NumLoInputs >= NumHiInputs;
14553
14554 auto TryUnpack = [&](int ScalarSize, int Scale) {
14555 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
14556 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
14557
14558 for (int i = 0; i < Size; ++i) {
14559 if (Mask[i] < 0)
14560 continue;
14561
14562 // Each element of the unpack contains Scale elements from this mask.
14563 int UnpackIdx = i / Scale;
14564
14565 // We only handle the case where V1 feeds the first slots of the unpack.
14566 // We rely on canonicalization to ensure this is the case.
14567 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
14568 return SDValue();
14569
14570 // Setup the mask for this input. The indexing is tricky as we have to
14571 // handle the unpack stride.
14572 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
14573 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
14574 Mask[i] % Size;
14575 }
14576
14577 // If we will have to shuffle both inputs to use the unpack, check whether
14578 // we can just unpack first and shuffle the result. If so, skip this unpack.
14579 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
14580 !isNoopShuffleMask(V2Mask))
14581 return SDValue();
14582
14583 // Shuffle the inputs into place.
14584 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
14585 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
14586
14587 // Cast the inputs to the type we will use to unpack them.
14588 MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
14589 V1 = DAG.getBitcast(UnpackVT, V1);
14590 V2 = DAG.getBitcast(UnpackVT, V2);
14591
14592 // Unpack the inputs and cast the result back to the desired type.
14593 return DAG.getBitcast(
14594 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
14595 UnpackVT, V1, V2));
14596 };
14597
14598 // We try each unpack from the largest to the smallest to try and find one
14599 // that fits this mask.
14600 int OrigScalarSize = VT.getScalarSizeInBits();
14601 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
14602 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
14603 return Unpack;
14604
14605 // If we're shuffling with a zero vector then we're better off not doing
14606 // VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.
14607 if (ISD::isBuildVectorAllZeros(V1.getNode()) ||
14608 ISD::isBuildVectorAllZeros(V2.getNode()))
14609 return SDValue();
14610
14611 // If none of the unpack-rooted lowerings worked (or were profitable) try an
14612 // initial unpack.
14613 if (NumLoInputs == 0 || NumHiInputs == 0) {
14614 assert((NumLoInputs > 0 || NumHiInputs > 0) &&(static_cast <bool> ((NumLoInputs > 0 || NumHiInputs
> 0) && "We have to have *some* inputs!") ? void (
0) : __assert_fail ("(NumLoInputs > 0 || NumHiInputs > 0) && \"We have to have *some* inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14615, __extension__
__PRETTY_FUNCTION__))
14615 "We have to have *some* inputs!")(static_cast <bool> ((NumLoInputs > 0 || NumHiInputs
> 0) && "We have to have *some* inputs!") ? void (
0) : __assert_fail ("(NumLoInputs > 0 || NumHiInputs > 0) && \"We have to have *some* inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14615, __extension__
__PRETTY_FUNCTION__))
;
14616 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
14617
14618 // FIXME: We could consider the total complexity of the permute of each
14619 // possible unpacking. Or at the least we should consider how many
14620 // half-crossings are created.
14621 // FIXME: We could consider commuting the unpacks.
14622
14623 SmallVector<int, 32> PermMask((unsigned)Size, -1);
14624 for (int i = 0; i < Size; ++i) {
14625 if (Mask[i] < 0)
14626 continue;
14627
14628 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!")(static_cast <bool> (Mask[i] % Size >= HalfOffset &&
"Found input from wrong half!") ? void (0) : __assert_fail (
"Mask[i] % Size >= HalfOffset && \"Found input from wrong half!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14628, __extension__
__PRETTY_FUNCTION__))
;
14629
14630 PermMask[i] =
14631 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
14632 }
14633 return DAG.getVectorShuffle(
14634 VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
14635 DL, VT, V1, V2),
14636 DAG.getUNDEF(VT), PermMask);
14637 }
14638
14639 return SDValue();
14640}
14641
14642/// Handle lowering of 2-lane 64-bit floating point shuffles.
14643///
14644/// This is the basis function for the 2-lane 64-bit shuffles as we have full
14645/// support for floating point shuffles but not integer shuffles. These
14646/// instructions will incur a domain crossing penalty on some chips though so
14647/// it is better to avoid lowering through this for integer vectors where
14648/// possible.
14649static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
14650 const APInt &Zeroable, SDValue V1, SDValue V2,
14651 const X86Subtarget &Subtarget,
14652 SelectionDAG &DAG) {
14653 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v2f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v2f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14653, __extension__
__PRETTY_FUNCTION__))
;
14654 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v2f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v2f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14654, __extension__
__PRETTY_FUNCTION__))
;
14655 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!")(static_cast <bool> (Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 2 && \"Unexpected mask size for v2 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14655, __extension__
__PRETTY_FUNCTION__))
;
14656
14657 if (V2.isUndef()) {
14658 // Check for being able to broadcast a single element.
14659 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,
14660 Mask, Subtarget, DAG))
14661 return Broadcast;
14662
14663 // Straight shuffle of a single input vector. Simulate this by using the
14664 // single input as both of the "inputs" to this instruction..
14665 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
14666
14667 if (Subtarget.hasAVX()) {
14668 // If we have AVX, we can use VPERMILPS which will allow folding a load
14669 // into the shuffle.
14670 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
14671 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
14672 }
14673
14674 return DAG.getNode(
14675 X86ISD::SHUFP, DL, MVT::v2f64,
14676 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
14677 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
14678 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
14679 }
14680 assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!")(static_cast <bool> (Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!"
) ? void (0) : __assert_fail ("Mask[0] >= 0 && \"No undef lanes in multi-input v2 shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14680, __extension__
__PRETTY_FUNCTION__))
;
14681 assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!")(static_cast <bool> (Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!"
) ? void (0) : __assert_fail ("Mask[1] >= 0 && \"No undef lanes in multi-input v2 shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14681, __extension__
__PRETTY_FUNCTION__))
;
14682 assert(Mask[0] < 2 && "We sort V1 to be the first input.")(static_cast <bool> (Mask[0] < 2 && "We sort V1 to be the first input."
) ? void (0) : __assert_fail ("Mask[0] < 2 && \"We sort V1 to be the first input.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14682, __extension__
__PRETTY_FUNCTION__))
;
14683 assert(Mask[1] >= 2 && "We sort V2 to be the second input.")(static_cast <bool> (Mask[1] >= 2 && "We sort V2 to be the second input."
) ? void (0) : __assert_fail ("Mask[1] >= 2 && \"We sort V2 to be the second input.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14683, __extension__
__PRETTY_FUNCTION__))
;
14684
14685 if (Subtarget.hasAVX2())
14686 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
14687 return Extract;
14688
14689 // When loading a scalar and then shuffling it into a vector we can often do
14690 // the insertion cheaply.
14691 if (SDValue Insertion = lowerShuffleAsElementInsertion(
14692 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
14693 return Insertion;
14694 // Try inverting the insertion since for v2 masks it is easy to do and we
14695 // can't reliably sort the mask one way or the other.
14696 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
14697 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
14698 if (SDValue Insertion = lowerShuffleAsElementInsertion(
14699 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
14700 return Insertion;
14701
14702 // Try to use one of the special instruction patterns to handle two common
14703 // blend patterns if a zero-blend above didn't work.
14704 if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) ||
14705 isShuffleEquivalent(Mask, {1, 3}, V1, V2))
14706 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
14707 // We can either use a special instruction to load over the low double or
14708 // to move just the low double.
14709 return DAG.getNode(
14710 X86ISD::MOVSD, DL, MVT::v2f64, V2,
14711 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
14712
14713 if (Subtarget.hasSSE41())
14714 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
14715 Zeroable, Subtarget, DAG))
14716 return Blend;
14717
14718 // Use dedicated unpack instructions for masks that match their pattern.
14719 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
14720 return V;
14721
14722 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
14723 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
14724 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
14725}
14726
14727/// Handle lowering of 2-lane 64-bit integer shuffles.
14728///
14729/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
14730/// the integer unit to minimize domain crossing penalties. However, for blends
14731/// it falls back to the floating point shuffle operation with appropriate bit
14732/// casting.
14733static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
14734 const APInt &Zeroable, SDValue V1, SDValue V2,
14735 const X86Subtarget &Subtarget,
14736 SelectionDAG &DAG) {
14737 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v2i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v2i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14737, __extension__
__PRETTY_FUNCTION__))
;
14738 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v2i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v2i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14738, __extension__
__PRETTY_FUNCTION__))
;
14739 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!")(static_cast <bool> (Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 2 && \"Unexpected mask size for v2 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14739, __extension__
__PRETTY_FUNCTION__))
;
14740
14741 if (V2.isUndef()) {
14742 // Check for being able to broadcast a single element.
14743 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,
14744 Mask, Subtarget, DAG))
14745 return Broadcast;
14746
14747 // Straight shuffle of a single input vector. For everything from SSE2
14748 // onward this has a single fast instruction with no scary immediates.
14749 // We have to map the mask as it is actually a v4i32 shuffle instruction.
14750 V1 = DAG.getBitcast(MVT::v4i32, V1);
14751 int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),
14752 Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),
14753 Mask[1] < 0 ? -1 : (Mask[1] * 2),
14754 Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};
14755 return DAG.getBitcast(
14756 MVT::v2i64,
14757 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
14758 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
14759 }
14760 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!")(static_cast <bool> (Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!"
) ? void (0) : __assert_fail ("Mask[0] != -1 && \"No undef lanes in multi-input v2 shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14760, __extension__
__PRETTY_FUNCTION__))
;
14761 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!")(static_cast <bool> (Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!"
) ? void (0) : __assert_fail ("Mask[1] != -1 && \"No undef lanes in multi-input v2 shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14761, __extension__
__PRETTY_FUNCTION__))
;
14762 assert(Mask[0] < 2 && "We sort V1 to be the first input.")(static_cast <bool> (Mask[0] < 2 && "We sort V1 to be the first input."
) ? void (0) : __assert_fail ("Mask[0] < 2 && \"We sort V1 to be the first input.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14762, __extension__
__PRETTY_FUNCTION__))
;
14763 assert(Mask[1] >= 2 && "We sort V2 to be the second input.")(static_cast <bool> (Mask[1] >= 2 && "We sort V2 to be the second input."
) ? void (0) : __assert_fail ("Mask[1] >= 2 && \"We sort V2 to be the second input.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14763, __extension__
__PRETTY_FUNCTION__))
;
14764
14765 if (Subtarget.hasAVX2())
14766 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
14767 return Extract;
14768
14769 // Try to use shift instructions.
14770 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
14771 Zeroable, Subtarget, DAG))
14772 return Shift;
14773
14774 // When loading a scalar and then shuffling it into a vector we can often do
14775 // the insertion cheaply.
14776 if (SDValue Insertion = lowerShuffleAsElementInsertion(
14777 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
14778 return Insertion;
14779 // Try inverting the insertion since for v2 masks it is easy to do and we
14780 // can't reliably sort the mask one way or the other.
14781 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
14782 if (SDValue Insertion = lowerShuffleAsElementInsertion(
14783 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
14784 return Insertion;
14785
14786 // We have different paths for blend lowering, but they all must use the
14787 // *exact* same predicate.
14788 bool IsBlendSupported = Subtarget.hasSSE41();
14789 if (IsBlendSupported)
14790 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
14791 Zeroable, Subtarget, DAG))
14792 return Blend;
14793
14794 // Use dedicated unpack instructions for masks that match their pattern.
14795 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
14796 return V;
14797
14798 // Try to use byte rotation instructions.
14799 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
14800 if (Subtarget.hasSSSE3()) {
14801 if (Subtarget.hasVLX())
14802 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,
14803 Subtarget, DAG))
14804 return Rotate;
14805
14806 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,
14807 Subtarget, DAG))
14808 return Rotate;
14809 }
14810
14811 // If we have direct support for blends, we should lower by decomposing into
14812 // a permute. That will be faster than the domain cross.
14813 if (IsBlendSupported)
14814 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask,
14815 Subtarget, DAG);
14816
14817 // We implement this with SHUFPD which is pretty lame because it will likely
14818 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
14819 // However, all the alternatives are still more cycles and newer chips don't
14820 // have this problem. It would be really nice if x86 had better shuffles here.
14821 V1 = DAG.getBitcast(MVT::v2f64, V1);
14822 V2 = DAG.getBitcast(MVT::v2f64, V2);
14823 return DAG.getBitcast(MVT::v2i64,
14824 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
14825}
14826
14827/// Lower a vector shuffle using the SHUFPS instruction.
14828///
14829/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
14830/// It makes no assumptions about whether this is the *best* lowering, it simply
14831/// uses it.
14832static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
14833 ArrayRef<int> Mask, SDValue V1,
14834 SDValue V2, SelectionDAG &DAG) {
14835 SDValue LowV = V1, HighV = V2;
14836 SmallVector<int, 4> NewMask(Mask.begin(), Mask.end());
14837 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
14838
14839 if (NumV2Elements == 1) {
14840 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
14841
14842 // Compute the index adjacent to V2Index and in the same half by toggling
14843 // the low bit.
14844 int V2AdjIndex = V2Index ^ 1;
14845
14846 if (Mask[V2AdjIndex] < 0) {
14847 // Handles all the cases where we have a single V2 element and an undef.
14848 // This will only ever happen in the high lanes because we commute the
14849 // vector otherwise.
14850 if (V2Index < 2)
14851 std::swap(LowV, HighV);
14852 NewMask[V2Index] -= 4;
14853 } else {
14854 // Handle the case where the V2 element ends up adjacent to a V1 element.
14855 // To make this work, blend them together as the first step.
14856 int V1Index = V2AdjIndex;
14857 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
14858 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
14859 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
14860
14861 // Now proceed to reconstruct the final blend as we have the necessary
14862 // high or low half formed.
14863 if (V2Index < 2) {
14864 LowV = V2;
14865 HighV = V1;
14866 } else {
14867 HighV = V2;
14868 }
14869 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
14870 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
14871 }
14872 } else if (NumV2Elements == 2) {
14873 if (Mask[0] < 4 && Mask[1] < 4) {
14874 // Handle the easy case where we have V1 in the low lanes and V2 in the
14875 // high lanes.
14876 NewMask[2] -= 4;
14877 NewMask[3] -= 4;
14878 } else if (Mask[2] < 4 && Mask[3] < 4) {
14879 // We also handle the reversed case because this utility may get called
14880 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
14881 // arrange things in the right direction.
14882 NewMask[0] -= 4;
14883 NewMask[1] -= 4;
14884 HighV = V1;
14885 LowV = V2;
14886 } else {
14887 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
14888 // trying to place elements directly, just blend them and set up the final
14889 // shuffle to place them.
14890
14891 // The first two blend mask elements are for V1, the second two are for
14892 // V2.
14893 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
14894 Mask[2] < 4 ? Mask[2] : Mask[3],
14895 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
14896 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
14897 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
14898 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
14899
14900 // Now we do a normal shuffle of V1 by giving V1 as both operands to
14901 // a blend.
14902 LowV = HighV = V1;
14903 NewMask[0] = Mask[0] < 4 ? 0 : 2;
14904 NewMask[1] = Mask[0] < 4 ? 2 : 0;
14905 NewMask[2] = Mask[2] < 4 ? 1 : 3;
14906 NewMask[3] = Mask[2] < 4 ? 3 : 1;
14907 }
14908 } else if (NumV2Elements == 3) {
14909 // Ideally canonicalizeShuffleMaskWithCommute should have caught this, but
14910 // we can get here due to other paths (e.g repeated mask matching) that we
14911 // don't want to do another round of lowerVECTOR_SHUFFLE.
14912 ShuffleVectorSDNode::commuteMask(NewMask);
14913 return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG);
14914 }
14915 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
14916 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
14917}
14918
14919/// Lower 4-lane 32-bit floating point shuffles.
14920///
14921/// Uses instructions exclusively from the floating point unit to minimize
14922/// domain crossing penalties, as these are sufficient to implement all v4f32
14923/// shuffles.
14924static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
14925 const APInt &Zeroable, SDValue V1, SDValue V2,
14926 const X86Subtarget &Subtarget,
14927 SelectionDAG &DAG) {
14928 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14928, __extension__
__PRETTY_FUNCTION__))
;
14929 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14929, __extension__
__PRETTY_FUNCTION__))
;
14930 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14930, __extension__
__PRETTY_FUNCTION__))
;
14931
14932 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
14933
14934 if (NumV2Elements == 0) {
14935 // Check for being able to broadcast a single element.
14936 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,
14937 Mask, Subtarget, DAG))
14938 return Broadcast;
14939
14940 // Use even/odd duplicate instructions for masks that match their pattern.
14941 if (Subtarget.hasSSE3()) {
14942 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
14943 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
14944 if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2))
14945 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
14946 }
14947
14948 if (Subtarget.hasAVX()) {
14949 // If we have AVX, we can use VPERMILPS which will allow folding a load
14950 // into the shuffle.
14951 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
14952 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
14953 }
14954
14955 // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
14956 // in SSE1 because otherwise they are widened to v2f64 and never get here.
14957 if (!Subtarget.hasSSE2()) {
14958 if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2))
14959 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
14960 if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2))
14961 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
14962 }
14963
14964 // Otherwise, use a straight shuffle of a single input vector. We pass the
14965 // input vector to both operands to simulate this with a SHUFPS.
14966 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
14967 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
14968 }
14969
14970 if (Subtarget.hasAVX2())
14971 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
14972 return Extract;
14973
14974 // There are special ways we can lower some single-element blends. However, we
14975 // have custom ways we can lower more complex single-element blends below that
14976 // we defer to if both this and BLENDPS fail to match, so restrict this to
14977 // when the V2 input is targeting element 0 of the mask -- that is the fast
14978 // case here.
14979 if (NumV2Elements == 1 && Mask[0] >= 4)
14980 if (SDValue V = lowerShuffleAsElementInsertion(
14981 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
14982 return V;
14983
14984 if (Subtarget.hasSSE41()) {
14985 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
14986 Zeroable, Subtarget, DAG))
14987 return Blend;
14988
14989 // Use INSERTPS if we can complete the shuffle efficiently.
14990 if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
14991 return V;
14992
14993 if (!isSingleSHUFPSMask(Mask))
14994 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,
14995 V2, Mask, DAG))
14996 return BlendPerm;
14997 }
14998
14999 // Use low/high mov instructions. These are only valid in SSE1 because
15000 // otherwise they are widened to v2f64 and never get here.
15001 if (!Subtarget.hasSSE2()) {
15002 if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2))
15003 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
15004 if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2))
15005 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
15006 }
15007
15008 // Use dedicated unpack instructions for masks that match their pattern.
15009 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
15010 return V;
15011
15012 // Otherwise fall back to a SHUFPS lowering strategy.
15013 return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
15014}
15015
15016/// Lower 4-lane i32 vector shuffles.
15017///
15018/// We try to handle these with integer-domain shuffles where we can, but for
15019/// blends we use the floating point domain blend instructions.
15020static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15021 const APInt &Zeroable, SDValue V1, SDValue V2,
15022 const X86Subtarget &Subtarget,
15023 SelectionDAG &DAG) {
15024 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15024, __extension__
__PRETTY_FUNCTION__))
;
15025 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15025, __extension__
__PRETTY_FUNCTION__))
;
15026 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15026, __extension__
__PRETTY_FUNCTION__))
;
15027
15028 // Whenever we can lower this as a zext, that instruction is strictly faster
15029 // than any alternative. It also allows us to fold memory operands into the
15030 // shuffle in many cases.
15031 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,
15032 Zeroable, Subtarget, DAG))
15033 return ZExt;
15034
15035 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
15036
15037 if (NumV2Elements == 0) {
15038 // Try to use broadcast unless the mask only has one non-undef element.
15039 if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {
15040 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
15041 Mask, Subtarget, DAG))
15042 return Broadcast;
15043 }
15044
15045 // Straight shuffle of a single input vector. For everything from SSE2
15046 // onward this has a single fast instruction with no scary immediates.
15047 // We coerce the shuffle pattern to be compatible with UNPCK instructions
15048 // but we aren't actually going to use the UNPCK instruction because doing
15049 // so prevents folding a load into this instruction or making a copy.
15050 const int UnpackLoMask[] = {0, 0, 1, 1};
15051 const int UnpackHiMask[] = {2, 2, 3, 3};
15052 if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2))
15053 Mask = UnpackLoMask;
15054 else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2))
15055 Mask = UnpackHiMask;
15056
15057 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
15058 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
15059 }
15060
15061 if (Subtarget.hasAVX2())
15062 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
15063 return Extract;
15064
15065 // Try to use shift instructions.
15066 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
15067 Zeroable, Subtarget, DAG))
15068 return Shift;
15069
15070 // There are special ways we can lower some single-element blends.
15071 if (NumV2Elements == 1)
15072 if (SDValue V = lowerShuffleAsElementInsertion(
15073 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
15074 return V;
15075
15076 // We have different paths for blend lowering, but they all must use the
15077 // *exact* same predicate.
15078 bool IsBlendSupported = Subtarget.hasSSE41();
15079 if (IsBlendSupported)
15080 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
15081 Zeroable, Subtarget, DAG))
15082 return Blend;
15083
15084 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
15085 Zeroable, Subtarget, DAG))
15086 return Masked;
15087
15088 // Use dedicated unpack instructions for masks that match their pattern.
15089 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
15090 return V;
15091
15092 // Try to use byte rotation instructions.
15093 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
15094 if (Subtarget.hasSSSE3()) {
15095 if (Subtarget.hasVLX())
15096 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
15097 Subtarget, DAG))
15098 return Rotate;
15099
15100 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
15101 Subtarget, DAG))
15102 return Rotate;
15103 }
15104
15105 // Assume that a single SHUFPS is faster than an alternative sequence of
15106 // multiple instructions (even if the CPU has a domain penalty).
15107 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
15108 if (!isSingleSHUFPSMask(Mask)) {
15109 // If we have direct support for blends, we should lower by decomposing into
15110 // a permute. That will be faster than the domain cross.
15111 if (IsBlendSupported)
15112 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask,
15113 Subtarget, DAG);
15114
15115 // Try to lower by permuting the inputs into an unpack instruction.
15116 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
15117 Mask, Subtarget, DAG))
15118 return Unpack;
15119 }
15120
15121 // We implement this with SHUFPS because it can blend from two vectors.
15122 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
15123 // up the inputs, bypassing domain shift penalties that we would incur if we
15124 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
15125 // relevant.
15126 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
15127 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
15128 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
15129 return DAG.getBitcast(MVT::v4i32, ShufPS);
15130}
15131
15132/// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
15133/// shuffle lowering, and the most complex part.
15134///
15135/// The lowering strategy is to try to form pairs of input lanes which are
15136/// targeted at the same half of the final vector, and then use a dword shuffle
15137/// to place them onto the right half, and finally unpack the paired lanes into
15138/// their final position.
15139///
15140/// The exact breakdown of how to form these dword pairs and align them on the
15141/// correct sides is really tricky. See the comments within the function for
15142/// more of the details.
15143///
15144/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
15145/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
15146/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
15147/// vector, form the analogous 128-bit 8-element Mask.
15148static SDValue lowerV8I16GeneralSingleInputShuffle(
15149 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
15150 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15151 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!")(static_cast <bool> (VT.getVectorElementType() == MVT::
i16 && "Bad input type!") ? void (0) : __assert_fail (
"VT.getVectorElementType() == MVT::i16 && \"Bad input type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15151, __extension__
__PRETTY_FUNCTION__))
;
15152 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
15153
15154 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!")(static_cast <bool> (Mask.size() == 8 && "Shuffle mask length doesn't match!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Shuffle mask length doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15154, __extension__
__PRETTY_FUNCTION__))
;
15155 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
15156 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
15157
15158 // Attempt to directly match PSHUFLW or PSHUFHW.
15159 if (isUndefOrInRange(LoMask, 0, 4) &&
15160 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
15161 return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
15162 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
15163 }
15164 if (isUndefOrInRange(HiMask, 4, 8) &&
15165 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
15166 for (int i = 0; i != 4; ++i)
15167 HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
15168 return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
15169 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
15170 }
15171
15172 SmallVector<int, 4> LoInputs;
15173 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
15174 array_pod_sort(LoInputs.begin(), LoInputs.end());
15175 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
15176 SmallVector<int, 4> HiInputs;
15177 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
15178 array_pod_sort(HiInputs.begin(), HiInputs.end());
15179 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
15180 int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
15181 int NumHToL = LoInputs.size() - NumLToL;
15182 int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
15183 int NumHToH = HiInputs.size() - NumLToH;
15184 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
15185 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
15186 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
15187 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
15188
15189 // If we are shuffling values from one half - check how many different DWORD
15190 // pairs we need to create. If only 1 or 2 then we can perform this as a
15191 // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
15192 auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
15193 ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
15194 V = DAG.getNode(ShufWOp, DL, VT, V,
15195 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
15196 V = DAG.getBitcast(PSHUFDVT, V);
15197 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
15198 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
15199 return DAG.getBitcast(VT, V);
15200 };
15201
15202 if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
15203 int PSHUFDMask[4] = { -1, -1, -1, -1 };
15204 SmallVector<std::pair<int, int>, 4> DWordPairs;
15205 int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
15206
15207 // Collect the different DWORD pairs.
15208 for (int DWord = 0; DWord != 4; ++DWord) {
15209 int M0 = Mask[2 * DWord + 0];
15210 int M1 = Mask[2 * DWord + 1];
15211 M0 = (M0 >= 0 ? M0 % 4 : M0);
15212 M1 = (M1 >= 0 ? M1 % 4 : M1);
15213 if (M0 < 0 && M1 < 0)
15214 continue;
15215
15216 bool Match = false;
15217 for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
15218 auto &DWordPair = DWordPairs[j];
15219 if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
15220 (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
15221 DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
15222 DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
15223 PSHUFDMask[DWord] = DOffset + j;
15224 Match = true;
15225 break;
15226 }
15227 }
15228 if (!Match) {
15229 PSHUFDMask[DWord] = DOffset + DWordPairs.size();
15230 DWordPairs.push_back(std::make_pair(M0, M1));
15231 }
15232 }
15233
15234 if (DWordPairs.size() <= 2) {
15235 DWordPairs.resize(2, std::make_pair(-1, -1));
15236 int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
15237 DWordPairs[1].first, DWordPairs[1].second};
15238 if ((NumHToL + NumHToH) == 0)
15239 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
15240 if ((NumLToL + NumLToH) == 0)
15241 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
15242 }
15243 }
15244
15245 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
15246 // such inputs we can swap two of the dwords across the half mark and end up
15247 // with <=2 inputs to each half in each half. Once there, we can fall through
15248 // to the generic code below. For example:
15249 //
15250 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
15251 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
15252 //
15253 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
15254 // and an existing 2-into-2 on the other half. In this case we may have to
15255 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
15256 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
15257 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
15258 // because any other situation (including a 3-into-1 or 1-into-3 in the other
15259 // half than the one we target for fixing) will be fixed when we re-enter this
15260 // path. We will also combine away any sequence of PSHUFD instructions that
15261 // result into a single instruction. Here is an example of the tricky case:
15262 //
15263 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
15264 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
15265 //
15266 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
15267 //
15268 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
15269 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
15270 //
15271 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
15272 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
15273 //
15274 // The result is fine to be handled by the generic logic.
15275 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
15276 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
15277 int AOffset, int BOffset) {
15278 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&(static_cast <bool> ((AToAInputs.size() == 3 || AToAInputs
.size() == 1) && "Must call this with A having 3 or 1 inputs from the A half."
) ? void (0) : __assert_fail ("(AToAInputs.size() == 3 || AToAInputs.size() == 1) && \"Must call this with A having 3 or 1 inputs from the A half.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15279, __extension__
__PRETTY_FUNCTION__))
15279 "Must call this with A having 3 or 1 inputs from the A half.")(static_cast <bool> ((AToAInputs.size() == 3 || AToAInputs
.size() == 1) && "Must call this with A having 3 or 1 inputs from the A half."
) ? void (0) : __assert_fail ("(AToAInputs.size() == 3 || AToAInputs.size() == 1) && \"Must call this with A having 3 or 1 inputs from the A half.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15279, __extension__
__PRETTY_FUNCTION__))
;
15280 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&(static_cast <bool> ((BToAInputs.size() == 1 || BToAInputs
.size() == 3) && "Must call this with B having 1 or 3 inputs from the B half."
) ? void (0) : __assert_fail ("(BToAInputs.size() == 1 || BToAInputs.size() == 3) && \"Must call this with B having 1 or 3 inputs from the B half.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15281, __extension__
__PRETTY_FUNCTION__))
15281 "Must call this with B having 1 or 3 inputs from the B half.")(static_cast <bool> ((BToAInputs.size() == 1 || BToAInputs
.size() == 3) && "Must call this with B having 1 or 3 inputs from the B half."
) ? void (0) : __assert_fail ("(BToAInputs.size() == 1 || BToAInputs.size() == 3) && \"Must call this with B having 1 or 3 inputs from the B half.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15281, __extension__
__PRETTY_FUNCTION__))
;
15282 assert(AToAInputs.size() + BToAInputs.size() == 4 &&(static_cast <bool> (AToAInputs.size() + BToAInputs.size
() == 4 && "Must call this with either 3:1 or 1:3 inputs (summing to 4)."
) ? void (0) : __assert_fail ("AToAInputs.size() + BToAInputs.size() == 4 && \"Must call this with either 3:1 or 1:3 inputs (summing to 4).\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15283, __extension__
__PRETTY_FUNCTION__))
15283 "Must call this with either 3:1 or 1:3 inputs (summing to 4).")(static_cast <bool> (AToAInputs.size() + BToAInputs.size
() == 4 && "Must call this with either 3:1 or 1:3 inputs (summing to 4)."
) ? void (0) : __assert_fail ("AToAInputs.size() + BToAInputs.size() == 4 && \"Must call this with either 3:1 or 1:3 inputs (summing to 4).\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15283, __extension__
__PRETTY_FUNCTION__))
;
15284
15285 bool ThreeAInputs = AToAInputs.size() == 3;
15286
15287 // Compute the index of dword with only one word among the three inputs in
15288 // a half by taking the sum of the half with three inputs and subtracting
15289 // the sum of the actual three inputs. The difference is the remaining
15290 // slot.
15291 int ADWord = 0, BDWord = 0;
15292 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
15293 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
15294 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
15295 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
15296 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
15297 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
15298 int TripleNonInputIdx =
15299 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
15300 TripleDWord = TripleNonInputIdx / 2;
15301
15302 // We use xor with one to compute the adjacent DWord to whichever one the
15303 // OneInput is in.
15304 OneInputDWord = (OneInput / 2) ^ 1;
15305
15306 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
15307 // and BToA inputs. If there is also such a problem with the BToB and AToB
15308 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
15309 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
15310 // is essential that we don't *create* a 3<-1 as then we might oscillate.
15311 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
15312 // Compute how many inputs will be flipped by swapping these DWords. We
15313 // need
15314 // to balance this to ensure we don't form a 3-1 shuffle in the other
15315 // half.
15316 int NumFlippedAToBInputs = llvm::count(AToBInputs, 2 * ADWord) +
15317 llvm::count(AToBInputs, 2 * ADWord + 1);
15318 int NumFlippedBToBInputs = llvm::count(BToBInputs, 2 * BDWord) +
15319 llvm::count(BToBInputs, 2 * BDWord + 1);
15320 if ((NumFlippedAToBInputs == 1 &&
15321 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
15322 (NumFlippedBToBInputs == 1 &&
15323 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
15324 // We choose whether to fix the A half or B half based on whether that
15325 // half has zero flipped inputs. At zero, we may not be able to fix it
15326 // with that half. We also bias towards fixing the B half because that
15327 // will more commonly be the high half, and we have to bias one way.
15328 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
15329 ArrayRef<int> Inputs) {
15330 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
15331 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
15332 // Determine whether the free index is in the flipped dword or the
15333 // unflipped dword based on where the pinned index is. We use this bit
15334 // in an xor to conditionally select the adjacent dword.
15335 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
15336 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
15337 if (IsFixIdxInput == IsFixFreeIdxInput)
15338 FixFreeIdx += 1;
15339 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
15340 assert(IsFixIdxInput != IsFixFreeIdxInput &&(static_cast <bool> (IsFixIdxInput != IsFixFreeIdxInput
&& "We need to be changing the number of flipped inputs!"
) ? void (0) : __assert_fail ("IsFixIdxInput != IsFixFreeIdxInput && \"We need to be changing the number of flipped inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15341, __extension__
__PRETTY_FUNCTION__))
15341 "We need to be changing the number of flipped inputs!")(static_cast <bool> (IsFixIdxInput != IsFixFreeIdxInput
&& "We need to be changing the number of flipped inputs!"
) ? void (0) : __assert_fail ("IsFixIdxInput != IsFixFreeIdxInput && \"We need to be changing the number of flipped inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15341, __extension__
__PRETTY_FUNCTION__))
;
15342 int PSHUFHalfMask[] = {0, 1, 2, 3};
15343 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
15344 V = DAG.getNode(
15345 FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
15346 MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
15347 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
15348
15349 for (int &M : Mask)
15350 if (M >= 0 && M == FixIdx)
15351 M = FixFreeIdx;
15352 else if (M >= 0 && M == FixFreeIdx)
15353 M = FixIdx;
15354 };
15355 if (NumFlippedBToBInputs != 0) {
15356 int BPinnedIdx =
15357 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
15358 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
15359 } else {
15360 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!")(static_cast <bool> (NumFlippedAToBInputs != 0 &&
"Impossible given predicates!") ? void (0) : __assert_fail (
"NumFlippedAToBInputs != 0 && \"Impossible given predicates!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15360, __extension__
__PRETTY_FUNCTION__))
;
15361 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
15362 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
15363 }
15364 }
15365 }
15366
15367 int PSHUFDMask[] = {0, 1, 2, 3};
15368 PSHUFDMask[ADWord] = BDWord;
15369 PSHUFDMask[BDWord] = ADWord;
15370 V = DAG.getBitcast(
15371 VT,
15372 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
15373 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
15374
15375 // Adjust the mask to match the new locations of A and B.
15376 for (int &M : Mask)
15377 if (M >= 0 && M/2 == ADWord)
15378 M = 2 * BDWord + M % 2;
15379 else if (M >= 0 && M/2 == BDWord)
15380 M = 2 * ADWord + M % 2;
15381
15382 // Recurse back into this routine to re-compute state now that this isn't
15383 // a 3 and 1 problem.
15384 return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);
15385 };
15386 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
15387 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
15388 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
15389 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
15390
15391 // At this point there are at most two inputs to the low and high halves from
15392 // each half. That means the inputs can always be grouped into dwords and
15393 // those dwords can then be moved to the correct half with a dword shuffle.
15394 // We use at most one low and one high word shuffle to collect these paired
15395 // inputs into dwords, and finally a dword shuffle to place them.
15396 int PSHUFLMask[4] = {-1, -1, -1, -1};
15397 int PSHUFHMask[4] = {-1, -1, -1, -1};
15398 int PSHUFDMask[4] = {-1, -1, -1, -1};
15399
15400 // First fix the masks for all the inputs that are staying in their
15401 // original halves. This will then dictate the targets of the cross-half
15402 // shuffles.
15403 auto fixInPlaceInputs =
15404 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
15405 MutableArrayRef<int> SourceHalfMask,
15406 MutableArrayRef<int> HalfMask, int HalfOffset) {
15407 if (InPlaceInputs.empty())
15408 return;
15409 if (InPlaceInputs.size() == 1) {
15410 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
15411 InPlaceInputs[0] - HalfOffset;
15412 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
15413 return;
15414 }
15415 if (IncomingInputs.empty()) {
15416 // Just fix all of the in place inputs.
15417 for (int Input : InPlaceInputs) {
15418 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
15419 PSHUFDMask[Input / 2] = Input / 2;
15420 }
15421 return;
15422 }
15423
15424 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!")(static_cast <bool> (InPlaceInputs.size() == 2 &&
"Cannot handle 3 or 4 inputs!") ? void (0) : __assert_fail (
"InPlaceInputs.size() == 2 && \"Cannot handle 3 or 4 inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15424, __extension__
__PRETTY_FUNCTION__))
;
15425 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
15426 InPlaceInputs[0] - HalfOffset;
15427 // Put the second input next to the first so that they are packed into
15428 // a dword. We find the adjacent index by toggling the low bit.
15429 int AdjIndex = InPlaceInputs[0] ^ 1;
15430 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
15431 std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
15432 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
15433 };
15434 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
15435 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
15436
15437 // Now gather the cross-half inputs and place them into a free dword of
15438 // their target half.
15439 // FIXME: This operation could almost certainly be simplified dramatically to
15440 // look more like the 3-1 fixing operation.
15441 auto moveInputsToRightHalf = [&PSHUFDMask](
15442 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
15443 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
15444 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
15445 int DestOffset) {
15446 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
15447 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
15448 };
15449 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
15450 int Word) {
15451 int LowWord = Word & ~1;
15452 int HighWord = Word | 1;
15453 return isWordClobbered(SourceHalfMask, LowWord) ||
15454 isWordClobbered(SourceHalfMask, HighWord);
15455 };
15456
15457 if (IncomingInputs.empty())
15458 return;
15459
15460 if (ExistingInputs.empty()) {
15461 // Map any dwords with inputs from them into the right half.
15462 for (int Input : IncomingInputs) {
15463 // If the source half mask maps over the inputs, turn those into
15464 // swaps and use the swapped lane.
15465 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
15466 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
15467 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
15468 Input - SourceOffset;
15469 // We have to swap the uses in our half mask in one sweep.
15470 for (int &M : HalfMask)
15471 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
15472 M = Input;
15473 else if (M == Input)
15474 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
15475 } else {
15476 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==(static_cast <bool> (SourceHalfMask[SourceHalfMask[Input
- SourceOffset]] == Input - SourceOffset && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15478, __extension__
__PRETTY_FUNCTION__))
15477 Input - SourceOffset &&(static_cast <bool> (SourceHalfMask[SourceHalfMask[Input
- SourceOffset]] == Input - SourceOffset && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15478, __extension__
__PRETTY_FUNCTION__))
15478 "Previous placement doesn't match!")(static_cast <bool> (SourceHalfMask[SourceHalfMask[Input
- SourceOffset]] == Input - SourceOffset && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15478, __extension__
__PRETTY_FUNCTION__))
;
15479 }
15480 // Note that this correctly re-maps both when we do a swap and when
15481 // we observe the other side of the swap above. We rely on that to
15482 // avoid swapping the members of the input list directly.
15483 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
15484 }
15485
15486 // Map the input's dword into the correct half.
15487 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
15488 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
15489 else
15490 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==(static_cast <bool> (PSHUFDMask[(Input - SourceOffset +
DestOffset) / 2] == Input / 2 && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15492, __extension__
__PRETTY_FUNCTION__))
15491 Input / 2 &&(static_cast <bool> (PSHUFDMask[(Input - SourceOffset +
DestOffset) / 2] == Input / 2 && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15492, __extension__
__PRETTY_FUNCTION__))
15492 "Previous placement doesn't match!")(static_cast <bool> (PSHUFDMask[(Input - SourceOffset +
DestOffset) / 2] == Input / 2 && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15492, __extension__
__PRETTY_FUNCTION__))
;
15493 }
15494
15495 // And just directly shift any other-half mask elements to be same-half
15496 // as we will have mirrored the dword containing the element into the
15497 // same position within that half.
15498 for (int &M : HalfMask)
15499 if (M >= SourceOffset && M < SourceOffset + 4) {
15500 M = M - SourceOffset + DestOffset;
15501 assert(M >= 0 && "This should never wrap below zero!")(static_cast <bool> (M >= 0 && "This should never wrap below zero!"
) ? void (0) : __assert_fail ("M >= 0 && \"This should never wrap below zero!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15501, __extension__
__PRETTY_FUNCTION__))
;
15502 }
15503 return;
15504 }
15505
15506 // Ensure we have the input in a viable dword of its current half. This
15507 // is particularly tricky because the original position may be clobbered
15508 // by inputs being moved and *staying* in that half.
15509 if (IncomingInputs.size() == 1) {
15510 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
15511 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
15512 SourceOffset;
15513 SourceHalfMask[InputFixed - SourceOffset] =
15514 IncomingInputs[0] - SourceOffset;
15515 std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
15516 InputFixed);
15517 IncomingInputs[0] = InputFixed;
15518 }
15519 } else if (IncomingInputs.size() == 2) {
15520 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
15521 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
15522 // We have two non-adjacent or clobbered inputs we need to extract from
15523 // the source half. To do this, we need to map them into some adjacent
15524 // dword slot in the source mask.
15525 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
15526 IncomingInputs[1] - SourceOffset};
15527
15528 // If there is a free slot in the source half mask adjacent to one of
15529 // the inputs, place the other input in it. We use (Index XOR 1) to
15530 // compute an adjacent index.
15531 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
15532 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
15533 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
15534 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
15535 InputsFixed[1] = InputsFixed[0] ^ 1;
15536 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
15537 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
15538 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
15539 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
15540 InputsFixed[0] = InputsFixed[1] ^ 1;
15541 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
15542 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
15543 // The two inputs are in the same DWord but it is clobbered and the
15544 // adjacent DWord isn't used at all. Move both inputs to the free
15545 // slot.
15546 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
15547 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
15548 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
15549 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
15550 } else {
15551 // The only way we hit this point is if there is no clobbering
15552 // (because there are no off-half inputs to this half) and there is no
15553 // free slot adjacent to one of the inputs. In this case, we have to
15554 // swap an input with a non-input.
15555 for (int i = 0; i < 4; ++i)
15556 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&(static_cast <bool> ((SourceHalfMask[i] < 0 || SourceHalfMask
[i] == i) && "We can't handle any clobbers here!") ? void
(0) : __assert_fail ("(SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) && \"We can't handle any clobbers here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15557, __extension__
__PRETTY_FUNCTION__))
15557 "We can't handle any clobbers here!")(static_cast <bool> ((SourceHalfMask[i] < 0 || SourceHalfMask
[i] == i) && "We can't handle any clobbers here!") ? void
(0) : __assert_fail ("(SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) && \"We can't handle any clobbers here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15557, __extension__
__PRETTY_FUNCTION__))
;
15558 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&(static_cast <bool> (InputsFixed[1] != (InputsFixed[0] ^
1) && "Cannot have adjacent inputs here!") ? void (0
) : __assert_fail ("InputsFixed[1] != (InputsFixed[0] ^ 1) && \"Cannot have adjacent inputs here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15559, __extension__
__PRETTY_FUNCTION__))
15559 "Cannot have adjacent inputs here!")(static_cast <bool> (InputsFixed[1] != (InputsFixed[0] ^
1) && "Cannot have adjacent inputs here!") ? void (0
) : __assert_fail ("InputsFixed[1] != (InputsFixed[0] ^ 1) && \"Cannot have adjacent inputs here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15559, __extension__
__PRETTY_FUNCTION__))
;
15560
15561 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
15562 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
15563
15564 // We also have to update the final source mask in this case because
15565 // it may need to undo the above swap.
15566 for (int &M : FinalSourceHalfMask)
15567 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
15568 M = InputsFixed[1] + SourceOffset;
15569 else if (M == InputsFixed[1] + SourceOffset)
15570 M = (InputsFixed[0] ^ 1) + SourceOffset;
15571
15572 InputsFixed[1] = InputsFixed[0] ^ 1;
15573 }
15574
15575 // Point everything at the fixed inputs.
15576 for (int &M : HalfMask)
15577 if (M == IncomingInputs[0])
15578 M = InputsFixed[0] + SourceOffset;
15579 else if (M == IncomingInputs[1])
15580 M = InputsFixed[1] + SourceOffset;
15581
15582 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
15583 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
15584 }
15585 } else {
15586 llvm_unreachable("Unhandled input size!")::llvm::llvm_unreachable_internal("Unhandled input size!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15586)
;
15587 }
15588
15589 // Now hoist the DWord down to the right half.
15590 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
15591 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free")(static_cast <bool> (PSHUFDMask[FreeDWord] < 0 &&
"DWord not free") ? void (0) : __assert_fail ("PSHUFDMask[FreeDWord] < 0 && \"DWord not free\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15591, __extension__
__PRETTY_FUNCTION__))
;
15592 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
15593 for (int &M : HalfMask)
15594 for (int Input : IncomingInputs)
15595 if (M == Input)
15596 M = FreeDWord * 2 + Input % 2;
15597 };
15598 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
15599 /*SourceOffset*/ 4, /*DestOffset*/ 0);
15600 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
15601 /*SourceOffset*/ 0, /*DestOffset*/ 4);
15602
15603 // Now enact all the shuffles we've computed to move the inputs into their
15604 // target half.
15605 if (!isNoopShuffleMask(PSHUFLMask))
15606 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
15607 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
15608 if (!isNoopShuffleMask(PSHUFHMask))
15609 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
15610 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
15611 if (!isNoopShuffleMask(PSHUFDMask))
15612 V = DAG.getBitcast(
15613 VT,
15614 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
15615 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
15616
15617 // At this point, each half should contain all its inputs, and we can then
15618 // just shuffle them into their final position.
15619 assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&(static_cast <bool> (count_if(LoMask, [](int M) { return
M >= 4; }) == 0 && "Failed to lift all the high half inputs to the low mask!"
) ? void (0) : __assert_fail ("count_if(LoMask, [](int M) { return M >= 4; }) == 0 && \"Failed to lift all the high half inputs to the low mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15620, __extension__
__PRETTY_FUNCTION__))
15620 "Failed to lift all the high half inputs to the low mask!")(static_cast <bool> (count_if(LoMask, [](int M) { return
M >= 4; }) == 0 && "Failed to lift all the high half inputs to the low mask!"
) ? void (0) : __assert_fail ("count_if(LoMask, [](int M) { return M >= 4; }) == 0 && \"Failed to lift all the high half inputs to the low mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15620, __extension__
__PRETTY_FUNCTION__))
;
15621 assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&(static_cast <bool> (count_if(HiMask, [](int M) { return
M >= 0 && M < 4; }) == 0 && "Failed to lift all the low half inputs to the high mask!"
) ? void (0) : __assert_fail ("count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 && \"Failed to lift all the low half inputs to the high mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15622, __extension__
__PRETTY_FUNCTION__))
15622 "Failed to lift all the low half inputs to the high mask!")(static_cast <bool> (count_if(HiMask, [](int M) { return
M >= 0 && M < 4; }) == 0 && "Failed to lift all the low half inputs to the high mask!"
) ? void (0) : __assert_fail ("count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 && \"Failed to lift all the low half inputs to the high mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15622, __extension__
__PRETTY_FUNCTION__))
;
15623
15624 // Do a half shuffle for the low mask.
15625 if (!isNoopShuffleMask(LoMask))
15626 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
15627 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
15628
15629 // Do a half shuffle with the high mask after shifting its values down.
15630 for (int &M : HiMask)
15631 if (M >= 0)
15632 M -= 4;
15633 if (!isNoopShuffleMask(HiMask))
15634 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
15635 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
15636
15637 return V;
15638}
15639
15640/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
15641/// blend if only one input is used.
15642static SDValue lowerShuffleAsBlendOfPSHUFBs(
15643 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15644 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
15645 assert(!is128BitLaneCrossingShuffleMask(VT, Mask) &&(static_cast <bool> (!is128BitLaneCrossingShuffleMask(VT
, Mask) && "Lane crossing shuffle masks not supported"
) ? void (0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, Mask) && \"Lane crossing shuffle masks not supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15646, __extension__
__PRETTY_FUNCTION__))
15646 "Lane crossing shuffle masks not supported")(static_cast <bool> (!is128BitLaneCrossingShuffleMask(VT
, Mask) && "Lane crossing shuffle masks not supported"
) ? void (0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, Mask) && \"Lane crossing shuffle masks not supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15646, __extension__
__PRETTY_FUNCTION__))
;
15647
15648 int NumBytes = VT.getSizeInBits() / 8;
15649 int Size = Mask.size();
15650 int Scale = NumBytes / Size;
15651
15652 SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));
15653 SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));
15654 V1InUse = false;
15655 V2InUse = false;
15656
15657 for (int i = 0; i < NumBytes; ++i) {
15658 int M = Mask[i / Scale];
15659 if (M < 0)
15660 continue;
15661
15662 const int ZeroMask = 0x80;
15663 int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;
15664 int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;
15665 if (Zeroable[i / Scale])
15666 V1Idx = V2Idx = ZeroMask;
15667
15668 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
15669 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
15670 V1InUse |= (ZeroMask != V1Idx);
15671 V2InUse |= (ZeroMask != V2Idx);
15672 }
15673
15674 MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);
15675 if (V1InUse)
15676 V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),
15677 DAG.getBuildVector(ShufVT, DL, V1Mask));
15678 if (V2InUse)
15679 V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),
15680 DAG.getBuildVector(ShufVT, DL, V2Mask));
15681
15682 // If we need shuffled inputs from both, blend the two.
15683 SDValue V;
15684 if (V1InUse && V2InUse)
15685 V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);
15686 else
15687 V = V1InUse ? V1 : V2;
15688
15689 // Cast the result back to the correct type.
15690 return DAG.getBitcast(VT, V);
15691}
15692
15693/// Generic lowering of 8-lane i16 shuffles.
15694///
15695/// This handles both single-input shuffles and combined shuffle/blends with
15696/// two inputs. The single input shuffles are immediately delegated to
15697/// a dedicated lowering routine.
15698///
15699/// The blends are lowered in one of three fundamental ways. If there are few
15700/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
15701/// of the input is significantly cheaper when lowered as an interleaving of
15702/// the two inputs, try to interleave them. Otherwise, blend the low and high
15703/// halves of the inputs separately (making them have relatively few inputs)
15704/// and then concatenate them.
15705static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15706 const APInt &Zeroable, SDValue V1, SDValue V2,
15707 const X86Subtarget &Subtarget,
15708 SelectionDAG &DAG) {
15709 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15709, __extension__
__PRETTY_FUNCTION__))
;
15710 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15710, __extension__
__PRETTY_FUNCTION__))
;
15711 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15711, __extension__
__PRETTY_FUNCTION__))
;
15712
15713 // Whenever we can lower this as a zext, that instruction is strictly faster
15714 // than any alternative.
15715 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,
15716 Zeroable, Subtarget, DAG))
15717 return ZExt;
15718
15719 // Try to use lower using a truncation.
15720 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
15721 Subtarget, DAG))
15722 return V;
15723
15724 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
15725
15726 if (NumV2Inputs == 0) {
15727 // Try to use shift instructions.
15728 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
15729 Zeroable, Subtarget, DAG))
15730 return Shift;
15731
15732 // Check for being able to broadcast a single element.
15733 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
15734 Mask, Subtarget, DAG))
15735 return Broadcast;
15736
15737 // Try to use bit rotation instructions.
15738 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,
15739 Subtarget, DAG))
15740 return Rotate;
15741
15742 // Use dedicated unpack instructions for masks that match their pattern.
15743 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
15744 return V;
15745
15746 // Use dedicated pack instructions for masks that match their pattern.
15747 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
15748 Subtarget))
15749 return V;
15750
15751 // Try to use byte rotation instructions.
15752 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,
15753 Subtarget, DAG))
15754 return Rotate;
15755
15756 // Make a copy of the mask so it can be modified.
15757 SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
15758 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,
15759 Subtarget, DAG);
15760 }
15761
15762 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&(static_cast <bool> (llvm::any_of(Mask, [](int M) { return
M >= 0 && M < 8; }) && "All single-input shuffles should be canonicalized to be V1-input "
"shuffles.") ? void (0) : __assert_fail ("llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15764, __extension__
__PRETTY_FUNCTION__))
15763 "All single-input shuffles should be canonicalized to be V1-input "(static_cast <bool> (llvm::any_of(Mask, [](int M) { return
M >= 0 && M < 8; }) && "All single-input shuffles should be canonicalized to be V1-input "
"shuffles.") ? void (0) : __assert_fail ("llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15764, __extension__
__PRETTY_FUNCTION__))
15764 "shuffles.")(static_cast <bool> (llvm::any_of(Mask, [](int M) { return
M >= 0 && M < 8; }) && "All single-input shuffles should be canonicalized to be V1-input "
"shuffles.") ? void (0) : __assert_fail ("llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15764, __extension__
__PRETTY_FUNCTION__))
;
15765
15766 // Try to use shift instructions.
15767 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
15768 Zeroable, Subtarget, DAG))
15769 return Shift;
15770
15771 // See if we can use SSE4A Extraction / Insertion.
15772 if (Subtarget.hasSSE4A())
15773 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
15774 Zeroable, DAG))
15775 return V;
15776
15777 // There are special ways we can lower some single-element blends.
15778 if (NumV2Inputs == 1)
15779 if (SDValue V = lowerShuffleAsElementInsertion(
15780 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
15781 return V;
15782
15783 // We have different paths for blend lowering, but they all must use the
15784 // *exact* same predicate.
15785 bool IsBlendSupported = Subtarget.hasSSE41();
15786 if (IsBlendSupported)
15787 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
15788 Zeroable, Subtarget, DAG))
15789 return Blend;
15790
15791 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
15792 Zeroable, Subtarget, DAG))
15793 return Masked;
15794
15795 // Use dedicated unpack instructions for masks that match their pattern.
15796 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
15797 return V;
15798
15799 // Use dedicated pack instructions for masks that match their pattern.
15800 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
15801 Subtarget))
15802 return V;
15803
15804 // Try to use lower using a truncation.
15805 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
15806 Subtarget, DAG))
15807 return V;
15808
15809 // Try to use byte rotation instructions.
15810 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
15811 Subtarget, DAG))
15812 return Rotate;
15813
15814 if (SDValue BitBlend =
15815 lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
15816 return BitBlend;
15817
15818 // Try to use byte shift instructions to mask.
15819 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,
15820 Zeroable, Subtarget, DAG))
15821 return V;
15822
15823 // Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.
15824 // We could use SIGN_EXTEND_INREG+PACKSSDW for older targets but this seems to
15825 // be slower than a PSHUFLW+PSHUFHW+PSHUFD chain.
15826 int NumEvenDrops = canLowerByDroppingElements(Mask, true, false);
15827 if ((NumEvenDrops == 1 || NumEvenDrops == 2) && Subtarget.hasSSE41() &&
15828 !Subtarget.hasVLX()) {
15829 // Check if this is part of a 256-bit vector truncation.
15830 if (NumEvenDrops == 2 && Subtarget.hasAVX2() &&
15831 peekThroughBitcasts(V1).getOpcode() == ISD::EXTRACT_SUBVECTOR &&
15832 peekThroughBitcasts(V2).getOpcode() == ISD::EXTRACT_SUBVECTOR) {
15833 SDValue V1V2 = concatSubVectors(V1, V2, DAG, DL);
15834 V1V2 = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1V2,
15835 getZeroVector(MVT::v16i16, Subtarget, DAG, DL),
15836 DAG.getTargetConstant(0xEE, DL, MVT::i8));
15837 V1V2 = DAG.getBitcast(MVT::v8i32, V1V2);
15838 V1 = extract128BitVector(V1V2, 0, DAG, DL);
15839 V2 = extract128BitVector(V1V2, 4, DAG, DL);
15840 } else {
15841 SmallVector<SDValue, 4> DWordClearOps(4,
15842 DAG.getConstant(0, DL, MVT::i32));
15843 for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
15844 DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);
15845 SDValue DWordClearMask =
15846 DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);
15847 V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),
15848 DWordClearMask);
15849 V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),
15850 DWordClearMask);
15851 }
15852 // Now pack things back together.
15853 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, V1, V2);
15854 if (NumEvenDrops == 2) {
15855 Result = DAG.getBitcast(MVT::v4i32, Result);
15856 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, Result, Result);
15857 }
15858 return Result;
15859 }
15860
15861 // When compacting odd (upper) elements, use PACKSS pre-SSE41.
15862 int NumOddDrops = canLowerByDroppingElements(Mask, false, false);
15863 if (NumOddDrops == 1) {
15864 bool HasSSE41 = Subtarget.hasSSE41();
15865 V1 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
15866 DAG.getBitcast(MVT::v4i32, V1),
15867 DAG.getTargetConstant(16, DL, MVT::i8));
15868 V2 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
15869 DAG.getBitcast(MVT::v4i32, V2),
15870 DAG.getTargetConstant(16, DL, MVT::i8));
15871 return DAG.getNode(HasSSE41 ? X86ISD::PACKUS : X86ISD::PACKSS, DL,
15872 MVT::v8i16, V1, V2);
15873 }
15874
15875 // Try to lower by permuting the inputs into an unpack instruction.
15876 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
15877 Mask, Subtarget, DAG))
15878 return Unpack;
15879
15880 // If we can't directly blend but can use PSHUFB, that will be better as it
15881 // can both shuffle and set up the inefficient blend.
15882 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
15883 bool V1InUse, V2InUse;
15884 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
15885 Zeroable, DAG, V1InUse, V2InUse);
15886 }
15887
15888 // We can always bit-blend if we have to so the fallback strategy is to
15889 // decompose into single-input permutes and blends/unpacks.
15890 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2,
15891 Mask, Subtarget, DAG);
15892}
15893
15894/// Lower 8-lane 16-bit floating point shuffles.
15895static SDValue lowerV8F16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15896 const APInt &Zeroable, SDValue V1, SDValue V2,
15897 const X86Subtarget &Subtarget,
15898 SelectionDAG &DAG) {
15899 assert(V1.getSimpleValueType() == MVT::v8f16 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8f16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8f16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15899, __extension__
__PRETTY_FUNCTION__))
;
15900 assert(V2.getSimpleValueType() == MVT::v8f16 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8f16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8f16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15900, __extension__
__PRETTY_FUNCTION__))
;
15901 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15901, __extension__
__PRETTY_FUNCTION__))
;
15902 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
15903
15904 if (NumV2Elements == 0) {
15905 // Check for being able to broadcast a single element.
15906 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f16, V1, V2,
15907 Mask, Subtarget, DAG))
15908 return Broadcast;
15909 }
15910 if (NumV2Elements == 1 && Mask[0] >= 8)
15911 if (SDValue V = lowerShuffleAsElementInsertion(DL, MVT::v8f16, V1, V2, Mask,
15912 Zeroable, Subtarget, DAG))
15913 return V;
15914
15915 V1 = DAG.getBitcast(MVT::v8i16, V1);
15916 V2 = DAG.getBitcast(MVT::v8i16, V2);
15917 return DAG.getBitcast(MVT::v8f16,
15918 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
15919}
15920
15921// Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,
15922// sub-512-bit shuffles are padded to 512-bits for the shuffle and then
15923// the active subvector is extracted.
15924static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT,
15925 ArrayRef<int> Mask, SDValue V1, SDValue V2,
15926 const X86Subtarget &Subtarget,
15927 SelectionDAG &DAG) {
15928 MVT MaskVT = VT.changeTypeToInteger();
15929 SDValue MaskNode;
15930 MVT ShuffleVT = VT;
15931 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
15932 V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);
15933 V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);
15934 ShuffleVT = V1.getSimpleValueType();
15935
15936 // Adjust mask to correct indices for the second input.
15937 int NumElts = VT.getVectorNumElements();
15938 unsigned Scale = 512 / VT.getSizeInBits();
15939 SmallVector<int, 32> AdjustedMask(Mask.begin(), Mask.end());
15940 for (int &M : AdjustedMask)
15941 if (NumElts <= M)
15942 M += (Scale - 1) * NumElts;
15943 MaskNode = getConstVector(AdjustedMask, MaskVT, DAG, DL, true);
15944 MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);
15945 } else {
15946 MaskNode = getConstVector(Mask, MaskVT, DAG, DL, true);
15947 }
15948
15949 SDValue Result;
15950 if (V2.isUndef())
15951 Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);
15952 else
15953 Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);
15954
15955 if (VT != ShuffleVT)
15956 Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());
15957
15958 return Result;
15959}
15960
15961/// Generic lowering of v16i8 shuffles.
15962///
15963/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
15964/// detect any complexity reducing interleaving. If that doesn't help, it uses
15965/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
15966/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
15967/// back together.
15968static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15969 const APInt &Zeroable, SDValue V1, SDValue V2,
15970 const X86Subtarget &Subtarget,
15971 SelectionDAG &DAG) {
15972 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v16i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15972, __extension__
__PRETTY_FUNCTION__))
;
15973 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v16i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15973, __extension__
__PRETTY_FUNCTION__))
;
15974 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast <bool> (Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15974, __extension__
__PRETTY_FUNCTION__))
;
15975
15976 // Try to use shift instructions.
15977 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
15978 Zeroable, Subtarget, DAG))
15979 return Shift;
15980
15981 // Try to use byte rotation instructions.
15982 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,
15983 Subtarget, DAG))
15984 return Rotate;
15985
15986 // Use dedicated pack instructions for masks that match their pattern.
15987 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,
15988 Subtarget))
15989 return V;
15990
15991 // Try to use a zext lowering.
15992 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,
15993 Zeroable, Subtarget, DAG))
15994 return ZExt;
15995
15996 // Try to use lower using a truncation.
15997 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
15998 Subtarget, DAG))
15999 return V;
16000
16001 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
16002 Subtarget, DAG))
16003 return V;
16004
16005 // See if we can use SSE4A Extraction / Insertion.
16006 if (Subtarget.hasSSE4A())
16007 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
16008 Zeroable, DAG))
16009 return V;
16010
16011 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
16012
16013 // For single-input shuffles, there are some nicer lowering tricks we can use.
16014 if (NumV2Elements == 0) {
16015 // Check for being able to broadcast a single element.
16016 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,
16017 Mask, Subtarget, DAG))
16018 return Broadcast;
16019
16020 // Try to use bit rotation instructions.
16021 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,
16022 Subtarget, DAG))
16023 return Rotate;
16024
16025 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
16026 return V;
16027
16028 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
16029 // Notably, this handles splat and partial-splat shuffles more efficiently.
16030 // However, it only makes sense if the pre-duplication shuffle simplifies
16031 // things significantly. Currently, this means we need to be able to
16032 // express the pre-duplication shuffle as an i16 shuffle.
16033 //
16034 // FIXME: We should check for other patterns which can be widened into an
16035 // i16 shuffle as well.
16036 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
16037 for (int i = 0; i < 16; i += 2)
16038 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
16039 return false;
16040
16041 return true;
16042 };
16043 auto tryToWidenViaDuplication = [&]() -> SDValue {
16044 if (!canWidenViaDuplication(Mask))
16045 return SDValue();
16046 SmallVector<int, 4> LoInputs;
16047 copy_if(Mask, std::back_inserter(LoInputs),
16048 [](int M) { return M >= 0 && M < 8; });
16049 array_pod_sort(LoInputs.begin(), LoInputs.end());
16050 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
16051 LoInputs.end());
16052 SmallVector<int, 4> HiInputs;
16053 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
16054 array_pod_sort(HiInputs.begin(), HiInputs.end());
16055 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
16056 HiInputs.end());
16057
16058 bool TargetLo = LoInputs.size() >= HiInputs.size();
16059 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
16060 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
16061
16062 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
16063 SmallDenseMap<int, int, 8> LaneMap;
16064 for (int I : InPlaceInputs) {
16065 PreDupI16Shuffle[I/2] = I/2;
16066 LaneMap[I] = I;
16067 }
16068 int j = TargetLo ? 0 : 4, je = j + 4;
16069 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
16070 // Check if j is already a shuffle of this input. This happens when
16071 // there are two adjacent bytes after we move the low one.
16072 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
16073 // If we haven't yet mapped the input, search for a slot into which
16074 // we can map it.
16075 while (j < je && PreDupI16Shuffle[j] >= 0)
16076 ++j;
16077
16078 if (j == je)
16079 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
16080 return SDValue();
16081
16082 // Map this input with the i16 shuffle.
16083 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
16084 }
16085
16086 // Update the lane map based on the mapping we ended up with.
16087 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
16088 }
16089 V1 = DAG.getBitcast(
16090 MVT::v16i8,
16091 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
16092 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
16093
16094 // Unpack the bytes to form the i16s that will be shuffled into place.
16095 bool EvenInUse = false, OddInUse = false;
16096 for (int i = 0; i < 16; i += 2) {
16097 EvenInUse |= (Mask[i + 0] >= 0);
16098 OddInUse |= (Mask[i + 1] >= 0);
16099 if (EvenInUse && OddInUse)
16100 break;
16101 }
16102 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
16103 MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),
16104 OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));
16105
16106 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
16107 for (int i = 0; i < 16; ++i)
16108 if (Mask[i] >= 0) {
16109 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
16110 assert(MappedMask < 8 && "Invalid v8 shuffle mask!")(static_cast <bool> (MappedMask < 8 && "Invalid v8 shuffle mask!"
) ? void (0) : __assert_fail ("MappedMask < 8 && \"Invalid v8 shuffle mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16110, __extension__
__PRETTY_FUNCTION__))
;
16111 if (PostDupI16Shuffle[i / 2] < 0)
16112 PostDupI16Shuffle[i / 2] = MappedMask;
16113 else
16114 assert(PostDupI16Shuffle[i / 2] == MappedMask &&(static_cast <bool> (PostDupI16Shuffle[i / 2] == MappedMask
&& "Conflicting entries in the original shuffle!") ?
void (0) : __assert_fail ("PostDupI16Shuffle[i / 2] == MappedMask && \"Conflicting entries in the original shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16115, __extension__
__PRETTY_FUNCTION__))
16115 "Conflicting entries in the original shuffle!")(static_cast <bool> (PostDupI16Shuffle[i / 2] == MappedMask
&& "Conflicting entries in the original shuffle!") ?
void (0) : __assert_fail ("PostDupI16Shuffle[i / 2] == MappedMask && \"Conflicting entries in the original shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16115, __extension__
__PRETTY_FUNCTION__))
;
16116 }
16117 return DAG.getBitcast(
16118 MVT::v16i8,
16119 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
16120 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
16121 };
16122 if (SDValue V = tryToWidenViaDuplication())
16123 return V;
16124 }
16125
16126 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
16127 Zeroable, Subtarget, DAG))
16128 return Masked;
16129
16130 // Use dedicated unpack instructions for masks that match their pattern.
16131 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
16132 return V;
16133
16134 // Try to use byte shift instructions to mask.
16135 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,
16136 Zeroable, Subtarget, DAG))
16137 return V;
16138
16139 // Check for compaction patterns.
16140 bool IsSingleInput = V2.isUndef();
16141 int NumEvenDrops = canLowerByDroppingElements(Mask, true, IsSingleInput);
16142
16143 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
16144 // with PSHUFB. It is important to do this before we attempt to generate any
16145 // blends but after all of the single-input lowerings. If the single input
16146 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
16147 // want to preserve that and we can DAG combine any longer sequences into
16148 // a PSHUFB in the end. But once we start blending from multiple inputs,
16149 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
16150 // and there are *very* few patterns that would actually be faster than the
16151 // PSHUFB approach because of its ability to zero lanes.
16152 //
16153 // If the mask is a binary compaction, we can more efficiently perform this
16154 // as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).
16155 //
16156 // FIXME: The only exceptions to the above are blends which are exact
16157 // interleavings with direct instructions supporting them. We currently don't
16158 // handle those well here.
16159 if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {
16160 bool V1InUse = false;
16161 bool V2InUse = false;
16162
16163 SDValue PSHUFB = lowerShuffleAsBlendOfPSHUFBs(
16164 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
16165
16166 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
16167 // do so. This avoids using them to handle blends-with-zero which is
16168 // important as a single pshufb is significantly faster for that.
16169 if (V1InUse && V2InUse) {
16170 if (Subtarget.hasSSE41())
16171 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
16172 Zeroable, Subtarget, DAG))
16173 return Blend;
16174
16175 // We can use an unpack to do the blending rather than an or in some
16176 // cases. Even though the or may be (very minorly) more efficient, we
16177 // preference this lowering because there are common cases where part of
16178 // the complexity of the shuffles goes away when we do the final blend as
16179 // an unpack.
16180 // FIXME: It might be worth trying to detect if the unpack-feeding
16181 // shuffles will both be pshufb, in which case we shouldn't bother with
16182 // this.
16183 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(
16184 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
16185 return Unpack;
16186
16187 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
16188 if (Subtarget.hasVBMI())
16189 return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,
16190 DAG);
16191
16192 // If we have XOP we can use one VPPERM instead of multiple PSHUFBs.
16193 if (Subtarget.hasXOP()) {
16194 SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true);
16195 return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode);
16196 }
16197
16198 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
16199 // PALIGNR will be cheaper than the second PSHUFB+OR.
16200 if (SDValue V = lowerShuffleAsByteRotateAndPermute(
16201 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
16202 return V;
16203 }
16204
16205 return PSHUFB;
16206 }
16207
16208 // There are special ways we can lower some single-element blends.
16209 if (NumV2Elements == 1)
16210 if (SDValue V = lowerShuffleAsElementInsertion(
16211 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
16212 return V;
16213
16214 if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
16215 return Blend;
16216
16217 // Check whether a compaction lowering can be done. This handles shuffles
16218 // which take every Nth element for some even N. See the helper function for
16219 // details.
16220 //
16221 // We special case these as they can be particularly efficiently handled with
16222 // the PACKUSB instruction on x86 and they show up in common patterns of
16223 // rearranging bytes to truncate wide elements.
16224 if (NumEvenDrops) {
16225 // NumEvenDrops is the power of two stride of the elements. Another way of
16226 // thinking about it is that we need to drop the even elements this many
16227 // times to get the original input.
16228
16229 // First we need to zero all the dropped bytes.
16230 assert(NumEvenDrops <= 3 &&(static_cast <bool> (NumEvenDrops <= 3 && "No support for dropping even elements more than 3 times."
) ? void (0) : __assert_fail ("NumEvenDrops <= 3 && \"No support for dropping even elements more than 3 times.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16231, __extension__
__PRETTY_FUNCTION__))
16231 "No support for dropping even elements more than 3 times.")(static_cast <bool> (NumEvenDrops <= 3 && "No support for dropping even elements more than 3 times."
) ? void (0) : __assert_fail ("NumEvenDrops <= 3 && \"No support for dropping even elements more than 3 times.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16231, __extension__
__PRETTY_FUNCTION__))
;
16232 SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16));
16233 for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
16234 WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16);
16235 SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps);
16236 V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1),
16237 WordClearMask);
16238 if (!IsSingleInput)
16239 V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2),
16240 WordClearMask);
16241
16242 // Now pack things back together.
16243 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
16244 IsSingleInput ? V1 : V2);
16245 for (int i = 1; i < NumEvenDrops; ++i) {
16246 Result = DAG.getBitcast(MVT::v8i16, Result);
16247 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
16248 }
16249 return Result;
16250 }
16251
16252 int NumOddDrops = canLowerByDroppingElements(Mask, false, IsSingleInput);
16253 if (NumOddDrops == 1) {
16254 V1 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
16255 DAG.getBitcast(MVT::v8i16, V1),
16256 DAG.getTargetConstant(8, DL, MVT::i8));
16257 if (!IsSingleInput)
16258 V2 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
16259 DAG.getBitcast(MVT::v8i16, V2),
16260 DAG.getTargetConstant(8, DL, MVT::i8));
16261 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
16262 IsSingleInput ? V1 : V2);
16263 }
16264
16265 // Handle multi-input cases by blending/unpacking single-input shuffles.
16266 if (NumV2Elements > 0)
16267 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,
16268 Subtarget, DAG);
16269
16270 // The fallback path for single-input shuffles widens this into two v8i16
16271 // vectors with unpacks, shuffles those, and then pulls them back together
16272 // with a pack.
16273 SDValue V = V1;
16274
16275 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
16276 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
16277 for (int i = 0; i < 16; ++i)
16278 if (Mask[i] >= 0)
16279 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
16280
16281 SDValue VLoHalf, VHiHalf;
16282 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
16283 // them out and avoid using UNPCK{L,H} to extract the elements of V as
16284 // i16s.
16285 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
16286 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
16287 // Use a mask to drop the high bytes.
16288 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
16289 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
16290 DAG.getConstant(0x00FF, DL, MVT::v8i16));
16291
16292 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
16293 VHiHalf = DAG.getUNDEF(MVT::v8i16);
16294
16295 // Squash the masks to point directly into VLoHalf.
16296 for (int &M : LoBlendMask)
16297 if (M >= 0)
16298 M /= 2;
16299 for (int &M : HiBlendMask)
16300 if (M >= 0)
16301 M /= 2;
16302 } else {
16303 // Otherwise just unpack the low half of V into VLoHalf and the high half into
16304 // VHiHalf so that we can blend them as i16s.
16305 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
16306
16307 VLoHalf = DAG.getBitcast(
16308 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
16309 VHiHalf = DAG.getBitcast(
16310 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
16311 }
16312
16313 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
16314 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
16315
16316 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
16317}
16318
16319/// Dispatching routine to lower various 128-bit x86 vector shuffles.
16320///
16321/// This routine breaks down the specific type of 128-bit shuffle and
16322/// dispatches to the lowering routines accordingly.
16323static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
16324 MVT VT, SDValue V1, SDValue V2,
16325 const APInt &Zeroable,
16326 const X86Subtarget &Subtarget,
16327 SelectionDAG &DAG) {
16328 switch (VT.SimpleTy) {
16329 case MVT::v2i64:
16330 return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16331 case MVT::v2f64:
16332 return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16333 case MVT::v4i32:
16334 return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16335 case MVT::v4f32:
16336 return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16337 case MVT::v8i16:
16338 return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16339 case MVT::v8f16:
16340 return lowerV8F16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16341 case MVT::v16i8:
16342 return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16343
16344 default:
16345 llvm_unreachable("Unimplemented!")::llvm::llvm_unreachable_internal("Unimplemented!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16345)
;
16346 }
16347}
16348
16349/// Generic routine to split vector shuffle into half-sized shuffles.
16350///
16351/// This routine just extracts two subvectors, shuffles them independently, and
16352/// then concatenates them back together. This should work effectively with all
16353/// AVX vector shuffle types.
16354static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
16355 SDValue V2, ArrayRef<int> Mask,
16356 SelectionDAG &DAG) {
16357 assert(VT.getSizeInBits() >= 256 &&(static_cast <bool> (VT.getSizeInBits() >= 256 &&
"Only for 256-bit or wider vector shuffles!") ? void (0) : __assert_fail
("VT.getSizeInBits() >= 256 && \"Only for 256-bit or wider vector shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16358, __extension__
__PRETTY_FUNCTION__))
16358 "Only for 256-bit or wider vector shuffles!")(static_cast <bool> (VT.getSizeInBits() >= 256 &&
"Only for 256-bit or wider vector shuffles!") ? void (0) : __assert_fail
("VT.getSizeInBits() >= 256 && \"Only for 256-bit or wider vector shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16358, __extension__
__PRETTY_FUNCTION__))
;
16359 assert(V1.getSimpleValueType() == VT && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == VT &&
"Bad operand type!") ? void (0) : __assert_fail ("V1.getSimpleValueType() == VT && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16359, __extension__
__PRETTY_FUNCTION__))
;
16360 assert(V2.getSimpleValueType() == VT && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == VT &&
"Bad operand type!") ? void (0) : __assert_fail ("V2.getSimpleValueType() == VT && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16360, __extension__
__PRETTY_FUNCTION__))
;
16361
16362 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
16363 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
16364
16365 int NumElements = VT.getVectorNumElements();
16366 int SplitNumElements = NumElements / 2;
16367 MVT ScalarVT = VT.getVectorElementType();
16368 MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements);
16369
16370 // Use splitVector/extractSubVector so that split build-vectors just build two
16371 // narrower build vectors. This helps shuffling with splats and zeros.
16372 auto SplitVector = [&](SDValue V) {
16373 SDValue LoV, HiV;
16374 std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL);
16375 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
16376 DAG.getBitcast(SplitVT, HiV));
16377 };
16378
16379 SDValue LoV1, HiV1, LoV2, HiV2;
16380 std::tie(LoV1, HiV1) = SplitVector(V1);
16381 std::tie(LoV2, HiV2) = SplitVector(V2);
16382
16383 // Now create two 4-way blends of these half-width vectors.
16384 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
16385 bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
16386 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
16387 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
16388 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
16389 for (int i = 0; i < SplitNumElements; ++i) {
16390 int M = HalfMask[i];
16391 if (M >= NumElements) {
16392 if (M >= NumElements + SplitNumElements)
16393 UseHiV2 = true;
16394 else
16395 UseLoV2 = true;
16396 V2BlendMask[i] = M - NumElements;
16397 BlendMask[i] = SplitNumElements + i;
16398 } else if (M >= 0) {
16399 if (M >= SplitNumElements)
16400 UseHiV1 = true;
16401 else
16402 UseLoV1 = true;
16403 V1BlendMask[i] = M;
16404 BlendMask[i] = i;
16405 }
16406 }
16407
16408 // Because the lowering happens after all combining takes place, we need to
16409 // manually combine these blend masks as much as possible so that we create
16410 // a minimal number of high-level vector shuffle nodes.
16411
16412 // First try just blending the halves of V1 or V2.
16413 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
16414 return DAG.getUNDEF(SplitVT);
16415 if (!UseLoV2 && !UseHiV2)
16416 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
16417 if (!UseLoV1 && !UseHiV1)
16418 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
16419
16420 SDValue V1Blend, V2Blend;
16421 if (UseLoV1 && UseHiV1) {
16422 V1Blend =
16423 DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
16424 } else {
16425 // We only use half of V1 so map the usage down into the final blend mask.
16426 V1Blend = UseLoV1 ? LoV1 : HiV1;
16427 for (int i = 0; i < SplitNumElements; ++i)
16428 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
16429 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
16430 }
16431 if (UseLoV2 && UseHiV2) {
16432 V2Blend =
16433 DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
16434 } else {
16435 // We only use half of V2 so map the usage down into the final blend mask.
16436 V2Blend = UseLoV2 ? LoV2 : HiV2;
16437 for (int i = 0; i < SplitNumElements; ++i)
16438 if (BlendMask[i] >= SplitNumElements)
16439 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
16440 }
16441 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
16442 };
16443 SDValue Lo = HalfBlend(LoMask);
16444 SDValue Hi = HalfBlend(HiMask);
16445 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
16446}
16447
16448/// Either split a vector in halves or decompose the shuffles and the
16449/// blend/unpack.
16450///
16451/// This is provided as a good fallback for many lowerings of non-single-input
16452/// shuffles with more than one 128-bit lane. In those cases, we want to select
16453/// between splitting the shuffle into 128-bit components and stitching those
16454/// back together vs. extracting the single-input shuffles and blending those
16455/// results.
16456static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,
16457 SDValue V2, ArrayRef<int> Mask,
16458 const X86Subtarget &Subtarget,
16459 SelectionDAG &DAG) {
16460 assert(!V2.isUndef() && "This routine must not be used to lower single-input "(static_cast <bool> (!V2.isUndef() && "This routine must not be used to lower single-input "
"shuffles as it could then recurse on itself.") ? void (0) :
__assert_fail ("!V2.isUndef() && \"This routine must not be used to lower single-input \" \"shuffles as it could then recurse on itself.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16461, __extension__
__PRETTY_FUNCTION__))
16461 "shuffles as it could then recurse on itself.")(static_cast <bool> (!V2.isUndef() && "This routine must not be used to lower single-input "
"shuffles as it could then recurse on itself.") ? void (0) :
__assert_fail ("!V2.isUndef() && \"This routine must not be used to lower single-input \" \"shuffles as it could then recurse on itself.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16461, __extension__
__PRETTY_FUNCTION__))
;
16462 int Size = Mask.size();
16463
16464 // If this can be modeled as a broadcast of two elements followed by a blend,
16465 // prefer that lowering. This is especially important because broadcasts can
16466 // often fold with memory operands.
16467 auto DoBothBroadcast = [&] {
16468 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
16469 for (int M : Mask)
16470 if (M >= Size) {
16471 if (V2BroadcastIdx < 0)
16472 V2BroadcastIdx = M - Size;
16473 else if (M - Size != V2BroadcastIdx)
16474 return false;
16475 } else if (M >= 0) {
16476 if (V1BroadcastIdx < 0)
16477 V1BroadcastIdx = M;
16478 else if (M != V1BroadcastIdx)
16479 return false;
16480 }
16481 return true;
16482 };
16483 if (DoBothBroadcast())
16484 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
16485 DAG);
16486
16487 // If the inputs all stem from a single 128-bit lane of each input, then we
16488 // split them rather than blending because the split will decompose to
16489 // unusually few instructions.
16490 int LaneCount = VT.getSizeInBits() / 128;
16491 int LaneSize = Size / LaneCount;
16492 SmallBitVector LaneInputs[2];
16493 LaneInputs[0].resize(LaneCount, false);
16494 LaneInputs[1].resize(LaneCount, false);
16495 for (int i = 0; i < Size; ++i)
16496 if (Mask[i] >= 0)
16497 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
16498 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
16499 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
16500
16501 // Otherwise, just fall back to decomposed shuffles and a blend/unpack. This
16502 // requires that the decomposed single-input shuffles don't end up here.
16503 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
16504 DAG);
16505}
16506
16507// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
16508// TODO: Extend to support v8f32 (+ 512-bit shuffles).
16509static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT,
16510 SDValue V1, SDValue V2,
16511 ArrayRef<int> Mask,
16512 SelectionDAG &DAG) {
16513 assert(VT == MVT::v4f64 && "Only for v4f64 shuffles")(static_cast <bool> (VT == MVT::v4f64 && "Only for v4f64 shuffles"
) ? void (0) : __assert_fail ("VT == MVT::v4f64 && \"Only for v4f64 shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16513, __extension__
__PRETTY_FUNCTION__))
;
16514
16515 int LHSMask[4] = {-1, -1, -1, -1};
16516 int RHSMask[4] = {-1, -1, -1, -1};
16517 unsigned SHUFPMask = 0;
16518
16519 // As SHUFPD uses a single LHS/RHS element per lane, we can always
16520 // perform the shuffle once the lanes have been shuffled in place.
16521 for (int i = 0; i != 4; ++i) {
16522 int M = Mask[i];
16523 if (M < 0)
16524 continue;
16525 int LaneBase = i & ~1;
16526 auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
16527 LaneMask[LaneBase + (M & 1)] = M;
16528 SHUFPMask |= (M & 1) << i;
16529 }
16530
16531 SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);
16532 SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);
16533 return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,
16534 DAG.getTargetConstant(SHUFPMask, DL, MVT::i8));
16535}
16536
16537/// Lower a vector shuffle crossing multiple 128-bit lanes as
16538/// a lane permutation followed by a per-lane permutation.
16539///
16540/// This is mainly for cases where we can have non-repeating permutes
16541/// in each lane.
16542///
16543/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,
16544/// we should investigate merging them.
16545static SDValue lowerShuffleAsLanePermuteAndPermute(
16546 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16547 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
16548 int NumElts = VT.getVectorNumElements();
16549 int NumLanes = VT.getSizeInBits() / 128;
16550 int NumEltsPerLane = NumElts / NumLanes;
16551 bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef();
16552
16553 /// Attempts to find a sublane permute with the given size
16554 /// that gets all elements into their target lanes.
16555 ///
16556 /// If successful, fills CrossLaneMask and InLaneMask and returns true.
16557 /// If unsuccessful, returns false and may overwrite InLaneMask.
16558 auto getSublanePermute = [&](int NumSublanes) -> SDValue {
16559 int NumSublanesPerLane = NumSublanes / NumLanes;
16560 int NumEltsPerSublane = NumElts / NumSublanes;
16561
16562 SmallVector<int, 16> CrossLaneMask;
16563 SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef);
16564 // CrossLaneMask but one entry == one sublane.
16565 SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef);
16566
16567 for (int i = 0; i != NumElts; ++i) {
16568 int M = Mask[i];
16569 if (M < 0)
16570 continue;
16571
16572 int SrcSublane = M / NumEltsPerSublane;
16573 int DstLane = i / NumEltsPerLane;
16574
16575 // We only need to get the elements into the right lane, not sublane.
16576 // So search all sublanes that make up the destination lane.
16577 bool Found = false;
16578 int DstSubStart = DstLane * NumSublanesPerLane;
16579 int DstSubEnd = DstSubStart + NumSublanesPerLane;
16580 for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {
16581 if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))
16582 continue;
16583
16584 Found = true;
16585 CrossLaneMaskLarge[DstSublane] = SrcSublane;
16586 int DstSublaneOffset = DstSublane * NumEltsPerSublane;
16587 InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;
16588 break;
16589 }
16590 if (!Found)
16591 return SDValue();
16592 }
16593
16594 // Fill CrossLaneMask using CrossLaneMaskLarge.
16595 narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask);
16596
16597 if (!CanUseSublanes) {
16598 // If we're only shuffling a single lowest lane and the rest are identity
16599 // then don't bother.
16600 // TODO - isShuffleMaskInputInPlace could be extended to something like
16601 // this.
16602 int NumIdentityLanes = 0;
16603 bool OnlyShuffleLowestLane = true;
16604 for (int i = 0; i != NumLanes; ++i) {
16605 int LaneOffset = i * NumEltsPerLane;
16606 if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane,
16607 i * NumEltsPerLane))
16608 NumIdentityLanes++;
16609 else if (CrossLaneMask[LaneOffset] != 0)
16610 OnlyShuffleLowestLane = false;
16611 }
16612 if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
16613 return SDValue();
16614 }
16615
16616 SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);
16617 return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),
16618 InLaneMask);
16619 };
16620
16621 // First attempt a solution with full lanes.
16622 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes))
16623 return V;
16624
16625 // The rest of the solutions use sublanes.
16626 if (!CanUseSublanes)
16627 return SDValue();
16628
16629 // Then attempt a solution with 64-bit sublanes (vpermq).
16630 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2))
16631 return V;
16632
16633 // If that doesn't work and we have fast variable cross-lane shuffle,
16634 // attempt 32-bit sublanes (vpermd).
16635 if (!Subtarget.hasFastVariableCrossLaneShuffle())
16636 return SDValue();
16637
16638 return getSublanePermute(/*NumSublanes=*/NumLanes * 4);
16639}
16640
16641/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
16642/// source with a lane permutation.
16643///
16644/// This lowering strategy results in four instructions in the worst case for a
16645/// single-input cross lane shuffle which is lower than any other fully general
16646/// cross-lane shuffle strategy I'm aware of. Special cases for each particular
16647/// shuffle pattern should be handled prior to trying this lowering.
16648static SDValue lowerShuffleAsLanePermuteAndShuffle(
16649 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16650 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
16651 // FIXME: This should probably be generalized for 512-bit vectors as well.
16652 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!")(static_cast <bool> (VT.is256BitVector() && "Only for 256-bit vector shuffles!"
) ? void (0) : __assert_fail ("VT.is256BitVector() && \"Only for 256-bit vector shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16652, __extension__
__PRETTY_FUNCTION__))
;
16653 int Size = Mask.size();
16654 int LaneSize = Size / 2;
16655
16656 // Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
16657 // Only do this if the elements aren't all from the lower lane,
16658 // otherwise we're (probably) better off doing a split.
16659 if (VT == MVT::v4f64 &&
16660 !all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))
16661 if (SDValue V =
16662 lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG))
16663 return V;
16664
16665 // If there are only inputs from one 128-bit lane, splitting will in fact be
16666 // less expensive. The flags track whether the given lane contains an element
16667 // that crosses to another lane.
16668 bool AllLanes;
16669 if (!Subtarget.hasAVX2()) {
16670 bool LaneCrossing[2] = {false, false};
16671 for (int i = 0; i < Size; ++i)
16672 if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
16673 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
16674 AllLanes = LaneCrossing[0] && LaneCrossing[1];
16675 } else {
16676 bool LaneUsed[2] = {false, false};
16677 for (int i = 0; i < Size; ++i)
16678 if (Mask[i] >= 0)
16679 LaneUsed[(Mask[i] % Size) / LaneSize] = true;
16680 AllLanes = LaneUsed[0] && LaneUsed[1];
16681 }
16682
16683 // TODO - we could support shuffling V2 in the Flipped input.
16684 assert(V2.isUndef() &&(static_cast <bool> (V2.isUndef() && "This last part of this routine only works on single input shuffles"
) ? void (0) : __assert_fail ("V2.isUndef() && \"This last part of this routine only works on single input shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16685, __extension__
__PRETTY_FUNCTION__))
16685 "This last part of this routine only works on single input shuffles")(static_cast <bool> (V2.isUndef() && "This last part of this routine only works on single input shuffles"
) ? void (0) : __assert_fail ("V2.isUndef() && \"This last part of this routine only works on single input shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16685, __extension__
__PRETTY_FUNCTION__))
;
16686
16687 SmallVector<int, 32> InLaneMask(Mask.begin(), Mask.end());
16688 for (int i = 0; i < Size; ++i) {
16689 int &M = InLaneMask[i];
16690 if (M < 0)
16691 continue;
16692 if (((M % Size) / LaneSize) != (i / LaneSize))
16693 M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
16694 }
16695 assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&(static_cast <bool> (!is128BitLaneCrossingShuffleMask(VT
, InLaneMask) && "In-lane shuffle mask expected") ? void
(0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, InLaneMask) && \"In-lane shuffle mask expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16696, __extension__
__PRETTY_FUNCTION__))
16696 "In-lane shuffle mask expected")(static_cast <bool> (!is128BitLaneCrossingShuffleMask(VT
, InLaneMask) && "In-lane shuffle mask expected") ? void
(0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, InLaneMask) && \"In-lane shuffle mask expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16696, __extension__
__PRETTY_FUNCTION__))
;
16697
16698 // If we're not using both lanes in each lane and the inlane mask is not
16699 // repeating, then we're better off splitting.
16700 if (!AllLanes && !is128BitLaneRepeatedShuffleMask(VT, InLaneMask))
16701 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
16702
16703 // Flip the lanes, and shuffle the results which should now be in-lane.
16704 MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
16705 SDValue Flipped = DAG.getBitcast(PVT, V1);
16706 Flipped =
16707 DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});
16708 Flipped = DAG.getBitcast(VT, Flipped);
16709 return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);
16710}
16711
16712/// Handle lowering 2-lane 128-bit shuffles.
16713static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,
16714 SDValue V2, ArrayRef<int> Mask,
16715 const APInt &Zeroable,
16716 const X86Subtarget &Subtarget,
16717 SelectionDAG &DAG) {
16718 if (V2.isUndef()) {
16719 // Attempt to match VBROADCAST*128 subvector broadcast load.
16720 bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1);
16721 bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1);
16722 if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() &&
16723 X86::mayFoldLoad(peekThroughOneUseBitcasts(V1), Subtarget)) {
16724 MVT MemVT = VT.getHalfNumVectorElementsVT();
16725 unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
16726 auto *Ld = cast<LoadSDNode>(peekThroughOneUseBitcasts(V1));
16727 if (SDValue BcstLd = getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, DL,
16728 VT, MemVT, Ld, Ofs, DAG))
16729 return BcstLd;
16730 }
16731
16732 // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
16733 if (Subtarget.hasAVX2())
16734 return SDValue();
16735 }
16736
16737 bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());
16738
16739 SmallVector<int, 4> WidenedMask;
16740 if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))
16741 return SDValue();
16742
16743 bool IsLowZero = (Zeroable & 0x3) == 0x3;
16744 bool IsHighZero = (Zeroable & 0xc) == 0xc;
16745
16746 // Try to use an insert into a zero vector.
16747 if (WidenedMask[0] == 0 && IsHighZero) {
16748 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
16749 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
16750 DAG.getIntPtrConstant(0, DL));
16751 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
16752 getZeroVector(VT, Subtarget, DAG, DL), LoV,
16753 DAG.getIntPtrConstant(0, DL));
16754 }
16755
16756 // TODO: If minimizing size and one of the inputs is a zero vector and the
16757 // the zero vector has only one use, we could use a VPERM2X128 to save the
16758 // instruction bytes needed to explicitly generate the zero vector.
16759
16760 // Blends are faster and handle all the non-lane-crossing cases.
16761 if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
16762 Subtarget, DAG))
16763 return Blend;
16764
16765 // If either input operand is a zero vector, use VPERM2X128 because its mask
16766 // allows us to replace the zero input with an implicit zero.
16767 if (!IsLowZero && !IsHighZero) {
16768 // Check for patterns which can be matched with a single insert of a 128-bit
16769 // subvector.
16770 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2);
16771 if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) {
16772
16773 // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
16774 // this will likely become vinsertf128 which can't fold a 256-bit memop.
16775 if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
16776 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
16777 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
16778 OnlyUsesV1 ? V1 : V2,
16779 DAG.getIntPtrConstant(0, DL));
16780 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
16781 DAG.getIntPtrConstant(2, DL));
16782 }
16783 }
16784
16785 // Try to use SHUF128 if possible.
16786 if (Subtarget.hasVLX()) {
16787 if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
16788 unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
16789 ((WidenedMask[1] % 2) << 1);
16790 return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
16791 DAG.getTargetConstant(PermMask, DL, MVT::i8));
16792 }
16793 }
16794 }
16795
16796 // Otherwise form a 128-bit permutation. After accounting for undefs,
16797 // convert the 64-bit shuffle mask selection values into 128-bit
16798 // selection bits by dividing the indexes by 2 and shifting into positions
16799 // defined by a vperm2*128 instruction's immediate control byte.
16800
16801 // The immediate permute control byte looks like this:
16802 // [1:0] - select 128 bits from sources for low half of destination
16803 // [2] - ignore
16804 // [3] - zero low half of destination
16805 // [5:4] - select 128 bits from sources for high half of destination
16806 // [6] - ignore
16807 // [7] - zero high half of destination
16808
16809 assert((WidenedMask[0] >= 0 || IsLowZero) &&(static_cast <bool> ((WidenedMask[0] >= 0 || IsLowZero
) && (WidenedMask[1] >= 0 || IsHighZero) &&
"Undef half?") ? void (0) : __assert_fail ("(WidenedMask[0] >= 0 || IsLowZero) && (WidenedMask[1] >= 0 || IsHighZero) && \"Undef half?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16810, __extension__
__PRETTY_FUNCTION__))
16810 (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?")(static_cast <bool> ((WidenedMask[0] >= 0 || IsLowZero
) && (WidenedMask[1] >= 0 || IsHighZero) &&
"Undef half?") ? void (0) : __assert_fail ("(WidenedMask[0] >= 0 || IsLowZero) && (WidenedMask[1] >= 0 || IsHighZero) && \"Undef half?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16810, __extension__
__PRETTY_FUNCTION__))
;
16811
16812 unsigned PermMask = 0;
16813 PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
16814 PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
16815
16816 // Check the immediate mask and replace unused sources with undef.
16817 if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
16818 V1 = DAG.getUNDEF(VT);
16819 if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
16820 V2 = DAG.getUNDEF(VT);
16821
16822 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
16823 DAG.getTargetConstant(PermMask, DL, MVT::i8));
16824}
16825
16826/// Lower a vector shuffle by first fixing the 128-bit lanes and then
16827/// shuffling each lane.
16828///
16829/// This attempts to create a repeated lane shuffle where each lane uses one
16830/// or two of the lanes of the inputs. The lanes of the input vectors are
16831/// shuffled in one or two independent shuffles to get the lanes into the
16832/// position needed by the final shuffle.
16833static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(
16834 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16835 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
16836 assert(!V2.isUndef() && "This is only useful with multiple inputs.")(static_cast <bool> (!V2.isUndef() && "This is only useful with multiple inputs."
) ? void (0) : __assert_fail ("!V2.isUndef() && \"This is only useful with multiple inputs.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16836, __extension__
__PRETTY_FUNCTION__))
;
20
'?' condition is true
16837
16838 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
21
Assuming the condition is false
22
Taking false branch
16839 return SDValue();
16840
16841 int NumElts = Mask.size();
16842 int NumLanes = VT.getSizeInBits() / 128;
16843 int NumLaneElts = 128 / VT.getScalarSizeInBits();
23
'NumLaneElts' initialized here
16844 SmallVector<int, 16> RepeatMask(NumLaneElts, -1);
16845 SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});
16846
16847 // First pass will try to fill in the RepeatMask from lanes that need two
16848 // sources.
16849 for (int Lane = 0; Lane != NumLanes; ++Lane) {
24
Assuming 'Lane' is not equal to 'NumLanes'
25
Loop condition is true. Entering loop body
30
Assuming 'Lane' is equal to 'NumLanes'
31
Loop condition is false. Execution continues on line 16921
16850 int Srcs[2] = {-1, -1};
16851 SmallVector<int, 16> InLaneMask(NumLaneElts, -1);
16852 for (int i = 0; i != NumLaneElts; ++i) {
26
Assuming 'i' is equal to 'NumLaneElts'
27
Loop condition is false. Execution continues on line 16874
16853 int M = Mask[(Lane * NumLaneElts) + i];
16854 if (M < 0)
16855 continue;
16856 // Determine which of the possible input lanes (NumLanes from each source)
16857 // this element comes from. Assign that as one of the sources for this
16858 // lane. We can assign up to 2 sources for this lane. If we run out
16859 // sources we can't do anything.
16860 int LaneSrc = M / NumLaneElts;
16861 int Src;
16862 if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
16863 Src = 0;
16864 else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)
16865 Src = 1;
16866 else
16867 return SDValue();
16868
16869 Srcs[Src] = LaneSrc;
16870 InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;
16871 }
16872
16873 // If this lane has two sources, see if it fits with the repeat mask so far.
16874 if (Srcs[1] < 0)
28
Taking true branch
16875 continue;
29
Execution continues on line 16849
16876
16877 LaneSrcs[Lane][0] = Srcs[0];
16878 LaneSrcs[Lane][1] = Srcs[1];
16879
16880 auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {
16881 assert(M1.size() == M2.size() && "Unexpected mask size")(static_cast <bool> (M1.size() == M2.size() && "Unexpected mask size"
) ? void (0) : __assert_fail ("M1.size() == M2.size() && \"Unexpected mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16881, __extension__
__PRETTY_FUNCTION__))
;
16882 for (int i = 0, e = M1.size(); i != e; ++i)
16883 if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])
16884 return false;
16885 return true;
16886 };
16887
16888 auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {
16889 assert(Mask.size() == MergedMask.size() && "Unexpected mask size")(static_cast <bool> (Mask.size() == MergedMask.size() &&
"Unexpected mask size") ? void (0) : __assert_fail ("Mask.size() == MergedMask.size() && \"Unexpected mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16889, __extension__
__PRETTY_FUNCTION__))
;
16890 for (int i = 0, e = MergedMask.size(); i != e; ++i) {
16891 int M = Mask[i];
16892 if (M < 0)
16893 continue;
16894 assert((MergedMask[i] < 0 || MergedMask[i] == M) &&(static_cast <bool> ((MergedMask[i] < 0 || MergedMask
[i] == M) && "Unexpected mask element") ? void (0) : __assert_fail
("(MergedMask[i] < 0 || MergedMask[i] == M) && \"Unexpected mask element\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16895, __extension__
__PRETTY_FUNCTION__))
16895 "Unexpected mask element")(static_cast <bool> ((MergedMask[i] < 0 || MergedMask
[i] == M) && "Unexpected mask element") ? void (0) : __assert_fail
("(MergedMask[i] < 0 || MergedMask[i] == M) && \"Unexpected mask element\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16895, __extension__
__PRETTY_FUNCTION__))
;
16896 MergedMask[i] = M;
16897 }
16898 };
16899
16900 if (MatchMasks(InLaneMask, RepeatMask)) {
16901 // Merge this lane mask into the final repeat mask.
16902 MergeMasks(InLaneMask, RepeatMask);
16903 continue;
16904 }
16905
16906 // Didn't find a match. Swap the operands and try again.
16907 std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
16908 ShuffleVectorSDNode::commuteMask(InLaneMask);
16909
16910 if (MatchMasks(InLaneMask, RepeatMask)) {
16911 // Merge this lane mask into the final repeat mask.
16912 MergeMasks(InLaneMask, RepeatMask);
16913 continue;
16914 }
16915
16916 // Couldn't find a match with the operands in either order.
16917 return SDValue();
16918 }
16919
16920 // Now handle any lanes with only one source.
16921 for (int Lane = 0; Lane != NumLanes; ++Lane) {
32
Loop condition is true. Entering loop body
36
Loop condition is false. Execution continues on line 16950
16922 // If this lane has already been processed, skip it.
16923 if (LaneSrcs[Lane][0] >= 0)
33
Assuming the condition is true
34
Taking true branch
16924 continue;
35
Execution continues on line 16921
16925
16926 for (int i = 0; i != NumLaneElts; ++i) {
16927 int M = Mask[(Lane * NumLaneElts) + i];
16928 if (M < 0)
16929 continue;
16930
16931 // If RepeatMask isn't defined yet we can define it ourself.
16932 if (RepeatMask[i] < 0)
16933 RepeatMask[i] = M % NumLaneElts;
16934
16935 if (RepeatMask[i] < NumElts) {
16936 if (RepeatMask[i] != M % NumLaneElts)
16937 return SDValue();
16938 LaneSrcs[Lane][0] = M / NumLaneElts;
16939 } else {
16940 if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))
16941 return SDValue();
16942 LaneSrcs[Lane][1] = M / NumLaneElts;
16943 }
16944 }
16945
16946 if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
16947 return SDValue();
16948 }
16949
16950 SmallVector<int, 16> NewMask(NumElts, -1);
16951 for (int Lane = 0; Lane != NumLanes; ++Lane) {
37
Loop condition is true. Entering loop body
39
Loop condition is false. Execution continues on line 16960
16952 int Src = LaneSrcs[Lane][0];
16953 for (int i = 0; i != NumLaneElts; ++i) {
38
Loop condition is false. Execution continues on line 16951
16954 int M = -1;
16955 if (Src >= 0)
16956 M = Src * NumLaneElts + i;
16957 NewMask[Lane * NumLaneElts + i] = M;
16958 }
16959 }
16960 SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
16961 // Ensure we didn't get back the shuffle we started with.
16962 // FIXME: This is a hack to make up for some splat handling code in
16963 // getVectorShuffle.
16964 if (isa<ShuffleVectorSDNode>(NewV1) &&
40
Assuming 'NewV1' is not a 'ShuffleVectorSDNode'
41
Taking false branch
16965 cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
16966 return SDValue();
16967
16968 for (int Lane = 0; Lane != NumLanes; ++Lane) {
42
Loop condition is true. Entering loop body
44
Loop condition is false. Execution continues on line 16977
16969 int Src = LaneSrcs[Lane][1];
16970 for (int i = 0; i != NumLaneElts; ++i) {
43
Loop condition is false. Execution continues on line 16968
16971 int M = -1;
16972 if (Src >= 0)
16973 M = Src * NumLaneElts + i;
16974 NewMask[Lane * NumLaneElts + i] = M;
16975 }
16976 }
16977 SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
16978 // Ensure we didn't get back the shuffle we started with.
16979 // FIXME: This is a hack to make up for some splat handling code in
16980 // getVectorShuffle.
16981 if (isa<ShuffleVectorSDNode>(NewV2) &&
45
Assuming 'NewV2' is not a 'ShuffleVectorSDNode'
46
Taking false branch
16982 cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
16983 return SDValue();
16984
16985 for (int i = 0; i != NumElts; ++i) {
47
Assuming 'i' is not equal to 'NumElts'
48
Loop condition is true. Entering loop body
16986 NewMask[i] = RepeatMask[i % NumLaneElts];
49
Division by zero
16987 if (NewMask[i] < 0)
16988 continue;
16989
16990 NewMask[i] += (i / NumLaneElts) * NumLaneElts;
16991 }
16992 return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
16993}
16994
16995/// If the input shuffle mask results in a vector that is undefined in all upper
16996/// or lower half elements and that mask accesses only 2 halves of the
16997/// shuffle's operands, return true. A mask of half the width with mask indexes
16998/// adjusted to access the extracted halves of the original shuffle operands is
16999/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
17000/// lower half of each input operand is accessed.
17001static bool
17002getHalfShuffleMask(ArrayRef<int> Mask, MutableArrayRef<int> HalfMask,
17003 int &HalfIdx1, int &HalfIdx2) {
17004 assert((Mask.size() == HalfMask.size() * 2) &&(static_cast <bool> ((Mask.size() == HalfMask.size() * 2
) && "Expected input mask to be twice as long as output"
) ? void (0) : __assert_fail ("(Mask.size() == HalfMask.size() * 2) && \"Expected input mask to be twice as long as output\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17005, __extension__
__PRETTY_FUNCTION__))
17005 "Expected input mask to be twice as long as output")(static_cast <bool> ((Mask.size() == HalfMask.size() * 2
) && "Expected input mask to be twice as long as output"
) ? void (0) : __assert_fail ("(Mask.size() == HalfMask.size() * 2) && \"Expected input mask to be twice as long as output\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17005, __extension__
__PRETTY_FUNCTION__))
;
17006
17007 // Exactly one half of the result must be undef to allow narrowing.
17008 bool UndefLower = isUndefLowerHalf(Mask);
17009 bool UndefUpper = isUndefUpperHalf(Mask);
17010 if (UndefLower == UndefUpper)
17011 return false;
17012
17013 unsigned HalfNumElts = HalfMask.size();
17014 unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
17015 HalfIdx1 = -1;
17016 HalfIdx2 = -1;
17017 for (unsigned i = 0; i != HalfNumElts; ++i) {
17018 int M = Mask[i + MaskIndexOffset];
17019 if (M < 0) {
17020 HalfMask[i] = M;
17021 continue;
17022 }
17023
17024 // Determine which of the 4 half vectors this element is from.
17025 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
17026 int HalfIdx = M / HalfNumElts;
17027
17028 // Determine the element index into its half vector source.
17029 int HalfElt = M % HalfNumElts;
17030
17031 // We can shuffle with up to 2 half vectors, set the new 'half'
17032 // shuffle mask accordingly.
17033 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
17034 HalfMask[i] = HalfElt;
17035 HalfIdx1 = HalfIdx;
17036 continue;
17037 }
17038 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
17039 HalfMask[i] = HalfElt + HalfNumElts;
17040 HalfIdx2 = HalfIdx;
17041 continue;
17042 }
17043
17044 // Too many half vectors referenced.
17045 return false;
17046 }
17047
17048 return true;
17049}
17050
17051/// Given the output values from getHalfShuffleMask(), create a half width
17052/// shuffle of extracted vectors followed by an insert back to full width.
17053static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2,
17054 ArrayRef<int> HalfMask, int HalfIdx1,
17055 int HalfIdx2, bool UndefLower,
17056 SelectionDAG &DAG, bool UseConcat = false) {
17057 assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?")(static_cast <bool> (V1.getValueType() == V2.getValueType
() && "Different sized vectors?") ? void (0) : __assert_fail
("V1.getValueType() == V2.getValueType() && \"Different sized vectors?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17057, __extension__
__PRETTY_FUNCTION__))
;
17058 assert(V1.getValueType().isSimple() && "Expecting only simple types")(static_cast <bool> (V1.getValueType().isSimple() &&
"Expecting only simple types") ? void (0) : __assert_fail ("V1.getValueType().isSimple() && \"Expecting only simple types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17058, __extension__
__PRETTY_FUNCTION__))
;
17059
17060 MVT VT = V1.getSimpleValueType();
17061 MVT HalfVT = VT.getHalfNumVectorElementsVT();
17062 unsigned HalfNumElts = HalfVT.getVectorNumElements();
17063
17064 auto getHalfVector = [&](int HalfIdx) {
17065 if (HalfIdx < 0)
17066 return DAG.getUNDEF(HalfVT);
17067 SDValue V = (HalfIdx < 2 ? V1 : V2);
17068 HalfIdx = (HalfIdx % 2) * HalfNumElts;
17069 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
17070 DAG.getIntPtrConstant(HalfIdx, DL));
17071 };
17072
17073 // ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset
17074 SDValue Half1 = getHalfVector(HalfIdx1);
17075 SDValue Half2 = getHalfVector(HalfIdx2);
17076 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
17077 if (UseConcat) {
17078 SDValue Op0 = V;
17079 SDValue Op1 = DAG.getUNDEF(HalfVT);
17080 if (UndefLower)
17081 std::swap(Op0, Op1);
17082 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);
17083 }
17084
17085 unsigned Offset = UndefLower ? HalfNumElts : 0;
17086 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
17087 DAG.getIntPtrConstant(Offset, DL));
17088}
17089
17090/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
17091/// This allows for fast cases such as subvector extraction/insertion
17092/// or shuffling smaller vector types which can lower more efficiently.
17093static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1,
17094 SDValue V2, ArrayRef<int> Mask,
17095 const X86Subtarget &Subtarget,
17096 SelectionDAG &DAG) {
17097 assert((VT.is256BitVector() || VT.is512BitVector()) &&(static_cast <bool> ((VT.is256BitVector() || VT.is512BitVector
()) && "Expected 256-bit or 512-bit vector") ? void (
0) : __assert_fail ("(VT.is256BitVector() || VT.is512BitVector()) && \"Expected 256-bit or 512-bit vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17098, __extension__
__PRETTY_FUNCTION__))
17098 "Expected 256-bit or 512-bit vector")(static_cast <bool> ((VT.is256BitVector() || VT.is512BitVector
()) && "Expected 256-bit or 512-bit vector") ? void (
0) : __assert_fail ("(VT.is256BitVector() || VT.is512BitVector()) && \"Expected 256-bit or 512-bit vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17098, __extension__
__PRETTY_FUNCTION__))
;
17099
17100 bool UndefLower = isUndefLowerHalf(Mask);
17101 if (!UndefLower && !isUndefUpperHalf(Mask))
17102 return SDValue();
17103
17104 assert((!UndefLower || !isUndefUpperHalf(Mask)) &&(static_cast <bool> ((!UndefLower || !isUndefUpperHalf(
Mask)) && "Completely undef shuffle mask should have been simplified already"
) ? void (0) : __assert_fail ("(!UndefLower || !isUndefUpperHalf(Mask)) && \"Completely undef shuffle mask should have been simplified already\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17105, __extension__
__PRETTY_FUNCTION__))
17105 "Completely undef shuffle mask should have been simplified already")(static_cast <bool> ((!UndefLower || !isUndefUpperHalf(
Mask)) && "Completely undef shuffle mask should have been simplified already"
) ? void (0) : __assert_fail ("(!UndefLower || !isUndefUpperHalf(Mask)) && \"Completely undef shuffle mask should have been simplified already\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17105, __extension__
__PRETTY_FUNCTION__))
;
17106
17107 // Upper half is undef and lower half is whole upper subvector.
17108 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
17109 MVT HalfVT = VT.getHalfNumVectorElementsVT();
17110 unsigned HalfNumElts = HalfVT.getVectorNumElements();
17111 if (!UndefLower &&
17112 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
17113 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
17114 DAG.getIntPtrConstant(HalfNumElts, DL));
17115 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
17116 DAG.getIntPtrConstant(0, DL));
17117 }
17118
17119 // Lower half is undef and upper half is whole lower subvector.
17120 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
17121 if (UndefLower &&
17122 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
17123 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
17124 DAG.getIntPtrConstant(0, DL));
17125 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
17126 DAG.getIntPtrConstant(HalfNumElts, DL));
17127 }
17128
17129 int HalfIdx1, HalfIdx2;
17130 SmallVector<int, 8> HalfMask(HalfNumElts);
17131 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))
17132 return SDValue();
17133
17134 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length")(static_cast <bool> (HalfMask.size() == HalfNumElts &&
"Unexpected shuffle mask length") ? void (0) : __assert_fail
("HalfMask.size() == HalfNumElts && \"Unexpected shuffle mask length\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17134, __extension__
__PRETTY_FUNCTION__))
;
17135
17136 // Only shuffle the halves of the inputs when useful.
17137 unsigned NumLowerHalves =
17138 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
17139 unsigned NumUpperHalves =
17140 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
17141 assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed")(static_cast <bool> (NumLowerHalves + NumUpperHalves <=
2 && "Only 1 or 2 halves allowed") ? void (0) : __assert_fail
("NumLowerHalves + NumUpperHalves <= 2 && \"Only 1 or 2 halves allowed\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17141, __extension__
__PRETTY_FUNCTION__))
;
17142
17143 // Determine the larger pattern of undef/halves, then decide if it's worth
17144 // splitting the shuffle based on subtarget capabilities and types.
17145 unsigned EltWidth = VT.getVectorElementType().getSizeInBits();
17146 if (!UndefLower) {
17147 // XXXXuuuu: no insert is needed.
17148 // Always extract lowers when setting lower - these are all free subreg ops.
17149 if (NumUpperHalves == 0)
17150 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
17151 UndefLower, DAG);
17152
17153 if (NumUpperHalves == 1) {
17154 // AVX2 has efficient 32/64-bit element cross-lane shuffles.
17155 if (Subtarget.hasAVX2()) {
17156 // extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
17157 if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
17158 !is128BitUnpackShuffleMask(HalfMask) &&
17159 (!isSingleSHUFPSMask(HalfMask) ||
17160 Subtarget.hasFastVariableCrossLaneShuffle()))
17161 return SDValue();
17162 // If this is a unary shuffle (assume that the 2nd operand is
17163 // canonicalized to undef), then we can use vpermpd. Otherwise, we
17164 // are better off extracting the upper half of 1 operand and using a
17165 // narrow shuffle.
17166 if (EltWidth == 64 && V2.isUndef())
17167 return SDValue();
17168 }
17169 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
17170 if (Subtarget.hasAVX512() && VT.is512BitVector())
17171 return SDValue();
17172 // Extract + narrow shuffle is better than the wide alternative.
17173 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
17174 UndefLower, DAG);
17175 }
17176
17177 // Don't extract both uppers, instead shuffle and then extract.
17178 assert(NumUpperHalves == 2 && "Half vector count went wrong")(static_cast <bool> (NumUpperHalves == 2 && "Half vector count went wrong"
) ? void (0) : __assert_fail ("NumUpperHalves == 2 && \"Half vector count went wrong\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17178, __extension__
__PRETTY_FUNCTION__))
;
17179 return SDValue();
17180 }
17181
17182 // UndefLower - uuuuXXXX: an insert to high half is required if we split this.
17183 if (NumUpperHalves == 0) {
17184 // AVX2 has efficient 64-bit element cross-lane shuffles.
17185 // TODO: Refine to account for unary shuffle, splat, and other masks?
17186 if (Subtarget.hasAVX2() && EltWidth == 64)
17187 return SDValue();
17188 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
17189 if (Subtarget.hasAVX512() && VT.is512BitVector())
17190 return SDValue();
17191 // Narrow shuffle + insert is better than the wide alternative.
17192 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
17193 UndefLower, DAG);
17194 }
17195
17196 // NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.
17197 return SDValue();
17198}
17199
17200/// Test whether the specified input (0 or 1) is in-place blended by the
17201/// given mask.
17202///
17203/// This returns true if the elements from a particular input are already in the
17204/// slot required by the given mask and require no permutation.
17205static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
17206 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.")(static_cast <bool> ((Input == 0 || Input == 1) &&
"Only two inputs to shuffles.") ? void (0) : __assert_fail (
"(Input == 0 || Input == 1) && \"Only two inputs to shuffles.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17206, __extension__
__PRETTY_FUNCTION__))
;
17207 int Size = Mask.size();
17208 for (int i = 0; i < Size; ++i)
17209 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
17210 return false;
17211
17212 return true;
17213}
17214
17215/// Handle case where shuffle sources are coming from the same 128-bit lane and
17216/// every lane can be represented as the same repeating mask - allowing us to
17217/// shuffle the sources with the repeating shuffle and then permute the result
17218/// to the destination lanes.
17219static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
17220 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
17221 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
17222 int NumElts = VT.getVectorNumElements();
17223 int NumLanes = VT.getSizeInBits() / 128;
17224 int NumLaneElts = NumElts / NumLanes;
17225
17226 // On AVX2 we may be able to just shuffle the lowest elements and then
17227 // broadcast the result.
17228 if (Subtarget.hasAVX2()) {
17229 for (unsigned BroadcastSize : {16, 32, 64}) {
17230 if (BroadcastSize <= VT.getScalarSizeInBits())
17231 continue;
17232 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
17233
17234 // Attempt to match a repeating pattern every NumBroadcastElts,
17235 // accounting for UNDEFs but only references the lowest 128-bit
17236 // lane of the inputs.
17237 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
17238 for (int i = 0; i != NumElts; i += NumBroadcastElts)
17239 for (int j = 0; j != NumBroadcastElts; ++j) {
17240 int M = Mask[i + j];
17241 if (M < 0)
17242 continue;
17243 int &R = RepeatMask[j];
17244 if (0 != ((M % NumElts) / NumLaneElts))
17245 return false;
17246 if (0 <= R && R != M)
17247 return false;
17248 R = M;
17249 }
17250 return true;
17251 };
17252
17253 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
17254 if (!FindRepeatingBroadcastMask(RepeatMask))
17255 continue;
17256
17257 // Shuffle the (lowest) repeated elements in place for broadcast.
17258 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
17259
17260 // Shuffle the actual broadcast.
17261 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
17262 for (int i = 0; i != NumElts; i += NumBroadcastElts)
17263 for (int j = 0; j != NumBroadcastElts; ++j)
17264 BroadcastMask[i + j] = j;
17265 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
17266 BroadcastMask);
17267 }
17268 }
17269
17270 // Bail if the shuffle mask doesn't cross 128-bit lanes.
17271 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
17272 return SDValue();
17273
17274 // Bail if we already have a repeated lane shuffle mask.
17275 SmallVector<int, 8> RepeatedShuffleMask;
17276 if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
17277 return SDValue();
17278
17279 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
17280 // (with PERMQ/PERMPD). On AVX512BW targets, permuting 32-bit sub-lanes, even
17281 // with a variable shuffle, is worth it for 64xi8 vectors. Otherwise we can
17282 // only permute whole 128-bit lanes.
17283 int SubLaneScale = 1;
17284 if (Subtarget.hasAVX2() && VT.is256BitVector())
17285 SubLaneScale = 2;
17286 if (Subtarget.hasBWI() && VT == MVT::v64i8)
17287 SubLaneScale = 4;
17288 int NumSubLanes = NumLanes * SubLaneScale;
17289 int NumSubLaneElts = NumLaneElts / SubLaneScale;
17290
17291 // Check that all the sources are coming from the same lane and see if we can
17292 // form a repeating shuffle mask (local to each sub-lane). At the same time,
17293 // determine the source sub-lane for each destination sub-lane.
17294 int TopSrcSubLane = -1;
17295 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
17296 SmallVector<SmallVector<int, 8>> RepeatedSubLaneMasks(
17297 SubLaneScale,
17298 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef));
17299
17300 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
17301 // Extract the sub-lane mask, check that it all comes from the same lane
17302 // and normalize the mask entries to come from the first lane.
17303 int SrcLane = -1;
17304 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
17305 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
17306 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
17307 if (M < 0)
17308 continue;
17309 int Lane = (M % NumElts) / NumLaneElts;
17310 if ((0 <= SrcLane) && (SrcLane != Lane))
17311 return SDValue();
17312 SrcLane = Lane;
17313 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
17314 SubLaneMask[Elt] = LocalM;
17315 }
17316
17317 // Whole sub-lane is UNDEF.
17318 if (SrcLane < 0)
17319 continue;
17320
17321 // Attempt to match against the candidate repeated sub-lane masks.
17322 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
17323 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
17324 for (int i = 0; i != NumSubLaneElts; ++i) {
17325 if (M1[i] < 0 || M2[i] < 0)
17326 continue;
17327 if (M1[i] != M2[i])
17328 return false;
17329 }
17330 return true;
17331 };
17332
17333 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
17334 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
17335 continue;
17336
17337 // Merge the sub-lane mask into the matching repeated sub-lane mask.
17338 for (int i = 0; i != NumSubLaneElts; ++i) {
17339 int M = SubLaneMask[i];
17340 if (M < 0)
17341 continue;
17342 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&(static_cast <bool> ((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask
[i] == M) && "Unexpected mask element") ? void (0) : __assert_fail
("(RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) && \"Unexpected mask element\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17343, __extension__
__PRETTY_FUNCTION__))
17343 "Unexpected mask element")(static_cast <bool> ((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask
[i] == M) && "Unexpected mask element") ? void (0) : __assert_fail
("(RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) && \"Unexpected mask element\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17343, __extension__
__PRETTY_FUNCTION__))
;
17344 RepeatedSubLaneMask[i] = M;
17345 }
17346
17347 // Track the top most source sub-lane - by setting the remaining to UNDEF
17348 // we can greatly simplify shuffle matching.
17349 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
17350 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
17351 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
17352 break;
17353 }
17354
17355 // Bail if we failed to find a matching repeated sub-lane mask.
17356 if (Dst2SrcSubLanes[DstSubLane] < 0)
17357 return SDValue();
17358 }
17359 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&(static_cast <bool> (0 <= TopSrcSubLane && TopSrcSubLane
< NumSubLanes && "Unexpected source lane") ? void
(0) : __assert_fail ("0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes && \"Unexpected source lane\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17360, __extension__
__PRETTY_FUNCTION__))
17360 "Unexpected source lane")(static_cast <bool> (0 <= TopSrcSubLane && TopSrcSubLane
< NumSubLanes && "Unexpected source lane") ? void
(0) : __assert_fail ("0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes && \"Unexpected source lane\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17360, __extension__
__PRETTY_FUNCTION__))
;
17361
17362 // Create a repeating shuffle mask for the entire vector.
17363 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
17364 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
17365 int Lane = SubLane / SubLaneScale;
17366 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
17367 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
17368 int M = RepeatedSubLaneMask[Elt];
17369 if (M < 0)
17370 continue;
17371 int Idx = (SubLane * NumSubLaneElts) + Elt;
17372 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
17373 }
17374 }
17375 SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
17376
17377 // Shuffle each source sub-lane to its destination.
17378 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
17379 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
17380 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
17381 if (SrcSubLane < 0)
17382 continue;
17383 for (int j = 0; j != NumSubLaneElts; ++j)
17384 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
17385 }
17386
17387 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
17388 SubLaneMask);
17389}
17390
17391static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
17392 bool &ForceV1Zero, bool &ForceV2Zero,
17393 unsigned &ShuffleImm, ArrayRef<int> Mask,
17394 const APInt &Zeroable) {
17395 int NumElts = VT.getVectorNumElements();
17396 assert(VT.getScalarSizeInBits() == 64 &&(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
(NumElts == 2 || NumElts == 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && \"Unexpected data type for VSHUFPD\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17398, __extension__
__PRETTY_FUNCTION__))
17397 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
(NumElts == 2 || NumElts == 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && \"Unexpected data type for VSHUFPD\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17398, __extension__
__PRETTY_FUNCTION__))
17398 "Unexpected data type for VSHUFPD")(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
(NumElts == 2 || NumElts == 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && \"Unexpected data type for VSHUFPD\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17398, __extension__
__PRETTY_FUNCTION__))
;
17399 assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&(static_cast <bool> (isUndefOrZeroOrInRange(Mask, 0, 2 *
NumElts) && "Illegal shuffle mask") ? void (0) : __assert_fail
("isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) && \"Illegal shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17400, __extension__
__PRETTY_FUNCTION__))
17400 "Illegal shuffle mask")(static_cast <bool> (isUndefOrZeroOrInRange(Mask, 0, 2 *
NumElts) && "Illegal shuffle mask") ? void (0) : __assert_fail
("isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) && \"Illegal shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17400, __extension__
__PRETTY_FUNCTION__))
;
17401
17402 bool ZeroLane[2] = { true, true };
17403 for (int i = 0; i < NumElts; ++i)
17404 ZeroLane[i & 1] &= Zeroable[i];
17405
17406 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
17407 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
17408 ShuffleImm = 0;
17409 bool ShufpdMask = true;
17410 bool CommutableMask = true;
17411 for (int i = 0; i < NumElts; ++i) {
17412 if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])
17413 continue;
17414 if (Mask[i] < 0)
17415 return false;
17416 int Val = (i & 6) + NumElts * (i & 1);
17417 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
17418 if (Mask[i] < Val || Mask[i] > Val + 1)
17419 ShufpdMask = false;
17420 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
17421 CommutableMask = false;
17422 ShuffleImm |= (Mask[i] % 2) << i;
17423 }
17424
17425 if (!ShufpdMask && !CommutableMask)
17426 return false;
17427
17428 if (!ShufpdMask && CommutableMask)
17429 std::swap(V1, V2);
17430
17431 ForceV1Zero = ZeroLane[0];
17432 ForceV2Zero = ZeroLane[1];
17433 return true;
17434}
17435
17436static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1,
17437 SDValue V2, ArrayRef<int> Mask,
17438 const APInt &Zeroable,
17439 const X86Subtarget &Subtarget,
17440 SelectionDAG &DAG) {
17441 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&(static_cast <bool> ((VT == MVT::v2f64 || VT == MVT::v4f64
|| VT == MVT::v8f64) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("(VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) && \"Unexpected data type for VSHUFPD\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17442, __extension__
__PRETTY_FUNCTION__))
17442 "Unexpected data type for VSHUFPD")(static_cast <bool> ((VT == MVT::v2f64 || VT == MVT::v4f64
|| VT == MVT::v8f64) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("(VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) && \"Unexpected data type for VSHUFPD\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17442, __extension__
__PRETTY_FUNCTION__))
;
17443
17444 unsigned Immediate = 0;
17445 bool ForceV1Zero = false, ForceV2Zero = false;
17446 if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,
17447 Mask, Zeroable))
17448 return SDValue();
17449
17450 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
17451 if (ForceV1Zero)
17452 V1 = getZeroVector(VT, Subtarget, DAG, DL);
17453 if (ForceV2Zero)
17454 V2 = getZeroVector(VT, Subtarget, DAG, DL);
17455
17456 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
17457 DAG.getTargetConstant(Immediate, DL, MVT::i8));
17458}
17459
17460// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
17461// by zeroable elements in the remaining 24 elements. Turn this into two
17462// vmovqb instructions shuffled together.
17463static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT,
17464 SDValue V1, SDValue V2,
17465 ArrayRef<int> Mask,
17466 const APInt &Zeroable,
17467 SelectionDAG &DAG) {
17468 assert(VT == MVT::v32i8 && "Unexpected type!")(static_cast <bool> (VT == MVT::v32i8 && "Unexpected type!"
) ? void (0) : __assert_fail ("VT == MVT::v32i8 && \"Unexpected type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17468, __extension__
__PRETTY_FUNCTION__))
;
17469
17470 // The first 8 indices should be every 8th element.
17471 if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))
17472 return SDValue();
17473
17474 // Remaining elements need to be zeroable.
17475 if (Zeroable.countLeadingOnes() < (Mask.size() - 8))
17476 return SDValue();
17477
17478 V1 = DAG.getBitcast(MVT::v4i64, V1);
17479 V2 = DAG.getBitcast(MVT::v4i64, V2);
17480
17481 V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);
17482 V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);
17483
17484 // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in
17485 // the upper bits of the result using an unpckldq.
17486 SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,
17487 { 0, 1, 2, 3, 16, 17, 18, 19,
17488 4, 5, 6, 7, 20, 21, 22, 23 });
17489 // Insert the unpckldq into a zero vector to widen to v32i8.
17490 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
17491 DAG.getConstant(0, DL, MVT::v32i8), Unpack,
17492 DAG.getIntPtrConstant(0, DL));
17493}
17494
17495
17496/// Handle lowering of 4-lane 64-bit floating point shuffles.
17497///
17498/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
17499/// isn't available.
17500static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17501 const APInt &Zeroable, SDValue V1, SDValue V2,
17502 const X86Subtarget &Subtarget,
17503 SelectionDAG &DAG) {
17504 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17504, __extension__
__PRETTY_FUNCTION__))
;
17505 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17505, __extension__
__PRETTY_FUNCTION__))
;
17506 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17506, __extension__
__PRETTY_FUNCTION__))
;
17507
17508 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
17509 Subtarget, DAG))
17510 return V;
17511
17512 if (V2.isUndef()) {
17513 // Check for being able to broadcast a single element.
17514 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,
17515 Mask, Subtarget, DAG))
17516 return Broadcast;
17517
17518 // Use low duplicate instructions for masks that match their pattern.
17519 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
17520 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
17521
17522 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
17523 // Non-half-crossing single input shuffles can be lowered with an
17524 // interleaved permutation.
17525 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
17526 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
17527 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
17528 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
17529 }
17530
17531 // With AVX2 we have direct support for this permutation.
17532 if (Subtarget.hasAVX2())
17533 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
17534 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
17535
17536 // Try to create an in-lane repeating shuffle mask and then shuffle the
17537 // results into the target lanes.
17538 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17539 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
17540 return V;
17541
17542 // Try to permute the lanes and then use a per-lane permute.
17543 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,
17544 Mask, DAG, Subtarget))
17545 return V;
17546
17547 // Otherwise, fall back.
17548 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,
17549 DAG, Subtarget);
17550 }
17551
17552 // Use dedicated unpack instructions for masks that match their pattern.
17553 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
17554 return V;
17555
17556 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
17557 Zeroable, Subtarget, DAG))
17558 return Blend;
17559
17560 // Check if the blend happens to exactly fit that of SHUFPD.
17561 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,
17562 Zeroable, Subtarget, DAG))
17563 return Op;
17564
17565 // If we have lane crossing shuffles AND they don't all come from the lower
17566 // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
17567 // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently
17568 // canonicalize to a blend of splat which isn't necessary for this combine.
17569 if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&
17570 !all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&
17571 (V1.getOpcode() != ISD::BUILD_VECTOR) &&
17572 (V2.getOpcode() != ISD::BUILD_VECTOR))
17573 if (SDValue Op = lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2,
17574 Mask, DAG))
17575 return Op;
17576
17577 // If we have one input in place, then we can permute the other input and
17578 // blend the result.
17579 if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask))
17580 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
17581 Subtarget, DAG);
17582
17583 // Try to create an in-lane repeating shuffle mask and then shuffle the
17584 // results into the target lanes.
17585 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17586 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
17587 return V;
17588
17589 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17590 // shuffle. However, if we have AVX2 and either inputs are already in place,
17591 // we will be able to shuffle even across lanes the other input in a single
17592 // instruction so skip this pattern.
17593 if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
17594 isShuffleMaskInputInPlace(1, Mask))))
17595 if (SDValue V = lowerShuffleAsLanePermuteAndRepeatedMask(
17596 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
17597 return V;
17598
17599 // If we have VLX support, we can use VEXPAND.
17600 if (Subtarget.hasVLX())
17601 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, V1, V2,
17602 DAG, Subtarget))
17603 return V;
17604
17605 // If we have AVX2 then we always want to lower with a blend because an v4 we
17606 // can fully permute the elements.
17607 if (Subtarget.hasAVX2())
17608 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
17609 Subtarget, DAG);
17610
17611 // Otherwise fall back on generic lowering.
17612 return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask,
17613 Subtarget, DAG);
17614}
17615
17616/// Handle lowering of 4-lane 64-bit integer shuffles.
17617///
17618/// This routine is only called when we have AVX2 and thus a reasonable
17619/// instruction set for v4i64 shuffling..
17620static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17621 const APInt &Zeroable, SDValue V1, SDValue V2,
17622 const X86Subtarget &Subtarget,
17623 SelectionDAG &DAG) {
17624 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17624, __extension__
__PRETTY_FUNCTION__))
;
17625 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17625, __extension__
__PRETTY_FUNCTION__))
;
17626 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17626, __extension__
__PRETTY_FUNCTION__))
;
17627 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v4i64 with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17627, __extension__
__PRETTY_FUNCTION__))
;
17628
17629 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
17630 Subtarget, DAG))
17631 return V;
17632
17633 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
17634 Zeroable, Subtarget, DAG))
17635 return Blend;
17636
17637 // Check for being able to broadcast a single element.
17638 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
17639 Subtarget, DAG))
17640 return Broadcast;
17641
17642 if (V2.isUndef()) {
17643 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
17644 // can use lower latency instructions that will operate on both lanes.
17645 SmallVector<int, 2> RepeatedMask;
17646 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
17647 SmallVector<int, 4> PSHUFDMask;
17648 narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask);
17649 return DAG.getBitcast(
17650 MVT::v4i64,
17651 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
17652 DAG.getBitcast(MVT::v8i32, V1),
17653 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
17654 }
17655
17656 // AVX2 provides a direct instruction for permuting a single input across
17657 // lanes.
17658 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
17659 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
17660 }
17661
17662 // Try to use shift instructions.
17663 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
17664 Zeroable, Subtarget, DAG))
17665 return Shift;
17666
17667 // If we have VLX support, we can use VALIGN or VEXPAND.
17668 if (Subtarget.hasVLX()) {
17669 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,
17670 Subtarget, DAG))
17671 return Rotate;
17672
17673 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, V1, V2,
17674 DAG, Subtarget))
17675 return V;
17676 }
17677
17678 // Try to use PALIGNR.
17679 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,
17680 Subtarget, DAG))
17681 return Rotate;
17682
17683 // Use dedicated unpack instructions for masks that match their pattern.
17684 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
17685 return V;
17686
17687 // If we have one input in place, then we can permute the other input and
17688 // blend the result.
17689 if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask))
17690 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
17691 Subtarget, DAG);
17692
17693 // Try to create an in-lane repeating shuffle mask and then shuffle the
17694 // results into the target lanes.
17695 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17696 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
17697 return V;
17698
17699 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17700 // shuffle. However, if we have AVX2 and either inputs are already in place,
17701 // we will be able to shuffle even across lanes the other input in a single
17702 // instruction so skip this pattern.
17703 if (!isShuffleMaskInputInPlace(0, Mask) &&
17704 !isShuffleMaskInputInPlace(1, Mask))
17705 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
17706 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
17707 return Result;
17708
17709 // Otherwise fall back on generic blend lowering.
17710 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
17711 Subtarget, DAG);
17712}
17713
17714/// Handle lowering of 8-lane 32-bit floating point shuffles.
17715///
17716/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
17717/// isn't available.
17718static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17719 const APInt &Zeroable, SDValue V1, SDValue V2,
17720 const X86Subtarget &Subtarget,
17721 SelectionDAG &DAG) {
17722 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17722, __extension__
__PRETTY_FUNCTION__))
;
17723 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17723, __extension__
__PRETTY_FUNCTION__))
;
17724 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17724, __extension__
__PRETTY_FUNCTION__))
;
17725
17726 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
17727 Zeroable, Subtarget, DAG))
17728 return Blend;
17729
17730 // Check for being able to broadcast a single element.
17731 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
17732 Subtarget, DAG))
17733 return Broadcast;
17734
17735 // If the shuffle mask is repeated in each 128-bit lane, we have many more
17736 // options to efficiently lower the shuffle.
17737 SmallVector<int, 4> RepeatedMask;
17738 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
17739 assert(RepeatedMask.size() == 4 &&(static_cast <bool> (RepeatedMask.size() == 4 &&
"Repeated masks must be half the mask width!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Repeated masks must be half the mask width!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17740, __extension__
__PRETTY_FUNCTION__))
17740 "Repeated masks must be half the mask width!")(static_cast <bool> (RepeatedMask.size() == 4 &&
"Repeated masks must be half the mask width!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Repeated masks must be half the mask width!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17740, __extension__
__PRETTY_FUNCTION__))
;
17741
17742 // Use even/odd duplicate instructions for masks that match their pattern.
17743 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
17744 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
17745 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
17746 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
17747
17748 if (V2.isUndef())
17749 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
17750 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17751
17752 // Use dedicated unpack instructions for masks that match their pattern.
17753 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
17754 return V;
17755
17756 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
17757 // have already handled any direct blends.
17758 return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
17759 }
17760
17761 // Try to create an in-lane repeating shuffle mask and then shuffle the
17762 // results into the target lanes.
17763 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17764 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
17765 return V;
17766
17767 // If we have a single input shuffle with different shuffle patterns in the
17768 // two 128-bit lanes use the variable mask to VPERMILPS.
17769 if (V2.isUndef()) {
17770 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {
17771 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
17772 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
17773 }
17774 if (Subtarget.hasAVX2()) {
17775 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
17776 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
17777 }
17778 // Otherwise, fall back.
17779 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,
17780 DAG, Subtarget);
17781 }
17782
17783 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17784 // shuffle.
17785 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
17786 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
17787 return Result;
17788
17789 // If we have VLX support, we can use VEXPAND.
17790 if (Subtarget.hasVLX())
17791 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask, V1, V2,
17792 DAG, Subtarget))
17793 return V;
17794
17795 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
17796 // since after split we get a more efficient code using vpunpcklwd and
17797 // vpunpckhwd instrs than vblend.
17798 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
17799 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Subtarget,
17800 DAG);
17801
17802 // If we have AVX2 then we always want to lower with a blend because at v8 we
17803 // can fully permute the elements.
17804 if (Subtarget.hasAVX2())
17805 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask,
17806 Subtarget, DAG);
17807
17808 // Otherwise fall back on generic lowering.
17809 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,
17810 Subtarget, DAG);
17811}
17812
17813/// Handle lowering of 8-lane 32-bit integer shuffles.
17814///
17815/// This routine is only called when we have AVX2 and thus a reasonable
17816/// instruction set for v8i32 shuffling..
17817static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17818 const APInt &Zeroable, SDValue V1, SDValue V2,
17819 const X86Subtarget &Subtarget,
17820 SelectionDAG &DAG) {
17821 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17821, __extension__
__PRETTY_FUNCTION__))
;
17822 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17822, __extension__
__PRETTY_FUNCTION__))
;
17823 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17823, __extension__
__PRETTY_FUNCTION__))
;
17824 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v8i32 with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17824, __extension__
__PRETTY_FUNCTION__))
;
17825
17826 // Whenever we can lower this as a zext, that instruction is strictly faster
17827 // than any alternative. It also allows us to fold memory operands into the
17828 // shuffle in many cases.
17829 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
17830 Zeroable, Subtarget, DAG))
17831 return ZExt;
17832
17833 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
17834 // since after split we get a more efficient code than vblend by using
17835 // vpunpcklwd and vpunpckhwd instrs.
17836 if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
17837 !Subtarget.hasAVX512())
17838 return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Subtarget,
17839 DAG);
17840
17841 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
17842 Zeroable, Subtarget, DAG))
17843 return Blend;
17844
17845 // Check for being able to broadcast a single element.
17846 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
17847 Subtarget, DAG))
17848 return Broadcast;
17849
17850 // If the shuffle mask is repeated in each 128-bit lane we can use more
17851 // efficient instructions that mirror the shuffles across the two 128-bit
17852 // lanes.
17853 SmallVector<int, 4> RepeatedMask;
17854 bool Is128BitLaneRepeatedShuffle =
17855 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
17856 if (Is128BitLaneRepeatedShuffle) {
17857 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")(static_cast <bool> (RepeatedMask.size() == 4 &&
"Unexpected repeated mask size!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17857, __extension__
__PRETTY_FUNCTION__))
;
17858 if (V2.isUndef())
17859 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
17860 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17861
17862 // Use dedicated unpack instructions for masks that match their pattern.
17863 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
17864 return V;
17865 }
17866
17867 // Try to use shift instructions.
17868 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
17869 Zeroable, Subtarget, DAG))
17870 return Shift;
17871
17872 // If we have VLX support, we can use VALIGN or EXPAND.
17873 if (Subtarget.hasVLX()) {
17874 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,
17875 Subtarget, DAG))
17876 return Rotate;
17877
17878 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, V1, V2,
17879 DAG, Subtarget))
17880 return V;
17881 }
17882
17883 // Try to use byte rotation instructions.
17884 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,
17885 Subtarget, DAG))
17886 return Rotate;
17887
17888 // Try to create an in-lane repeating shuffle mask and then shuffle the
17889 // results into the target lanes.
17890 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17891 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
17892 return V;
17893
17894 if (V2.isUndef()) {
17895 // Try to produce a fixed cross-128-bit lane permute followed by unpack
17896 // because that should be faster than the variable permute alternatives.
17897 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, Mask, V1, V2, DAG))
17898 return V;
17899
17900 // If the shuffle patterns aren't repeated but it's a single input, directly
17901 // generate a cross-lane VPERMD instruction.
17902 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
17903 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
17904 }
17905
17906 // Assume that a single SHUFPS is faster than an alternative sequence of
17907 // multiple instructions (even if the CPU has a domain penalty).
17908 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
17909 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
17910 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
17911 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
17912 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
17913 CastV1, CastV2, DAG);
17914 return DAG.getBitcast(MVT::v8i32, ShufPS);
17915 }
17916
17917 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17918 // shuffle.
17919 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
17920 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
17921 return Result;
17922
17923 // Otherwise fall back on generic blend lowering.
17924 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask,
17925 Subtarget, DAG);
17926}
17927
17928/// Handle lowering of 16-lane 16-bit integer shuffles.
17929///
17930/// This routine is only called when we have AVX2 and thus a reasonable
17931/// instruction set for v16i16 shuffling..
17932static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17933 const APInt &Zeroable, SDValue V1, SDValue V2,
17934 const X86Subtarget &Subtarget,
17935 SelectionDAG &DAG) {
17936 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v16i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17936, __extension__
__PRETTY_FUNCTION__))
;
1
'?' condition is true
17937 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v16i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17937, __extension__
__PRETTY_FUNCTION__))
;
2
'?' condition is true
17938 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast <bool> (Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17938, __extension__
__PRETTY_FUNCTION__))
;
3
Assuming the condition is true
4
'?' condition is true
17939 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v16i16 with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17939, __extension__
__PRETTY_FUNCTION__))
;
5
'?' condition is true
17940
17941 // Whenever we can lower this as a zext, that instruction is strictly faster
17942 // than any alternative. It also allows us to fold memory operands into the
17943 // shuffle in many cases.
17944 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
6
Taking false branch
17945 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
17946 return ZExt;
17947
17948 // Check for being able to broadcast a single element.
17949 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,
7
Taking false branch
17950 Subtarget, DAG))
17951 return Broadcast;
17952
17953 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
8
Taking false branch
17954 Zeroable, Subtarget, DAG))
17955 return Blend;
17956
17957 // Use dedicated unpack instructions for masks that match their pattern.
17958 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
9
Taking false branch
17959 return V;
17960
17961 // Use dedicated pack instructions for masks that match their pattern.
17962 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,
10
Taking false branch
17963 Subtarget))
17964 return V;
17965
17966 // Try to use lower using a truncation.
17967 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
11
Taking false branch
17968 Subtarget, DAG))
17969 return V;
17970
17971 // Try to use shift instructions.
17972 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
12
Taking false branch
17973 Zeroable, Subtarget, DAG))
17974 return Shift;
17975
17976 // Try to use byte rotation instructions.
17977 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,
13
Taking false branch
17978 Subtarget, DAG))
17979 return Rotate;
17980
17981 // Try to create an in-lane repeating shuffle mask and then shuffle the
17982 // results into the target lanes.
17983 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
14
Taking false branch
17984 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17985 return V;
17986
17987 if (V2.isUndef()) {
15
Taking false branch
17988 // Try to use bit rotation instructions.
17989 if (SDValue Rotate =
17990 lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))
17991 return Rotate;
17992
17993 // Try to produce a fixed cross-128-bit lane permute followed by unpack
17994 // because that should be faster than the variable permute alternatives.
17995 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, Mask, V1, V2, DAG))
17996 return V;
17997
17998 // There are no generalized cross-lane shuffle operations available on i16
17999 // element types.
18000 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
18001 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
18002 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
18003 return V;
18004
18005 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,
18006 DAG, Subtarget);
18007 }
18008
18009 SmallVector<int, 8> RepeatedMask;
18010 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
18011 // As this is a single-input shuffle, the repeated mask should be
18012 // a strictly valid v8i16 mask that we can pass through to the v8i16
18013 // lowering to handle even the v16 case.
18014 return lowerV8I16GeneralSingleInputShuffle(
18015 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
18016 }
18017 }
18018
18019 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,
16
Taking false branch
18020 Zeroable, Subtarget, DAG))
18021 return PSHUFB;
18022
18023 // AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).
18024 if (Subtarget.hasBWI())
17
Assuming the condition is false
18
Taking false branch
18025 return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);
18026
18027 // Try to simplify this by merging 128-bit lanes to enable a lane-based
18028 // shuffle.
18029 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
19
Calling 'lowerShuffleAsLanePermuteAndRepeatedMask'
18030 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
18031 return Result;
18032
18033 // Try to permute the lanes and then use a per-lane permute.
18034 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
18035 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
18036 return V;
18037
18038 // Otherwise fall back on generic lowering.
18039 return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask,
18040 Subtarget, DAG);
18041}
18042
18043/// Handle lowering of 32-lane 8-bit integer shuffles.
18044///
18045/// This routine is only called when we have AVX2 and thus a reasonable
18046/// instruction set for v32i8 shuffling..
18047static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18048 const APInt &Zeroable, SDValue V1, SDValue V2,
18049 const X86Subtarget &Subtarget,
18050 SelectionDAG &DAG) {
18051 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v32i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v32i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18051, __extension__
__PRETTY_FUNCTION__))
;
18052 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v32i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v32i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18052, __extension__
__PRETTY_FUNCTION__))
;
18053 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!")(static_cast <bool> (Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 32 && \"Unexpected mask size for v32 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18053, __extension__
__PRETTY_FUNCTION__))
;
18054 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v32i8 with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18054, __extension__
__PRETTY_FUNCTION__))
;
18055
18056 // Whenever we can lower this as a zext, that instruction is strictly faster
18057 // than any alternative. It also allows us to fold memory operands into the
18058 // shuffle in many cases.
18059 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,
18060 Zeroable, Subtarget, DAG))
18061 return ZExt;
18062
18063 // Check for being able to broadcast a single element.
18064 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
18065 Subtarget, DAG))
18066 return Broadcast;
18067
18068 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
18069 Zeroable, Subtarget, DAG))
18070 return Blend;
18071
18072 // Use dedicated unpack instructions for masks that match their pattern.
18073 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
18074 return V;
18075
18076 // Use dedicated pack instructions for masks that match their pattern.
18077 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,
18078 Subtarget))
18079 return V;
18080
18081 // Try to use lower using a truncation.
18082 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
18083 Subtarget, DAG))
18084 return V;
18085
18086 // Try to use shift instructions.
18087 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
18088 Zeroable, Subtarget, DAG))
18089 return Shift;
18090
18091 // Try to use byte rotation instructions.
18092 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,
18093 Subtarget, DAG))
18094 return Rotate;
18095
18096 // Try to use bit rotation instructions.
18097 if (V2.isUndef())
18098 if (SDValue Rotate =
18099 lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))
18100 return Rotate;
18101
18102 // Try to create an in-lane repeating shuffle mask and then shuffle the
18103 // results into the target lanes.
18104 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
18105 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
18106 return V;
18107
18108 // There are no generalized cross-lane shuffle operations available on i8
18109 // element types.
18110 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
18111 // Try to produce a fixed cross-128-bit lane permute followed by unpack
18112 // because that should be faster than the variable permute alternatives.
18113 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, Mask, V1, V2, DAG))
18114 return V;
18115
18116 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
18117 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
18118 return V;
18119
18120 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,
18121 DAG, Subtarget);
18122 }
18123
18124 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
18125 Zeroable, Subtarget, DAG))
18126 return PSHUFB;
18127
18128 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
18129 if (Subtarget.hasVBMI())
18130 return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);
18131
18132 // Try to simplify this by merging 128-bit lanes to enable a lane-based
18133 // shuffle.
18134 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
18135 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
18136 return Result;
18137
18138 // Try to permute the lanes and then use a per-lane permute.
18139 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
18140 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
18141 return V;
18142
18143 // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
18144 // by zeroable elements in the remaining 24 elements. Turn this into two
18145 // vmovqb instructions shuffled together.
18146 if (Subtarget.hasVLX())
18147 if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,
18148 Mask, Zeroable, DAG))
18149 return V;
18150
18151 // Otherwise fall back on generic lowering.
18152 return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask,
18153 Subtarget, DAG);
18154}
18155
18156/// High-level routine to lower various 256-bit x86 vector shuffles.
18157///
18158/// This routine either breaks down the specific type of a 256-bit x86 vector
18159/// shuffle or splits it into two 128-bit shuffles and fuses the results back
18160/// together based on the available instructions.
18161static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
18162 SDValue V1, SDValue V2, const APInt &Zeroable,
18163 const X86Subtarget &Subtarget,
18164 SelectionDAG &DAG) {
18165 // If we have a single input to the zero element, insert that into V1 if we
18166 // can do so cheaply.
18167 int NumElts = VT.getVectorNumElements();
18168 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
18169
18170 if (NumV2Elements == 1 && Mask[0] >= NumElts)
18171 if (SDValue Insertion = lowerShuffleAsElementInsertion(
18172 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
18173 return Insertion;
18174
18175 // Handle special cases where the lower or upper half is UNDEF.
18176 if (SDValue V =
18177 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
18178 return V;
18179
18180 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
18181 // can check for those subtargets here and avoid much of the subtarget
18182 // querying in the per-vector-type lowering routines. With AVX1 we have
18183 // essentially *zero* ability to manipulate a 256-bit vector with integer
18184 // types. Since we'll use floating point types there eventually, just
18185 // immediately cast everything to a float and operate entirely in that domain.
18186 if (VT.isInteger() && !Subtarget.hasAVX2()) {
18187 int ElementBits = VT.getScalarSizeInBits();
18188 if (ElementBits < 32) {
18189 // No floating point type available, if we can't use the bit operations
18190 // for masking/blending then decompose into 128-bit vectors.
18191 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
18192 Subtarget, DAG))
18193 return V;
18194 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
18195 return V;
18196 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
18197 }
18198
18199 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
18200 VT.getVectorNumElements());
18201 V1 = DAG.getBitcast(FpVT, V1);
18202 V2 = DAG.getBitcast(FpVT, V2);
18203 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
18204 }
18205
18206 if (VT == MVT::v16f16) {
18207 V1 = DAG.getBitcast(MVT::v16i16, V1);
18208 V2 = DAG.getBitcast(MVT::v16i16, V2);
18209 return DAG.getBitcast(MVT::v16f16,
18210 DAG.getVectorShuffle(MVT::v16i16, DL, V1, V2, Mask));
18211 }
18212
18213 switch (VT.SimpleTy) {
18214 case MVT::v4f64:
18215 return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18216 case MVT::v4i64:
18217 return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18218 case MVT::v8f32:
18219 return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18220 case MVT::v8i32:
18221 return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18222 case MVT::v16i16:
18223 return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18224 case MVT::v32i8:
18225 return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18226
18227 default:
18228 llvm_unreachable("Not a valid 256-bit x86 vector type!")::llvm::llvm_unreachable_internal("Not a valid 256-bit x86 vector type!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18228)
;
18229 }
18230}
18231
18232/// Try to lower a vector shuffle as a 128-bit shuffles.
18233static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
18234 const APInt &Zeroable, SDValue V1, SDValue V2,
18235 const X86Subtarget &Subtarget,
18236 SelectionDAG &DAG) {
18237 assert(VT.getScalarSizeInBits() == 64 &&(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
"Unexpected element type size for 128bit shuffle.") ? void (
0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && \"Unexpected element type size for 128bit shuffle.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18238, __extension__
__PRETTY_FUNCTION__))
18238 "Unexpected element type size for 128bit shuffle.")(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
"Unexpected element type size for 128bit shuffle.") ? void (
0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && \"Unexpected element type size for 128bit shuffle.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18238, __extension__
__PRETTY_FUNCTION__))
;
18239
18240 // To handle 256 bit vector requires VLX and most probably
18241 // function lowerV2X128VectorShuffle() is better solution.
18242 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.")(static_cast <bool> (VT.is512BitVector() && "Unexpected vector size for 512bit shuffle."
) ? void (0) : __assert_fail ("VT.is512BitVector() && \"Unexpected vector size for 512bit shuffle.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18242, __extension__
__PRETTY_FUNCTION__))
;
18243
18244 // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
18245 SmallVector<int, 4> Widened128Mask;
18246 if (!canWidenShuffleElements(Mask, Widened128Mask))
18247 return SDValue();
18248 assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch")(static_cast <bool> (Widened128Mask.size() == 4 &&
"Shuffle widening mismatch") ? void (0) : __assert_fail ("Widened128Mask.size() == 4 && \"Shuffle widening mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18248, __extension__
__PRETTY_FUNCTION__))
;
18249
18250 // Try to use an insert into a zero vector.
18251 if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
18252 (Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
18253 unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
18254 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
18255 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
18256 DAG.getIntPtrConstant(0, DL));
18257 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
18258 getZeroVector(VT, Subtarget, DAG, DL), LoV,
18259 DAG.getIntPtrConstant(0, DL));
18260 }
18261
18262 // Check for patterns which can be matched with a single insert of a 256-bit
18263 // subvector.
18264 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);
18265 if (OnlyUsesV1 ||
18266 isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {
18267 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
18268 SDValue SubVec =
18269 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
18270 DAG.getIntPtrConstant(0, DL));
18271 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
18272 DAG.getIntPtrConstant(4, DL));
18273 }
18274
18275 // See if this is an insertion of the lower 128-bits of V2 into V1.
18276 bool IsInsert = true;
18277 int V2Index = -1;
18278 for (int i = 0; i < 4; ++i) {
18279 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value")(static_cast <bool> (Widened128Mask[i] >= -1 &&
"Illegal shuffle sentinel value") ? void (0) : __assert_fail
("Widened128Mask[i] >= -1 && \"Illegal shuffle sentinel value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18279, __extension__
__PRETTY_FUNCTION__))
;
18280 if (Widened128Mask[i] < 0)
18281 continue;
18282
18283 // Make sure all V1 subvectors are in place.
18284 if (Widened128Mask[i] < 4) {
18285 if (Widened128Mask[i] != i) {
18286 IsInsert = false;
18287 break;
18288 }
18289 } else {
18290 // Make sure we only have a single V2 index and its the lowest 128-bits.
18291 if (V2Index >= 0 || Widened128Mask[i] != 4) {
18292 IsInsert = false;
18293 break;
18294 }
18295 V2Index = i;
18296 }
18297 }
18298 if (IsInsert && V2Index >= 0) {
18299 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
18300 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
18301 DAG.getIntPtrConstant(0, DL));
18302 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
18303 }
18304
18305 // See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane
18306 // UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where
18307 // possible we at least ensure the lanes stay sequential to help later
18308 // combines.
18309 SmallVector<int, 2> Widened256Mask;
18310 if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {
18311 Widened128Mask.clear();
18312 narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask);
18313 }
18314
18315 // Try to lower to vshuf64x2/vshuf32x4.
18316 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
18317 unsigned PermMask = 0;
18318 // Insure elements came from the same Op.
18319 for (int i = 0; i < 4; ++i) {
18320 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value")(static_cast <bool> (Widened128Mask[i] >= -1 &&
"Illegal shuffle sentinel value") ? void (0) : __assert_fail
("Widened128Mask[i] >= -1 && \"Illegal shuffle sentinel value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18320, __extension__
__PRETTY_FUNCTION__))
;
18321 if (Widened128Mask[i] < 0)
18322 continue;
18323
18324 SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;
18325 unsigned OpIndex = i / 2;
18326 if (Ops[OpIndex].isUndef())
18327 Ops[OpIndex] = Op;
18328 else if (Ops[OpIndex] != Op)
18329 return SDValue();
18330
18331 // Convert the 128-bit shuffle mask selection values into 128-bit selection
18332 // bits defined by a vshuf64x2 instruction's immediate control byte.
18333 PermMask |= (Widened128Mask[i] % 4) << (i * 2);
18334 }
18335
18336 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
18337 DAG.getTargetConstant(PermMask, DL, MVT::i8));
18338}
18339
18340/// Handle lowering of 8-lane 64-bit floating point shuffles.
18341static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18342 const APInt &Zeroable, SDValue V1, SDValue V2,
18343 const X86Subtarget &Subtarget,
18344 SelectionDAG &DAG) {
18345 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18345, __extension__
__PRETTY_FUNCTION__))
;
18346 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18346, __extension__
__PRETTY_FUNCTION__))
;
18347 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18347, __extension__
__PRETTY_FUNCTION__))
;
18348
18349 if (V2.isUndef()) {
18350 // Use low duplicate instructions for masks that match their pattern.
18351 if (isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))
18352 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
18353
18354 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
18355 // Non-half-crossing single input shuffles can be lowered with an
18356 // interleaved permutation.
18357 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
18358 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
18359 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
18360 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
18361 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
18362 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
18363 }
18364
18365 SmallVector<int, 4> RepeatedMask;
18366 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
18367 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
18368 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
18369 }
18370
18371 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,
18372 V2, Subtarget, DAG))
18373 return Shuf128;
18374
18375 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
18376 return Unpck;
18377
18378 // Check if the blend happens to exactly fit that of SHUFPD.
18379 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,
18380 Zeroable, Subtarget, DAG))
18381 return Op;
18382
18383 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, V2,
18384 DAG, Subtarget))
18385 return V;
18386
18387 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
18388 Zeroable, Subtarget, DAG))
18389 return Blend;
18390
18391 return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);
18392}
18393
18394/// Handle lowering of 16-lane 32-bit floating point shuffles.
18395static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18396 const APInt &Zeroable, SDValue V1, SDValue V2,
18397 const X86Subtarget &Subtarget,
18398 SelectionDAG &DAG) {
18399 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v16f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18399, __extension__
__PRETTY_FUNCTION__))
;
18400 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v16f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18400, __extension__
__PRETTY_FUNCTION__))
;
18401 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast <bool> (Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18401, __extension__
__PRETTY_FUNCTION__))
;
18402
18403 // If the shuffle mask is repeated in each 128-bit lane, we have many more
18404 // options to efficiently lower the shuffle.
18405 SmallVector<int, 4> RepeatedMask;
18406 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
18407 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")(static_cast <bool> (RepeatedMask.size() == 4 &&
"Unexpected repeated mask size!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18407, __extension__
__PRETTY_FUNCTION__))
;
18408
18409 // Use even/odd duplicate instructions for masks that match their pattern.
18410 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
18411 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
18412 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
18413 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
18414
18415 if (V2.isUndef())
18416 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
18417 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
18418
18419 // Use dedicated unpack instructions for masks that match their pattern.
18420 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
18421 return V;
18422
18423 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
18424 Zeroable, Subtarget, DAG))
18425 return Blend;
18426
18427 // Otherwise, fall back to a SHUFPS sequence.
18428 return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
18429 }
18430
18431 // Try to create an in-lane repeating shuffle mask and then shuffle the
18432 // results into the target lanes.
18433 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
18434 DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))
18435 return V;
18436
18437 // If we have a single input shuffle with different shuffle patterns in the
18438 // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
18439 if (V2.isUndef() &&
18440 !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
18441 SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
18442 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
18443 }
18444
18445 // If we have AVX512F support, we can use VEXPAND.
18446 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
18447 V1, V2, DAG, Subtarget))
18448 return V;
18449
18450 return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);
18451}
18452
18453/// Handle lowering of 8-lane 64-bit integer shuffles.
18454static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18455 const APInt &Zeroable, SDValue V1, SDValue V2,
18456 const X86Subtarget &Subtarget,
18457 SelectionDAG &DAG) {
18458 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18458, __extension__
__PRETTY_FUNCTION__))
;
18459 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18459, __extension__
__PRETTY_FUNCTION__))
;
18460 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18460, __extension__
__PRETTY_FUNCTION__))
;
18461
18462 if (V2.isUndef()) {
18463 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
18464 // can use lower latency instructions that will operate on all four
18465 // 128-bit lanes.
18466 SmallVector<int, 2> Repeated128Mask;
18467 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
18468 SmallVector<int, 4> PSHUFDMask;
18469 narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask);
18470 return DAG.getBitcast(
18471 MVT::v8i64,
18472 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
18473 DAG.getBitcast(MVT::v16i32, V1),
18474 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
18475 }
18476
18477 SmallVector<int, 4> Repeated256Mask;
18478 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
18479 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
18480 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
18481 }
18482
18483 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,
18484 V2, Subtarget, DAG))
18485 return Shuf128;
18486
18487 // Try to use shift instructions.
18488 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
18489 Zeroable, Subtarget, DAG))
18490 return Shift;
18491
18492 // Try to use VALIGN.
18493 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,
18494 Subtarget, DAG))
18495 return Rotate;
18496
18497 // Try to use PALIGNR.
18498 if (Subtarget.hasBWI())
18499 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
18500 Subtarget, DAG))
18501 return Rotate;
18502
18503 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
18504 return Unpck;
18505
18506 // If we have AVX512F support, we can use VEXPAND.
18507 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, V2,
18508 DAG, Subtarget))
18509 return V;
18510
18511 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
18512 Zeroable, Subtarget, DAG))
18513 return Blend;
18514
18515 return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);
18516}
18517
18518/// Handle lowering of 16-lane 32-bit integer shuffles.
18519static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18520 const APInt &Zeroable, SDValue V1, SDValue V2,
18521 const X86Subtarget &Subtarget,
18522 SelectionDAG &DAG) {
18523 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v16i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18523, __extension__
__PRETTY_FUNCTION__))
;
18524 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v16i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18524, __extension__
__PRETTY_FUNCTION__))
;
18525 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast <bool> (Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18525, __extension__
__PRETTY_FUNCTION__))
;
18526
18527 // Whenever we can lower this as a zext, that instruction is strictly faster
18528 // than any alternative. It also allows us to fold memory operands into the
18529 // shuffle in many cases.
18530 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
18531 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
18532 return ZExt;
18533
18534 // If the shuffle mask is repeated in each 128-bit lane we can use more
18535 // efficient instructions that mirror the shuffles across the four 128-bit
18536 // lanes.
18537 SmallVector<int, 4> RepeatedMask;
18538 bool Is128BitLaneRepeatedShuffle =
18539 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
18540 if (Is128BitLaneRepeatedShuffle) {
18541 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")(static_cast <bool> (RepeatedMask.size() == 4 &&
"Unexpected repeated mask size!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18541, __extension__
__PRETTY_FUNCTION__))
;
18542 if (V2.isUndef())
18543 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
18544 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
18545
18546 // Use dedicated unpack instructions for masks that match their pattern.
18547 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
18548 return V;
18549 }
18550
18551 // Try to use shift instructions.
18552 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
18553 Zeroable, Subtarget, DAG))
18554 return Shift;
18555
18556 // Try to use VALIGN.
18557 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,
18558 Subtarget, DAG))
18559 return Rotate;
18560
18561 // Try to use byte rotation instructions.
18562 if (Subtarget.hasBWI())
18563 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,
18564 Subtarget, DAG))
18565 return Rotate;
18566
18567 // Assume that a single SHUFPS is faster than using a permv shuffle.
18568 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
18569 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
18570 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
18571 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
18572 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
18573 CastV1, CastV2, DAG);
18574 return DAG.getBitcast(MVT::v16i32, ShufPS);
18575 }
18576
18577 // Try to create an in-lane repeating shuffle mask and then shuffle the
18578 // results into the target lanes.
18579 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
18580 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
18581 return V;
18582
18583 // If we have AVX512F support, we can use VEXPAND.
18584 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, V1, V2,
18585 DAG, Subtarget))
18586 return V;
18587
18588 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
18589 Zeroable, Subtarget, DAG))
18590 return Blend;
18591
18592 return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);
18593}
18594
18595/// Handle lowering of 32-lane 16-bit integer shuffles.
18596static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18597 const APInt &Zeroable, SDValue V1, SDValue V2,
18598 const X86Subtarget &Subtarget,
18599 SelectionDAG &DAG) {
18600 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v32i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v32i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18600, __extension__
__PRETTY_FUNCTION__))
;
18601 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v32i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v32i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18601, __extension__
__PRETTY_FUNCTION__))
;
18602 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!")(static_cast <bool> (Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 32 && \"Unexpected mask size for v32 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18602, __extension__
__PRETTY_FUNCTION__))
;
18603 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!")(static_cast <bool> (Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"We can only lower v32i16 with AVX-512-BWI!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18603, __extension__
__PRETTY_FUNCTION__))
;
18604
18605 // Whenever we can lower this as a zext, that instruction is strictly faster
18606 // than any alternative. It also allows us to fold memory operands into the
18607 // shuffle in many cases.
18608 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
18609 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
18610 return ZExt;
18611
18612 // Use dedicated unpack instructions for masks that match their pattern.
18613 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
18614 return V;
18615
18616 // Use dedicated pack instructions for masks that match their pattern.
18617 if (SDValue V =
18618 lowerShuffleWithPACK(DL, MVT::v32i16, Mask, V1, V2, DAG, Subtarget))
18619 return V;
18620
18621 // Try to use shift instructions.
18622 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
18623 Zeroable, Subtarget, DAG))
18624 return Shift;
18625
18626 // Try to use byte rotation instructions.
18627 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,
18628 Subtarget, DAG))
18629 return Rotate;
18630
18631 if (V2.isUndef()) {
18632 // Try to use bit rotation instructions.
18633 if (SDValue Rotate =
18634 lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))
18635 return Rotate;
18636
18637 SmallVector<int, 8> RepeatedMask;
18638 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
18639 // As this is a single-input shuffle, the repeated mask should be
18640 // a strictly valid v8i16 mask that we can pass through to the v8i16
18641 // lowering to handle even the v32 case.
18642 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,
18643 RepeatedMask, Subtarget, DAG);
18644 }
18645 }
18646
18647 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
18648 Zeroable, Subtarget, DAG))
18649 return Blend;
18650
18651 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
18652 Zeroable, Subtarget, DAG))
18653 return PSHUFB;
18654
18655 return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);
18656}
18657
18658/// Handle lowering of 64-lane 8-bit integer shuffles.
18659static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18660 const APInt &Zeroable, SDValue V1, SDValue V2,
18661 const X86Subtarget &Subtarget,
18662 SelectionDAG &DAG) {
18663 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v64i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v64i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18663, __extension__
__PRETTY_FUNCTION__))
;
18664 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v64i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v64i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18664, __extension__
__PRETTY_FUNCTION__))
;
18665 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!")(static_cast <bool> (Mask.size() == 64 && "Unexpected mask size for v64 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 64 && \"Unexpected mask size for v64 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18665, __extension__
__PRETTY_FUNCTION__))
;
18666 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!")(static_cast <bool> (Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"We can only lower v64i8 with AVX-512-BWI!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18666, __extension__
__PRETTY_FUNCTION__))
;
18667
18668 // Whenever we can lower this as a zext, that instruction is strictly faster
18669 // than any alternative. It also allows us to fold memory operands into the
18670 // shuffle in many cases.
18671 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
18672 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
18673 return ZExt;
18674
18675 // Use dedicated unpack instructions for masks that match their pattern.
18676 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
18677 return V;
18678
18679 // Use dedicated pack instructions for masks that match their pattern.
18680 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v64i8, Mask, V1, V2, DAG,
18681 Subtarget))
18682 return V;
18683
18684 // Try to use shift instructions.
18685 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
18686 Zeroable, Subtarget, DAG))
18687 return Shift;
18688
18689 // Try to use byte rotation instructions.
18690 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,
18691 Subtarget, DAG))
18692 return Rotate;
18693
18694 // Try to use bit rotation instructions.
18695 if (V2.isUndef())
18696 if (SDValue Rotate =
18697 lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))
18698 return Rotate;
18699
18700 // Lower as AND if possible.
18701 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask,
18702 Zeroable, Subtarget, DAG))
18703 return Masked;
18704
18705 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
18706 Zeroable, Subtarget, DAG))
18707 return PSHUFB;
18708
18709 // Try to create an in-lane repeating shuffle mask and then shuffle the
18710 // results into the target lanes.
18711 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
18712 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
18713 return V;
18714
18715 if (SDValue Result = lowerShuffleAsLanePermuteAndPermute(
18716 DL, MVT::v64i8, V1, V2, Mask, DAG, Subtarget))
18717 return Result;
18718
18719 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
18720 Zeroable, Subtarget, DAG))
18721 return Blend;
18722
18723 if (!is128BitLaneCrossingShuffleMask(MVT::v64i8, Mask)) {
18724 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
18725 // PALIGNR will be cheaper than the second PSHUFB+OR.
18726 if (SDValue V = lowerShuffleAsByteRotateAndPermute(DL, MVT::v64i8, V1, V2,
18727 Mask, Subtarget, DAG))
18728 return V;
18729
18730 // If we can't directly blend but can use PSHUFB, that will be better as it
18731 // can both shuffle and set up the inefficient blend.
18732 bool V1InUse, V2InUse;
18733 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v64i8, V1, V2, Mask, Zeroable,
18734 DAG, V1InUse, V2InUse);
18735 }
18736
18737 // Try to simplify this by merging 128-bit lanes to enable a lane-based
18738 // shuffle.
18739 if (!V2.isUndef())
18740 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
18741 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
18742 return Result;
18743
18744 // VBMI can use VPERMV/VPERMV3 byte shuffles.
18745 if (Subtarget.hasVBMI())
18746 return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
18747
18748 return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
18749}
18750
18751/// High-level routine to lower various 512-bit x86 vector shuffles.
18752///
18753/// This routine either breaks down the specific type of a 512-bit x86 vector
18754/// shuffle or splits it into two 256-bit shuffles and fuses the results back
18755/// together based on the available instructions.
18756static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
18757 MVT VT, SDValue V1, SDValue V2,
18758 const APInt &Zeroable,
18759 const X86Subtarget &Subtarget,
18760 SelectionDAG &DAG) {
18761 assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/ basic ISA!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18762, __extension__
__PRETTY_FUNCTION__))
18762 "Cannot lower 512-bit vectors w/ basic ISA!")(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/ basic ISA!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18762, __extension__
__PRETTY_FUNCTION__))
;
18763
18764 // If we have a single input to the zero element, insert that into V1 if we
18765 // can do so cheaply.
18766 int NumElts = Mask.size();
18767 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
18768
18769 if (NumV2Elements == 1 && Mask[0] >= NumElts)
18770 if (SDValue Insertion = lowerShuffleAsElementInsertion(
18771 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
18772 return Insertion;
18773
18774 // Handle special cases where the lower or upper half is UNDEF.
18775 if (SDValue V =
18776 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
18777 return V;
18778
18779 // Check for being able to broadcast a single element.
18780 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
18781 Subtarget, DAG))
18782 return Broadcast;
18783
18784 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {
18785 // Try using bit ops for masking and blending before falling back to
18786 // splitting.
18787 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
18788 Subtarget, DAG))
18789 return V;
18790 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
18791 return V;
18792
18793 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
18794 }
18795
18796 if (VT == MVT::v32f16) {
18797 V1 = DAG.getBitcast(MVT::v32i16, V1);
18798 V2 = DAG.getBitcast(MVT::v32i16, V2);
18799 return DAG.getBitcast(MVT::v32f16,
18800 DAG.getVectorShuffle(MVT::v32i16, DL, V1, V2, Mask));
18801 }
18802
18803 // Dispatch to each element type for lowering. If we don't have support for
18804 // specific element type shuffles at 512 bits, immediately split them and
18805 // lower them. Each lowering routine of a given type is allowed to assume that
18806 // the requisite ISA extensions for that element type are available.
18807 switch (VT.SimpleTy) {
18808 case MVT::v8f64:
18809 return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18810 case MVT::v16f32:
18811 return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18812 case MVT::v8i64:
18813 return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18814 case MVT::v16i32:
18815 return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18816 case MVT::v32i16:
18817 return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18818 case MVT::v64i8:
18819 return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18820
18821 default:
18822 llvm_unreachable("Not a valid 512-bit x86 vector type!")::llvm::llvm_unreachable_internal("Not a valid 512-bit x86 vector type!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18822)
;
18823 }
18824}
18825
18826static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef<int> Mask,
18827 MVT VT, SDValue V1, SDValue V2,
18828 const X86Subtarget &Subtarget,
18829 SelectionDAG &DAG) {
18830 // Shuffle should be unary.
18831 if (!V2.isUndef())
18832 return SDValue();
18833
18834 int ShiftAmt = -1;
18835 int NumElts = Mask.size();
18836 for (int i = 0; i != NumElts; ++i) {
18837 int M = Mask[i];
18838 assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&(static_cast <bool> ((M == SM_SentinelUndef || (0 <=
M && M < NumElts)) && "Unexpected mask index."
) ? void (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < NumElts)) && \"Unexpected mask index.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18839, __extension__
__PRETTY_FUNCTION__))
18839 "Unexpected mask index.")(static_cast <bool> ((M == SM_SentinelUndef || (0 <=
M && M < NumElts)) && "Unexpected mask index."
) ? void (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < NumElts)) && \"Unexpected mask index.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18839, __extension__
__PRETTY_FUNCTION__))
;
18840 if (M < 0)
18841 continue;
18842
18843 // The first non-undef element determines our shift amount.
18844 if (ShiftAmt < 0) {
18845 ShiftAmt = M - i;
18846 // Need to be shifting right.
18847 if (ShiftAmt <= 0)
18848 return SDValue();
18849 }
18850 // All non-undef elements must shift by the same amount.
18851 if (ShiftAmt != M - i)
18852 return SDValue();
18853 }
18854 assert(ShiftAmt >= 0 && "All undef?")(static_cast <bool> (ShiftAmt >= 0 && "All undef?"
) ? void (0) : __assert_fail ("ShiftAmt >= 0 && \"All undef?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18854, __extension__
__PRETTY_FUNCTION__))
;
18855
18856 // Great we found a shift right.
18857 MVT WideVT = VT;
18858 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
18859 WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
18860 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,
18861 DAG.getUNDEF(WideVT), V1,
18862 DAG.getIntPtrConstant(0, DL));
18863 Res = DAG.getNode(X86ISD::KSHIFTR, DL, WideVT, Res,
18864 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
18865 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
18866 DAG.getIntPtrConstant(0, DL));
18867}
18868
18869// Determine if this shuffle can be implemented with a KSHIFT instruction.
18870// Returns the shift amount if possible or -1 if not. This is a simplified
18871// version of matchShuffleAsShift.
18872static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
18873 int MaskOffset, const APInt &Zeroable) {
18874 int Size = Mask.size();
18875
18876 auto CheckZeros = [&](int Shift, bool Left) {
18877 for (int j = 0; j < Shift; ++j)
18878 if (!Zeroable[j + (Left ? 0 : (Size - Shift))])
18879 return false;
18880
18881 return true;
18882 };
18883
18884 auto MatchShift = [&](int Shift, bool Left) {
18885 unsigned Pos = Left ? Shift : 0;
18886 unsigned Low = Left ? 0 : Shift;
18887 unsigned Len = Size - Shift;
18888 return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);
18889 };
18890
18891 for (int Shift = 1; Shift != Size; ++Shift)
18892 for (bool Left : {true, false})
18893 if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {
18894 Opcode = Left ? X86ISD::KSHIFTL : X86ISD::KSHIFTR;
18895 return Shift;
18896 }
18897
18898 return -1;
18899}
18900
18901
18902// Lower vXi1 vector shuffles.
18903// There is no a dedicated instruction on AVX-512 that shuffles the masks.
18904// The only way to shuffle bits is to sign-extend the mask vector to SIMD
18905// vector, shuffle and then truncate it back.
18906static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
18907 MVT VT, SDValue V1, SDValue V2,
18908 const APInt &Zeroable,
18909 const X86Subtarget &Subtarget,
18910 SelectionDAG &DAG) {
18911 assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/o basic ISA!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/o basic ISA!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18912, __extension__
__PRETTY_FUNCTION__))
18912 "Cannot lower 512-bit vectors w/o basic ISA!")(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/o basic ISA!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/o basic ISA!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18912, __extension__
__PRETTY_FUNCTION__))
;
18913
18914 int NumElts = Mask.size();
18915
18916 // Try to recognize shuffles that are just padding a subvector with zeros.
18917 int SubvecElts = 0;
18918 int Src = -1;
18919 for (int i = 0; i != NumElts; ++i) {
18920 if (Mask[i] >= 0) {
18921 // Grab the source from the first valid mask. All subsequent elements need
18922 // to use this same source.
18923 if (Src < 0)
18924 Src = Mask[i] / NumElts;
18925 if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)
18926 break;
18927 }
18928
18929 ++SubvecElts;
18930 }
18931 assert(SubvecElts != NumElts && "Identity shuffle?")(static_cast <bool> (SubvecElts != NumElts && "Identity shuffle?"
) ? void (0) : __assert_fail ("SubvecElts != NumElts && \"Identity shuffle?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18931, __extension__
__PRETTY_FUNCTION__))
;
18932
18933 // Clip to a power 2.
18934 SubvecElts = PowerOf2Floor(SubvecElts);
18935
18936 // Make sure the number of zeroable bits in the top at least covers the bits
18937 // not covered by the subvector.
18938 if ((int)Zeroable.countLeadingOnes() >= (NumElts - SubvecElts)) {
18939 assert(Src >= 0 && "Expected a source!")(static_cast <bool> (Src >= 0 && "Expected a source!"
) ? void (0) : __assert_fail ("Src >= 0 && \"Expected a source!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18939, __extension__
__PRETTY_FUNCTION__))
;
18940 MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
18941 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT,
18942 Src == 0 ? V1 : V2,
18943 DAG.getIntPtrConstant(0, DL));
18944 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
18945 DAG.getConstant(0, DL, VT),
18946 Extract, DAG.getIntPtrConstant(0, DL));
18947 }
18948
18949 // Try a simple shift right with undef elements. Later we'll try with zeros.
18950 if (SDValue Shift = lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget,
18951 DAG))
18952 return Shift;
18953
18954 // Try to match KSHIFTs.
18955 unsigned Offset = 0;
18956 for (SDValue V : { V1, V2 }) {
18957 unsigned Opcode;
18958 int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
18959 if (ShiftAmt >= 0) {
18960 MVT WideVT = VT;
18961 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
18962 WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
18963 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,
18964 DAG.getUNDEF(WideVT), V,
18965 DAG.getIntPtrConstant(0, DL));
18966 // Widened right shifts need two shifts to ensure we shift in zeroes.
18967 if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {
18968 int WideElts = WideVT.getVectorNumElements();
18969 // Shift left to put the original vector in the MSBs of the new size.
18970 Res = DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,
18971 DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));
18972 // Increase the shift amount to account for the left shift.
18973 ShiftAmt += WideElts - NumElts;
18974 }
18975
18976 Res = DAG.getNode(Opcode, DL, WideVT, Res,
18977 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
18978 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
18979 DAG.getIntPtrConstant(0, DL));
18980 }
18981 Offset += NumElts; // Increment for next iteration.
18982 }
18983
18984 // If we're broadcasting a SETCC result, try to broadcast the ops instead.
18985 // TODO: What other unary shuffles would benefit from this?
18986 if (isBroadcastShuffleMask(Mask) && V1.getOpcode() == ISD::SETCC &&
18987 V1->hasOneUse()) {
18988 SDValue Op0 = V1.getOperand(0);
18989 SDValue Op1 = V1.getOperand(1);
18990 ISD::CondCode CC = cast<CondCodeSDNode>(V1.getOperand(2))->get();
18991 EVT OpVT = Op0.getValueType();
18992 return DAG.getSetCC(
18993 DL, VT, DAG.getVectorShuffle(OpVT, DL, Op0, DAG.getUNDEF(OpVT), Mask),
18994 DAG.getVectorShuffle(OpVT, DL, Op1, DAG.getUNDEF(OpVT), Mask), CC);
18995 }
18996
18997 MVT ExtVT;
18998 switch (VT.SimpleTy) {
18999 default:
19000 llvm_unreachable("Expected a vector of i1 elements")::llvm::llvm_unreachable_internal("Expected a vector of i1 elements"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19000)
;
19001 case MVT::v2i1:
19002 ExtVT = MVT::v2i64;
19003 break;
19004 case MVT::v4i1:
19005 ExtVT = MVT::v4i32;
19006 break;
19007 case MVT::v8i1:
19008 // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
19009 // shuffle.
19010 ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
19011 break;
19012 case MVT::v16i1:
19013 // Take 512-bit type, unless we are avoiding 512-bit types and have the
19014 // 256-bit operation available.
19015 ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
19016 break;
19017 case MVT::v32i1:
19018 // Take 512-bit type, unless we are avoiding 512-bit types and have the
19019 // 256-bit operation available.
19020 assert(Subtarget.hasBWI() && "Expected AVX512BW support")(static_cast <bool> (Subtarget.hasBWI() && "Expected AVX512BW support"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected AVX512BW support\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19020, __extension__
__PRETTY_FUNCTION__))
;
19021 ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
19022 break;
19023 case MVT::v64i1:
19024 // Fall back to scalarization. FIXME: We can do better if the shuffle
19025 // can be partitioned cleanly.
19026 if (!Subtarget.useBWIRegs())
19027 return SDValue();
19028 ExtVT = MVT::v64i8;
19029 break;
19030 }
19031
19032 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
19033 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
19034
19035 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
19036 // i1 was sign extended we can use X86ISD::CVT2MASK.
19037 int NumElems = VT.getVectorNumElements();
19038 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
19039 (Subtarget.hasDQI() && (NumElems < 32)))
19040 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
19041 Shuffle, ISD::SETGT);
19042
19043 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
19044}
19045
19046/// Helper function that returns true if the shuffle mask should be
19047/// commuted to improve canonicalization.
19048static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
19049 int NumElements = Mask.size();
19050
19051 int NumV1Elements = 0, NumV2Elements = 0;
19052 for (int M : Mask)
19053 if (M < 0)
19054 continue;
19055 else if (M < NumElements)
19056 ++NumV1Elements;
19057 else
19058 ++NumV2Elements;
19059
19060 // Commute the shuffle as needed such that more elements come from V1 than
19061 // V2. This allows us to match the shuffle pattern strictly on how many
19062 // elements come from V1 without handling the symmetric cases.
19063 if (NumV2Elements > NumV1Elements)
19064 return true;
19065
19066 assert(NumV1Elements > 0 && "No V1 indices")(static_cast <bool> (NumV1Elements > 0 && "No V1 indices"
) ? void (0) : __assert_fail ("NumV1Elements > 0 && \"No V1 indices\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19066, __extension__
__PRETTY_FUNCTION__))
;
19067
19068 if (NumV2Elements == 0)
19069 return false;
19070
19071 // When the number of V1 and V2 elements are the same, try to minimize the
19072 // number of uses of V2 in the low half of the vector. When that is tied,
19073 // ensure that the sum of indices for V1 is equal to or lower than the sum
19074 // indices for V2. When those are equal, try to ensure that the number of odd
19075 // indices for V1 is lower than the number of odd indices for V2.
19076 if (NumV1Elements == NumV2Elements) {
19077 int LowV1Elements = 0, LowV2Elements = 0;
19078 for (int M : Mask.slice(0, NumElements / 2))
19079 if (M >= NumElements)
19080 ++LowV2Elements;
19081 else if (M >= 0)
19082 ++LowV1Elements;
19083 if (LowV2Elements > LowV1Elements)
19084 return true;
19085 if (LowV2Elements == LowV1Elements) {
19086 int SumV1Indices = 0, SumV2Indices = 0;
19087 for (int i = 0, Size = Mask.size(); i < Size; ++i)
19088 if (Mask[i] >= NumElements)
19089 SumV2Indices += i;
19090 else if (Mask[i] >= 0)
19091 SumV1Indices += i;
19092 if (SumV2Indices < SumV1Indices)
19093 return true;
19094 if (SumV2Indices == SumV1Indices) {
19095 int NumV1OddIndices = 0, NumV2OddIndices = 0;
19096 for (int i = 0, Size = Mask.size(); i < Size; ++i)
19097 if (Mask[i] >= NumElements)
19098 NumV2OddIndices += i % 2;
19099 else if (Mask[i] >= 0)
19100 NumV1OddIndices += i % 2;
19101 if (NumV2OddIndices < NumV1OddIndices)
19102 return true;
19103 }
19104 }
19105 }
19106
19107 return false;
19108}
19109
19110// Forward declaration.
19111static SDValue canonicalizeShuffleMaskWithHorizOp(
19112 MutableArrayRef<SDValue> Ops, MutableArrayRef<int> Mask,
19113 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
19114 const X86Subtarget &Subtarget);
19115
19116 /// Top-level lowering for x86 vector shuffles.
19117///
19118/// This handles decomposition, canonicalization, and lowering of all x86
19119/// vector shuffles. Most of the specific lowering strategies are encapsulated
19120/// above in helper routines. The canonicalization attempts to widen shuffles
19121/// to involve fewer lanes of wider elements, consolidate symmetric patterns
19122/// s.t. only one of the two inputs needs to be tested, etc.
19123static SDValue lowerVECTOR_SHUFFLE(SDValue Op, const X86Subtarget &Subtarget,
19124 SelectionDAG &DAG) {
19125 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
19126 ArrayRef<int> OrigMask = SVOp->getMask();
19127 SDValue V1 = Op.getOperand(0);
19128 SDValue V2 = Op.getOperand(1);
19129 MVT VT = Op.getSimpleValueType();
19130 int NumElements = VT.getVectorNumElements();
19131 SDLoc DL(Op);
19132 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
19133
19134 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&(static_cast <bool> ((VT.getSizeInBits() != 64 || Is1BitVector
) && "Can't lower MMX shuffles") ? void (0) : __assert_fail
("(VT.getSizeInBits() != 64 || Is1BitVector) && \"Can't lower MMX shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19135, __extension__
__PRETTY_FUNCTION__))
19135 "Can't lower MMX shuffles")(static_cast <bool> ((VT.getSizeInBits() != 64 || Is1BitVector
) && "Can't lower MMX shuffles") ? void (0) : __assert_fail
("(VT.getSizeInBits() != 64 || Is1BitVector) && \"Can't lower MMX shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19135, __extension__
__PRETTY_FUNCTION__))
;
19136
19137 bool V1IsUndef = V1.isUndef();
19138 bool V2IsUndef = V2.isUndef();
19139 if (V1IsUndef && V2IsUndef)
19140 return DAG.getUNDEF(VT);
19141
19142 // When we create a shuffle node we put the UNDEF node to second operand,
19143 // but in some cases the first operand may be transformed to UNDEF.
19144 // In this case we should just commute the node.
19145 if (V1IsUndef)
19146 return DAG.getCommutedVectorShuffle(*SVOp);
19147
19148 // Check for non-undef masks pointing at an undef vector and make the masks
19149 // undef as well. This makes it easier to match the shuffle based solely on
19150 // the mask.
19151 if (V2IsUndef &&
19152 any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
19153 SmallVector<int, 8> NewMask(OrigMask.begin(), OrigMask.end());
19154 for (int &M : NewMask)
19155 if (M >= NumElements)
19156 M = -1;
19157 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
19158 }
19159
19160 // Check for illegal shuffle mask element index values.
19161 int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
19162 (void)MaskUpperLimit;
19163 assert(llvm::all_of(OrigMask,(static_cast <bool> (llvm::all_of(OrigMask, [&](int
M) { return -1 <= M && M < MaskUpperLimit; }) &&
"Out of bounds shuffle index") ? void (0) : __assert_fail ("llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19165, __extension__
__PRETTY_FUNCTION__))
19164 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&(static_cast <bool> (llvm::all_of(OrigMask, [&](int
M) { return -1 <= M && M < MaskUpperLimit; }) &&
"Out of bounds shuffle index") ? void (0) : __assert_fail ("llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19165, __extension__
__PRETTY_FUNCTION__))
19165 "Out of bounds shuffle index")(static_cast <bool> (llvm::all_of(OrigMask, [&](int
M) { return -1 <= M && M < MaskUpperLimit; }) &&
"Out of bounds shuffle index") ? void (0) : __assert_fail ("llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19165, __extension__
__PRETTY_FUNCTION__))
;
19166
19167 // We actually see shuffles that are entirely re-arrangements of a set of
19168 // zero inputs. This mostly happens while decomposing complex shuffles into
19169 // simple ones. Directly lower these as a buildvector of zeros.
19170 APInt KnownUndef, KnownZero;
19171 computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);
19172
19173 APInt Zeroable = KnownUndef | KnownZero;
19174 if (Zeroable.isAllOnes())
19175 return getZeroVector(VT, Subtarget, DAG, DL);
19176
19177 bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
19178
19179 // Try to collapse shuffles into using a vector type with fewer elements but
19180 // wider element types. We cap this to not form integers or floating point
19181 // elements wider than 64 bits. It does not seem beneficial to form i128
19182 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
19183 SmallVector<int, 16> WidenedMask;
19184 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
19185 canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {
19186 // Shuffle mask widening should not interfere with a broadcast opportunity
19187 // by obfuscating the operands with bitcasts.
19188 // TODO: Avoid lowering directly from this top-level function: make this
19189 // a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
19190 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,
19191 Subtarget, DAG))
19192 return Broadcast;
19193
19194 MVT NewEltVT = VT.isFloatingPoint()
19195 ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
19196 : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
19197 int NewNumElts = NumElements / 2;
19198 MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
19199 // Make sure that the new vector type is legal. For example, v2f64 isn't
19200 // legal on SSE1.
19201 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
19202 if (V2IsZero) {
19203 // Modify the new Mask to take all zeros from the all-zero vector.
19204 // Choose indices that are blend-friendly.
19205 bool UsedZeroVector = false;
19206 assert(is_contained(WidenedMask, SM_SentinelZero) &&(static_cast <bool> (is_contained(WidenedMask, SM_SentinelZero
) && "V2's non-undef elements are used?!") ? void (0)
: __assert_fail ("is_contained(WidenedMask, SM_SentinelZero) && \"V2's non-undef elements are used?!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19207, __extension__
__PRETTY_FUNCTION__))
19207 "V2's non-undef elements are used?!")(static_cast <bool> (is_contained(WidenedMask, SM_SentinelZero
) && "V2's non-undef elements are used?!") ? void (0)
: __assert_fail ("is_contained(WidenedMask, SM_SentinelZero) && \"V2's non-undef elements are used?!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19207, __extension__
__PRETTY_FUNCTION__))
;
19208 for (int i = 0; i != NewNumElts; ++i)
19209 if (WidenedMask[i] == SM_SentinelZero) {
19210 WidenedMask[i] = i + NewNumElts;
19211 UsedZeroVector = true;
19212 }
19213 // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
19214 // some elements to be undef.
19215 if (UsedZeroVector)
19216 V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
19217 }
19218 V1 = DAG.getBitcast(NewVT, V1);
19219 V2 = DAG.getBitcast(NewVT, V2);
19220 return DAG.getBitcast(
19221 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
19222 }
19223 }
19224
19225 SmallVector<SDValue> Ops = {V1, V2};
19226 SmallVector<int> Mask(OrigMask.begin(), OrigMask.end());
19227
19228 // Canonicalize the shuffle with any horizontal ops inputs.
19229 // NOTE: This may update Ops and Mask.
19230 if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp(
19231 Ops, Mask, VT.getSizeInBits(), DL, DAG, Subtarget))
19232 return DAG.getBitcast(VT, HOp);
19233
19234 V1 = DAG.getBitcast(VT, Ops[0]);
19235 V2 = DAG.getBitcast(VT, Ops[1]);
19236 assert(NumElements == (int)Mask.size() &&(static_cast <bool> (NumElements == (int)Mask.size() &&
"canonicalizeShuffleMaskWithHorizOp " "shouldn't alter the shuffle mask size"
) ? void (0) : __assert_fail ("NumElements == (int)Mask.size() && \"canonicalizeShuffleMaskWithHorizOp \" \"shouldn't alter the shuffle mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19238, __extension__
__PRETTY_FUNCTION__))
19237 "canonicalizeShuffleMaskWithHorizOp "(static_cast <bool> (NumElements == (int)Mask.size() &&
"canonicalizeShuffleMaskWithHorizOp " "shouldn't alter the shuffle mask size"
) ? void (0) : __assert_fail ("NumElements == (int)Mask.size() && \"canonicalizeShuffleMaskWithHorizOp \" \"shouldn't alter the shuffle mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19238, __extension__
__PRETTY_FUNCTION__))
19238 "shouldn't alter the shuffle mask size")(static_cast <bool> (NumElements == (int)Mask.size() &&
"canonicalizeShuffleMaskWithHorizOp " "shouldn't alter the shuffle mask size"
) ? void (0) : __assert_fail ("NumElements == (int)Mask.size() && \"canonicalizeShuffleMaskWithHorizOp \" \"shouldn't alter the shuffle mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19238, __extension__
__PRETTY_FUNCTION__))
;
19239
19240 // Commute the shuffle if it will improve canonicalization.
19241 if (canonicalizeShuffleMaskWithCommute(Mask)) {
19242 ShuffleVectorSDNode::commuteMask(Mask);
19243 std::swap(V1, V2);
19244 }
19245
19246 // For each vector width, delegate to a specialized lowering routine.
19247 if (VT.is128BitVector())
19248 return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
19249
19250 if (VT.is256BitVector())
19251 return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
19252
19253 if (VT.is512BitVector())
19254 return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
19255
19256 if (Is1BitVector)
19257 return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
19258
19259 llvm_unreachable("Unimplemented!")::llvm::llvm_unreachable_internal("Unimplemented!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19259)
;
19260}
19261
19262/// Try to lower a VSELECT instruction to a vector shuffle.
19263static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
19264 const X86Subtarget &Subtarget,
19265 SelectionDAG &DAG) {
19266 SDValue Cond = Op.getOperand(0);
19267 SDValue LHS = Op.getOperand(1);
19268 SDValue RHS = Op.getOperand(2);
19269 MVT VT = Op.getSimpleValueType();
19270
19271 // Only non-legal VSELECTs reach this lowering, convert those into generic
19272 // shuffles and re-use the shuffle lowering path for blends.
19273 if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
19274 SmallVector<int, 32> Mask;
19275 if (createShuffleMaskFromVSELECT(Mask, Cond))
19276 return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);
19277 }
19278
19279 return SDValue();
19280}
19281
19282SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
19283 SDValue Cond = Op.getOperand(0);
19284 SDValue LHS = Op.getOperand(1);
19285 SDValue RHS = Op.getOperand(2);
19286
19287 // A vselect where all conditions and data are constants can be optimized into
19288 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
19289 if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) &&
19290 ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&
19291 ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))
19292 return SDValue();
19293
19294 // Try to lower this to a blend-style vector shuffle. This can handle all
19295 // constant condition cases.
19296 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
19297 return BlendOp;
19298
19299 // If this VSELECT has a vector if i1 as a mask, it will be directly matched
19300 // with patterns on the mask registers on AVX-512.
19301 MVT CondVT = Cond.getSimpleValueType();
19302 unsigned CondEltSize = Cond.getScalarValueSizeInBits();
19303 if (CondEltSize == 1)
19304 return Op;
19305
19306 // Variable blends are only legal from SSE4.1 onward.
19307 if (!Subtarget.hasSSE41())
19308 return SDValue();
19309
19310 SDLoc dl(Op);
19311 MVT VT = Op.getSimpleValueType();
19312 unsigned EltSize = VT.getScalarSizeInBits();
19313 unsigned NumElts = VT.getVectorNumElements();
19314
19315 // Expand v32i16/v64i8 without BWI.
19316 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
19317 return SDValue();
19318
19319 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
19320 // into an i1 condition so that we can use the mask-based 512-bit blend
19321 // instructions.
19322 if (VT.getSizeInBits() == 512) {
19323 // Build a mask by testing the condition against zero.
19324 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
19325 SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
19326 DAG.getConstant(0, dl, CondVT),
19327 ISD::SETNE);
19328 // Now return a new VSELECT using the mask.
19329 return DAG.getSelect(dl, VT, Mask, LHS, RHS);
19330 }
19331
19332 // SEXT/TRUNC cases where the mask doesn't match the destination size.
19333 if (CondEltSize != EltSize) {
19334 // If we don't have a sign splat, rely on the expansion.
19335 if (CondEltSize != DAG.ComputeNumSignBits(Cond))
19336 return SDValue();
19337
19338 MVT NewCondSVT = MVT::getIntegerVT(EltSize);
19339 MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);
19340 Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);
19341 return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
19342 }
19343
19344 // Only some types will be legal on some subtargets. If we can emit a legal
19345 // VSELECT-matching blend, return Op, and but if we need to expand, return
19346 // a null value.
19347 switch (VT.SimpleTy) {
19348 default:
19349 // Most of the vector types have blends past SSE4.1.
19350 return Op;
19351
19352 case MVT::v32i8:
19353 // The byte blends for AVX vectors were introduced only in AVX2.
19354 if (Subtarget.hasAVX2())
19355 return Op;
19356
19357 return SDValue();
19358
19359 case MVT::v8i16:
19360 case MVT::v16i16: {
19361 // Bitcast everything to the vXi8 type and use a vXi8 vselect.
19362 MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
19363 Cond = DAG.getBitcast(CastVT, Cond);
19364 LHS = DAG.getBitcast(CastVT, LHS);
19365 RHS = DAG.getBitcast(CastVT, RHS);
19366 SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
19367 return DAG.getBitcast(VT, Select);
19368 }
19369 }
19370}
19371
19372static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
19373 MVT VT = Op.getSimpleValueType();
19374 SDValue Vec = Op.getOperand(0);
19375 SDValue Idx = Op.getOperand(1);
19376 assert(isa<ConstantSDNode>(Idx) && "Constant index expected")(static_cast <bool> (isa<ConstantSDNode>(Idx) &&
"Constant index expected") ? void (0) : __assert_fail ("isa<ConstantSDNode>(Idx) && \"Constant index expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19376, __extension__
__PRETTY_FUNCTION__))
;
19377 SDLoc dl(Op);
19378
19379 if (!Vec.getSimpleValueType().is128BitVector())
19380 return SDValue();
19381
19382 if (VT.getSizeInBits() == 8) {
19383 // If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless
19384 // we're going to zero extend the register or fold the store.
19385 if (llvm::isNullConstant(Idx) && !X86::mayFoldIntoZeroExtend(Op) &&
19386 !X86::mayFoldIntoStore(Op))
19387 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
19388 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
19389 DAG.getBitcast(MVT::v4i32, Vec), Idx));
19390
19391 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
19392 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,
19393 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
19394 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
19395 }
19396
19397 if (VT == MVT::f32) {
19398 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
19399 // the result back to FR32 register. It's only worth matching if the
19400 // result has a single use which is a store or a bitcast to i32. And in
19401 // the case of a store, it's not worth it if the index is a constant 0,
19402 // because a MOVSSmr can be used instead, which is smaller and faster.
19403 if (!Op.hasOneUse())
19404 return SDValue();
19405 SDNode *User = *Op.getNode()->use_begin();
19406 if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&
19407 (User->getOpcode() != ISD::BITCAST ||
19408 User->getValueType(0) != MVT::i32))
19409 return SDValue();
19410 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
19411 DAG.getBitcast(MVT::v4i32, Vec), Idx);
19412 return DAG.getBitcast(MVT::f32, Extract);
19413 }
19414
19415 if (VT == MVT::i32 || VT == MVT::i64)
19416 return Op;
19417
19418 return SDValue();
19419}
19420
19421/// Extract one bit from mask vector, like v16i1 or v8i1.
19422/// AVX-512 feature.
19423static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
19424 const X86Subtarget &Subtarget) {
19425 SDValue Vec = Op.getOperand(0);
19426 SDLoc dl(Vec);
19427 MVT VecVT = Vec.getSimpleValueType();
19428 SDValue Idx = Op.getOperand(1);
19429 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
19430 MVT EltVT = Op.getSimpleValueType();
19431
19432 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&(static_cast <bool> ((VecVT.getVectorNumElements() <=
16 || Subtarget.hasBWI()) && "Unexpected vector type in ExtractBitFromMaskVector"
) ? void (0) : __assert_fail ("(VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) && \"Unexpected vector type in ExtractBitFromMaskVector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19433, __extension__
__PRETTY_FUNCTION__))
19433 "Unexpected vector type in ExtractBitFromMaskVector")(static_cast <bool> ((VecVT.getVectorNumElements() <=
16 || Subtarget.hasBWI()) && "Unexpected vector type in ExtractBitFromMaskVector"
) ? void (0) : __assert_fail ("(VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) && \"Unexpected vector type in ExtractBitFromMaskVector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19433, __extension__
__PRETTY_FUNCTION__))
;
19434
19435 // variable index can't be handled in mask registers,
19436 // extend vector to VR512/128
19437 if (!IdxC) {
19438 unsigned NumElts = VecVT.getVectorNumElements();
19439 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
19440 // than extending to 128/256bit.
19441 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
19442 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
19443 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
19444 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
19445 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
19446 }
19447
19448 unsigned IdxVal = IdxC->getZExtValue();
19449 if (IdxVal == 0) // the operation is legal
19450 return Op;
19451
19452 // Extend to natively supported kshift.
19453 unsigned NumElems = VecVT.getVectorNumElements();
19454 MVT WideVecVT = VecVT;
19455 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {
19456 WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
19457 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
19458 DAG.getUNDEF(WideVecVT), Vec,
19459 DAG.getIntPtrConstant(0, dl));
19460 }
19461
19462 // Use kshiftr instruction to move to the lower element.
19463 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
19464 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
19465
19466 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
19467 DAG.getIntPtrConstant(0, dl));
19468}
19469
19470SDValue
19471X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
19472 SelectionDAG &DAG) const {
19473 SDLoc dl(Op);
19474 SDValue Vec = Op.getOperand(0);
19475 MVT VecVT = Vec.getSimpleValueType();
19476 SDValue Idx = Op.getOperand(1);
19477 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
19478
19479 if (VecVT.getVectorElementType() == MVT::i1)
19480 return ExtractBitFromMaskVector(Op, DAG, Subtarget);
19481
19482 if (!IdxC) {
19483 // Its more profitable to go through memory (1 cycles throughput)
19484 // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
19485 // IACA tool was used to get performance estimation
19486 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
19487 //
19488 // example : extractelement <16 x i8> %a, i32 %i
19489 //
19490 // Block Throughput: 3.00 Cycles
19491 // Throughput Bottleneck: Port5
19492 //
19493 // | Num Of | Ports pressure in cycles | |
19494 // | Uops | 0 - DV | 5 | 6 | 7 | |
19495 // ---------------------------------------------
19496 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
19497 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
19498 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
19499 // Total Num Of Uops: 4
19500 //
19501 //
19502 // Block Throughput: 1.00 Cycles
19503 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
19504 //
19505 // | | Ports pressure in cycles | |
19506 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
19507 // ---------------------------------------------------------
19508 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
19509 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
19510 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
19511 // Total Num Of Uops: 4
19512
19513 return SDValue();
19514 }
19515
19516 unsigned IdxVal = IdxC->getZExtValue();
19517
19518 // If this is a 256-bit vector result, first extract the 128-bit vector and
19519 // then extract the element from the 128-bit vector.
19520 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
19521 // Get the 128-bit vector.
19522 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
19523 MVT EltVT = VecVT.getVectorElementType();
19524
19525 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
19526 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")(static_cast <bool> (isPowerOf2_32(ElemsPerChunk) &&
"Elements per chunk not power of 2") ? void (0) : __assert_fail
("isPowerOf2_32(ElemsPerChunk) && \"Elements per chunk not power of 2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19526, __extension__
__PRETTY_FUNCTION__))
;
19527
19528 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
19529 // this can be done with a mask.
19530 IdxVal &= ElemsPerChunk - 1;
19531 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
19532 DAG.getIntPtrConstant(IdxVal, dl));
19533 }
19534
19535 assert(VecVT.is128BitVector() && "Unexpected vector length")(static_cast <bool> (VecVT.is128BitVector() && "Unexpected vector length"
) ? void (0) : __assert_fail ("VecVT.is128BitVector() && \"Unexpected vector length\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19535, __extension__
__PRETTY_FUNCTION__))
;
19536
19537 MVT VT = Op.getSimpleValueType();
19538
19539 if (VT == MVT::i16) {
19540 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
19541 // we're going to zero extend the register or fold the store (SSE41 only).
19542 if (IdxVal == 0 && !X86::mayFoldIntoZeroExtend(Op) &&
19543 !(Subtarget.hasSSE41() && X86::mayFoldIntoStore(Op))) {
19544 if (Subtarget.hasFP16())
19545 return Op;
19546
19547 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
19548 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
19549 DAG.getBitcast(MVT::v4i32, Vec), Idx));
19550 }
19551
19552 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,
19553 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
19554 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
19555 }
19556
19557 if (Subtarget.hasSSE41())
19558 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
19559 return Res;
19560
19561 // TODO: We only extract a single element from v16i8, we can probably afford
19562 // to be more aggressive here before using the default approach of spilling to
19563 // stack.
19564 if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
19565 // Extract either the lowest i32 or any i16, and extract the sub-byte.
19566 int DWordIdx = IdxVal / 4;
19567 if (DWordIdx == 0) {
19568 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
19569 DAG.getBitcast(MVT::v4i32, Vec),
19570 DAG.getIntPtrConstant(DWordIdx, dl));
19571 int ShiftVal = (IdxVal % 4) * 8;
19572 if (ShiftVal != 0)
19573 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
19574 DAG.getConstant(ShiftVal, dl, MVT::i8));
19575 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
19576 }
19577
19578 int WordIdx = IdxVal / 2;
19579 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
19580 DAG.getBitcast(MVT::v8i16, Vec),
19581 DAG.getIntPtrConstant(WordIdx, dl));
19582 int ShiftVal = (IdxVal % 2) * 8;
19583 if (ShiftVal != 0)
19584 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
19585 DAG.getConstant(ShiftVal, dl, MVT::i8));
19586 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
19587 }
19588
19589 if (VT == MVT::f16 || VT.getSizeInBits() == 32) {
19590 if (IdxVal == 0)
19591 return Op;
19592
19593 // Shuffle the element to the lowest element, then movss or movsh.
19594 SmallVector<int, 8> Mask(VecVT.getVectorNumElements(), -1);
19595 Mask[0] = static_cast<int>(IdxVal);
19596 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
19597 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
19598 DAG.getIntPtrConstant(0, dl));
19599 }
19600
19601 if (VT.getSizeInBits() == 64) {
19602 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
19603 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
19604 // to match extract_elt for f64.
19605 if (IdxVal == 0)
19606 return Op;
19607
19608 // UNPCKHPD the element to the lowest double word, then movsd.
19609 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
19610 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
19611 int Mask[2] = { 1, -1 };
19612 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
19613 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
19614 DAG.getIntPtrConstant(0, dl));
19615 }
19616
19617 return SDValue();
19618}
19619
19620/// Insert one bit to mask vector, like v16i1 or v8i1.
19621/// AVX-512 feature.
19622static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,
19623 const X86Subtarget &Subtarget) {
19624 SDLoc dl(Op);
19625 SDValue Vec = Op.getOperand(0);
19626 SDValue Elt = Op.getOperand(1);
19627 SDValue Idx = Op.getOperand(2);
19628 MVT VecVT = Vec.getSimpleValueType();
19629
19630 if (!isa<ConstantSDNode>(Idx)) {
19631 // Non constant index. Extend source and destination,
19632 // insert element and then truncate the result.
19633 unsigned NumElts = VecVT.getVectorNumElements();
19634 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
19635 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
19636 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
19637 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
19638 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
19639 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
19640 }
19641
19642 // Copy into a k-register, extract to v1i1 and insert_subvector.
19643 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
19644 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx);
19645}
19646
19647SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
19648 SelectionDAG &DAG) const {
19649 MVT VT = Op.getSimpleValueType();
19650 MVT EltVT = VT.getVectorElementType();
19651 unsigned NumElts = VT.getVectorNumElements();
19652 unsigned EltSizeInBits = EltVT.getScalarSizeInBits();
19653
19654 if (EltVT == MVT::i1)
19655 return InsertBitToMaskVector(Op, DAG, Subtarget);
19656
19657 SDLoc dl(Op);
19658 SDValue N0 = Op.getOperand(0);
19659 SDValue N1 = Op.getOperand(1);
19660 SDValue N2 = Op.getOperand(2);
19661 auto *N2C = dyn_cast<ConstantSDNode>(N2);
19662
19663 if (!N2C) {
19664 // Variable insertion indices, usually we're better off spilling to stack,
19665 // but AVX512 can use a variable compare+select by comparing against all
19666 // possible vector indices, and FP insertion has less gpr->simd traffic.
19667 if (!(Subtarget.hasBWI() ||
19668 (Subtarget.hasAVX512() && EltSizeInBits >= 32) ||
19669 (Subtarget.hasSSE41() && VT.isFloatingPoint())))
19670 return SDValue();
19671
19672 MVT IdxSVT = MVT::getIntegerVT(EltSizeInBits);
19673 MVT IdxVT = MVT::getVectorVT(IdxSVT, NumElts);
19674 if (!isTypeLegal(IdxSVT) || !isTypeLegal(IdxVT))
19675 return SDValue();
19676
19677 SDValue IdxExt = DAG.getZExtOrTrunc(N2, dl, IdxSVT);
19678 SDValue IdxSplat = DAG.getSplatBuildVector(IdxVT, dl, IdxExt);
19679 SDValue EltSplat = DAG.getSplatBuildVector(VT, dl, N1);
19680
19681 SmallVector<SDValue, 16> RawIndices;
19682 for (unsigned I = 0; I != NumElts; ++I)
19683 RawIndices.push_back(DAG.getConstant(I, dl, IdxSVT));
19684 SDValue Indices = DAG.getBuildVector(IdxVT, dl, RawIndices);
19685
19686 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
19687 return DAG.getSelectCC(dl, IdxSplat, Indices, EltSplat, N0,
19688 ISD::CondCode::SETEQ);
19689 }
19690
19691 if (N2C->getAPIntValue().uge(NumElts))
19692 return SDValue();
19693 uint64_t IdxVal = N2C->getZExtValue();
19694
19695 bool IsZeroElt = X86::isZeroNode(N1);
19696 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
19697
19698 if (IsZeroElt || IsAllOnesElt) {
19699 // Lower insertion of i8 -1 as an 'OR' blend.
19700 // We don't deal with i8 0 since it appears to be handled elsewhere.
19701 if (IsAllOnesElt && EltSizeInBits == 8 && !Subtarget.hasSSE41()) {
19702 SDValue ZeroCst = DAG.getConstant(0, dl, VT.getScalarType());
19703 SDValue OnesCst = DAG.getAllOnesConstant(dl, VT.getScalarType());
19704 SmallVector<SDValue, 8> CstVectorElts(NumElts, ZeroCst);
19705 CstVectorElts[IdxVal] = OnesCst;
19706 SDValue CstVector = DAG.getBuildVector(VT, dl, CstVectorElts);
19707 return DAG.getNode(ISD::OR, dl, VT, N0, CstVector);
19708 }
19709 // See if we can do this more efficiently with a blend shuffle with a
19710 // rematerializable vector.
19711 if (Subtarget.hasSSE41() &&
19712 (EltSizeInBits >= 16 || (IsZeroElt && !VT.is128BitVector()))) {
19713 SmallVector<int, 8> BlendMask;
19714 for (unsigned i = 0; i != NumElts; ++i)
19715 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
19716 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
19717 : getOnesVector(VT, DAG, dl);
19718 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
19719 }
19720 }
19721
19722 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
19723 // into that, and then insert the subvector back into the result.
19724 if (VT.is256BitVector() || VT.is512BitVector()) {
19725 // With a 256-bit vector, we can insert into the zero element efficiently
19726 // using a blend if we have AVX or AVX2 and the right data type.
19727 if (VT.is256BitVector() && IdxVal == 0) {
19728 // TODO: It is worthwhile to cast integer to floating point and back
19729 // and incur a domain crossing penalty if that's what we'll end up
19730 // doing anyway after extracting to a 128-bit vector.
19731 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
19732 (Subtarget.hasAVX2() && EltVT == MVT::i32)) {
19733 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
19734 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,
19735 DAG.getTargetConstant(1, dl, MVT::i8));
19736 }
19737 }
19738
19739 unsigned NumEltsIn128 = 128 / EltSizeInBits;
19740 assert(isPowerOf2_32(NumEltsIn128) &&(static_cast <bool> (isPowerOf2_32(NumEltsIn128) &&
"Vectors will always have power-of-two number of elements.")
? void (0) : __assert_fail ("isPowerOf2_32(NumEltsIn128) && \"Vectors will always have power-of-two number of elements.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19741, __extension__
__PRETTY_FUNCTION__))
19741 "Vectors will always have power-of-two number of elements.")(static_cast <bool> (isPowerOf2_32(NumEltsIn128) &&
"Vectors will always have power-of-two number of elements.")
? void (0) : __assert_fail ("isPowerOf2_32(NumEltsIn128) && \"Vectors will always have power-of-two number of elements.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19741, __extension__
__PRETTY_FUNCTION__))
;
19742
19743 // If we are not inserting into the low 128-bit vector chunk,
19744 // then prefer the broadcast+blend sequence.
19745 // FIXME: relax the profitability check iff all N1 uses are insertions.
19746 if (!VT.is128BitVector() && IdxVal >= NumEltsIn128 &&
19747 ((Subtarget.hasAVX2() && EltSizeInBits != 8) ||
19748 (Subtarget.hasAVX() && (EltSizeInBits >= 32) &&
19749 X86::mayFoldLoad(N1, Subtarget)))) {
19750 SDValue N1SplatVec = DAG.getSplatBuildVector(VT, dl, N1);
19751 SmallVector<int, 8> BlendMask;
19752 for (unsigned i = 0; i != NumElts; ++i)
19753 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
19754 return DAG.getVectorShuffle(VT, dl, N0, N1SplatVec, BlendMask);
19755 }
19756
19757 // Get the desired 128-bit vector chunk.
19758 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
19759
19760 // Insert the element into the desired chunk.
19761 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
19762 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
19763
19764 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
19765 DAG.getIntPtrConstant(IdxIn128, dl));
19766
19767 // Insert the changed part back into the bigger vector
19768 return insert128BitVector(N0, V, IdxVal, DAG, dl);
19769 }
19770 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vector types should be left!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector types should be left!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19770, __extension__
__PRETTY_FUNCTION__))
;
19771
19772 // This will be just movw/movd/movq/movsh/movss/movsd.
19773 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {
19774 if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
19775 EltVT == MVT::f16 || EltVT == MVT::i64) {
19776 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
19777 return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
19778 }
19779
19780 // We can't directly insert an i8 or i16 into a vector, so zero extend
19781 // it to i32 first.
19782 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
19783 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);
19784 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
19785 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);
19786 N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
19787 return DAG.getBitcast(VT, N1);
19788 }
19789 }
19790
19791 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
19792 // argument. SSE41 required for pinsrb.
19793 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
19794 unsigned Opc;
19795 if (VT == MVT::v8i16) {
19796 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW")(static_cast <bool> (Subtarget.hasSSE2() && "SSE2 required for PINSRW"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"SSE2 required for PINSRW\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19796, __extension__
__PRETTY_FUNCTION__))
;
19797 Opc = X86ISD::PINSRW;
19798 } else {
19799 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector")(static_cast <bool> (VT == MVT::v16i8 && "PINSRB requires v16i8 vector"
) ? void (0) : __assert_fail ("VT == MVT::v16i8 && \"PINSRB requires v16i8 vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19799, __extension__
__PRETTY_FUNCTION__))
;
19800 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB")(static_cast <bool> (Subtarget.hasSSE41() && "SSE41 required for PINSRB"
) ? void (0) : __assert_fail ("Subtarget.hasSSE41() && \"SSE41 required for PINSRB\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19800, __extension__
__PRETTY_FUNCTION__))
;
19801 Opc = X86ISD::PINSRB;
19802 }
19803
19804 assert(N1.getValueType() != MVT::i32 && "Unexpected VT")(static_cast <bool> (N1.getValueType() != MVT::i32 &&
"Unexpected VT") ? void (0) : __assert_fail ("N1.getValueType() != MVT::i32 && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19804, __extension__
__PRETTY_FUNCTION__))
;
19805 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
19806 N2 = DAG.getTargetConstant(IdxVal, dl, MVT::i8);
19807 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
19808 }
19809
19810 if (Subtarget.hasSSE41()) {
19811 if (EltVT == MVT::f32) {
19812 // Bits [7:6] of the constant are the source select. This will always be
19813 // zero here. The DAG Combiner may combine an extract_elt index into
19814 // these bits. For example (insert (extract, 3), 2) could be matched by
19815 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
19816 // Bits [5:4] of the constant are the destination select. This is the
19817 // value of the incoming immediate.
19818 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
19819 // combine either bitwise AND or insert of float 0.0 to set these bits.
19820
19821 bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
19822 if (IdxVal == 0 && (!MinSize || !X86::mayFoldLoad(N1, Subtarget))) {
19823 // If this is an insertion of 32-bits into the low 32-bits of
19824 // a vector, we prefer to generate a blend with immediate rather
19825 // than an insertps. Blends are simpler operations in hardware and so
19826 // will always have equal or better performance than insertps.
19827 // But if optimizing for size and there's a load folding opportunity,
19828 // generate insertps because blendps does not have a 32-bit memory
19829 // operand form.
19830 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
19831 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,
19832 DAG.getTargetConstant(1, dl, MVT::i8));
19833 }
19834 // Create this as a scalar to vector..
19835 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
19836 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,
19837 DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));
19838 }
19839
19840 // PINSR* works with constant index.
19841 if (EltVT == MVT::i32 || EltVT == MVT::i64)
19842 return Op;
19843 }
19844
19845 return SDValue();
19846}
19847
19848static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
19849 SelectionDAG &DAG) {
19850 SDLoc dl(Op);
19851 MVT OpVT = Op.getSimpleValueType();
19852
19853 // It's always cheaper to replace a xor+movd with xorps and simplifies further
19854 // combines.
19855 if (X86::isZeroNode(Op.getOperand(0)))
19856 return getZeroVector(OpVT, Subtarget, DAG, dl);
19857
19858 // If this is a 256-bit vector result, first insert into a 128-bit
19859 // vector and then insert into the 256-bit vector.
19860 if (!OpVT.is128BitVector()) {
19861 // Insert into a 128-bit vector.
19862 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
19863 MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
19864 OpVT.getVectorNumElements() / SizeFactor);
19865
19866 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
19867
19868 // Insert the 128-bit vector.
19869 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
19870 }
19871 assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&(static_cast <bool> (OpVT.is128BitVector() && OpVT
.isInteger() && OpVT != MVT::v2i64 && "Expected an SSE type!"
) ? void (0) : __assert_fail ("OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 && \"Expected an SSE type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19872, __extension__
__PRETTY_FUNCTION__))
19872 "Expected an SSE type!")(static_cast <bool> (OpVT.is128BitVector() && OpVT
.isInteger() && OpVT != MVT::v2i64 && "Expected an SSE type!"
) ? void (0) : __assert_fail ("OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 && \"Expected an SSE type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19872, __extension__
__PRETTY_FUNCTION__))
;
19873
19874 // Pass through a v4i32 or V8i16 SCALAR_TO_VECTOR as that's what we use in
19875 // tblgen.
19876 if (OpVT == MVT::v4i32 || (OpVT == MVT::v8i16 && Subtarget.hasFP16()))
19877 return Op;
19878
19879 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
19880 return DAG.getBitcast(
19881 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
19882}
19883
19884// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
19885// simple superregister reference or explicit instructions to insert
19886// the upper bits of a vector.
19887static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
19888 SelectionDAG &DAG) {
19889 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1)(static_cast <bool> (Op.getSimpleValueType().getVectorElementType
() == MVT::i1) ? void (0) : __assert_fail ("Op.getSimpleValueType().getVectorElementType() == MVT::i1"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19889, __extension__
__PRETTY_FUNCTION__))
;
19890
19891 return insert1BitVector(Op, DAG, Subtarget);
19892}
19893
19894static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
19895 SelectionDAG &DAG) {
19896 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&(static_cast <bool> (Op.getSimpleValueType().getVectorElementType
() == MVT::i1 && "Only vXi1 extract_subvectors need custom lowering"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().getVectorElementType() == MVT::i1 && \"Only vXi1 extract_subvectors need custom lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19897, __extension__
__PRETTY_FUNCTION__))
19897 "Only vXi1 extract_subvectors need custom lowering")(static_cast <bool> (Op.getSimpleValueType().getVectorElementType
() == MVT::i1 && "Only vXi1 extract_subvectors need custom lowering"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().getVectorElementType() == MVT::i1 && \"Only vXi1 extract_subvectors need custom lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19897, __extension__
__PRETTY_FUNCTION__))
;
19898
19899 SDLoc dl(Op);
19900 SDValue Vec = Op.getOperand(0);
19901 uint64_t IdxVal = Op.getConstantOperandVal(1);
19902
19903 if (IdxVal == 0) // the operation is legal
19904 return Op;
19905
19906 MVT VecVT = Vec.getSimpleValueType();
19907 unsigned NumElems = VecVT.getVectorNumElements();
19908
19909 // Extend to natively supported kshift.
19910 MVT WideVecVT = VecVT;
19911 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {
19912 WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
19913 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
19914 DAG.getUNDEF(WideVecVT), Vec,
19915 DAG.getIntPtrConstant(0, dl));
19916 }
19917
19918 // Shift to the LSB.
19919 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
19920 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
19921
19922 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
19923 DAG.getIntPtrConstant(0, dl));
19924}
19925
19926// Returns the appropriate wrapper opcode for a global reference.
19927unsigned X86TargetLowering::getGlobalWrapperKind(
19928 const GlobalValue *GV, const unsigned char OpFlags) const {
19929 // References to absolute symbols are never PC-relative.
19930 if (GV && GV->isAbsoluteSymbolRef())
19931 return X86ISD::Wrapper;
19932
19933 CodeModel::Model M = getTargetMachine().getCodeModel();
19934 if (Subtarget.isPICStyleRIPRel() &&
19935 (M == CodeModel::Small || M == CodeModel::Kernel))
19936 return X86ISD::WrapperRIP;
19937
19938 // GOTPCREL references must always use RIP.
19939 if (OpFlags == X86II::MO_GOTPCREL || OpFlags == X86II::MO_GOTPCREL_NORELAX)
19940 return X86ISD::WrapperRIP;
19941
19942 return X86ISD::Wrapper;
19943}
19944
19945// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
19946// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
19947// one of the above mentioned nodes. It has to be wrapped because otherwise
19948// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
19949// be used to form addressing mode. These wrapped nodes will be selected
19950// into MOV32ri.
19951SDValue
19952X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
19953 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
19954
19955 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19956 // global base reg.
19957 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
19958
19959 auto PtrVT = getPointerTy(DAG.getDataLayout());
19960 SDValue Result = DAG.getTargetConstantPool(
19961 CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);
19962 SDLoc DL(CP);
19963 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
19964 // With PIC, the address is actually $g + Offset.
19965 if (OpFlag) {
19966 Result =
19967 DAG.getNode(ISD::ADD, DL, PtrVT,
19968 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
19969 }
19970
19971 return Result;
19972}
19973
19974SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
19975 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
19976
19977 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19978 // global base reg.
19979 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
19980
19981 auto PtrVT = getPointerTy(DAG.getDataLayout());
19982 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
19983 SDLoc DL(JT);
19984 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
19985
19986 // With PIC, the address is actually $g + Offset.
19987 if (OpFlag)
19988 Result =
19989 DAG.getNode(ISD::ADD, DL, PtrVT,
19990 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
19991
19992 return Result;
19993}
19994
19995SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
19996 SelectionDAG &DAG) const {
19997 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
19998}
19999
20000SDValue
20001X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
20002 // Create the TargetBlockAddressAddress node.
20003 unsigned char OpFlags =
20004 Subtarget.classifyBlockAddressReference();
20005 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
20006 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
20007 SDLoc dl(Op);
20008 auto PtrVT = getPointerTy(DAG.getDataLayout());
20009 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
20010 Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);
20011
20012 // With PIC, the address is actually $g + Offset.
20013 if (isGlobalRelativeToPICBase(OpFlags)) {
20014 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
20015 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
20016 }
20017
20018 return Result;
20019}
20020
20021/// Creates target global address or external symbol nodes for calls or
20022/// other uses.
20023SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
20024 bool ForCall) const {
20025 // Unpack the global address or external symbol.
20026 const SDLoc &dl = SDLoc(Op);
20027 const GlobalValue *GV = nullptr;
20028 int64_t Offset = 0;
20029 const char *ExternalSym = nullptr;
20030 if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {
20031 GV = G->getGlobal();
20032 Offset = G->getOffset();
20033 } else {
20034 const auto *ES = cast<ExternalSymbolSDNode>(Op);
20035 ExternalSym = ES->getSymbol();
20036 }
20037
20038 // Calculate some flags for address lowering.
20039 const Module &Mod = *DAG.getMachineFunction().getFunction().getParent();
20040 unsigned char OpFlags;
20041 if (ForCall)
20042 OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);
20043 else
20044 OpFlags = Subtarget.classifyGlobalReference(GV, Mod);
20045 bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);
20046 bool NeedsLoad = isGlobalStubReference(OpFlags);
20047
20048 CodeModel::Model M = DAG.getTarget().getCodeModel();
20049 auto PtrVT = getPointerTy(DAG.getDataLayout());
20050 SDValue Result;
20051
20052 if (GV) {
20053 // Create a target global address if this is a global. If possible, fold the
20054 // offset into the global address reference. Otherwise, ADD it on later.
20055 // Suppress the folding if Offset is negative: movl foo-1, %eax is not
20056 // allowed because if the address of foo is 0, the ELF R_X86_64_32
20057 // relocation will compute to a negative value, which is invalid.
20058 int64_t GlobalOffset = 0;
20059 if (OpFlags == X86II::MO_NO_FLAG && Offset >= 0 &&
20060 X86::isOffsetSuitableForCodeModel(Offset, M, true)) {
20061 std::swap(GlobalOffset, Offset);
20062 }
20063 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
20064 } else {
20065 // If this is not a global address, this must be an external symbol.
20066 Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);
20067 }
20068
20069 // If this is a direct call, avoid the wrapper if we don't need to do any
20070 // loads or adds. This allows SDAG ISel to match direct calls.
20071 if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
20072 return Result;
20073
20074 Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
20075
20076 // With PIC, the address is actually $g + Offset.
20077 if (HasPICReg) {
20078 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
20079 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
20080 }
20081
20082 // For globals that require a load from a stub to get the address, emit the
20083 // load.
20084 if (NeedsLoad)
20085 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
20086 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
20087
20088 // If there was a non-zero offset that we didn't fold, create an explicit
20089 // addition for it.
20090 if (Offset != 0)
20091 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
20092 DAG.getConstant(Offset, dl, PtrVT));
20093
20094 return Result;
20095}
20096
20097SDValue
20098X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
20099 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
20100}
20101
20102static SDValue
20103GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
20104 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
20105 unsigned char OperandFlags, bool LocalDynamic = false) {
20106 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
20107 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
20108 SDLoc dl(GA);
20109 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
20110 GA->getValueType(0),
20111 GA->getOffset(),
20112 OperandFlags);
20113
20114 X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
20115 : X86ISD::TLSADDR;
20116
20117 if (InFlag) {
20118 SDValue Ops[] = { Chain, TGA, *InFlag };
20119 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
20120 } else {
20121 SDValue Ops[] = { Chain, TGA };
20122 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
20123 }
20124
20125 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
20126 MFI.setAdjustsStack(true);
20127 MFI.setHasCalls(true);
20128
20129 SDValue Flag = Chain.getValue(1);
20130 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
20131}
20132
20133// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
20134static SDValue
20135LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
20136 const EVT PtrVT) {
20137 SDValue InFlag;
20138 SDLoc dl(GA); // ? function entry point might be better
20139 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
20140 DAG.getNode(X86ISD::GlobalBaseReg,
20141 SDLoc(), PtrVT), InFlag);
20142 InFlag = Chain.getValue(1);
20143
20144 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
20145}
20146
20147// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64
20148static SDValue
20149LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
20150 const EVT PtrVT) {
20151 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
20152 X86::RAX, X86II::MO_TLSGD);
20153}
20154
20155// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32
20156static SDValue
20157LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
20158 const EVT PtrVT) {
20159 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
20160 X86::EAX, X86II::MO_TLSGD);
20161}
20162
20163static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
20164 SelectionDAG &DAG, const EVT PtrVT,
20165 bool Is64Bit, bool Is64BitLP64) {
20166 SDLoc dl(GA);
20167
20168 // Get the start address of the TLS block for this module.
20169 X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
20170 .getInfo<X86MachineFunctionInfo>();
20171 MFI->incNumLocalDynamicTLSAccesses();
20172
20173 SDValue Base;
20174 if (Is64Bit) {
20175 unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;
20176 Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, ReturnReg,
20177 X86II::MO_TLSLD, /*LocalDynamic=*/true);
20178 } else {
20179 SDValue InFlag;
20180 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
20181 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
20182 InFlag = Chain.getValue(1);
20183 Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
20184 X86II::MO_TLSLDM, /*LocalDynamic=*/true);
20185 }
20186
20187 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
20188 // of Base.
20189
20190 // Build x@dtpoff.
20191 unsigned char OperandFlags = X86II::MO_DTPOFF;
20192 unsigned WrapperKind = X86ISD::Wrapper;
20193 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
20194 GA->getValueType(0),
20195 GA->getOffset(), OperandFlags);
20196 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
20197
20198 // Add x@dtpoff with the base.
20199 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
20200}
20201
20202// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
20203static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
20204 const EVT PtrVT, TLSModel::Model model,
20205 bool is64Bit, bool isPIC) {
20206 SDLoc dl(GA);
20207
20208 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
20209 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
20210 is64Bit ? 257 : 256));
20211
20212 SDValue ThreadPointer =
20213 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
20214 MachinePointerInfo(Ptr));
20215
20216 unsigned char OperandFlags = 0;
20217 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
20218 // initialexec.
20219 unsigned WrapperKind = X86ISD::Wrapper;
20220 if (model == TLSModel::LocalExec) {
20221 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
20222 } else if (model == TLSModel::InitialExec) {
20223 if (is64Bit) {
20224 OperandFlags = X86II::MO_GOTTPOFF;
20225 WrapperKind = X86ISD::WrapperRIP;
20226 } else {
20227 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
20228 }
20229 } else {
20230 llvm_unreachable("Unexpected model")::llvm::llvm_unreachable_internal("Unexpected model", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20230)
;
20231 }
20232
20233 // emit "addl x@ntpoff,%eax" (local exec)
20234 // or "addl x@indntpoff,%eax" (initial exec)
20235 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
20236 SDValue TGA =
20237 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
20238 GA->getOffset(), OperandFlags);
20239 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
20240
20241 if (model == TLSModel::InitialExec) {
20242 if (isPIC && !is64Bit) {
20243 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
20244 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
20245 Offset);
20246 }
20247
20248 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
20249 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
20250 }
20251
20252 // The address of the thread local variable is the add of the thread
20253 // pointer with the offset of the variable.
20254 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
20255}
20256
20257SDValue
20258X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
20259
20260 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
20261
20262 if (DAG.getTarget().useEmulatedTLS())
20263 return LowerToTLSEmulatedModel(GA, DAG);
20264
20265 const GlobalValue *GV = GA->getGlobal();
20266 auto PtrVT = getPointerTy(DAG.getDataLayout());
20267 bool PositionIndependent = isPositionIndependent();
20268
20269 if (Subtarget.isTargetELF()) {
20270 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
20271 switch (model) {
20272 case TLSModel::GeneralDynamic:
20273 if (Subtarget.is64Bit()) {
20274 if (Subtarget.isTarget64BitLP64())
20275 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
20276 return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);
20277 }
20278 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
20279 case TLSModel::LocalDynamic:
20280 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),
20281 Subtarget.isTarget64BitLP64());
20282 case TLSModel::InitialExec:
20283 case TLSModel::LocalExec:
20284 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
20285 PositionIndependent);
20286 }
20287 llvm_unreachable("Unknown TLS model.")::llvm::llvm_unreachable_internal("Unknown TLS model.", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20287)
;
20288 }
20289
20290 if (Subtarget.isTargetDarwin()) {
20291 // Darwin only has one model of TLS. Lower to that.
20292 unsigned char OpFlag = 0;
20293 unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
20294 X86ISD::WrapperRIP : X86ISD::Wrapper;
20295
20296 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
20297 // global base reg.
20298 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
20299 if (PIC32)
20300 OpFlag = X86II::MO_TLVP_PIC_BASE;
20301 else
20302 OpFlag = X86II::MO_TLVP;
20303 SDLoc DL(Op);
20304 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
20305 GA->getValueType(0),
20306 GA->getOffset(), OpFlag);
20307 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
20308
20309 // With PIC32, the address is actually $g + Offset.
20310 if (PIC32)
20311 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
20312 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
20313 Offset);
20314
20315 // Lowering the machine isd will make sure everything is in the right
20316 // location.
20317 SDValue Chain = DAG.getEntryNode();
20318 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
20319 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
20320 SDValue Args[] = { Chain, Offset };
20321 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
20322 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
20323 DAG.getIntPtrConstant(0, DL, true),
20324 Chain.getValue(1), DL);
20325
20326 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
20327 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
20328 MFI.setAdjustsStack(true);
20329
20330 // And our return value (tls address) is in the standard call return value
20331 // location.
20332 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
20333 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
20334 }
20335
20336 if (Subtarget.isOSWindows()) {
20337 // Just use the implicit TLS architecture
20338 // Need to generate something similar to:
20339 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
20340 // ; from TEB
20341 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
20342 // mov rcx, qword [rdx+rcx*8]
20343 // mov eax, .tls$:tlsvar
20344 // [rax+rcx] contains the address
20345 // Windows 64bit: gs:0x58
20346 // Windows 32bit: fs:__tls_array
20347
20348 SDLoc dl(GA);
20349 SDValue Chain = DAG.getEntryNode();
20350
20351 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
20352 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
20353 // use its literal value of 0x2C.
20354 Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
20355 ? Type::getInt8PtrTy(*DAG.getContext(),
20356 256)
20357 : Type::getInt32PtrTy(*DAG.getContext(),
20358 257));
20359
20360 SDValue TlsArray = Subtarget.is64Bit()
20361 ? DAG.getIntPtrConstant(0x58, dl)
20362 : (Subtarget.isTargetWindowsGNU()
20363 ? DAG.getIntPtrConstant(0x2C, dl)
20364 : DAG.getExternalSymbol("_tls_array", PtrVT));
20365
20366 SDValue ThreadPointer =
20367 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
20368
20369 SDValue res;
20370 if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
20371 res = ThreadPointer;
20372 } else {
20373 // Load the _tls_index variable
20374 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
20375 if (Subtarget.is64Bit())
20376 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
20377 MachinePointerInfo(), MVT::i32);
20378 else
20379 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
20380
20381 const DataLayout &DL = DAG.getDataLayout();
20382 SDValue Scale =
20383 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
20384 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
20385
20386 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
20387 }
20388
20389 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
20390
20391 // Get the offset of start of .tls section
20392 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
20393 GA->getValueType(0),
20394 GA->getOffset(), X86II::MO_SECREL);
20395 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
20396
20397 // The address of the thread local variable is the add of the thread
20398 // pointer with the offset of the variable.
20399 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
20400 }
20401
20402 llvm_unreachable("TLS not implemented for this target.")::llvm::llvm_unreachable_internal("TLS not implemented for this target."
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20402)
;
20403}
20404
20405/// Lower SRA_PARTS and friends, which return two i32 values
20406/// and take a 2 x i32 value to shift plus a shift amount.
20407/// TODO: Can this be moved to general expansion code?
20408static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
20409 SDValue Lo, Hi;
20410 DAG.getTargetLoweringInfo().expandShiftParts(Op.getNode(), Lo, Hi, DAG);
20411 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
20412}
20413
20414// Try to use a packed vector operation to handle i64 on 32-bit targets when
20415// AVX512DQ is enabled.
20416static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,
20417 const X86Subtarget &Subtarget) {
20418 assert((Op.getOpcode() == ISD::SINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20422, __extension__
__PRETTY_FUNCTION__))
20419 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20422, __extension__
__PRETTY_FUNCTION__))
20420 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20422, __extension__
__PRETTY_FUNCTION__))
20421 Op.getOpcode() == ISD::UINT_TO_FP) &&(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20422, __extension__
__PRETTY_FUNCTION__))
20422 "Unexpected opcode!")(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20422, __extension__
__PRETTY_FUNCTION__))
;
20423 bool IsStrict = Op->isStrictFPOpcode();
20424 unsigned OpNo = IsStrict ? 1 : 0;
20425 SDValue Src = Op.getOperand(OpNo);
20426 MVT SrcVT = Src.getSimpleValueType();
20427 MVT VT = Op.getSimpleValueType();
20428
20429 if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
20430 (VT != MVT::f32 && VT != MVT::f64))
20431 return SDValue();
20432
20433 // Pack the i64 into a vector, do the operation and extract.
20434
20435 // Using 256-bit to ensure result is 128-bits for f32 case.
20436 unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
20437 MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
20438 MVT VecVT = MVT::getVectorVT(VT, NumElts);
20439
20440 SDLoc dl(Op);
20441 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
20442 if (IsStrict) {
20443 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},
20444 {Op.getOperand(0), InVec});
20445 SDValue Chain = CvtVec.getValue(1);
20446 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
20447 DAG.getIntPtrConstant(0, dl));
20448 return DAG.getMergeValues({Value, Chain}, dl);
20449 }
20450
20451 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
20452
20453 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
20454 DAG.getIntPtrConstant(0, dl));
20455}
20456
20457// Try to use a packed vector operation to handle i64 on 32-bit targets.
20458static SDValue LowerI64IntToFP16(SDValue Op, SelectionDAG &DAG,
20459 const X86Subtarget &Subtarget) {
20460 assert((Op.getOpcode() == ISD::SINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20464, __extension__
__PRETTY_FUNCTION__))
20461 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20464, __extension__
__PRETTY_FUNCTION__))
20462 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20464, __extension__
__PRETTY_FUNCTION__))
20463 Op.getOpcode() == ISD::UINT_TO_FP) &&(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20464, __extension__
__PRETTY_FUNCTION__))
20464 "Unexpected opcode!")(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20464, __extension__
__PRETTY_FUNCTION__))
;
20465 bool IsStrict = Op->isStrictFPOpcode();
20466 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
20467 MVT SrcVT = Src.getSimpleValueType();
20468 MVT VT = Op.getSimpleValueType();
20469
20470 if (SrcVT != MVT::i64 || Subtarget.is64Bit() || VT != MVT::f16)
20471 return SDValue();
20472
20473 // Pack the i64 into a vector, do the operation and extract.
20474
20475 assert(Subtarget.hasFP16() && "Expected FP16")(static_cast <bool> (Subtarget.hasFP16() && "Expected FP16"
) ? void (0) : __assert_fail ("Subtarget.hasFP16() && \"Expected FP16\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20475, __extension__
__PRETTY_FUNCTION__))
;
20476
20477 SDLoc dl(Op);
20478 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
20479 if (IsStrict) {
20480 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {MVT::v2f16, MVT::Other},
20481 {Op.getOperand(0), InVec});
20482 SDValue Chain = CvtVec.getValue(1);
20483 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
20484 DAG.getIntPtrConstant(0, dl));
20485 return DAG.getMergeValues({Value, Chain}, dl);
20486 }
20487
20488 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, MVT::v2f16, InVec);
20489
20490 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
20491 DAG.getIntPtrConstant(0, dl));
20492}
20493
20494static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
20495 const X86Subtarget &Subtarget) {
20496 switch (Opcode) {
20497 case ISD::SINT_TO_FP:
20498 // TODO: Handle wider types with AVX/AVX512.
20499 if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
20500 return false;
20501 // CVTDQ2PS or (V)CVTDQ2PD
20502 return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
20503
20504 case ISD::UINT_TO_FP:
20505 // TODO: Handle wider types and i64 elements.
20506 if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
20507 return false;
20508 // VCVTUDQ2PS or VCVTUDQ2PD
20509 return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
20510
20511 default:
20512 return false;
20513 }
20514}
20515
20516/// Given a scalar cast operation that is extracted from a vector, try to
20517/// vectorize the cast op followed by extraction. This will avoid an expensive
20518/// round-trip between XMM and GPR.
20519static SDValue vectorizeExtractedCast(SDValue Cast, SelectionDAG &DAG,
20520 const X86Subtarget &Subtarget) {
20521 // TODO: This could be enhanced to handle smaller integer types by peeking
20522 // through an extend.
20523 SDValue Extract = Cast.getOperand(0);
20524 MVT DestVT = Cast.getSimpleValueType();
20525 if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
20526 !isa<ConstantSDNode>(Extract.getOperand(1)))
20527 return SDValue();
20528
20529 // See if we have a 128-bit vector cast op for this type of cast.
20530 SDValue VecOp = Extract.getOperand(0);
20531 MVT FromVT = VecOp.getSimpleValueType();
20532 unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();
20533 MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);
20534 MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);
20535 if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))
20536 return SDValue();
20537
20538 // If we are extracting from a non-zero element, first shuffle the source
20539 // vector to allow extracting from element zero.
20540 SDLoc DL(Cast);
20541 if (!isNullConstant(Extract.getOperand(1))) {
20542 SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
20543 Mask[0] = Extract.getConstantOperandVal(1);
20544 VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);
20545 }
20546 // If the source vector is wider than 128-bits, extract the low part. Do not
20547 // create an unnecessarily wide vector cast op.
20548 if (FromVT != Vec128VT)
20549 VecOp = extract128BitVector(VecOp, 0, DAG, DL);
20550
20551 // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
20552 // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
20553 SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
20554 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
20555 DAG.getIntPtrConstant(0, DL));
20556}
20557
20558/// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),
20559/// try to vectorize the cast ops. This will avoid an expensive round-trip
20560/// between XMM and GPR.
20561static SDValue lowerFPToIntToFP(SDValue CastToFP, SelectionDAG &DAG,
20562 const X86Subtarget &Subtarget) {
20563 // TODO: Allow FP_TO_UINT.
20564 SDValue CastToInt = CastToFP.getOperand(0);
20565 MVT VT = CastToFP.getSimpleValueType();
20566 if (CastToInt.getOpcode() != ISD::FP_TO_SINT || VT.isVector())
20567 return SDValue();
20568
20569 MVT IntVT = CastToInt.getSimpleValueType();
20570 SDValue X = CastToInt.getOperand(0);
20571 MVT SrcVT = X.getSimpleValueType();
20572 if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
20573 return SDValue();
20574
20575 // See if we have 128-bit vector cast instructions for this type of cast.
20576 // We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.
20577 if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||
20578 IntVT != MVT::i32)
20579 return SDValue();
20580
20581 unsigned SrcSize = SrcVT.getSizeInBits();
20582 unsigned IntSize = IntVT.getSizeInBits();
20583 unsigned VTSize = VT.getSizeInBits();
20584 MVT VecSrcVT = MVT::getVectorVT(SrcVT, 128 / SrcSize);
20585 MVT VecIntVT = MVT::getVectorVT(IntVT, 128 / IntSize);
20586 MVT VecVT = MVT::getVectorVT(VT, 128 / VTSize);
20587
20588 // We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.
20589 unsigned ToIntOpcode =
20590 SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;
20591 unsigned ToFPOpcode =
20592 IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;
20593
20594 // sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0
20595 //
20596 // We are not defining the high elements (for example, zero them) because
20597 // that could nullify any performance advantage that we hoped to gain from
20598 // this vector op hack. We do not expect any adverse effects (like denorm
20599 // penalties) with cast ops.
20600 SDLoc DL(CastToFP);
20601 SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
20602 SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);
20603 SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);
20604 SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);
20605 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);
20606}
20607
20608static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG,
20609 const X86Subtarget &Subtarget) {
20610 SDLoc DL(Op);
20611 bool IsStrict = Op->isStrictFPOpcode();
20612 MVT VT = Op->getSimpleValueType(0);
20613 SDValue Src = Op->getOperand(IsStrict ? 1 : 0);
20614
20615 if (Subtarget.hasDQI()) {
20616 assert(!Subtarget.hasVLX() && "Unexpected features")(static_cast <bool> (!Subtarget.hasVLX() && "Unexpected features"
) ? void (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20616, __extension__
__PRETTY_FUNCTION__))
;
20617
20618 assert((Src.getSimpleValueType() == MVT::v2i64 ||(static_cast <bool> ((Src.getSimpleValueType() == MVT::
v2i64 || Src.getSimpleValueType() == MVT::v4i64) && "Unsupported custom type"
) ? void (0) : __assert_fail ("(Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType() == MVT::v4i64) && \"Unsupported custom type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20620, __extension__
__PRETTY_FUNCTION__))
20619 Src.getSimpleValueType() == MVT::v4i64) &&(static_cast <bool> ((Src.getSimpleValueType() == MVT::
v2i64 || Src.getSimpleValueType() == MVT::v4i64) && "Unsupported custom type"
) ? void (0) : __assert_fail ("(Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType() == MVT::v4i64) && \"Unsupported custom type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20620, __extension__
__PRETTY_FUNCTION__))
20620 "Unsupported custom type")(static_cast <bool> ((Src.getSimpleValueType() == MVT::
v2i64 || Src.getSimpleValueType() == MVT::v4i64) && "Unsupported custom type"
) ? void (0) : __assert_fail ("(Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType() == MVT::v4i64) && \"Unsupported custom type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20620, __extension__
__PRETTY_FUNCTION__))
;
20621
20622 // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.
20623 assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&(static_cast <bool> ((VT == MVT::v4f32 || VT == MVT::v2f64
|| VT == MVT::v4f64) && "Unexpected VT!") ? void (0)
: __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20624, __extension__
__PRETTY_FUNCTION__))
20624 "Unexpected VT!")(static_cast <bool> ((VT == MVT::v4f32 || VT == MVT::v2f64
|| VT == MVT::v4f64) && "Unexpected VT!") ? void (0)
: __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20624, __extension__
__PRETTY_FUNCTION__))
;
20625 MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
20626
20627 // Need to concat with zero vector for strict fp to avoid spurious
20628 // exceptions.
20629 SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)
20630 : DAG.getUNDEF(MVT::v8i64);
20631 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,
20632 DAG.getIntPtrConstant(0, DL));
20633 SDValue Res, Chain;
20634 if (IsStrict) {
20635 Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},
20636 {Op->getOperand(0), Src});
20637 Chain = Res.getValue(1);
20638 } else {
20639 Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);
20640 }
20641
20642 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
20643 DAG.getIntPtrConstant(0, DL));
20644
20645 if (IsStrict)
20646 return DAG.getMergeValues({Res, Chain}, DL);
20647 return Res;
20648 }
20649
20650 bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||
20651 Op->getOpcode() == ISD::STRICT_SINT_TO_FP;
20652 if (VT != MVT::v4f32 || IsSigned)
20653 return SDValue();
20654
20655 SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);
20656 SDValue One = DAG.getConstant(1, DL, MVT::v4i64);
20657 SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,
20658 DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),
20659 DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));
20660 SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);
20661 SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);
20662 SmallVector<SDValue, 4> SignCvts(4);
20663 SmallVector<SDValue, 4> Chains(4);
20664 for (int i = 0; i != 4; ++i) {
20665 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
20666 DAG.getIntPtrConstant(i, DL));
20667 if (IsStrict) {
20668 SignCvts[i] =
20669 DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},
20670 {Op.getOperand(0), Elt});
20671 Chains[i] = SignCvts[i].getValue(1);
20672 } else {
20673 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);
20674 }
20675 }
20676 SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);
20677
20678 SDValue Slow, Chain;
20679 if (IsStrict) {
20680 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
20681 Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},
20682 {Chain, SignCvt, SignCvt});
20683 Chain = Slow.getValue(1);
20684 } else {
20685 Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);
20686 }
20687
20688 IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);
20689 SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);
20690
20691 if (IsStrict)
20692 return DAG.getMergeValues({Cvt, Chain}, DL);
20693
20694 return Cvt;
20695}
20696
20697SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
20698 SelectionDAG &DAG) const {
20699 bool IsStrict = Op->isStrictFPOpcode();
20700 unsigned OpNo = IsStrict ? 1 : 0;
20701 SDValue Src = Op.getOperand(OpNo);
20702 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
20703 MVT SrcVT = Src.getSimpleValueType();
20704 MVT VT = Op.getSimpleValueType();
20705 SDLoc dl(Op);
20706
20707 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
20708 return LowerWin64_INT128_TO_FP(Op, DAG);
20709
20710 if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
20711 return Extract;
20712
20713 if (SDValue R = lowerFPToIntToFP(Op, DAG, Subtarget))
20714 return R;
20715
20716 if (SrcVT.isVector()) {
20717 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
20718 // Note: Since v2f64 is a legal type. We don't need to zero extend the
20719 // source for strict FP.
20720 if (IsStrict)
20721 return DAG.getNode(
20722 X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
20723 {Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
20724 DAG.getUNDEF(SrcVT))});
20725 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
20726 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
20727 DAG.getUNDEF(SrcVT)));
20728 }
20729 if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)
20730 return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);
20731
20732 return SDValue();
20733 }
20734
20735 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&(static_cast <bool> (SrcVT <= MVT::i64 && SrcVT
>= MVT::i16 && "Unknown SINT_TO_FP to lower!") ? void
(0) : __assert_fail ("SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && \"Unknown SINT_TO_FP to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20736, __extension__
__PRETTY_FUNCTION__))
20736 "Unknown SINT_TO_FP to lower!")(static_cast <bool> (SrcVT <= MVT::i64 && SrcVT
>= MVT::i16 && "Unknown SINT_TO_FP to lower!") ? void
(0) : __assert_fail ("SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && \"Unknown SINT_TO_FP to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20736, __extension__
__PRETTY_FUNCTION__))
;
20737
20738 bool UseSSEReg = isScalarFPTypeInSSEReg(VT);
20739
20740 // These are really Legal; return the operand so the caller accepts it as
20741 // Legal.
20742 if (SrcVT == MVT::i32 && UseSSEReg)
20743 return Op;
20744 if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())
20745 return Op;
20746
20747 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
20748 return V;
20749 if (SDValue V = LowerI64IntToFP16(Op, DAG, Subtarget))
20750 return V;
20751
20752 // SSE doesn't have an i16 conversion so we need to promote.
20753 if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {
20754 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);
20755 if (IsStrict)
20756 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
20757 {Chain, Ext});
20758
20759 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);
20760 }
20761
20762 if (VT == MVT::f128)
20763 return SDValue();
20764
20765 SDValue ValueToStore = Src;
20766 if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())
20767 // Bitcasting to f64 here allows us to do a single 64-bit store from
20768 // an SSE register, avoiding the store forwarding penalty that would come
20769 // with two 32-bit stores.
20770 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
20771
20772 unsigned Size = SrcVT.getStoreSize();
20773 Align Alignment(Size);
20774 MachineFunction &MF = DAG.getMachineFunction();
20775 auto PtrVT = getPointerTy(MF.getDataLayout());
20776 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);
20777 MachinePointerInfo MPI =
20778 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);
20779 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20780 Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);
20781 std::pair<SDValue, SDValue> Tmp =
20782 BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);
20783
20784 if (IsStrict)
20785 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
20786
20787 return Tmp.first;
20788}
20789
20790std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(
20791 EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,
20792 MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {
20793 // Build the FILD
20794 SDVTList Tys;
20795 bool useSSE = isScalarFPTypeInSSEReg(DstVT);
20796 if (useSSE)
20797 Tys = DAG.getVTList(MVT::f80, MVT::Other);
20798 else
20799 Tys = DAG.getVTList(DstVT, MVT::Other);
20800
20801 SDValue FILDOps[] = {Chain, Pointer};
20802 SDValue Result =
20803 DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,
20804 Alignment, MachineMemOperand::MOLoad);
20805 Chain = Result.getValue(1);
20806
20807 if (useSSE) {
20808 MachineFunction &MF = DAG.getMachineFunction();
20809 unsigned SSFISize = DstVT.getStoreSize();
20810 int SSFI =
20811 MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false);
20812 auto PtrVT = getPointerTy(MF.getDataLayout());
20813 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20814 Tys = DAG.getVTList(MVT::Other);
20815 SDValue FSTOps[] = {Chain, Result, StackSlot};
20816 MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand(
20817 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
20818 MachineMemOperand::MOStore, SSFISize, Align(SSFISize));
20819
20820 Chain =
20821 DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);
20822 Result = DAG.getLoad(
20823 DstVT, DL, Chain, StackSlot,
20824 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
20825 Chain = Result.getValue(1);
20826 }
20827
20828 return { Result, Chain };
20829}
20830
20831/// Horizontal vector math instructions may be slower than normal math with
20832/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
20833/// implementation, and likely shuffle complexity of the alternate sequence.
20834static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
20835 const X86Subtarget &Subtarget) {
20836 bool IsOptimizingSize = DAG.shouldOptForSize();
20837 bool HasFastHOps = Subtarget.hasFastHorizontalOps();
20838 return !IsSingleSource || IsOptimizingSize || HasFastHOps;
20839}
20840
20841/// 64-bit unsigned integer to double expansion.
20842static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
20843 const X86Subtarget &Subtarget) {
20844 // We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0
20845 // when converting 0 when rounding toward negative infinity. Caller will
20846 // fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.
20847 assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!")(static_cast <bool> (!Op->isStrictFPOpcode() &&
"Expected non-strict uint_to_fp!") ? void (0) : __assert_fail
("!Op->isStrictFPOpcode() && \"Expected non-strict uint_to_fp!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20847, __extension__
__PRETTY_FUNCTION__))
;
20848 // This algorithm is not obvious. Here it is what we're trying to output:
20849 /*
20850 movq %rax, %xmm0
20851 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
20852 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
20853 #ifdef __SSE3__
20854 haddpd %xmm0, %xmm0
20855 #else
20856 pshufd $0x4e, %xmm0, %xmm1
20857 addpd %xmm1, %xmm0
20858 #endif
20859 */
20860
20861 SDLoc dl(Op);
20862 LLVMContext *Context = DAG.getContext();
20863
20864 // Build some magic constants.
20865 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
20866 Constant *C0 = ConstantDataVector::get(*Context, CV0);
20867 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
20868 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));
20869
20870 SmallVector<Constant*,2> CV1;
20871 CV1.push_back(
20872 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
20873 APInt(64, 0x4330000000000000ULL))));
20874 CV1.push_back(
20875 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
20876 APInt(64, 0x4530000000000000ULL))));
20877 Constant *C1 = ConstantVector::get(CV1);
20878 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));
20879
20880 // Load the 64-bit value into an XMM register.
20881 SDValue XR1 =
20882 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0));
20883 SDValue CLod0 = DAG.getLoad(
20884 MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
20885 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));
20886 SDValue Unpck1 =
20887 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
20888
20889 SDValue CLod1 = DAG.getLoad(
20890 MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
20891 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));
20892 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
20893 // TODO: Are there any fast-math-flags to propagate here?
20894 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
20895 SDValue Result;
20896
20897 if (Subtarget.hasSSE3() &&
20898 shouldUseHorizontalOp(true, DAG, Subtarget)) {
20899 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
20900 } else {
20901 SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
20902 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
20903 }
20904 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
20905 DAG.getIntPtrConstant(0, dl));
20906 return Result;
20907}
20908
20909/// 32-bit unsigned integer to float expansion.
20910static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG,
20911 const X86Subtarget &Subtarget) {
20912 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
20913 SDLoc dl(Op);
20914 // FP constant to bias correct the final result.
20915 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
20916 MVT::f64);
20917
20918 // Load the 32-bit value into an XMM register.
20919 SDValue Load =
20920 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));
20921
20922 // Zero out the upper parts of the register.
20923 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
20924
20925 // Or the load with the bias.
20926 SDValue Or = DAG.getNode(
20927 ISD::OR, dl, MVT::v2i64,
20928 DAG.getBitcast(MVT::v2i64, Load),
20929 DAG.getBitcast(MVT::v2i64,
20930 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
20931 Or =
20932 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
20933 DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
20934
20935 if (Op.getNode()->isStrictFPOpcode()) {
20936 // Subtract the bias.
20937 // TODO: Are there any fast-math-flags to propagate here?
20938 SDValue Chain = Op.getOperand(0);
20939 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},
20940 {Chain, Or, Bias});
20941
20942 if (Op.getValueType() == Sub.getValueType())
20943 return Sub;
20944
20945 // Handle final rounding.
20946 std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(
20947 Sub, Sub.getValue(1), dl, Op.getSimpleValueType());
20948
20949 return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);
20950 }
20951
20952 // Subtract the bias.
20953 // TODO: Are there any fast-math-flags to propagate here?
20954 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
20955
20956 // Handle final rounding.
20957 return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
20958}
20959
20960static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
20961 const X86Subtarget &Subtarget,
20962 const SDLoc &DL) {
20963 if (Op.getSimpleValueType() != MVT::v2f64)
20964 return SDValue();
20965
20966 bool IsStrict = Op->isStrictFPOpcode();
20967
20968 SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);
20969 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type")(static_cast <bool> (N0.getSimpleValueType() == MVT::v2i32
&& "Unexpected input type") ? void (0) : __assert_fail
("N0.getSimpleValueType() == MVT::v2i32 && \"Unexpected input type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20969, __extension__
__PRETTY_FUNCTION__))
;
20970
20971 if (Subtarget.hasAVX512()) {
20972 if (!Subtarget.hasVLX()) {
20973 // Let generic type legalization widen this.
20974 if (!IsStrict)
20975 return SDValue();
20976 // Otherwise pad the integer input with 0s and widen the operation.
20977 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
20978 DAG.getConstant(0, DL, MVT::v2i32));
20979 SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},
20980 {Op.getOperand(0), N0});
20981 SDValue Chain = Res.getValue(1);
20982 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,
20983 DAG.getIntPtrConstant(0, DL));
20984 return DAG.getMergeValues({Res, Chain}, DL);
20985 }
20986
20987 // Legalize to v4i32 type.
20988 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
20989 DAG.getUNDEF(MVT::v2i32));
20990 if (IsStrict)
20991 return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},
20992 {Op.getOperand(0), N0});
20993 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
20994 }
20995
20996 // Zero extend to 2i64, OR with the floating point representation of 2^52.
20997 // This gives us the floating point equivalent of 2^52 + the i32 integer
20998 // since double has 52-bits of mantissa. Then subtract 2^52 in floating
20999 // point leaving just our i32 integers in double format.
21000 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);
21001 SDValue VBias =
21002 DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), DL, MVT::v2f64);
21003 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,
21004 DAG.getBitcast(MVT::v2i64, VBias));
21005 Or = DAG.getBitcast(MVT::v2f64, Or);
21006
21007 if (IsStrict)
21008 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},
21009 {Op.getOperand(0), Or, VBias});
21010 return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);
21011}
21012
21013static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
21014 const X86Subtarget &Subtarget) {
21015 SDLoc DL(Op);
21016 bool IsStrict = Op->isStrictFPOpcode();
21017 SDValue V = Op->getOperand(IsStrict ? 1 : 0);
21018 MVT VecIntVT = V.getSimpleValueType();
21019 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&(static_cast <bool> ((VecIntVT == MVT::v4i32 || VecIntVT
== MVT::v8i32) && "Unsupported custom type") ? void (
0) : __assert_fail ("(VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && \"Unsupported custom type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21020, __extension__
__PRETTY_FUNCTION__))
21020 "Unsupported custom type")(static_cast <bool> ((VecIntVT == MVT::v4i32 || VecIntVT
== MVT::v8i32) && "Unsupported custom type") ? void (
0) : __assert_fail ("(VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && \"Unsupported custom type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21020, __extension__
__PRETTY_FUNCTION__))
;
21021
21022 if (Subtarget.hasAVX512()) {
21023 // With AVX512, but not VLX we need to widen to get a 512-bit result type.
21024 assert(!Subtarget.hasVLX() && "Unexpected features")(static_cast <bool> (!Subtarget.hasVLX() && "Unexpected features"
) ? void (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21024, __extension__
__PRETTY_FUNCTION__))
;
21025 MVT VT = Op->getSimpleValueType(0);
21026
21027 // v8i32->v8f64 is legal with AVX512 so just return it.
21028 if (VT == MVT::v8f64)
21029 return Op;
21030
21031 assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) &&(static_cast <bool> ((VT == MVT::v4f32 || VT == MVT::v8f32
|| VT == MVT::v4f64) && "Unexpected VT!") ? void (0)
: __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21032, __extension__
__PRETTY_FUNCTION__))
21032 "Unexpected VT!")(static_cast <bool> ((VT == MVT::v4f32 || VT == MVT::v8f32
|| VT == MVT::v4f64) && "Unexpected VT!") ? void (0)
: __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21032, __extension__
__PRETTY_FUNCTION__))
;
21033 MVT WideVT = VT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
21034 MVT WideIntVT = VT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
21035 // Need to concat with zero vector for strict fp to avoid spurious
21036 // exceptions.
21037 SDValue Tmp =
21038 IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);
21039 V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,
21040 DAG.getIntPtrConstant(0, DL));
21041 SDValue Res, Chain;
21042 if (IsStrict) {
21043 Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},
21044 {Op->getOperand(0), V});
21045 Chain = Res.getValue(1);
21046 } else {
21047 Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);
21048 }
21049
21050 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
21051 DAG.getIntPtrConstant(0, DL));
21052
21053 if (IsStrict)
21054 return DAG.getMergeValues({Res, Chain}, DL);
21055 return Res;
21056 }
21057
21058 if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&
21059 Op->getSimpleValueType(0) == MVT::v4f64) {
21060 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);
21061 Constant *Bias = ConstantFP::get(
21062 *DAG.getContext(),
21063 APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));
21064 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
21065 SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8));
21066 SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);
21067 SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
21068 SDValue VBias = DAG.getMemIntrinsicNode(
21069 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
21070 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(8),
21071 MachineMemOperand::MOLoad);
21072
21073 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,
21074 DAG.getBitcast(MVT::v4i64, VBias));
21075 Or = DAG.getBitcast(MVT::v4f64, Or);
21076
21077 if (IsStrict)
21078 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},
21079 {Op.getOperand(0), Or, VBias});
21080 return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);
21081 }
21082
21083 // The algorithm is the following:
21084 // #ifdef __SSE4_1__
21085 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
21086 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
21087 // (uint4) 0x53000000, 0xaa);
21088 // #else
21089 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
21090 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
21091 // #endif
21092 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
21093 // return (float4) lo + fhi;
21094
21095 bool Is128 = VecIntVT == MVT::v4i32;
21096 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
21097 // If we convert to something else than the supported type, e.g., to v4f64,
21098 // abort early.
21099 if (VecFloatVT != Op->getSimpleValueType(0))
21100 return SDValue();
21101
21102 // In the #idef/#else code, we have in common:
21103 // - The vector of constants:
21104 // -- 0x4b000000
21105 // -- 0x53000000
21106 // - A shift:
21107 // -- v >> 16
21108
21109 // Create the splat vector for 0x4b000000.
21110 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
21111 // Create the splat vector for 0x53000000.
21112 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
21113
21114 // Create the right shift.
21115 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
21116 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
21117
21118 SDValue Low, High;
21119 if (Subtarget.hasSSE41()) {
21120 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
21121 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
21122 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
21123 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
21124 // Low will be bitcasted right away, so do not bother bitcasting back to its
21125 // original type.
21126 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
21127 VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
21128 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
21129 // (uint4) 0x53000000, 0xaa);
21130 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
21131 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
21132 // High will be bitcasted right away, so do not bother bitcasting back to
21133 // its original type.
21134 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
21135 VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
21136 } else {
21137 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
21138 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
21139 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
21140 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
21141
21142 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
21143 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
21144 }
21145
21146 // Create the vector constant for (0x1.0p39f + 0x1.0p23f).
21147 SDValue VecCstFSub = DAG.getConstantFP(
21148 APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);
21149
21150 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
21151 // NOTE: By using fsub of a positive constant instead of fadd of a negative
21152 // constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is
21153 // enabled. See PR24512.
21154 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
21155 // TODO: Are there any fast-math-flags to propagate here?
21156 // (float4) lo;
21157 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
21158 // return (float4) lo + fhi;
21159 if (IsStrict) {
21160 SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},
21161 {Op.getOperand(0), HighBitcast, VecCstFSub});
21162 return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},
21163 {FHigh.getValue(1), LowBitcast, FHigh});
21164 }
21165
21166 SDValue FHigh =
21167 DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);
21168 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
21169}
21170
21171static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG,
21172 const X86Subtarget &Subtarget) {
21173 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
21174 SDValue N0 = Op.getOperand(OpNo);
21175 MVT SrcVT = N0.getSimpleValueType();
21176 SDLoc dl(Op);
21177
21178 switch (SrcVT.SimpleTy) {
21179 default:
21180 llvm_unreachable("Custom UINT_TO_FP is not supported!")::llvm::llvm_unreachable_internal("Custom UINT_TO_FP is not supported!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21180)
;
21181 case MVT::v2i32:
21182 return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
21183 case MVT::v4i32:
21184 case MVT::v8i32:
21185 return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
21186 case MVT::v2i64:
21187 case MVT::v4i64:
21188 return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);
21189 }
21190}
21191
21192SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
21193 SelectionDAG &DAG) const {
21194 bool IsStrict = Op->isStrictFPOpcode();
21195 unsigned OpNo = IsStrict ? 1 : 0;
21196 SDValue Src = Op.getOperand(OpNo);
21197 SDLoc dl(Op);
21198 auto PtrVT = getPointerTy(DAG.getDataLayout());
21199 MVT SrcVT = Src.getSimpleValueType();
21200 MVT DstVT = Op->getSimpleValueType(0);
21201 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
21202
21203 if (DstVT == MVT::f128)
21204 return SDValue();
21205
21206 if (DstVT.isVector())
21207 return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);
21208
21209 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
21210 return LowerWin64_INT128_TO_FP(Op, DAG);
21211
21212 if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
21213 return Extract;
21214
21215 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
21216 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
21217 // Conversions from unsigned i32 to f32/f64 are legal,
21218 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
21219 return Op;
21220 }
21221
21222 // Promote i32 to i64 and use a signed conversion on 64-bit targets.
21223 if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
21224 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);
21225 if (IsStrict)
21226 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},
21227 {Chain, Src});
21228 return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);
21229 }
21230
21231 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
21232 return V;
21233 if (SDValue V = LowerI64IntToFP16(Op, DAG, Subtarget))
21234 return V;
21235
21236 // The transform for i64->f64 isn't correct for 0 when rounding to negative
21237 // infinity. It produces -0.0, so disable under strictfp.
21238 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && Subtarget.hasSSE2() &&
21239 !IsStrict)
21240 return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);
21241 // The transform for i32->f64/f32 isn't correct for 0 when rounding to
21242 // negative infinity. So disable under strictfp. Using FILD instead.
21243 if (SrcVT == MVT::i32 && Subtarget.hasSSE2() && DstVT != MVT::f80 &&
21244 !IsStrict)
21245 return LowerUINT_TO_FP_i32(Op, DAG, Subtarget);
21246 if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&
21247 (DstVT == MVT::f32 || DstVT == MVT::f64))
21248 return SDValue();
21249
21250 // Make a 64-bit buffer, and use it to build an FILD.
21251 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);
21252 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
21253 Align SlotAlign(8);
21254 MachinePointerInfo MPI =
21255 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);
21256 if (SrcVT == MVT::i32) {
21257 SDValue OffsetSlot =
21258 DAG.getMemBasePlusOffset(StackSlot, TypeSize::Fixed(4), dl);
21259 SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);
21260 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
21261 OffsetSlot, MPI.getWithOffset(4), SlotAlign);
21262 std::pair<SDValue, SDValue> Tmp =
21263 BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);
21264 if (IsStrict)
21265 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
21266
21267 return Tmp.first;
21268 }
21269
21270 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP")(static_cast <bool> (SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"
) ? void (0) : __assert_fail ("SrcVT == MVT::i64 && \"Unexpected type in UINT_TO_FP\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21270, __extension__
__PRETTY_FUNCTION__))
;
21271 SDValue ValueToStore = Src;
21272 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {
21273 // Bitcasting to f64 here allows us to do a single 64-bit store from
21274 // an SSE register, avoiding the store forwarding penalty that would come
21275 // with two 32-bit stores.
21276 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
21277 }
21278 SDValue Store =
21279 DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);
21280 // For i64 source, we need to add the appropriate power of 2 if the input
21281 // was negative. We must be careful to do the computation in x87 extended
21282 // precision, not in SSE.
21283 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
21284 SDValue Ops[] = { Store, StackSlot };
21285 SDValue Fild =
21286 DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,
21287 SlotAlign, MachineMemOperand::MOLoad);
21288 Chain = Fild.getValue(1);
21289
21290
21291 // Check whether the sign bit is set.
21292 SDValue SignSet = DAG.getSetCC(
21293 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
21294 Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
21295
21296 // Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.
21297 APInt FF(64, 0x5F80000000000000ULL);
21298 SDValue FudgePtr = DAG.getConstantPool(
21299 ConstantInt::get(*DAG.getContext(), FF), PtrVT);
21300 Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();
21301
21302 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
21303 SDValue Zero = DAG.getIntPtrConstant(0, dl);
21304 SDValue Four = DAG.getIntPtrConstant(4, dl);
21305 SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);
21306 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
21307
21308 // Load the value out, extending it from f32 to f80.
21309 SDValue Fudge = DAG.getExtLoad(
21310 ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,
21311 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
21312 CPAlignment);
21313 Chain = Fudge.getValue(1);
21314 // Extend everything to 80 bits to force it to be done on x87.
21315 // TODO: Are there any fast-math-flags to propagate here?
21316 if (IsStrict) {
21317 SDValue Add = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::f80, MVT::Other},
21318 {Chain, Fild, Fudge});
21319 // STRICT_FP_ROUND can't handle equal types.
21320 if (DstVT == MVT::f80)
21321 return Add;
21322 return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},
21323 {Add.getValue(1), Add, DAG.getIntPtrConstant(0, dl)});
21324 }
21325 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
21326 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
21327 DAG.getIntPtrConstant(0, dl));
21328}
21329
21330// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
21331// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
21332// just return an SDValue().
21333// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
21334// to i16, i32 or i64, and we lower it to a legal sequence and return the
21335// result.
21336SDValue
21337X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
21338 bool IsSigned, SDValue &Chain) const {
21339 bool IsStrict = Op->isStrictFPOpcode();
21340 SDLoc DL(Op);
21341
21342 EVT DstTy = Op.getValueType();
21343 SDValue Value = Op.getOperand(IsStrict ? 1 : 0);
21344 EVT TheVT = Value.getValueType();
21345 auto PtrVT = getPointerTy(DAG.getDataLayout());
21346
21347 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
21348 // f16 must be promoted before using the lowering in this routine.
21349 // fp128 does not use this lowering.
21350 return SDValue();
21351 }
21352
21353 // If using FIST to compute an unsigned i64, we'll need some fixup
21354 // to handle values above the maximum signed i64. A FIST is always
21355 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
21356 bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;
21357
21358 // FIXME: This does not generate an invalid exception if the input does not
21359 // fit in i32. PR44019
21360 if (!IsSigned && DstTy != MVT::i64) {
21361 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
21362 // The low 32 bits of the fist result will have the correct uint32 result.
21363 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT")(static_cast <bool> (DstTy == MVT::i32 && "Unexpected FP_TO_UINT"
) ? void (0) : __assert_fail ("DstTy == MVT::i32 && \"Unexpected FP_TO_UINT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21363, __extension__
__PRETTY_FUNCTION__))
;
21364 DstTy = MVT::i64;
21365 }
21366
21367 assert(DstTy.getSimpleVT() <= MVT::i64 &&(static_cast <bool> (DstTy.getSimpleVT() <= MVT::i64
&& DstTy.getSimpleVT() >= MVT::i16 && "Unknown FP_TO_INT to lower!"
) ? void (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21369, __extension__
__PRETTY_FUNCTION__))
21368 DstTy.getSimpleVT() >= MVT::i16 &&(static_cast <bool> (DstTy.getSimpleVT() <= MVT::i64
&& DstTy.getSimpleVT() >= MVT::i16 && "Unknown FP_TO_INT to lower!"
) ? void (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21369, __extension__
__PRETTY_FUNCTION__))
21369 "Unknown FP_TO_INT to lower!")(static_cast <bool> (DstTy.getSimpleVT() <= MVT::i64
&& DstTy.getSimpleVT() >= MVT::i16 && "Unknown FP_TO_INT to lower!"
) ? void (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21369, __extension__
__PRETTY_FUNCTION__))
;
21370
21371 // We lower FP->int64 into FISTP64 followed by a load from a temporary
21372 // stack slot.
21373 MachineFunction &MF = DAG.getMachineFunction();
21374 unsigned MemSize = DstTy.getStoreSize();
21375 int SSFI =
21376 MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false);
21377 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
21378
21379 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
21380
21381 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
21382
21383 if (UnsignedFixup) {
21384 //
21385 // Conversion to unsigned i64 is implemented with a select,
21386 // depending on whether the source value fits in the range
21387 // of a signed i64. Let Thresh be the FP equivalent of
21388 // 0x8000000000000000ULL.
21389 //
21390 // Adjust = (Value >= Thresh) ? 0x80000000 : 0;
21391 // FltOfs = (Value >= Thresh) ? 0x80000000 : 0;
21392 // FistSrc = (Value - FltOfs);
21393 // Fist-to-mem64 FistSrc
21394 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
21395 // to XOR'ing the high 32 bits with Adjust.
21396 //
21397 // Being a power of 2, Thresh is exactly representable in all FP formats.
21398 // For X87 we'd like to use the smallest FP type for this constant, but
21399 // for DAG type consistency we have to match the FP operand type.
21400
21401 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
21402 LLVM_ATTRIBUTE_UNUSED__attribute__((__unused__)) APFloat::opStatus Status = APFloat::opOK;
21403 bool LosesInfo = false;
21404 if (TheVT == MVT::f64)
21405 // The rounding mode is irrelevant as the conversion should be exact.
21406 Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
21407 &LosesInfo);
21408 else if (TheVT == MVT::f80)
21409 Status = Thresh.convert(APFloat::x87DoubleExtended(),
21410 APFloat::rmNearestTiesToEven, &LosesInfo);
21411
21412 assert(Status == APFloat::opOK && !LosesInfo &&(static_cast <bool> (Status == APFloat::opOK &&
!LosesInfo && "FP conversion should have been exact"
) ? void (0) : __assert_fail ("Status == APFloat::opOK && !LosesInfo && \"FP conversion should have been exact\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21413, __extension__
__PRETTY_FUNCTION__))
21413 "FP conversion should have been exact")(static_cast <bool> (Status == APFloat::opOK &&
!LosesInfo && "FP conversion should have been exact"
) ? void (0) : __assert_fail ("Status == APFloat::opOK && !LosesInfo && \"FP conversion should have been exact\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21413, __extension__
__PRETTY_FUNCTION__))
;
21414
21415 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
21416
21417 EVT ResVT = getSetCCResultType(DAG.getDataLayout(),
21418 *DAG.getContext(), TheVT);
21419 SDValue Cmp;
21420 if (IsStrict) {
21421 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,
21422 /*IsSignaling*/ true);
21423 Chain = Cmp.getValue(1);
21424 } else {
21425 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE);
21426 }
21427
21428 // Our preferred lowering of
21429 //
21430 // (Value >= Thresh) ? 0x8000000000000000ULL : 0
21431 //
21432 // is
21433 //
21434 // (Value >= Thresh) << 63
21435 //
21436 // but since we can get here after LegalOperations, DAGCombine might do the
21437 // wrong thing if we create a select. So, directly create the preferred
21438 // version.
21439 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp);
21440 SDValue Const63 = DAG.getConstant(63, DL, MVT::i8);
21441 Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63);
21442
21443 SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal,
21444 DAG.getConstantFP(0.0, DL, TheVT));
21445
21446 if (IsStrict) {
21447 Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
21448 { Chain, Value, FltOfs });
21449 Chain = Value.getValue(1);
21450 } else
21451 Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);
21452 }
21453
21454 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
21455
21456 // FIXME This causes a redundant load/store if the SSE-class value is already
21457 // in memory, such as if it is on the callstack.
21458 if (isScalarFPTypeInSSEReg(TheVT)) {
21459 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!")(static_cast <bool> (DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"
) ? void (0) : __assert_fail ("DstTy == MVT::i64 && \"Invalid FP_TO_SINT to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21459, __extension__
__PRETTY_FUNCTION__))
;
21460 Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
21461 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
21462 SDValue Ops[] = { Chain, StackSlot };
21463
21464 unsigned FLDSize = TheVT.getStoreSize();
21465 assert(FLDSize <= MemSize && "Stack slot not big enough")(static_cast <bool> (FLDSize <= MemSize && "Stack slot not big enough"
) ? void (0) : __assert_fail ("FLDSize <= MemSize && \"Stack slot not big enough\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21465, __extension__
__PRETTY_FUNCTION__))
;
21466 MachineMemOperand *MMO = MF.getMachineMemOperand(
21467 MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize));
21468 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);
21469 Chain = Value.getValue(1);
21470 }
21471
21472 // Build the FP_TO_INT*_IN_MEM
21473 MachineMemOperand *MMO = MF.getMachineMemOperand(
21474 MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));
21475 SDValue Ops[] = { Chain, Value, StackSlot };
21476 SDValue FIST = DAG.getMemIntrinsicNode(X86ISD::FP_TO_INT_IN_MEM, DL,
21477 DAG.getVTList(MVT::Other),
21478 Ops, DstTy, MMO);
21479
21480 SDValue Res = DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, MPI);
21481 Chain = Res.getValue(1);
21482
21483 // If we need an unsigned fixup, XOR the result with adjust.
21484 if (UnsignedFixup)
21485 Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);
21486
21487 return Res;
21488}
21489
21490static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
21491 const X86Subtarget &Subtarget) {
21492 MVT VT = Op.getSimpleValueType();
21493 SDValue In = Op.getOperand(0);
21494 MVT InVT = In.getSimpleValueType();
21495 SDLoc dl(Op);
21496 unsigned Opc = Op.getOpcode();
21497
21498 assert(VT.isVector() && InVT.isVector() && "Expected vector type")(static_cast <bool> (VT.isVector() && InVT.isVector
() && "Expected vector type") ? void (0) : __assert_fail
("VT.isVector() && InVT.isVector() && \"Expected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21498, __extension__
__PRETTY_FUNCTION__))
;
21499 assert((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) &&(static_cast <bool> ((Opc == ISD::ANY_EXTEND || Opc == ISD
::ZERO_EXTEND) && "Unexpected extension opcode") ? void
(0) : __assert_fail ("(Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) && \"Unexpected extension opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21500, __extension__
__PRETTY_FUNCTION__))
21500 "Unexpected extension opcode")(static_cast <bool> ((Opc == ISD::ANY_EXTEND || Opc == ISD
::ZERO_EXTEND) && "Unexpected extension opcode") ? void
(0) : __assert_fail ("(Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) && \"Unexpected extension opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21500, __extension__
__PRETTY_FUNCTION__))
;
21501 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Expected same number of elements"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21502, __extension__
__PRETTY_FUNCTION__))
21502 "Expected same number of elements")(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Expected same number of elements"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21502, __extension__
__PRETTY_FUNCTION__))
;
21503 assert((VT.getVectorElementType() == MVT::i16 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21506, __extension__
__PRETTY_FUNCTION__))
21504 VT.getVectorElementType() == MVT::i32 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21506, __extension__
__PRETTY_FUNCTION__))
21505 VT.getVectorElementType() == MVT::i64) &&(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21506, __extension__
__PRETTY_FUNCTION__))
21506 "Unexpected element type")(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21506, __extension__
__PRETTY_FUNCTION__))
;
21507 assert((InVT.getVectorElementType() == MVT::i8 ||(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21510, __extension__
__PRETTY_FUNCTION__))
21508 InVT.getVectorElementType() == MVT::i16 ||(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21510, __extension__
__PRETTY_FUNCTION__))
21509 InVT.getVectorElementType() == MVT::i32) &&(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21510, __extension__
__PRETTY_FUNCTION__))
21510 "Unexpected element type")(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21510, __extension__
__PRETTY_FUNCTION__))
;
21511
21512 unsigned ExtendInVecOpc = getOpcode_EXTEND_VECTOR_INREG(Opc);
21513
21514 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
21515 assert(InVT == MVT::v32i8 && "Unexpected VT!")(static_cast <bool> (InVT == MVT::v32i8 && "Unexpected VT!"
) ? void (0) : __assert_fail ("InVT == MVT::v32i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21515, __extension__
__PRETTY_FUNCTION__))
;
21516 return splitVectorIntUnary(Op, DAG);
21517 }
21518
21519 if (Subtarget.hasInt256())
21520 return Op;
21521
21522 // Optimize vectors in AVX mode:
21523 //
21524 // v8i16 -> v8i32
21525 // Use vpmovzwd for 4 lower elements v8i16 -> v4i32.
21526 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
21527 // Concat upper and lower parts.
21528 //
21529 // v4i32 -> v4i64
21530 // Use vpmovzdq for 4 lower elements v4i32 -> v2i64.
21531 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
21532 // Concat upper and lower parts.
21533 //
21534 MVT HalfVT = VT.getHalfNumVectorElementsVT();
21535 SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);
21536
21537 // Short-circuit if we can determine that each 128-bit half is the same value.
21538 // Otherwise, this is difficult to match and optimize.
21539 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))
21540 if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))
21541 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);
21542
21543 SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
21544 SDValue Undef = DAG.getUNDEF(InVT);
21545 bool NeedZero = Opc == ISD::ZERO_EXTEND;
21546 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
21547 OpHi = DAG.getBitcast(HalfVT, OpHi);
21548
21549 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
21550}
21551
21552// Helper to split and extend a v16i1 mask to v16i8 or v16i16.
21553static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
21554 const SDLoc &dl, SelectionDAG &DAG) {
21555 assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.")(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v16i16
) && "Unexpected VT.") ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v16i16) && \"Unexpected VT.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21555, __extension__
__PRETTY_FUNCTION__))
;
21556 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
21557 DAG.getIntPtrConstant(0, dl));
21558 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
21559 DAG.getIntPtrConstant(8, dl));
21560 Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
21561 Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
21562 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
21563 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21564}
21565
21566static SDValue LowerZERO_EXTEND_Mask(SDValue Op,
21567 const X86Subtarget &Subtarget,
21568 SelectionDAG &DAG) {
21569 MVT VT = Op->getSimpleValueType(0);
21570 SDValue In = Op->getOperand(0);
21571 MVT InVT = In.getSimpleValueType();
21572 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!")(static_cast <bool> (InVT.getVectorElementType() == MVT
::i1 && "Unexpected input type!") ? void (0) : __assert_fail
("InVT.getVectorElementType() == MVT::i1 && \"Unexpected input type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21572, __extension__
__PRETTY_FUNCTION__))
;
21573 SDLoc DL(Op);
21574 unsigned NumElts = VT.getVectorNumElements();
21575
21576 // For all vectors, but vXi8 we can just emit a sign_extend and a shift. This
21577 // avoids a constant pool load.
21578 if (VT.getVectorElementType() != MVT::i8) {
21579 SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
21580 return DAG.getNode(ISD::SRL, DL, VT, Extend,
21581 DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
21582 }
21583
21584 // Extend VT if BWI is not supported.
21585 MVT ExtVT = VT;
21586 if (!Subtarget.hasBWI()) {
21587 // If v16i32 is to be avoided, we'll need to split and concatenate.
21588 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
21589 return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);
21590
21591 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
21592 }
21593
21594 // Widen to 512-bits if VLX is not supported.
21595 MVT WideVT = ExtVT;
21596 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
21597 NumElts *= 512 / ExtVT.getSizeInBits();
21598 InVT = MVT::getVectorVT(MVT::i1, NumElts);
21599 In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),
21600 In, DAG.getIntPtrConstant(0, DL));
21601 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(),
21602 NumElts);
21603 }
21604
21605 SDValue One = DAG.getConstant(1, DL, WideVT);
21606 SDValue Zero = DAG.getConstant(0, DL, WideVT);
21607
21608 SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
21609
21610 // Truncate if we had to extend above.
21611 if (VT != ExtVT) {
21612 WideVT = MVT::getVectorVT(MVT::i8, NumElts);
21613 SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
21614 }
21615
21616 // Extract back to 128/256-bit if we widened.
21617 if (WideVT != VT)
21618 SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
21619 DAG.getIntPtrConstant(0, DL));
21620
21621 return SelectedVal;
21622}
21623
21624static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
21625 SelectionDAG &DAG) {
21626 SDValue In = Op.getOperand(0);
21627 MVT SVT = In.getSimpleValueType();
21628
21629 if (SVT.getVectorElementType() == MVT::i1)
21630 return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);
21631
21632 assert(Subtarget.hasAVX() && "Expected AVX support")(static_cast <bool> (Subtarget.hasAVX() && "Expected AVX support"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"Expected AVX support\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21632, __extension__
__PRETTY_FUNCTION__))
;
21633 return LowerAVXExtend(Op, DAG, Subtarget);
21634}
21635
21636/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
21637/// It makes use of the fact that vectors with enough leading sign/zero bits
21638/// prevent the PACKSS/PACKUS from saturating the results.
21639/// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
21640/// within each 128-bit lane.
21641static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
21642 const SDLoc &DL, SelectionDAG &DAG,
21643 const X86Subtarget &Subtarget) {
21644 assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&(static_cast <bool> ((Opcode == X86ISD::PACKSS || Opcode
== X86ISD::PACKUS) && "Unexpected PACK opcode") ? void
(0) : __assert_fail ("(Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) && \"Unexpected PACK opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21645, __extension__
__PRETTY_FUNCTION__))
21645 "Unexpected PACK opcode")(static_cast <bool> ((Opcode == X86ISD::PACKSS || Opcode
== X86ISD::PACKUS) && "Unexpected PACK opcode") ? void
(0) : __assert_fail ("(Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) && \"Unexpected PACK opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21645, __extension__
__PRETTY_FUNCTION__))
;
21646 assert(DstVT.isVector() && "VT not a vector?")(static_cast <bool> (DstVT.isVector() && "VT not a vector?"
) ? void (0) : __assert_fail ("DstVT.isVector() && \"VT not a vector?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21646, __extension__
__PRETTY_FUNCTION__))
;
21647
21648 // Requires SSE2 for PACKSS (SSE41 PACKUSDW is handled below).
21649 if (!Subtarget.hasSSE2())
21650 return SDValue();
21651
21652 EVT SrcVT = In.getValueType();
21653
21654 // No truncation required, we might get here due to recursive calls.
21655 if (SrcVT == DstVT)
21656 return In;
21657
21658 // We only support vector truncation to 64bits or greater from a
21659 // 128bits or greater source.
21660 unsigned DstSizeInBits = DstVT.getSizeInBits();
21661 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
21662 if ((DstSizeInBits % 64) != 0 || (SrcSizeInBits % 128) != 0)
21663 return SDValue();
21664
21665 unsigned NumElems = SrcVT.getVectorNumElements();
21666 if (!isPowerOf2_32(NumElems))
21667 return SDValue();
21668
21669 LLVMContext &Ctx = *DAG.getContext();
21670 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation")(static_cast <bool> (DstVT.getVectorNumElements() == NumElems
&& "Illegal truncation") ? void (0) : __assert_fail (
"DstVT.getVectorNumElements() == NumElems && \"Illegal truncation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21670, __extension__
__PRETTY_FUNCTION__))
;
21671 assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation")(static_cast <bool> (SrcSizeInBits > DstSizeInBits &&
"Illegal truncation") ? void (0) : __assert_fail ("SrcSizeInBits > DstSizeInBits && \"Illegal truncation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21671, __extension__
__PRETTY_FUNCTION__))
;
21672
21673 EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
21674
21675 // Pack to the largest type possible:
21676 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
21677 EVT InVT = MVT::i16, OutVT = MVT::i8;
21678 if (SrcVT.getScalarSizeInBits() > 16 &&
21679 (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
21680 InVT = MVT::i32;
21681 OutVT = MVT::i16;
21682 }
21683
21684 // 128bit -> 64bit truncate - PACK 128-bit src in the lower subvector.
21685 if (SrcVT.is128BitVector()) {
21686 InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
21687 OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
21688 In = DAG.getBitcast(InVT, In);
21689 SDValue Res = DAG.getNode(Opcode, DL, OutVT, In, DAG.getUNDEF(InVT));
21690 Res = extractSubVector(Res, 0, DAG, DL, 64);
21691 return DAG.getBitcast(DstVT, Res);
21692 }
21693
21694 // Split lower/upper subvectors.
21695 SDValue Lo, Hi;
21696 std::tie(Lo, Hi) = splitVector(In, DAG, DL);
21697
21698 unsigned SubSizeInBits = SrcSizeInBits / 2;
21699 InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
21700 OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
21701
21702 // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
21703 if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
21704 Lo = DAG.getBitcast(InVT, Lo);
21705 Hi = DAG.getBitcast(InVT, Hi);
21706 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
21707 return DAG.getBitcast(DstVT, Res);
21708 }
21709
21710 // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
21711 // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
21712 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
21713 Lo = DAG.getBitcast(InVT, Lo);
21714 Hi = DAG.getBitcast(InVT, Hi);
21715 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
21716
21717 // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
21718 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
21719 // Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.
21720 SmallVector<int, 64> Mask;
21721 int Scale = 64 / OutVT.getScalarSizeInBits();
21722 narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask);
21723 Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);
21724
21725 if (DstVT.is256BitVector())
21726 return DAG.getBitcast(DstVT, Res);
21727
21728 // If 512bit -> 128bit truncate another stage.
21729 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
21730 Res = DAG.getBitcast(PackedVT, Res);
21731 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21732 }
21733
21734 // Recursively pack lower/upper subvectors, concat result and pack again.
21735 assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater")(static_cast <bool> (SrcSizeInBits >= 256 &&
"Expected 256-bit vector or greater") ? void (0) : __assert_fail
("SrcSizeInBits >= 256 && \"Expected 256-bit vector or greater\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21735, __extension__
__PRETTY_FUNCTION__))
;
21736 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems / 2);
21737 Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget);
21738 Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget);
21739
21740 PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
21741 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
21742 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21743}
21744
21745static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
21746 const X86Subtarget &Subtarget) {
21747
21748 SDLoc DL(Op);
21749 MVT VT = Op.getSimpleValueType();
21750 SDValue In = Op.getOperand(0);
21751 MVT InVT = In.getSimpleValueType();
21752
21753 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.")(static_cast <bool> (VT.getVectorElementType() == MVT::
i1 && "Unexpected vector type.") ? void (0) : __assert_fail
("VT.getVectorElementType() == MVT::i1 && \"Unexpected vector type.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21753, __extension__
__PRETTY_FUNCTION__))
;
21754
21755 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
21756 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
21757 if (InVT.getScalarSizeInBits() <= 16) {
21758 if (Subtarget.hasBWI()) {
21759 // legal, will go to VPMOVB2M, VPMOVW2M
21760 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
21761 // We need to shift to get the lsb into sign position.
21762 // Shift packed bytes not supported natively, bitcast to word
21763 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
21764 In = DAG.getNode(ISD::SHL, DL, ExtVT,
21765 DAG.getBitcast(ExtVT, In),
21766 DAG.getConstant(ShiftInx, DL, ExtVT));
21767 In = DAG.getBitcast(InVT, In);
21768 }
21769 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
21770 In, ISD::SETGT);
21771 }
21772 // Use TESTD/Q, extended vector to packed dword/qword.
21773 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&(static_cast <bool> ((InVT.is256BitVector() || InVT.is128BitVector
()) && "Unexpected vector type.") ? void (0) : __assert_fail
("(InVT.is256BitVector() || InVT.is128BitVector()) && \"Unexpected vector type.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21774, __extension__
__PRETTY_FUNCTION__))
21774 "Unexpected vector type.")(static_cast <bool> ((InVT.is256BitVector() || InVT.is128BitVector
()) && "Unexpected vector type.") ? void (0) : __assert_fail
("(InVT.is256BitVector() || InVT.is128BitVector()) && \"Unexpected vector type.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21774, __extension__
__PRETTY_FUNCTION__))
;
21775 unsigned NumElts = InVT.getVectorNumElements();
21776 assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements")(static_cast <bool> ((NumElts == 8 || NumElts == 16) &&
"Unexpected number of elements") ? void (0) : __assert_fail (
"(NumElts == 8 || NumElts == 16) && \"Unexpected number of elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21776, __extension__
__PRETTY_FUNCTION__))
;
21777 // We need to change to a wider element type that we have support for.
21778 // For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
21779 // For 16 element vectors we extend to v16i32 unless we are explicitly
21780 // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
21781 // we need to split into two 8 element vectors which we can extend to v8i32,
21782 // truncate and concat the results. There's an additional complication if
21783 // the original type is v16i8. In that case we can't split the v16i8
21784 // directly, so we need to shuffle high elements to low and use
21785 // sign_extend_vector_inreg.
21786 if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
21787 SDValue Lo, Hi;
21788 if (InVT == MVT::v16i8) {
21789 Lo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, In);
21790 Hi = DAG.getVectorShuffle(
21791 InVT, DL, In, In,
21792 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
21793 Hi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, Hi);
21794 } else {
21795 assert(InVT == MVT::v16i16 && "Unexpected VT!")(static_cast <bool> (InVT == MVT::v16i16 && "Unexpected VT!"
) ? void (0) : __assert_fail ("InVT == MVT::v16i16 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21795, __extension__
__PRETTY_FUNCTION__))
;
21796 Lo = extract128BitVector(In, 0, DAG, DL);
21797 Hi = extract128BitVector(In, 8, DAG, DL);
21798 }
21799 // We're split now, just emit two truncates and a concat. The two
21800 // truncates will trigger legalization to come back to this function.
21801 Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
21802 Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
21803 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21804 }
21805 // We either have 8 elements or we're allowed to use 512-bit vectors.
21806 // If we have VLX, we want to use the narrowest vector that can get the
21807 // job done so we use vXi32.
21808 MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
21809 MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
21810 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
21811 InVT = ExtVT;
21812 ShiftInx = InVT.getScalarSizeInBits() - 1;
21813 }
21814
21815 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
21816 // We need to shift to get the lsb into sign position.
21817 In = DAG.getNode(ISD::SHL, DL, InVT, In,
21818 DAG.getConstant(ShiftInx, DL, InVT));
21819 }
21820 // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
21821 if (Subtarget.hasDQI())
21822 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);
21823 return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);
21824}
21825
21826SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
21827 SDLoc DL(Op);
21828 MVT VT = Op.getSimpleValueType();
21829 SDValue In = Op.getOperand(0);
21830 MVT InVT = In.getSimpleValueType();
21831 unsigned InNumEltBits = InVT.getScalarSizeInBits();
21832
21833 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Invalid TRUNCATE operation"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Invalid TRUNCATE operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21834, __extension__
__PRETTY_FUNCTION__))
21834 "Invalid TRUNCATE operation")(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Invalid TRUNCATE operation"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Invalid TRUNCATE operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21834, __extension__
__PRETTY_FUNCTION__))
;
21835
21836 // If we're called by the type legalizer, handle a few cases.
21837 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21838 if (!TLI.isTypeLegal(InVT)) {
21839 if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&
21840 VT.is128BitVector()) {
21841 assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) &&(static_cast <bool> ((InVT == MVT::v16i64 || Subtarget.
hasVLX()) && "Unexpected subtarget!") ? void (0) : __assert_fail
("(InVT == MVT::v16i64 || Subtarget.hasVLX()) && \"Unexpected subtarget!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21842, __extension__
__PRETTY_FUNCTION__))
21842 "Unexpected subtarget!")(static_cast <bool> ((InVT == MVT::v16i64 || Subtarget.
hasVLX()) && "Unexpected subtarget!") ? void (0) : __assert_fail
("(InVT == MVT::v16i64 || Subtarget.hasVLX()) && \"Unexpected subtarget!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21842, __extension__
__PRETTY_FUNCTION__))
;
21843 // The default behavior is to truncate one step, concatenate, and then
21844 // truncate the remainder. We'd rather produce two 64-bit results and
21845 // concatenate those.
21846 SDValue Lo, Hi;
21847 std::tie(Lo, Hi) = DAG.SplitVector(In, DL);
21848
21849 EVT LoVT, HiVT;
21850 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
21851
21852 Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);
21853 Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);
21854 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21855 }
21856
21857 // Otherwise let default legalization handle it.
21858 return SDValue();
21859 }
21860
21861 if (VT.getVectorElementType() == MVT::i1)
21862 return LowerTruncateVecI1(Op, DAG, Subtarget);
21863
21864 // vpmovqb/w/d, vpmovdb/w, vpmovwb
21865 if (Subtarget.hasAVX512()) {
21866 if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) {
21867 assert(VT == MVT::v32i8 && "Unexpected VT!")(static_cast <bool> (VT == MVT::v32i8 && "Unexpected VT!"
) ? void (0) : __assert_fail ("VT == MVT::v32i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21867, __extension__
__PRETTY_FUNCTION__))
;
21868 return splitVectorIntUnary(Op, DAG);
21869 }
21870
21871 // word to byte only under BWI. Otherwise we have to promoted to v16i32
21872 // and then truncate that. But we should only do that if we haven't been
21873 // asked to avoid 512-bit vectors. The actual promotion to v16i32 will be
21874 // handled by isel patterns.
21875 if (InVT != MVT::v16i16 || Subtarget.hasBWI() ||
21876 Subtarget.canExtendTo512DQ())
21877 return Op;
21878 }
21879
21880 unsigned NumPackedSignBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);
21881 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
21882
21883 // Truncate with PACKUS if we are truncating a vector with leading zero bits
21884 // that extend all the way to the packed/truncated value.
21885 // Pre-SSE41 we can only use PACKUSWB.
21886 KnownBits Known = DAG.computeKnownBits(In);
21887 if ((InNumEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros())
21888 if (SDValue V =
21889 truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget))
21890 return V;
21891
21892 // Truncate with PACKSS if we are truncating a vector with sign-bits that
21893 // extend all the way to the packed/truncated value.
21894 if ((InNumEltBits - NumPackedSignBits) < DAG.ComputeNumSignBits(In))
21895 if (SDValue V =
21896 truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))
21897 return V;
21898
21899 // Handle truncation of V256 to V128 using shuffles.
21900 assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!")(static_cast <bool> (VT.is128BitVector() && InVT
.is256BitVector() && "Unexpected types!") ? void (0) :
__assert_fail ("VT.is128BitVector() && InVT.is256BitVector() && \"Unexpected types!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21900, __extension__
__PRETTY_FUNCTION__))
;
21901
21902 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
21903 In = DAG.getBitcast(MVT::v8i32, In);
21904
21905 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
21906 if (Subtarget.hasInt256()) {
21907 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
21908 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
21909 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
21910 DAG.getIntPtrConstant(0, DL));
21911 }
21912
21913 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
21914 DAG.getIntPtrConstant(0, DL));
21915 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
21916 DAG.getIntPtrConstant(4, DL));
21917 static const int ShufMask[] = {0, 2, 4, 6};
21918 return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
21919 }
21920
21921 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
21922 In = DAG.getBitcast(MVT::v32i8, In);
21923
21924 // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
21925 if (Subtarget.hasInt256()) {
21926 // The PSHUFB mask:
21927 static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
21928 -1, -1, -1, -1, -1, -1, -1, -1,
21929 16, 17, 20, 21, 24, 25, 28, 29,
21930 -1, -1, -1, -1, -1, -1, -1, -1 };
21931 In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
21932 In = DAG.getBitcast(MVT::v4i64, In);
21933
21934 static const int ShufMask2[] = {0, 2, -1, -1};
21935 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
21936 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16,
21937 DAG.getBitcast(MVT::v16i16, In),
21938 DAG.getIntPtrConstant(0, DL));
21939 }
21940
21941 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, In,
21942 DAG.getIntPtrConstant(0, DL));
21943 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, In,
21944 DAG.getIntPtrConstant(16, DL));
21945
21946 // The PSHUFB mask:
21947 static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
21948 -1, -1, -1, -1, -1, -1, -1, -1};
21949
21950 OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);
21951 OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);
21952
21953 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
21954 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
21955
21956 // The MOVLHPS Mask:
21957 static const int ShufMask2[] = {0, 1, 4, 5};
21958 SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
21959 return DAG.getBitcast(MVT::v8i16, res);
21960 }
21961
21962 if (VT == MVT::v16i8 && InVT == MVT::v16i16) {
21963 // Use an AND to zero uppper bits for PACKUS.
21964 In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(255, DL, InVT));
21965
21966 SDValue InLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,
21967 DAG.getIntPtrConstant(0, DL));
21968 SDValue InHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,
21969 DAG.getIntPtrConstant(8, DL));
21970 return DAG.getNode(X86ISD::PACKUS, DL, VT, InLo, InHi);
21971 }
21972
21973 llvm_unreachable("All 256->128 cases should have been handled above!")::llvm::llvm_unreachable_internal("All 256->128 cases should have been handled above!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21973)
;
21974}
21975
21976// We can leverage the specific way the "cvttps2dq/cvttpd2dq" instruction
21977// behaves on out of range inputs to generate optimized conversions.
21978static SDValue expandFP_TO_UINT_SSE(MVT VT, SDValue Src, const SDLoc &dl,
21979 SelectionDAG &DAG,
21980 const X86Subtarget &Subtarget) {
21981 MVT SrcVT = Src.getSimpleValueType();
21982 unsigned DstBits = VT.getScalarSizeInBits();
21983 assert(DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported")(static_cast <bool> (DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported"
) ? void (0) : __assert_fail ("DstBits == 32 && \"expandFP_TO_UINT_SSE - only vXi32 supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21983, __extension__
__PRETTY_FUNCTION__))
;
21984
21985 // Calculate the converted result for values in the range 0 to
21986 // 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
21987 SDValue Small = DAG.getNode(X86ISD::CVTTP2SI, dl, VT, Src);
21988 SDValue Big =
21989 DAG.getNode(X86ISD::CVTTP2SI, dl, VT,
21990 DAG.getNode(ISD::FSUB, dl, SrcVT, Src,
21991 DAG.getConstantFP(2147483648.0f, dl, SrcVT)));
21992
21993 // The "CVTTP2SI" instruction conveniently sets the sign bit if
21994 // and only if the value was out of range. So we can use that
21995 // as our indicator that we rather use "Big" instead of "Small".
21996 //
21997 // Use "Small" if "IsOverflown" has all bits cleared
21998 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
21999
22000 // AVX1 can't use the signsplat masking for 256-bit vectors - we have to
22001 // use the slightly slower blendv select instead.
22002 if (VT == MVT::v8i32 && !Subtarget.hasAVX2()) {
22003 SDValue Overflow = DAG.getNode(ISD::OR, dl, VT, Small, Big);
22004 return DAG.getNode(X86ISD::BLENDV, dl, VT, Small, Overflow, Small);
22005 }
22006
22007 SDValue IsOverflown =
22008 DAG.getNode(X86ISD::VSRAI, dl, VT, Small,
22009 DAG.getTargetConstant(DstBits - 1, dl, MVT::i8));
22010 return DAG.getNode(ISD::OR, dl, VT, Small,
22011 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
22012}
22013
22014SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
22015 bool IsStrict = Op->isStrictFPOpcode();
22016 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
22017 Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
22018 MVT VT = Op->getSimpleValueType(0);
22019 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
22020 SDValue Chain = IsStrict ? Op->getOperand(0) : SDValue();
22021 MVT SrcVT = Src.getSimpleValueType();
22022 SDLoc dl(Op);
22023
22024 SDValue Res;
22025 if (VT.isVector()) {
22026 if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
22027 MVT ResVT = MVT::v4i32;
22028 MVT TruncVT = MVT::v4i1;
22029 unsigned Opc;
22030 if (IsStrict)
22031 Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
22032 else
22033 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
22034
22035 if (!IsSigned && !Subtarget.hasVLX()) {
22036 assert(Subtarget.useAVX512Regs() && "Unexpected features!")(static_cast <bool> (Subtarget.useAVX512Regs() &&
"Unexpected features!") ? void (0) : __assert_fail ("Subtarget.useAVX512Regs() && \"Unexpected features!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22036, __extension__
__PRETTY_FUNCTION__))
;
22037 // Widen to 512-bits.
22038 ResVT = MVT::v8i32;
22039 TruncVT = MVT::v8i1;
22040 Opc = Op.getOpcode();
22041 // Need to concat with zero vector for strict fp to avoid spurious
22042 // exceptions.
22043 // TODO: Should we just do this for non-strict as well?
22044 SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64)
22045 : DAG.getUNDEF(MVT::v8f64);
22046 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src,
22047 DAG.getIntPtrConstant(0, dl));
22048 }
22049 if (IsStrict) {
22050 Res = DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Chain, Src});
22051 Chain = Res.getValue(1);
22052 } else {
22053 Res = DAG.getNode(Opc, dl, ResVT, Src);
22054 }
22055
22056 Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
22057 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
22058 DAG.getIntPtrConstant(0, dl));
22059 if (IsStrict)
22060 return DAG.getMergeValues({Res, Chain}, dl);
22061 return Res;
22062 }
22063
22064 if (Subtarget.hasFP16() && SrcVT.getVectorElementType() == MVT::f16) {
22065 if (VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16)
22066 return Op;
22067
22068 MVT ResVT = VT;
22069 MVT EleVT = VT.getVectorElementType();
22070 if (EleVT != MVT::i64)
22071 ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
22072
22073 if (SrcVT != MVT::v8f16) {
22074 SDValue Tmp =
22075 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
22076 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
22077 Ops[0] = Src;
22078 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
22079 }
22080
22081 if (IsStrict) {
22082 Res = DAG.getNode(IsSigned ? X86ISD::STRICT_CVTTP2SI
22083 : X86ISD::STRICT_CVTTP2UI,
22084 dl, {ResVT, MVT::Other}, {Chain, Src});
22085 Chain = Res.getValue(1);
22086 } else {
22087 Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl,
22088 ResVT, Src);
22089 }
22090
22091 // TODO: Need to add exception check code for strict FP.
22092 if (EleVT.getSizeInBits() < 16) {
22093 ResVT = MVT::getVectorVT(EleVT, 8);
22094 Res = DAG.getNode(ISD::TRUNCATE, dl, ResVT, Res);
22095 }
22096
22097 if (ResVT != VT)
22098 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
22099 DAG.getIntPtrConstant(0, dl));
22100
22101 if (IsStrict)
22102 return DAG.getMergeValues({Res, Chain}, dl);
22103 return Res;
22104 }
22105
22106 if (VT == MVT::v8i16 && (SrcVT == MVT::v8f32 || SrcVT == MVT::v8f64)) {
22107 if (IsStrict) {
22108 Res = DAG.getNode(IsSigned ? ISD::STRICT_FP_TO_SINT
22109 : ISD::STRICT_FP_TO_UINT,
22110 dl, {MVT::v8i32, MVT::Other}, {Chain, Src});
22111 Chain = Res.getValue(1);
22112 } else {
22113 Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, dl,
22114 MVT::v8i32, Src);
22115 }
22116
22117 // TODO: Need to add exception check code for strict FP.
22118 Res = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i16, Res);
22119
22120 if (IsStrict)
22121 return DAG.getMergeValues({Res, Chain}, dl);
22122 return Res;
22123 }
22124
22125 // v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.
22126 if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {
22127 assert(!IsSigned && "Expected unsigned conversion!")(static_cast <bool> (!IsSigned && "Expected unsigned conversion!"
) ? void (0) : __assert_fail ("!IsSigned && \"Expected unsigned conversion!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22127, __extension__
__PRETTY_FUNCTION__))
;
22128 assert(Subtarget.useAVX512Regs() && "Requires avx512f")(static_cast <bool> (Subtarget.useAVX512Regs() &&
"Requires avx512f") ? void (0) : __assert_fail ("Subtarget.useAVX512Regs() && \"Requires avx512f\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22128, __extension__
__PRETTY_FUNCTION__))
;
22129 return Op;
22130 }
22131
22132 // Widen vXi32 fp_to_uint with avx512f to 512-bit source.
22133 if ((VT == MVT::v4i32 || VT == MVT::v8i32) &&
22134 (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32) &&
22135 Subtarget.useAVX512Regs()) {
22136 assert(!IsSigned && "Expected unsigned conversion!")(static_cast <bool> (!IsSigned && "Expected unsigned conversion!"
) ? void (0) : __assert_fail ("!IsSigned && \"Expected unsigned conversion!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22136, __extension__
__PRETTY_FUNCTION__))
;
22137 assert(!Subtarget.hasVLX() && "Unexpected features!")(static_cast <bool> (!Subtarget.hasVLX() && "Unexpected features!"
) ? void (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22137, __extension__
__PRETTY_FUNCTION__))
;
22138 MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
22139 MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
22140 // Need to concat with zero vector for strict fp to avoid spurious
22141 // exceptions.
22142 // TODO: Should we just do this for non-strict as well?
22143 SDValue Tmp =
22144 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
22145 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
22146 DAG.getIntPtrConstant(0, dl));
22147
22148 if (IsStrict) {
22149 Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other},
22150 {Chain, Src});
22151 Chain = Res.getValue(1);
22152 } else {
22153 Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src);
22154 }
22155
22156 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
22157 DAG.getIntPtrConstant(0, dl));
22158
22159 if (IsStrict)
22160 return DAG.getMergeValues({Res, Chain}, dl);
22161 return Res;
22162 }
22163
22164 // Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.
22165 if ((VT == MVT::v2i64 || VT == MVT::v4i64) &&
22166 (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32) &&
22167 Subtarget.useAVX512Regs() && Subtarget.hasDQI()) {
22168 assert(!Subtarget.hasVLX() && "Unexpected features!")(static_cast <bool> (!Subtarget.hasVLX() && "Unexpected features!"
) ? void (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22168, __extension__
__PRETTY_FUNCTION__))
;
22169 MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
22170 // Need to concat with zero vector for strict fp to avoid spurious
22171 // exceptions.
22172 // TODO: Should we just do this for non-strict as well?
22173 SDValue Tmp =
22174 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
22175 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
22176 DAG.getIntPtrConstant(0, dl));
22177
22178 if (IsStrict) {
22179 Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
22180 {Chain, Src});
22181 Chain = Res.getValue(1);
22182 } else {
22183 Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src);
22184 }
22185
22186 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
22187 DAG.getIntPtrConstant(0, dl));
22188
22189 if (IsStrict)
22190 return DAG.getMergeValues({Res, Chain}, dl);
22191 return Res;
22192 }
22193
22194 if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
22195 if (!Subtarget.hasVLX()) {
22196 // Non-strict nodes without VLX can we widened to v4f32->v4i64 by type
22197 // legalizer and then widened again by vector op legalization.
22198 if (!IsStrict)
22199 return SDValue();
22200
22201 SDValue Zero = DAG.getConstantFP(0.0, dl, MVT::v2f32);
22202 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32,
22203 {Src, Zero, Zero, Zero});
22204 Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
22205 {Chain, Tmp});
22206 SDValue Chain = Tmp.getValue(1);
22207 Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,
22208 DAG.getIntPtrConstant(0, dl));
22209 return DAG.getMergeValues({Tmp, Chain}, dl);
22210 }
22211
22212 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL")(static_cast <bool> (Subtarget.hasDQI() && Subtarget
.hasVLX() && "Requires AVX512DQVL") ? void (0) : __assert_fail
("Subtarget.hasDQI() && Subtarget.hasVLX() && \"Requires AVX512DQVL\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22212, __extension__
__PRETTY_FUNCTION__))
;
22213 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
22214 DAG.getUNDEF(MVT::v2f32));
22215 if (IsStrict) {
22216 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI
22217 : X86ISD::STRICT_CVTTP2UI;
22218 return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});
22219 }
22220 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
22221 return DAG.getNode(Opc, dl, VT, Tmp);
22222 }
22223
22224 // Generate optimized instructions for pre AVX512 unsigned conversions from
22225 // vXf32 to vXi32.
22226 if ((VT == MVT::v4i32 && SrcVT == MVT::v4f32) ||
22227 (VT == MVT::v4i32 && SrcVT == MVT::v4f64) ||
22228 (VT == MVT::v8i32 && SrcVT == MVT::v8f32)) {
22229 assert(!IsSigned && "Expected unsigned conversion!")(static_cast <bool> (!IsSigned && "Expected unsigned conversion!"
) ? void (0) : __assert_fail ("!IsSigned && \"Expected unsigned conversion!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22229, __extension__
__PRETTY_FUNCTION__))
;
22230 return expandFP_TO_UINT_SSE(VT, Src, dl, DAG, Subtarget);
22231 }
22232
22233 return SDValue();
22234 }
22235
22236 assert(!VT.isVector())(static_cast <bool> (!VT.isVector()) ? void (0) : __assert_fail
("!VT.isVector()", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22236, __extension__ __PRETTY_FUNCTION__))
;
22237
22238 bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);
22239
22240 if (!IsSigned && UseSSEReg) {
22241 // Conversions from f32/f64 with AVX512 should be legal.
22242 if (Subtarget.hasAVX512())
22243 return Op;
22244
22245 // We can leverage the specific way the "cvttss2si/cvttsd2si" instruction
22246 // behaves on out of range inputs to generate optimized conversions.
22247 if (!IsStrict && ((VT == MVT::i32 && !Subtarget.is64Bit()) ||
22248 (VT == MVT::i64 && Subtarget.is64Bit()))) {
22249 unsigned DstBits = VT.getScalarSizeInBits();
22250 APInt UIntLimit = APInt::getSignMask(DstBits);
22251 SDValue FloatOffset = DAG.getNode(ISD::UINT_TO_FP, dl, SrcVT,
22252 DAG.getConstant(UIntLimit, dl, VT));
22253 MVT SrcVecVT = MVT::getVectorVT(SrcVT, 128 / SrcVT.getScalarSizeInBits());
22254
22255 // Calculate the converted result for values in the range:
22256 // (i32) 0 to 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
22257 // (i64) 0 to 2^63-1 ("Small") and from 2^63 to 2^64-1 ("Big").
22258 SDValue Small =
22259 DAG.getNode(X86ISD::CVTTS2SI, dl, VT,
22260 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT, Src));
22261 SDValue Big = DAG.getNode(
22262 X86ISD::CVTTS2SI, dl, VT,
22263 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT,
22264 DAG.getNode(ISD::FSUB, dl, SrcVT, Src, FloatOffset)));
22265
22266 // The "CVTTS2SI" instruction conveniently sets the sign bit if
22267 // and only if the value was out of range. So we can use that
22268 // as our indicator that we rather use "Big" instead of "Small".
22269 //
22270 // Use "Small" if "IsOverflown" has all bits cleared
22271 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
22272 SDValue IsOverflown = DAG.getNode(
22273 ISD::SRA, dl, VT, Small, DAG.getConstant(DstBits - 1, dl, MVT::i8));
22274 return DAG.getNode(ISD::OR, dl, VT, Small,
22275 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
22276 }
22277
22278 // Use default expansion for i64.
22279 if (VT == MVT::i64)
22280 return SDValue();
22281
22282 assert(VT == MVT::i32 && "Unexpected VT!")(static_cast <bool> (VT == MVT::i32 && "Unexpected VT!"
) ? void (0) : __assert_fail ("VT == MVT::i32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22282, __extension__
__PRETTY_FUNCTION__))
;
22283
22284 // Promote i32 to i64 and use a signed operation on 64-bit targets.
22285 // FIXME: This does not generate an invalid exception if the input does not
22286 // fit in i32. PR44019
22287 if (Subtarget.is64Bit()) {
22288 if (IsStrict) {
22289 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i64, MVT::Other},
22290 {Chain, Src});
22291 Chain = Res.getValue(1);
22292 } else
22293 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);
22294
22295 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
22296 if (IsStrict)
22297 return DAG.getMergeValues({Res, Chain}, dl);
22298 return Res;
22299 }
22300
22301 // Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can
22302 // use fisttp which will be handled later.
22303 if (!Subtarget.hasSSE3())
22304 return SDValue();
22305 }
22306
22307 // Promote i16 to i32 if we can use a SSE operation or the type is f128.
22308 // FIXME: This does not generate an invalid exception if the input does not
22309 // fit in i16. PR44019
22310 if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) {
22311 assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!")(static_cast <bool> (IsSigned && "Expected i16 FP_TO_UINT to have been promoted!"
) ? void (0) : __assert_fail ("IsSigned && \"Expected i16 FP_TO_UINT to have been promoted!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22311, __extension__
__PRETTY_FUNCTION__))
;
22312 if (IsStrict) {
22313 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i32, MVT::Other},
22314 {Chain, Src});
22315 Chain = Res.getValue(1);
22316 } else
22317 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);
22318
22319 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
22320 if (IsStrict)
22321 return DAG.getMergeValues({Res, Chain}, dl);
22322 return Res;
22323 }
22324
22325 // If this is a FP_TO_SINT using SSEReg we're done.
22326 if (UseSSEReg && IsSigned)
22327 return Op;
22328
22329 // fp128 needs to use a libcall.
22330 if (SrcVT == MVT::f128) {
22331 RTLIB::Libcall LC;
22332 if (IsSigned)
22333 LC = RTLIB::getFPTOSINT(SrcVT, VT);
22334 else
22335 LC = RTLIB::getFPTOUINT(SrcVT, VT);
22336
22337 MakeLibCallOptions CallOptions;
22338 std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, LC, VT, Src, CallOptions,
22339 SDLoc(Op), Chain);
22340
22341 if (IsStrict)
22342 return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
22343
22344 return Tmp.first;
22345 }
22346
22347 // Fall back to X87.
22348 if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) {
22349 if (IsStrict)
22350 return DAG.getMergeValues({V, Chain}, dl);
22351 return V;
22352 }
22353
22354 llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.")::llvm::llvm_unreachable_internal("Expected FP_TO_INTHelper to handle all remaining cases."
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22354)
;
22355}
22356
22357SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,
22358 SelectionDAG &DAG) const {
22359 SDValue Src = Op.getOperand(0);
22360 MVT SrcVT = Src.getSimpleValueType();
22361
22362 // If the source is in an SSE register, the node is Legal.
22363 if (isScalarFPTypeInSSEReg(SrcVT))
22364 return Op;
22365
22366 return LRINT_LLRINTHelper(Op.getNode(), DAG);
22367}
22368
22369SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
22370 SelectionDAG &DAG) const {
22371 EVT DstVT = N->getValueType(0);
22372 SDValue Src = N->getOperand(0);
22373 EVT SrcVT = Src.getValueType();
22374
22375 if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) {
22376 // f16 must be promoted before using the lowering in this routine.
22377 // fp128 does not use this lowering.
22378 return SDValue();
22379 }
22380
22381 SDLoc DL(N);
22382 SDValue Chain = DAG.getEntryNode();
22383
22384 bool UseSSE = isScalarFPTypeInSSEReg(SrcVT);
22385
22386 // If we're converting from SSE, the stack slot needs to hold both types.
22387 // Otherwise it only needs to hold the DstVT.
22388 EVT OtherVT = UseSSE ? SrcVT : DstVT;
22389 SDValue StackPtr = DAG.CreateStackTemporary(DstVT, OtherVT);
22390 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
22391 MachinePointerInfo MPI =
22392 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
22393
22394 if (UseSSE) {
22395 assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!")(static_cast <bool> (DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!"
) ? void (0) : __assert_fail ("DstVT == MVT::i64 && \"Invalid LRINT/LLRINT to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22395, __extension__
__PRETTY_FUNCTION__))
;
22396 Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI);
22397 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
22398 SDValue Ops[] = { Chain, StackPtr };
22399
22400 Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI,
22401 /*Align*/ None, MachineMemOperand::MOLoad);
22402 Chain = Src.getValue(1);
22403 }
22404
22405 SDValue StoreOps[] = { Chain, Src, StackPtr };
22406 Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other),
22407 StoreOps, DstVT, MPI, /*Align*/ None,
22408 MachineMemOperand::MOStore);
22409
22410 return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);
22411}
22412
22413SDValue
22414X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
22415 // This is based on the TargetLowering::expandFP_TO_INT_SAT implementation,
22416 // but making use of X86 specifics to produce better instruction sequences.
22417 SDNode *Node = Op.getNode();
22418 bool IsSigned = Node->getOpcode() == ISD::FP_TO_SINT_SAT;
22419 unsigned FpToIntOpcode = IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
22420 SDLoc dl(SDValue(Node, 0));
22421 SDValue Src = Node->getOperand(0);
22422
22423 // There are three types involved here: SrcVT is the source floating point
22424 // type, DstVT is the type of the result, and TmpVT is the result of the
22425 // intermediate FP_TO_*INT operation we'll use (which may be a promotion of
22426 // DstVT).
22427 EVT SrcVT = Src.getValueType();
22428 EVT DstVT = Node->getValueType(0);
22429 EVT TmpVT = DstVT;
22430
22431 // This code is only for floats and doubles. Fall back to generic code for
22432 // anything else.
22433 if (!isScalarFPTypeInSSEReg(SrcVT))
22434 return SDValue();
22435
22436 EVT SatVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
22437 unsigned SatWidth = SatVT.getScalarSizeInBits();
22438 unsigned DstWidth = DstVT.getScalarSizeInBits();
22439 unsigned TmpWidth = TmpVT.getScalarSizeInBits();
22440 assert(SatWidth <= DstWidth && SatWidth <= TmpWidth &&(static_cast <bool> (SatWidth <= DstWidth &&
SatWidth <= TmpWidth && "Expected saturation width smaller than result width"
) ? void (0) : __assert_fail ("SatWidth <= DstWidth && SatWidth <= TmpWidth && \"Expected saturation width smaller than result width\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22441, __extension__
__PRETTY_FUNCTION__))
22441 "Expected saturation width smaller than result width")(static_cast <bool> (SatWidth <= DstWidth &&
SatWidth <= TmpWidth && "Expected saturation width smaller than result width"
) ? void (0) : __assert_fail ("SatWidth <= DstWidth && SatWidth <= TmpWidth && \"Expected saturation width smaller than result width\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22441, __extension__
__PRETTY_FUNCTION__))
;
22442
22443 // Promote result of FP_TO_*INT to at least 32 bits.
22444 if (TmpWidth < 32) {
22445 TmpVT = MVT::i32;
22446 TmpWidth = 32;
22447 }
22448
22449 // Promote conversions to unsigned 32-bit to 64-bit, because it will allow
22450 // us to use a native signed conversion instead.
22451 if (SatWidth == 32 && !IsSigned && Subtarget.is64Bit()) {
22452 TmpVT = MVT::i64;
22453 TmpWidth = 64;
22454 }
22455
22456 // If the saturation width is smaller than the size of the temporary result,
22457 // we can always use signed conversion, which is native.
22458 if (SatWidth < TmpWidth)
22459 FpToIntOpcode = ISD::FP_TO_SINT;
22460
22461 // Determine minimum and maximum integer values and their corresponding
22462 // floating-point values.
22463 APInt MinInt, MaxInt;
22464 if (IsSigned) {
22465 MinInt = APInt::getSignedMinValue(SatWidth).sextOrSelf(DstWidth);
22466 MaxInt = APInt::getSignedMaxValue(SatWidth).sextOrSelf(DstWidth);
22467 } else {
22468 MinInt = APInt::getMinValue(SatWidth).zextOrSelf(DstWidth);
22469 MaxInt = APInt::getMaxValue(SatWidth).zextOrSelf(DstWidth);
22470 }
22471
22472 APFloat MinFloat(DAG.EVTToAPFloatSemantics(SrcVT));
22473 APFloat MaxFloat(DAG.EVTToAPFloatSemantics(SrcVT));
22474
22475 APFloat::opStatus MinStatus = MinFloat.convertFromAPInt(
22476 MinInt, IsSigned, APFloat::rmTowardZero);
22477 APFloat::opStatus MaxStatus = MaxFloat.convertFromAPInt(
22478 MaxInt, IsSigned, APFloat::rmTowardZero);
22479 bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact)
22480 && !(MaxStatus & APFloat::opStatus::opInexact);
22481
22482 SDValue MinFloatNode = DAG.getConstantFP(MinFloat, dl, SrcVT);
22483 SDValue MaxFloatNode = DAG.getConstantFP(MaxFloat, dl, SrcVT);
22484
22485 // If the integer bounds are exactly representable as floats, emit a
22486 // min+max+fptoi sequence. Otherwise use comparisons and selects.
22487 if (AreExactFloatBounds) {
22488 if (DstVT != TmpVT) {
22489 // Clamp by MinFloat from below. If Src is NaN, propagate NaN.
22490 SDValue MinClamped = DAG.getNode(
22491 X86ISD::FMAX, dl, SrcVT, MinFloatNode, Src);
22492 // Clamp by MaxFloat from above. If Src is NaN, propagate NaN.
22493 SDValue BothClamped = DAG.getNode(
22494 X86ISD::FMIN, dl, SrcVT, MaxFloatNode, MinClamped);
22495 // Convert clamped value to integer.
22496 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, BothClamped);
22497
22498 // NaN will become INDVAL, with the top bit set and the rest zero.
22499 // Truncation will discard the top bit, resulting in zero.
22500 return DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
22501 }
22502
22503 // Clamp by MinFloat from below. If Src is NaN, the result is MinFloat.
22504 SDValue MinClamped = DAG.getNode(
22505 X86ISD::FMAX, dl, SrcVT, Src, MinFloatNode);
22506 // Clamp by MaxFloat from above. NaN cannot occur.
22507 SDValue BothClamped = DAG.getNode(
22508 X86ISD::FMINC, dl, SrcVT, MinClamped, MaxFloatNode);
22509 // Convert clamped value to integer.
22510 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, DstVT, BothClamped);
22511
22512 if (!IsSigned) {
22513 // In the unsigned case we're done, because we mapped NaN to MinFloat,
22514 // which is zero.
22515 return FpToInt;
22516 }
22517
22518 // Otherwise, select zero if Src is NaN.
22519 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
22520 return DAG.getSelectCC(
22521 dl, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO);
22522 }
22523
22524 SDValue MinIntNode = DAG.getConstant(MinInt, dl, DstVT);
22525 SDValue MaxIntNode = DAG.getConstant(MaxInt, dl, DstVT);
22526
22527 // Result of direct conversion, which may be selected away.
22528 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, Src);
22529
22530 if (DstVT != TmpVT) {
22531 // NaN will become INDVAL, with the top bit set and the rest zero.
22532 // Truncation will discard the top bit, resulting in zero.
22533 FpToInt = DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
22534 }
22535
22536 SDValue Select = FpToInt;
22537 // For signed conversions where we saturate to the same size as the
22538 // result type of the fptoi instructions, INDVAL coincides with integer
22539 // minimum, so we don't need to explicitly check it.
22540 if (!IsSigned || SatWidth != TmpVT.getScalarSizeInBits()) {
22541 // If Src ULT MinFloat, select MinInt. In particular, this also selects
22542 // MinInt if Src is NaN.
22543 Select = DAG.getSelectCC(
22544 dl, Src, MinFloatNode, MinIntNode, Select, ISD::CondCode::SETULT);
22545 }
22546
22547 // If Src OGT MaxFloat, select MaxInt.
22548 Select = DAG.getSelectCC(
22549 dl, Src, MaxFloatNode, MaxIntNode, Select, ISD::CondCode::SETOGT);
22550
22551 // In the unsigned case we are done, because we mapped NaN to MinInt, which
22552 // is already zero. The promoted case was already handled above.
22553 if (!IsSigned || DstVT != TmpVT) {
22554 return Select;
22555 }
22556
22557 // Otherwise, select 0 if Src is NaN.
22558 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
22559 return DAG.getSelectCC(
22560 dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO);
22561}
22562
22563SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
22564 bool IsStrict = Op->isStrictFPOpcode();
22565
22566 SDLoc DL(Op);
22567 MVT VT = Op.getSimpleValueType();
22568 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
22569 MVT SVT = In.getSimpleValueType();
22570
22571 if (VT == MVT::f128)
22572 return SDValue();
22573
22574 if (VT == MVT::f80) {
22575 if (SVT == MVT::f16) {
22576 assert(Subtarget.hasFP16() && "Unexpected features!")(static_cast <bool> (Subtarget.hasFP16() && "Unexpected features!"
) ? void (0) : __assert_fail ("Subtarget.hasFP16() && \"Unexpected features!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22576, __extension__
__PRETTY_FUNCTION__))
;
22577 RTLIB::Libcall LC = RTLIB::getFPEXT(SVT, VT);
22578 MakeLibCallOptions CallOptions;
22579 std::pair<SDValue, SDValue> Tmp =
22580 makeLibCall(DAG, LC, VT, In, CallOptions, DL,
22581 IsStrict ? Op.getOperand(0) : SDValue());
22582 if (IsStrict)
22583 return DAG.getMergeValues({Tmp.first, Tmp.second}, DL);
22584 else
22585 return Tmp.first;
22586 }
22587 return Op;
22588 }
22589
22590 if (SVT.getVectorElementType() == MVT::f16) {
22591 assert(Subtarget.hasFP16() && Subtarget.hasVLX() && "Unexpected features!")(static_cast <bool> (Subtarget.hasFP16() && Subtarget
.hasVLX() && "Unexpected features!") ? void (0) : __assert_fail
("Subtarget.hasFP16() && Subtarget.hasVLX() && \"Unexpected features!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22591, __extension__
__PRETTY_FUNCTION__))
;
22592 if (SVT == MVT::v2f16)
22593 In = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f16, In,
22594 DAG.getUNDEF(MVT::v2f16));
22595 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f16, In,
22596 DAG.getUNDEF(MVT::v4f16));
22597 if (IsStrict)
22598 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
22599 {Op->getOperand(0), Res});
22600 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
22601 }
22602
22603 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!")(static_cast <bool> (SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!"
) ? void (0) : __assert_fail ("SVT == MVT::v2f32 && \"Only customize MVT::v2f32 type legalization!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22603, __extension__
__PRETTY_FUNCTION__))
;
22604
22605 SDValue Res =
22606 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT));
22607 if (IsStrict)
22608 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
22609 {Op->getOperand(0), Res});
22610 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
22611}
22612
22613SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
22614 bool IsStrict = Op->isStrictFPOpcode();
22615 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
22616 MVT VT = Op.getSimpleValueType();
22617 MVT SVT = In.getSimpleValueType();
22618
22619 // It's legal except when f128 is involved or we're converting f80->f16.
22620 if (SVT != MVT::f128 && !(VT == MVT::f16 && SVT == MVT::f80))
22621 return Op;
22622
22623 return SDValue();
22624}
22625
22626static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG) {
22627 bool IsStrict = Op->isStrictFPOpcode();
22628 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
22629 assert(Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 &&(static_cast <bool> (Src.getValueType() == MVT::i16 &&
Op.getValueType() == MVT::f32 && "Unexpected VT!") ?
void (0) : __assert_fail ("Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22630, __extension__
__PRETTY_FUNCTION__))
22630 "Unexpected VT!")(static_cast <bool> (Src.getValueType() == MVT::i16 &&
Op.getValueType() == MVT::f32 && "Unexpected VT!") ?
void (0) : __assert_fail ("Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22630, __extension__
__PRETTY_FUNCTION__))
;
22631
22632 SDLoc dl(Op);
22633 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16,
22634 DAG.getConstant(0, dl, MVT::v8i16), Src,
22635 DAG.getIntPtrConstant(0, dl));
22636
22637 SDValue Chain;
22638 if (IsStrict) {
22639 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {MVT::v4f32, MVT::Other},
22640 {Op.getOperand(0), Res});
22641 Chain = Res.getValue(1);
22642 } else {
22643 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
22644 }
22645
22646 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
22647 DAG.getIntPtrConstant(0, dl));
22648
22649 if (IsStrict)
22650 return DAG.getMergeValues({Res, Chain}, dl);
22651
22652 return Res;
22653}
22654
22655static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) {
22656 bool IsStrict = Op->isStrictFPOpcode();
22657 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
22658 assert(Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 &&(static_cast <bool> (Src.getValueType() == MVT::f32 &&
Op.getValueType() == MVT::i16 && "Unexpected VT!") ?
void (0) : __assert_fail ("Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22659, __extension__
__PRETTY_FUNCTION__))
22659 "Unexpected VT!")(static_cast <bool> (Src.getValueType() == MVT::f32 &&
Op.getValueType() == MVT::i16 && "Unexpected VT!") ?
void (0) : __assert_fail ("Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22659, __extension__
__PRETTY_FUNCTION__))
;
22660
22661 SDLoc dl(Op);
22662 SDValue Res, Chain;
22663 if (IsStrict) {
22664 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4f32,
22665 DAG.getConstantFP(0, dl, MVT::v4f32), Src,
22666 DAG.getIntPtrConstant(0, dl));
22667 Res = DAG.getNode(
22668 X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
22669 {Op.getOperand(0), Res, DAG.getTargetConstant(4, dl, MVT::i32)});
22670 Chain = Res.getValue(1);
22671 } else {
22672 // FIXME: Should we use zeros for upper elements for non-strict?
22673 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Src);
22674 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
22675 DAG.getTargetConstant(4, dl, MVT::i32));
22676 }
22677
22678 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res,
22679 DAG.getIntPtrConstant(0, dl));
22680
22681 if (IsStrict)
22682 return DAG.getMergeValues({Res, Chain}, dl);
22683
22684 return Res;
22685}
22686
22687/// Depending on uarch and/or optimizing for size, we might prefer to use a
22688/// vector operation in place of the typical scalar operation.
22689static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG,
22690 const X86Subtarget &Subtarget) {
22691 // If both operands have other uses, this is probably not profitable.
22692 SDValue LHS = Op.getOperand(0);
22693 SDValue RHS = Op.getOperand(1);
22694 if (!LHS.hasOneUse() && !RHS.hasOneUse())
22695 return Op;
22696
22697 // FP horizontal add/sub were added with SSE3. Integer with SSSE3.
22698 bool IsFP = Op.getSimpleValueType().isFloatingPoint();
22699 if (IsFP && !Subtarget.hasSSE3())
22700 return Op;
22701 if (!IsFP && !Subtarget.hasSSSE3())
22702 return Op;
22703
22704 // Extract from a common vector.
22705 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
22706 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
22707 LHS.getOperand(0) != RHS.getOperand(0) ||
22708 !isa<ConstantSDNode>(LHS.getOperand(1)) ||
22709 !isa<ConstantSDNode>(RHS.getOperand(1)) ||
22710 !shouldUseHorizontalOp(true, DAG, Subtarget))
22711 return Op;
22712
22713 // Allow commuted 'hadd' ops.
22714 // TODO: Allow commuted (f)sub by negating the result of (F)HSUB?
22715 unsigned HOpcode;
22716 switch (Op.getOpcode()) {
22717 case ISD::ADD: HOpcode = X86ISD::HADD; break;
22718 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
22719 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
22720 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
22721 default:
22722 llvm_unreachable("Trying to lower unsupported opcode to horizontal op")::llvm::llvm_unreachable_internal("Trying to lower unsupported opcode to horizontal op"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22722)
;
22723 }
22724 unsigned LExtIndex = LHS.getConstantOperandVal(1);
22725 unsigned RExtIndex = RHS.getConstantOperandVal(1);
22726 if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 &&
22727 (HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD))
22728 std::swap(LExtIndex, RExtIndex);
22729
22730 if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1))
22731 return Op;
22732
22733 SDValue X = LHS.getOperand(0);
22734 EVT VecVT = X.getValueType();
22735 unsigned BitWidth = VecVT.getSizeInBits();
22736 unsigned NumLanes = BitWidth / 128;
22737 unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes;
22738 assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&(static_cast <bool> ((BitWidth == 128 || BitWidth == 256
|| BitWidth == 512) && "Not expecting illegal vector widths here"
) ? void (0) : __assert_fail ("(BitWidth == 128 || BitWidth == 256 || BitWidth == 512) && \"Not expecting illegal vector widths here\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22739, __extension__
__PRETTY_FUNCTION__))
22739 "Not expecting illegal vector widths here")(static_cast <bool> ((BitWidth == 128 || BitWidth == 256
|| BitWidth == 512) && "Not expecting illegal vector widths here"
) ? void (0) : __assert_fail ("(BitWidth == 128 || BitWidth == 256 || BitWidth == 512) && \"Not expecting illegal vector widths here\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22739, __extension__
__PRETTY_FUNCTION__))
;
22740
22741 // Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit
22742 // equivalent, so extract the 256/512-bit source op to 128-bit if we can.
22743 SDLoc DL(Op);
22744 if (BitWidth == 256 || BitWidth == 512) {
22745 unsigned LaneIdx = LExtIndex / NumEltsPerLane;
22746 X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);
22747 LExtIndex %= NumEltsPerLane;
22748 }
22749
22750 // add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
22751 // add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
22752 // add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1
22753 // sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0
22754 SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);
22755 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,
22756 DAG.getIntPtrConstant(LExtIndex / 2, DL));
22757}
22758
22759/// Depending on uarch and/or optimizing for size, we might prefer to use a
22760/// vector operation in place of the typical scalar operation.
22761SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {
22762 assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) &&(static_cast <bool> ((Op.getValueType() == MVT::f32 || Op
.getValueType() == MVT::f64) && "Only expecting float/double"
) ? void (0) : __assert_fail ("(Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) && \"Only expecting float/double\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22763, __extension__
__PRETTY_FUNCTION__))
22763 "Only expecting float/double")(static_cast <bool> ((Op.getValueType() == MVT::f32 || Op
.getValueType() == MVT::f64) && "Only expecting float/double"
) ? void (0) : __assert_fail ("(Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) && \"Only expecting float/double\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22763, __extension__
__PRETTY_FUNCTION__))
;
22764 return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
22765}
22766
22767/// ISD::FROUND is defined to round to nearest with ties rounding away from 0.
22768/// This mode isn't supported in hardware on X86. But as long as we aren't
22769/// compiling with trapping math, we can emulate this with
22770/// trunc(X + copysign(nextafter(0.5, 0.0), X)).
22771static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) {
22772 SDValue N0 = Op.getOperand(0);
22773 SDLoc dl(Op);
22774 MVT VT = Op.getSimpleValueType();
22775
22776 // N0 += copysign(nextafter(0.5, 0.0), N0)
22777 const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
22778 bool Ignored;
22779 APFloat Point5Pred = APFloat(0.5f);
22780 Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);
22781 Point5Pred.next(/*nextDown*/true);
22782
22783 SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,
22784 DAG.getConstantFP(Point5Pred, dl, VT), N0);
22785 N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder);
22786
22787 // Truncate the result to remove fraction.
22788 return DAG.getNode(ISD::FTRUNC, dl, VT, N0);
22789}
22790
22791/// The only differences between FABS and FNEG are the mask and the logic op.
22792/// FNEG also has a folding opportunity for FNEG(FABS(x)).
22793static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
22794 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&(static_cast <bool> ((Op.getOpcode() == ISD::FABS || Op
.getOpcode() == ISD::FNEG) && "Wrong opcode for lowering FABS or FNEG."
) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) && \"Wrong opcode for lowering FABS or FNEG.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22795, __extension__
__PRETTY_FUNCTION__))
22795 "Wrong opcode for lowering FABS or FNEG.")(static_cast <bool> ((Op.getOpcode() == ISD::FABS || Op
.getOpcode() == ISD::FNEG) && "Wrong opcode for lowering FABS or FNEG."
) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) && \"Wrong opcode for lowering FABS or FNEG.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22795, __extension__
__PRETTY_FUNCTION__))
;
22796
22797 bool IsFABS = (Op.getOpcode() == ISD::FABS);
22798
22799 // If this is a FABS and it has an FNEG user, bail out to fold the combination
22800 // into an FNABS. We'll lower the FABS after that if it is still in use.
22801 if (IsFABS)
22802 for (SDNode *User : Op->uses())
22803 if (User->getOpcode() == ISD::FNEG)
22804 return Op;
22805
22806 SDLoc dl(Op);
22807 MVT VT = Op.getSimpleValueType();
22808
22809 bool IsF128 = (VT == MVT::f128);
22810 assert(VT.isFloatingPoint() && VT != MVT::f80 &&(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFABSorFNEG") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFABSorFNEG\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22812, __extension__
__PRETTY_FUNCTION__))
22811 DAG.getTargetLoweringInfo().isTypeLegal(VT) &&(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFABSorFNEG") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFABSorFNEG\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22812, __extension__
__PRETTY_FUNCTION__))
22812 "Unexpected type in LowerFABSorFNEG")(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFABSorFNEG") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFABSorFNEG\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22812, __extension__
__PRETTY_FUNCTION__))
;
22813
22814 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
22815 // decide if we should generate a 16-byte constant mask when we only need 4 or
22816 // 8 bytes for the scalar case.
22817
22818 // There are no scalar bitwise logical SSE/AVX instructions, so we
22819 // generate a 16-byte vector constant and logic op even for the scalar case.
22820 // Using a 16-byte mask allows folding the load of the mask with
22821 // the logic op, so it can save (~4 bytes) on code size.
22822 bool IsFakeVector = !VT.isVector() && !IsF128;
22823 MVT LogicVT = VT;
22824 if (IsFakeVector)
22825 LogicVT = (VT == MVT::f64) ? MVT::v2f64
22826 : (VT == MVT::f32) ? MVT::v4f32
22827 : MVT::v8f16;
22828
22829 unsigned EltBits = VT.getScalarSizeInBits();
22830 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
22831 APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :
22832 APInt::getSignMask(EltBits);
22833 const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
22834 SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
22835
22836 SDValue Op0 = Op.getOperand(0);
22837 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
22838 unsigned LogicOp = IsFABS ? X86ISD::FAND :
22839 IsFNABS ? X86ISD::FOR :
22840 X86ISD::FXOR;
22841 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
22842
22843 if (VT.isVector() || IsF128)
22844 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
22845
22846 // For the scalar case extend to a 128-bit vector, perform the logic op,
22847 // and extract the scalar result back out.
22848 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
22849 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
22850 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
22851 DAG.getIntPtrConstant(0, dl));
22852}
22853
22854static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
22855 SDValue Mag = Op.getOperand(0);
22856 SDValue Sign = Op.getOperand(1);
22857 SDLoc dl(Op);
22858
22859 // If the sign operand is smaller, extend it first.
22860 MVT VT = Op.getSimpleValueType();
22861 if (Sign.getSimpleValueType().bitsLT(VT))
22862 Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
22863
22864 // And if it is bigger, shrink it first.
22865 if (Sign.getSimpleValueType().bitsGT(VT))
22866 Sign =
22867 DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(0, dl));
22868
22869 // At this point the operands and the result should have the same
22870 // type, and that won't be f80 since that is not custom lowered.
22871 bool IsF128 = (VT == MVT::f128);
22872 assert(VT.isFloatingPoint() && VT != MVT::f80 &&(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFCOPYSIGN") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFCOPYSIGN\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22874, __extension__
__PRETTY_FUNCTION__))
22873 DAG.getTargetLoweringInfo().isTypeLegal(VT) &&(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFCOPYSIGN") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFCOPYSIGN\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22874, __extension__
__PRETTY_FUNCTION__))
22874 "Unexpected type in LowerFCOPYSIGN")(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFCOPYSIGN") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFCOPYSIGN\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22874, __extension__
__PRETTY_FUNCTION__))
;
22875
22876 const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
22877
22878 // Perform all scalar logic operations as 16-byte vectors because there are no
22879 // scalar FP logic instructions in SSE.
22880 // TODO: This isn't necessary. If we used scalar types, we might avoid some
22881 // unnecessary splats, but we might miss load folding opportunities. Should
22882 // this decision be based on OptimizeForSize?
22883 bool IsFakeVector = !VT.isVector() && !IsF128;
22884 MVT LogicVT = VT;
22885 if (IsFakeVector)
22886 LogicVT = (VT == MVT::f64) ? MVT::v2f64
22887 : (VT == MVT::f32) ? MVT::v4f32
22888 : MVT::v8f16;
22889
22890 // The mask constants are automatically splatted for vector types.
22891 unsigned EltSizeInBits = VT.getScalarSizeInBits();
22892 SDValue SignMask = DAG.getConstantFP(
22893 APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
22894 SDValue MagMask = DAG.getConstantFP(
22895 APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT);
22896
22897 // First, clear all bits but the sign bit from the second operand (sign).
22898 if (IsFakeVector)
22899 Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
22900 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
22901
22902 // Next, clear the sign bit from the first operand (magnitude).
22903 // TODO: If we had general constant folding for FP logic ops, this check
22904 // wouldn't be necessary.
22905 SDValue MagBits;
22906 if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) {
22907 APFloat APF = Op0CN->getValueAPF();
22908 APF.clearSign();
22909 MagBits = DAG.getConstantFP(APF, dl, LogicVT);
22910 } else {
22911 // If the magnitude operand wasn't a constant, we need to AND out the sign.
22912 if (IsFakeVector)
22913 Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
22914 MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
22915 }
22916
22917 // OR the magnitude value with the sign bit.
22918 SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
22919 return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
22920 DAG.getIntPtrConstant(0, dl));
22921}
22922
22923static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
22924 SDValue N0 = Op.getOperand(0);
22925 SDLoc dl(Op);
22926 MVT VT = Op.getSimpleValueType();
22927
22928 MVT OpVT = N0.getSimpleValueType();
22929 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&(static_cast <bool> ((OpVT == MVT::f32 || OpVT == MVT::
f64) && "Unexpected type for FGETSIGN") ? void (0) : __assert_fail
("(OpVT == MVT::f32 || OpVT == MVT::f64) && \"Unexpected type for FGETSIGN\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22930, __extension__
__PRETTY_FUNCTION__))
22930 "Unexpected type for FGETSIGN")(static_cast <bool> ((OpVT == MVT::f32 || OpVT == MVT::
f64) && "Unexpected type for FGETSIGN") ? void (0) : __assert_fail
("(OpVT == MVT::f32 || OpVT == MVT::f64) && \"Unexpected type for FGETSIGN\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22930, __extension__
__PRETTY_FUNCTION__))
;
22931
22932 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
22933 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
22934 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
22935 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
22936 Res = DAG.getZExtOrTrunc(Res, dl, VT);
22937 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
22938 return Res;
22939}
22940
22941/// Helper for attempting to create a X86ISD::BT node.
22942static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG) {
22943 // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
22944 // instruction. Since the shift amount is in-range-or-undefined, we know
22945 // that doing a bittest on the i32 value is ok. We extend to i32 because
22946 // the encoding for the i16 version is larger than the i32 version.
22947 // Also promote i16 to i32 for performance / code size reason.
22948 if (Src.getValueType().getScalarSizeInBits() < 32)
22949 Src = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Src);
22950
22951 // No legal type found, give up.
22952 if (!DAG.getTargetLoweringInfo().isTypeLegal(Src.getValueType()))
22953 return SDValue();
22954
22955 // See if we can use the 32-bit instruction instead of the 64-bit one for a
22956 // shorter encoding. Since the former takes the modulo 32 of BitNo and the
22957 // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
22958 // known to be zero.
22959 if (Src.getValueType() == MVT::i64 &&
22960 DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
22961 Src = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Src);
22962
22963 // If the operand types disagree, extend the shift amount to match. Since
22964 // BT ignores high bits (like shifts) we can use anyextend.
22965 if (Src.getValueType() != BitNo.getValueType())
22966 BitNo = DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(), BitNo);
22967
22968 return DAG.getNode(X86ISD::BT, DL, MVT::i32, Src, BitNo);
22969}
22970
22971/// Helper for creating a X86ISD::SETCC node.
22972static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
22973 SelectionDAG &DAG) {
22974 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
22975 DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS);
22976}
22977
22978/// Helper for matching OR(EXTRACTELT(X,0),OR(EXTRACTELT(X,1),...))
22979/// style scalarized (associative) reduction patterns. Partial reductions
22980/// are supported when the pointer SrcMask is non-null.
22981/// TODO - move this to SelectionDAG?
22982static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp,
22983 SmallVectorImpl<SDValue> &SrcOps,
22984 SmallVectorImpl<APInt> *SrcMask = nullptr) {
22985 SmallVector<SDValue, 8> Opnds;
22986 DenseMap<SDValue, APInt> SrcOpMap;
22987 EVT VT = MVT::Other;
22988
22989 // Recognize a special case where a vector is casted into wide integer to
22990 // test all 0s.
22991 assert(Op.getOpcode() == unsigned(BinOp) &&(static_cast <bool> (Op.getOpcode() == unsigned(BinOp) &&
"Unexpected bit reduction opcode") ? void (0) : __assert_fail
("Op.getOpcode() == unsigned(BinOp) && \"Unexpected bit reduction opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22992, __extension__
__PRETTY_FUNCTION__))
22992 "Unexpected bit reduction opcode")(static_cast <bool> (Op.getOpcode() == unsigned(BinOp) &&
"Unexpected bit reduction opcode") ? void (0) : __assert_fail
("Op.getOpcode() == unsigned(BinOp) && \"Unexpected bit reduction opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22992, __extension__
__PRETTY_FUNCTION__))
;
22993 Opnds.push_back(Op.getOperand(0));
22994 Opnds.push_back(Op.getOperand(1));
22995
22996 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
22997 SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
22998 // BFS traverse all BinOp operands.
22999 if (I->getOpcode() == unsigned(BinOp)) {
23000 Opnds.push_back(I->getOperand(0));
23001 Opnds.push_back(I->getOperand(1));
23002 // Re-evaluate the number of nodes to be traversed.
23003 e += 2; // 2 more nodes (LHS and RHS) are pushed.
23004 continue;
23005 }
23006
23007 // Quit if a non-EXTRACT_VECTOR_ELT
23008 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
23009 return false;
23010
23011 // Quit if without a constant index.
23012 auto *Idx = dyn_cast<ConstantSDNode>(I->getOperand(1));
23013 if (!Idx)
23014 return false;
23015
23016 SDValue Src = I->getOperand(0);
23017 DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src);
23018 if (M == SrcOpMap.end()) {
23019 VT = Src.getValueType();
23020 // Quit if not the same type.
23021 if (!SrcOpMap.empty() && VT != SrcOpMap.begin()->first.getValueType())
23022 return false;
23023 unsigned NumElts = VT.getVectorNumElements();
23024 APInt EltCount = APInt::getZero(NumElts);
23025 M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;
23026 SrcOps.push_back(Src);
23027 }
23028
23029 // Quit if element already used.
23030 unsigned CIdx = Idx->getZExtValue();
23031 if (M->second[CIdx])
23032 return false;
23033 M->second.setBit(CIdx);
23034 }
23035
23036 if (SrcMask) {
23037 // Collect the source partial masks.
23038 for (SDValue &SrcOp : SrcOps)
23039 SrcMask->push_back(SrcOpMap[SrcOp]);
23040 } else {
23041 // Quit if not all elements are used.
23042 for (const auto &I : SrcOpMap)
23043 if (!I.second.isAllOnes())
23044 return false;
23045 }
23046
23047 return true;
23048}
23049
23050// Helper function for comparing all bits of a vector against zero.
23051static SDValue LowerVectorAllZero(const SDLoc &DL, SDValue V, ISD::CondCode CC,
23052 const APInt &Mask,
23053 const X86Subtarget &Subtarget,
23054 SelectionDAG &DAG, X86::CondCode &X86CC) {
23055 EVT VT = V.getValueType();
23056 unsigned ScalarSize = VT.getScalarSizeInBits();
23057 if (Mask.getBitWidth() != ScalarSize) {
23058 assert(ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch")(static_cast <bool> (ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch"
) ? void (0) : __assert_fail ("ScalarSize == 1 && \"Element Mask vs Vector bitwidth mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23058, __extension__
__PRETTY_FUNCTION__))
;
23059 return SDValue();
23060 }
23061
23062 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode")(static_cast <bool> ((CC == ISD::SETEQ || CC == ISD::SETNE
) && "Unsupported ISD::CondCode") ? void (0) : __assert_fail
("(CC == ISD::SETEQ || CC == ISD::SETNE) && \"Unsupported ISD::CondCode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23062, __extension__
__PRETTY_FUNCTION__))
;
23063 X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE);
23064
23065 auto MaskBits = [&](SDValue Src) {
23066 if (Mask.isAllOnes())
23067 return Src;
23068 EVT SrcVT = Src.getValueType();
23069 SDValue MaskValue = DAG.getConstant(Mask, DL, SrcVT);
23070 return DAG.getNode(ISD::AND, DL, SrcVT, Src, MaskValue);
23071 };
23072
23073 // For sub-128-bit vector, cast to (legal) integer and compare with zero.
23074 if (VT.getSizeInBits() < 128) {
23075 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
23076 if (!DAG.getTargetLoweringInfo().isTypeLegal(IntVT))
23077 return SDValue();
23078 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
23079 DAG.getBitcast(IntVT, MaskBits(V)),
23080 DAG.getConstant(0, DL, IntVT));
23081 }
23082
23083 // Quit if not splittable to 128/256-bit vector.
23084 if (!isPowerOf2_32(VT.getSizeInBits()))
23085 return SDValue();
23086
23087 // Split down to 128/256-bit vector.
23088 unsigned TestSize = Subtarget.hasAVX() ? 256 : 128;
23089 while (VT.getSizeInBits() > TestSize) {
23090 auto Split = DAG.SplitVector(V, DL);
23091 VT = Split.first.getValueType();
23092 V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second);
23093 }
23094
23095 bool UsePTEST = Subtarget.hasSSE41();
23096 if (UsePTEST) {
23097 MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
23098 V = DAG.getBitcast(TestVT, MaskBits(V));
23099 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V);
23100 }
23101
23102 // Without PTEST, a masked v2i64 or-reduction is not faster than
23103 // scalarization.
23104 if (!Mask.isAllOnes() && VT.getScalarSizeInBits() > 32)
23105 return SDValue();
23106
23107 V = DAG.getBitcast(MVT::v16i8, MaskBits(V));
23108 V = DAG.getNode(X86ISD::PCMPEQ, DL, MVT::v16i8, V,
23109 getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
23110 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
23111 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
23112 DAG.getConstant(0xFFFF, DL, MVT::i32));
23113}
23114
23115// Check whether an OR'd reduction tree is PTEST-able, or if we can fallback to
23116// CMP(MOVMSK(PCMPEQB(X,0))).
23117static SDValue MatchVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
23118 const SDLoc &DL,
23119 const X86Subtarget &Subtarget,
23120 SelectionDAG &DAG, SDValue &X86CC) {
23121 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode")(static_cast <bool> ((CC == ISD::SETEQ || CC == ISD::SETNE
) && "Unsupported ISD::CondCode") ? void (0) : __assert_fail
("(CC == ISD::SETEQ || CC == ISD::SETNE) && \"Unsupported ISD::CondCode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23121, __extension__
__PRETTY_FUNCTION__))
;
23122
23123 if (!Subtarget.hasSSE2() || !Op->hasOneUse())
23124 return SDValue();
23125
23126 // Check whether we're masking/truncating an OR-reduction result, in which
23127 // case track the masked bits.
23128 APInt Mask = APInt::getAllOnes(Op.getScalarValueSizeInBits());
23129 switch (Op.getOpcode()) {
23130 case ISD::TRUNCATE: {
23131 SDValue Src = Op.getOperand(0);
23132 Mask = APInt::getLowBitsSet(Src.getScalarValueSizeInBits(),
23133 Op.getScalarValueSizeInBits());
23134 Op = Src;
23135 break;
23136 }
23137 case ISD::AND: {
23138 if (auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
23139 Mask = Cst->getAPIntValue();
23140 Op = Op.getOperand(0);
23141 }
23142 break;
23143 }
23144 }
23145
23146 SmallVector<SDValue, 8> VecIns;
23147 if (Op.getOpcode() == ISD::OR && matchScalarReduction(Op, ISD::OR, VecIns)) {
23148 EVT VT = VecIns[0].getValueType();
23149 assert(llvm::all_of(VecIns,(static_cast <bool> (llvm::all_of(VecIns, [VT](SDValue V
) { return VT == V.getValueType(); }) && "Reduction source vector mismatch"
) ? void (0) : __assert_fail ("llvm::all_of(VecIns, [VT](SDValue V) { return VT == V.getValueType(); }) && \"Reduction source vector mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23151, __extension__
__PRETTY_FUNCTION__))
23150 [VT](SDValue V) { return VT == V.getValueType(); }) &&(static_cast <bool> (llvm::all_of(VecIns, [VT](SDValue V
) { return VT == V.getValueType(); }) && "Reduction source vector mismatch"
) ? void (0) : __assert_fail ("llvm::all_of(VecIns, [VT](SDValue V) { return VT == V.getValueType(); }) && \"Reduction source vector mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23151, __extension__
__PRETTY_FUNCTION__))
23151 "Reduction source vector mismatch")(static_cast <bool> (llvm::all_of(VecIns, [VT](SDValue V
) { return VT == V.getValueType(); }) && "Reduction source vector mismatch"
) ? void (0) : __assert_fail ("llvm::all_of(VecIns, [VT](SDValue V) { return VT == V.getValueType(); }) && \"Reduction source vector mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23151, __extension__
__PRETTY_FUNCTION__))
;
23152
23153 // Quit if less than 128-bits or not splittable to 128/256-bit vector.
23154 if (VT.getSizeInBits() < 128 || !isPowerOf2_32(VT.getSizeInBits()))
23155 return SDValue();
23156
23157 // If more than one full vector is evaluated, OR them first before PTEST.
23158 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1;
23159 Slot += 2, e += 1) {
23160 // Each iteration will OR 2 nodes and append the result until there is
23161 // only 1 node left, i.e. the final OR'd value of all vectors.
23162 SDValue LHS = VecIns[Slot];
23163 SDValue RHS = VecIns[Slot + 1];
23164 VecIns.push_back(DAG.getNode(ISD::OR, DL, VT, LHS, RHS));
23165 }
23166
23167 X86::CondCode CCode;
23168 if (SDValue V = LowerVectorAllZero(DL, VecIns.back(), CC, Mask, Subtarget,
23169 DAG, CCode)) {
23170 X86CC = DAG.getTargetConstant(CCode, DL, MVT::i8);
23171 return V;
23172 }
23173 }
23174
23175 if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
23176 ISD::NodeType BinOp;
23177 if (SDValue Match =
23178 DAG.matchBinOpReduction(Op.getNode(), BinOp, {ISD::OR})) {
23179 X86::CondCode CCode;
23180 if (SDValue V =
23181 LowerVectorAllZero(DL, Match, CC, Mask, Subtarget, DAG, CCode)) {
23182 X86CC = DAG.getTargetConstant(CCode, DL, MVT::i8);
23183 return V;
23184 }
23185 }
23186 }
23187
23188 return SDValue();
23189}
23190
23191/// return true if \c Op has a use that doesn't just read flags.
23192static bool hasNonFlagsUse(SDValue Op) {
23193 for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
23194 ++UI) {
23195 SDNode *User = *UI;
23196 unsigned UOpNo = UI.getOperandNo();
23197 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
23198 // Look pass truncate.
23199 UOpNo = User->use_begin().getOperandNo();
23200 User = *User->use_begin();
23201 }
23202
23203 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
23204 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
23205 return true;
23206 }
23207 return false;
23208}
23209
23210// Transform to an x86-specific ALU node with flags if there is a chance of
23211// using an RMW op or only the flags are used. Otherwise, leave
23212// the node alone and emit a 'cmp' or 'test' instruction.
23213static bool isProfitableToUseFlagOp(SDValue Op) {
23214 for (SDNode *U : Op->uses())
23215 if (U->getOpcode() != ISD::CopyToReg &&
23216 U->getOpcode() != ISD::SETCC &&
23217 U->getOpcode() != ISD::STORE)
23218 return false;
23219
23220 return true;
23221}
23222
23223/// Emit nodes that will be selected as "test Op0,Op0", or something
23224/// equivalent.
23225static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
23226 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
23227 // CF and OF aren't always set the way we want. Determine which
23228 // of these we need.
23229 bool NeedCF = false;
23230 bool NeedOF = false;
23231 switch (X86CC) {
23232 default: break;
23233 case X86::COND_A: case X86::COND_AE:
23234 case X86::COND_B: case X86::COND_BE:
23235 NeedCF = true;
23236 break;
23237 case X86::COND_G: case X86::COND_GE:
23238 case X86::COND_L: case X86::COND_LE:
23239 case X86::COND_O: case X86::COND_NO: {
23240 // Check if we really need to set the
23241 // Overflow flag. If NoSignedWrap is present
23242 // that is not actually needed.
23243 switch (Op->getOpcode()) {
23244 case ISD::ADD:
23245 case ISD::SUB:
23246 case ISD::MUL:
23247 case ISD::SHL:
23248 if (Op.getNode()->getFlags().hasNoSignedWrap())
23249 break;
23250 LLVM_FALLTHROUGH[[gnu::fallthrough]];
23251 default:
23252 NeedOF = true;
23253 break;
23254 }
23255 break;
23256 }
23257 }
23258 // See if we can use the EFLAGS value from the operand instead of
23259 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
23260 // we prove that the arithmetic won't overflow, we can't use OF or CF.
23261 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
23262 // Emit a CMP with 0, which is the TEST pattern.
23263 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
23264 DAG.getConstant(0, dl, Op.getValueType()));
23265 }
23266 unsigned Opcode = 0;
23267 unsigned NumOperands = 0;
23268
23269 SDValue ArithOp = Op;
23270
23271 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
23272 // which may be the result of a CAST. We use the variable 'Op', which is the
23273 // non-casted variable when we check for possible users.
23274 switch (ArithOp.getOpcode()) {
23275 case ISD::AND:
23276 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
23277 // because a TEST instruction will be better.
23278 if (!hasNonFlagsUse(Op))
23279 break;
23280
23281 LLVM_FALLTHROUGH[[gnu::fallthrough]];
23282 case ISD::ADD:
23283 case ISD::SUB:
23284 case ISD::OR:
23285 case ISD::XOR:
23286 if (!isProfitableToUseFlagOp(Op))
23287 break;
23288
23289 // Otherwise use a regular EFLAGS-setting instruction.
23290 switch (ArithOp.getOpcode()) {
23291 default: llvm_unreachable("unexpected operator!")::llvm::llvm_unreachable_internal("unexpected operator!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23291)
;
23292 case ISD::ADD: Opcode = X86ISD::ADD; break;
23293 case ISD::SUB: Opcode = X86ISD::SUB; break;
23294 case ISD::XOR: Opcode = X86ISD::XOR; break;
23295 case ISD::AND: Opcode = X86ISD::AND; break;
23296 case ISD::OR: Opcode = X86ISD::OR; break;
23297 }
23298
23299 NumOperands = 2;
23300 break;
23301 case X86ISD::ADD:
23302 case X86ISD::SUB:
23303 case X86ISD::OR:
23304 case X86ISD::XOR:
23305 case X86ISD::AND:
23306 return SDValue(Op.getNode(), 1);
23307 case ISD::SSUBO:
23308 case ISD::USUBO: {
23309 // /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag.
23310 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
23311 return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),
23312 Op->getOperand(1)).getValue(1);
23313 }
23314 default:
23315 break;
23316 }
23317
23318 if (Opcode == 0) {
23319 // Emit a CMP with 0, which is the TEST pattern.
23320 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
23321 DAG.getConstant(0, dl, Op.getValueType()));
23322 }
23323 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
23324 SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
23325
23326 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
23327 DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);
23328 return SDValue(New.getNode(), 1);
23329}
23330
23331/// Emit nodes that will be selected as "cmp Op0,Op1", or something
23332/// equivalent.
23333static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
23334 const SDLoc &dl, SelectionDAG &DAG,
23335 const X86Subtarget &Subtarget) {
23336 if (isNullConstant(Op1))
23337 return EmitTest(Op0, X86CC, dl, DAG, Subtarget);
23338
23339 EVT CmpVT = Op0.getValueType();
23340
23341 assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||(static_cast <bool> ((CmpVT == MVT::i8 || CmpVT == MVT::
i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!"
) ? void (0) : __assert_fail ("(CmpVT == MVT::i8 || CmpVT == MVT::i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23342, __extension__
__PRETTY_FUNCTION__))
23342 CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!")(static_cast <bool> ((CmpVT == MVT::i8 || CmpVT == MVT::
i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!"
) ? void (0) : __assert_fail ("(CmpVT == MVT::i8 || CmpVT == MVT::i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23342, __extension__
__PRETTY_FUNCTION__))
;
23343
23344 // Only promote the compare up to I32 if it is a 16 bit operation
23345 // with an immediate. 16 bit immediates are to be avoided.
23346 if (CmpVT == MVT::i16 && !Subtarget.isAtom() &&
23347 !DAG.getMachineFunction().getFunction().hasMinSize()) {
23348 ConstantSDNode *COp0 = dyn_cast<ConstantSDNode>(Op0);
23349 ConstantSDNode *COp1 = dyn_cast<ConstantSDNode>(Op1);
23350 // Don't do this if the immediate can fit in 8-bits.
23351 if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||
23352 (COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {
23353 unsigned ExtendOp =
23354 isX86CCSigned(X86CC) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
23355 if (X86CC == X86::COND_E || X86CC == X86::COND_NE) {
23356 // For equality comparisons try to use SIGN_EXTEND if the input was
23357 // truncate from something with enough sign bits.
23358 if (Op0.getOpcode() == ISD::TRUNCATE) {
23359 if (DAG.ComputeMaxSignificantBits(Op0.getOperand(0)) <= 16)
23360 ExtendOp = ISD::SIGN_EXTEND;
23361 } else if (Op1.getOpcode() == ISD::TRUNCATE) {
23362 if (DAG.ComputeMaxSignificantBits(Op1.getOperand(0)) <= 16)
23363 ExtendOp = ISD::SIGN_EXTEND;
23364 }
23365 }
23366
23367 CmpVT = MVT::i32;
23368 Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0);
23369 Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);
23370 }
23371 }
23372
23373 // Try to shrink i64 compares if the input has enough zero bits.
23374 // FIXME: Do this for non-constant compares for constant on LHS?
23375 if (CmpVT == MVT::i64 && isa<ConstantSDNode>(Op1) && !isX86CCSigned(X86CC) &&
23376 Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
23377 cast<ConstantSDNode>(Op1)->getAPIntValue().getActiveBits() <= 32 &&
23378 DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) {
23379 CmpVT = MVT::i32;
23380 Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
23381 Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
23382 }
23383
23384 // 0-x == y --> x+y == 0
23385 // 0-x != y --> x+y != 0
23386 if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) &&
23387 Op0.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
23388 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
23389 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(1), Op1);
23390 return Add.getValue(1);
23391 }
23392
23393 // x == 0-y --> x+y == 0
23394 // x != 0-y --> x+y != 0
23395 if (Op1.getOpcode() == ISD::SUB && isNullConstant(Op1.getOperand(0)) &&
23396 Op1.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
23397 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
23398 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0, Op1.getOperand(1));
23399 return Add.getValue(1);
23400 }
23401
23402 // Use SUB instead of CMP to enable CSE between SUB and CMP.
23403 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
23404 SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
23405 return Sub.getValue(1);
23406}
23407
23408/// Check if replacement of SQRT with RSQRT should be disabled.
23409bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
23410 EVT VT = Op.getValueType();
23411
23412 // We don't need to replace SQRT with RSQRT for half type.
23413 if (VT.getScalarType() == MVT::f16)
23414 return true;
23415
23416 // We never want to use both SQRT and RSQRT instructions for the same input.
23417 if (DAG.doesNodeExist(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
23418 return false;
23419
23420 if (VT.isVector())
23421 return Subtarget.hasFastVectorFSQRT();
23422 return Subtarget.hasFastScalarFSQRT();
23423}
23424
23425/// The minimum architected relative accuracy is 2^-12. We need one
23426/// Newton-Raphson step to have a good float result (24 bits of precision).
23427SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
23428 SelectionDAG &DAG, int Enabled,
23429 int &RefinementSteps,
23430 bool &UseOneConstNR,
23431 bool Reciprocal) const {
23432 SDLoc DL(Op);
23433 EVT VT = Op.getValueType();
23434
23435 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
23436 // It is likely not profitable to do this for f64 because a double-precision
23437 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
23438 // instructions: convert to single, rsqrtss, convert back to double, refine
23439 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
23440 // along with FMA, this could be a throughput win.
23441 // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
23442 // after legalize types.
23443 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
23444 (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
23445 (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
23446 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
23447 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
23448 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23449 RefinementSteps = 1;
23450
23451 UseOneConstNR = false;
23452 // There is no FSQRT for 512-bits, but there is RSQRT14.
23453 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
23454 SDValue Estimate = DAG.getNode(Opcode, DL, VT, Op);
23455 if (RefinementSteps == 0 && !Reciprocal)
23456 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Op, Estimate);
23457 return Estimate;
23458 }
23459
23460 if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
23461 Subtarget.hasFP16()) {
23462 assert(Reciprocal && "Don't replace SQRT with RSQRT for half type")(static_cast <bool> (Reciprocal && "Don't replace SQRT with RSQRT for half type"
) ? void (0) : __assert_fail ("Reciprocal && \"Don't replace SQRT with RSQRT for half type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23462, __extension__
__PRETTY_FUNCTION__))
;
23463 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23464 RefinementSteps = 0;
23465
23466 if (VT == MVT::f16) {
23467 SDValue Zero = DAG.getIntPtrConstant(0, DL);
23468 SDValue Undef = DAG.getUNDEF(MVT::v8f16);
23469 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
23470 Op = DAG.getNode(X86ISD::RSQRT14S, DL, MVT::v8f16, Undef, Op);
23471 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
23472 }
23473
23474 return DAG.getNode(X86ISD::RSQRT14, DL, VT, Op);
23475 }
23476 return SDValue();
23477}
23478
23479/// The minimum architected relative accuracy is 2^-12. We need one
23480/// Newton-Raphson step to have a good float result (24 bits of precision).
23481SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
23482 int Enabled,
23483 int &RefinementSteps) const {
23484 SDLoc DL(Op);
23485 EVT VT = Op.getValueType();
23486
23487 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
23488 // It is likely not profitable to do this for f64 because a double-precision
23489 // reciprocal estimate with refinement on x86 prior to FMA requires
23490 // 15 instructions: convert to single, rcpss, convert back to double, refine
23491 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
23492 // along with FMA, this could be a throughput win.
23493
23494 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
23495 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
23496 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
23497 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
23498 // Enable estimate codegen with 1 refinement step for vector division.
23499 // Scalar division estimates are disabled because they break too much
23500 // real-world code. These defaults are intended to match GCC behavior.
23501 if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
23502 return SDValue();
23503
23504 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23505 RefinementSteps = 1;
23506
23507 // There is no FSQRT for 512-bits, but there is RCP14.
23508 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
23509 return DAG.getNode(Opcode, DL, VT, Op);
23510 }
23511
23512 if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
23513 Subtarget.hasFP16()) {
23514 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23515 RefinementSteps = 0;
23516
23517 if (VT == MVT::f16) {
23518 SDValue Zero = DAG.getIntPtrConstant(0, DL);
23519 SDValue Undef = DAG.getUNDEF(MVT::v8f16);
23520 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
23521 Op = DAG.getNode(X86ISD::RCP14S, DL, MVT::v8f16, Undef, Op);
23522 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
23523 }
23524
23525 return DAG.getNode(X86ISD::RCP14, DL, VT, Op);
23526 }
23527 return SDValue();
23528}
23529
23530/// If we have at least two divisions that use the same divisor, convert to
23531/// multiplication by a reciprocal. This may need to be adjusted for a given
23532/// CPU if a division's cost is not at least twice the cost of a multiplication.
23533/// This is because we still need one division to calculate the reciprocal and
23534/// then we need two multiplies by that reciprocal as replacements for the
23535/// original divisions.
23536unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
23537 return 2;
23538}
23539
23540SDValue
23541X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
23542 SelectionDAG &DAG,
23543 SmallVectorImpl<SDNode *> &Created) const {
23544 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
23545 if (isIntDivCheap(N->getValueType(0), Attr))
23546 return SDValue(N,0); // Lower SDIV as SDIV
23547
23548 assert((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) &&(static_cast <bool> ((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2
()) && "Unexpected divisor!") ? void (0) : __assert_fail
("(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) && \"Unexpected divisor!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23549, __extension__
__PRETTY_FUNCTION__))
23549 "Unexpected divisor!")(static_cast <bool> ((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2
()) && "Unexpected divisor!") ? void (0) : __assert_fail
("(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) && \"Unexpected divisor!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23549, __extension__
__PRETTY_FUNCTION__))
;
23550
23551 // Only perform this transform if CMOV is supported otherwise the select
23552 // below will become a branch.
23553 if (!Subtarget.canUseCMOV())
23554 return SDValue();
23555
23556 // fold (sdiv X, pow2)
23557 EVT VT = N->getValueType(0);
23558 // FIXME: Support i8.
23559 if (VT != MVT::i16 && VT != MVT::i32 &&
23560 !(Subtarget.is64Bit() && VT == MVT::i64))
23561 return SDValue();
23562
23563 unsigned Lg2 = Divisor.countTrailingZeros();
23564
23565 // If the divisor is 2 or -2, the default expansion is better.
23566 if (Lg2 == 1)
23567 return SDValue();
23568
23569 SDLoc DL(N);
23570 SDValue N0 = N->getOperand(0);
23571 SDValue Zero = DAG.getConstant(0, DL, VT);
23572 APInt Lg2Mask = APInt::getLowBitsSet(VT.getSizeInBits(), Lg2);
23573 SDValue Pow2MinusOne = DAG.getConstant(Lg2Mask, DL, VT);
23574
23575 // If N0 is negative, we need to add (Pow2 - 1) to it before shifting right.
23576 SDValue Cmp = DAG.getSetCC(DL, MVT::i8, N0, Zero, ISD::SETLT);
23577 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
23578 SDValue CMov = DAG.getNode(ISD::SELECT, DL, VT, Cmp, Add, N0);
23579
23580 Created.push_back(Cmp.getNode());
23581 Created.push_back(Add.getNode());
23582 Created.push_back(CMov.getNode());
23583
23584 // Divide by pow2.
23585 SDValue SRA =
23586 DAG.getNode(ISD::SRA, DL, VT, CMov, DAG.getConstant(Lg2, DL, MVT::i8));
23587
23588 // If we're dividing by a positive value, we're done. Otherwise, we must
23589 // negate the result.
23590 if (Divisor.isNonNegative())
23591 return SRA;
23592
23593 Created.push_back(SRA.getNode());
23594 return DAG.getNode(ISD::SUB, DL, VT, Zero, SRA);
23595}
23596
23597/// Result of 'and' is compared against zero. Change to a BT node if possible.
23598/// Returns the BT node and the condition code needed to use it.
23599static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl,
23600 SelectionDAG &DAG, X86::CondCode &X86CC) {
23601 assert(And.getOpcode() == ISD::AND && "Expected AND node!")(static_cast <bool> (And.getOpcode() == ISD::AND &&
"Expected AND node!") ? void (0) : __assert_fail ("And.getOpcode() == ISD::AND && \"Expected AND node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23601, __extension__
__PRETTY_FUNCTION__))
;
23602 SDValue Op0 = And.getOperand(0);
23603 SDValue Op1 = And.getOperand(1);
23604 if (Op0.getOpcode() == ISD::TRUNCATE)
23605 Op0 = Op0.getOperand(0);
23606 if (Op1.getOpcode() == ISD::TRUNCATE)
23607 Op1 = Op1.getOperand(0);
23608
23609 SDValue Src, BitNo;
23610 if (Op1.getOpcode() == ISD::SHL)
23611 std::swap(Op0, Op1);
23612 if (Op0.getOpcode() == ISD::SHL) {
23613 if (isOneConstant(Op0.getOperand(0))) {
23614 // If we looked past a truncate, check that it's only truncating away
23615 // known zeros.
23616 unsigned BitWidth = Op0.getValueSizeInBits();
23617 unsigned AndBitWidth = And.getValueSizeInBits();
23618 if (BitWidth > AndBitWidth) {
23619 KnownBits Known = DAG.computeKnownBits(Op0);
23620 if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
23621 return SDValue();
23622 }
23623 Src = Op1;
23624 BitNo = Op0.getOperand(1);
23625 }
23626 } else if (Op1.getOpcode() == ISD::Constant) {
23627 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
23628 uint64_t AndRHSVal = AndRHS->getZExtValue();
23629 SDValue AndLHS = Op0;
23630
23631 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
23632 Src = AndLHS.getOperand(0);
23633 BitNo = AndLHS.getOperand(1);
23634 } else {
23635 // Use BT if the immediate can't be encoded in a TEST instruction or we
23636 // are optimizing for size and the immedaite won't fit in a byte.
23637 bool OptForSize = DAG.shouldOptForSize();
23638 if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
23639 isPowerOf2_64(AndRHSVal)) {
23640 Src = AndLHS;
23641 BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,
23642 Src.getValueType());
23643 }
23644 }
23645 }
23646
23647 // No patterns found, give up.
23648 if (!Src.getNode())
23649 return SDValue();
23650
23651 // Remove any bit flip.
23652 if (isBitwiseNot(Src)) {
23653 Src = Src.getOperand(0);
23654 CC = CC == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ;
23655 }
23656
23657 // Attempt to create the X86ISD::BT node.
23658 if (SDValue BT = getBT(Src, BitNo, dl, DAG)) {
23659 X86CC = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
23660 return BT;
23661 }
23662
23663 return SDValue();
23664}
23665
23666// Check if pre-AVX condcode can be performed by a single FCMP op.
23667static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode) {
23668 return (SetCCOpcode != ISD::SETONE) && (SetCCOpcode != ISD::SETUEQ);
23669}
23670
23671/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
23672/// CMPs.
23673static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
23674 SDValue &Op1, bool &IsAlwaysSignaling) {
23675 unsigned SSECC;
23676 bool Swap = false;
23677
23678 // SSE Condition code mapping:
23679 // 0 - EQ
23680 // 1 - LT
23681 // 2 - LE
23682 // 3 - UNORD
23683 // 4 - NEQ
23684 // 5 - NLT
23685 // 6 - NLE
23686 // 7 - ORD
23687 switch (SetCCOpcode) {
23688 default: llvm_unreachable("Unexpected SETCC condition")::llvm::llvm_unreachable_internal("Unexpected SETCC condition"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23688)
;
23689 case ISD::SETOEQ:
23690 case ISD::SETEQ: SSECC = 0; break;
23691 case ISD::SETOGT:
23692 case ISD::SETGT: Swap = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];
23693 case ISD::SETLT:
23694 case ISD::SETOLT: SSECC = 1; break;
23695 case ISD::SETOGE:
23696 case ISD::SETGE: Swap = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];
23697 case ISD::SETLE:
23698 case ISD::SETOLE: SSECC = 2; break;
23699 case ISD::SETUO: SSECC = 3; break;
23700 case ISD::SETUNE:
23701 case ISD::SETNE: SSECC = 4; break;
23702 case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];
23703 case ISD::SETUGE: SSECC = 5; break;
23704 case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];
23705 case ISD::SETUGT: SSECC = 6; break;
23706 case ISD::SETO: SSECC = 7; break;
23707 case ISD::SETUEQ: SSECC = 8; break;
23708 case ISD::SETONE: SSECC = 12; break;
23709 }
23710 if (Swap)
23711 std::swap(Op0, Op1);
23712
23713 switch (SetCCOpcode) {
23714 default:
23715 IsAlwaysSignaling = true;
23716 break;
23717 case ISD::SETEQ:
23718 case ISD::SETOEQ:
23719 case ISD::SETUEQ:
23720 case ISD::SETNE:
23721 case ISD::SETONE:
23722 case ISD::SETUNE:
23723 case ISD::SETO:
23724 case ISD::SETUO:
23725 IsAlwaysSignaling = false;
23726 break;
23727 }
23728
23729 return SSECC;
23730}
23731
23732/// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
23733/// concatenate the result back.
23734static SDValue splitIntVSETCC(EVT VT, SDValue LHS, SDValue RHS,
23735 ISD::CondCode Cond, SelectionDAG &DAG,
23736 const SDLoc &dl) {
23737 assert(VT.isInteger() && VT == LHS.getValueType() &&(static_cast <bool> (VT.isInteger() && VT == LHS
.getValueType() && VT == RHS.getValueType() &&
"Unsupported VTs!") ? void (0) : __assert_fail ("VT.isInteger() && VT == LHS.getValueType() && VT == RHS.getValueType() && \"Unsupported VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23738, __extension__
__PRETTY_FUNCTION__))
23738 VT == RHS.getValueType() && "Unsupported VTs!")(static_cast <bool> (VT.isInteger() && VT == LHS
.getValueType() && VT == RHS.getValueType() &&
"Unsupported VTs!") ? void (0) : __assert_fail ("VT.isInteger() && VT == LHS.getValueType() && VT == RHS.getValueType() && \"Unsupported VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23738, __extension__
__PRETTY_FUNCTION__))
;
23739
23740 SDValue CC = DAG.getCondCode(Cond);
23741
23742 // Extract the LHS Lo/Hi vectors
23743 SDValue LHS1, LHS2;
23744 std::tie(LHS1, LHS2) = splitVector(LHS, DAG, dl);
23745
23746 // Extract the RHS Lo/Hi vectors
23747 SDValue RHS1, RHS2;
23748 std::tie(RHS1, RHS2) = splitVector(RHS, DAG, dl);
23749
23750 // Issue the operation on the smaller types and concatenate the result back
23751 EVT LoVT, HiVT;
23752 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
23753 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
23754 DAG.getNode(ISD::SETCC, dl, LoVT, LHS1, RHS1, CC),
23755 DAG.getNode(ISD::SETCC, dl, HiVT, LHS2, RHS2, CC));
23756}
23757
23758static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
23759
23760 SDValue Op0 = Op.getOperand(0);
23761 SDValue Op1 = Op.getOperand(1);
23762 SDValue CC = Op.getOperand(2);
23763 MVT VT = Op.getSimpleValueType();
23764 SDLoc dl(Op);
23765
23766 assert(VT.getVectorElementType() == MVT::i1 &&(static_cast <bool> (VT.getVectorElementType() == MVT::
i1 && "Cannot set masked compare for this operation")
? void (0) : __assert_fail ("VT.getVectorElementType() == MVT::i1 && \"Cannot set masked compare for this operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23767, __extension__
__PRETTY_FUNCTION__))
23767 "Cannot set masked compare for this operation")(static_cast <bool> (VT.getVectorElementType() == MVT::
i1 && "Cannot set masked compare for this operation")
? void (0) : __assert_fail ("VT.getVectorElementType() == MVT::i1 && \"Cannot set masked compare for this operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23767, __extension__
__PRETTY_FUNCTION__))
;
23768
23769 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
23770
23771 // Prefer SETGT over SETLT.
23772 if (SetCCOpcode == ISD::SETLT) {
23773 SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
23774 std::swap(Op0, Op1);
23775 }
23776
23777 return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
23778}
23779
23780/// Given a buildvector constant, return a new vector constant with each element
23781/// incremented or decremented. If incrementing or decrementing would result in
23782/// unsigned overflow or underflow or this is not a simple vector constant,
23783/// return an empty value.
23784static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc) {
23785 auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());
23786 if (!BV)
23787 return SDValue();
23788
23789 MVT VT = V.getSimpleValueType();
23790 MVT EltVT = VT.getVectorElementType();
23791 unsigned NumElts = VT.getVectorNumElements();
23792 SmallVector<SDValue, 8> NewVecC;
23793 SDLoc DL(V);
23794 for (unsigned i = 0; i < NumElts; ++i) {
23795 auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
23796 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)
23797 return SDValue();
23798
23799 // Avoid overflow/underflow.
23800 const APInt &EltC = Elt->getAPIntValue();
23801 if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isZero()))
23802 return SDValue();
23803
23804 NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));
23805 }
23806
23807 return DAG.getBuildVector(VT, DL, NewVecC);
23808}
23809
23810/// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
23811/// Op0 u<= Op1:
23812/// t = psubus Op0, Op1
23813/// pcmpeq t, <0..0>
23814static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,
23815 ISD::CondCode Cond, const SDLoc &dl,
23816 const X86Subtarget &Subtarget,
23817 SelectionDAG &DAG) {
23818 if (!Subtarget.hasSSE2())
23819 return SDValue();
23820
23821 MVT VET = VT.getVectorElementType();
23822 if (VET != MVT::i8 && VET != MVT::i16)
23823 return SDValue();
23824
23825 switch (Cond) {
23826 default:
23827 return SDValue();
23828 case ISD::SETULT: {
23829 // If the comparison is against a constant we can turn this into a
23830 // setule. With psubus, setule does not require a swap. This is
23831 // beneficial because the constant in the register is no longer
23832 // destructed as the destination so it can be hoisted out of a loop.
23833 // Only do this pre-AVX since vpcmp* is no longer destructive.
23834 if (Subtarget.hasAVX())
23835 return SDValue();
23836 SDValue ULEOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/false);
23837 if (!ULEOp1)
23838 return SDValue();
23839 Op1 = ULEOp1;
23840 break;
23841 }
23842 case ISD::SETUGT: {
23843 // If the comparison is against a constant, we can turn this into a setuge.
23844 // This is beneficial because materializing a constant 0 for the PCMPEQ is
23845 // probably cheaper than XOR+PCMPGT using 2 different vector constants:
23846 // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
23847 SDValue UGEOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/true);
23848 if (!UGEOp1)
23849 return SDValue();
23850 Op1 = Op0;
23851 Op0 = UGEOp1;
23852 break;
23853 }
23854 // Psubus is better than flip-sign because it requires no inversion.
23855 case ISD::SETUGE:
23856 std::swap(Op0, Op1);
23857 break;
23858 case ISD::SETULE:
23859 break;
23860 }
23861
23862 SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1);
23863 return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
23864 DAG.getConstant(0, dl, VT));
23865}
23866
23867static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
23868 SelectionDAG &DAG) {
23869 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
23870 Op.getOpcode() == ISD::STRICT_FSETCCS;
23871 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
23872 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
23873 SDValue CC = Op.getOperand(IsStrict ? 3 : 2);
23874 MVT VT = Op->getSimpleValueType(0);
23875 ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
23876 bool isFP = Op1.getSimpleValueType().isFloatingPoint();
23877 SDLoc dl(Op);
23878
23879 if (isFP) {
23880#ifndef NDEBUG
23881 MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
23882 assert(EltVT == MVT::f16 || EltVT == MVT::f32 || EltVT == MVT::f64)(static_cast <bool> (EltVT == MVT::f16 || EltVT == MVT::
f32 || EltVT == MVT::f64) ? void (0) : __assert_fail ("EltVT == MVT::f16 || EltVT == MVT::f32 || EltVT == MVT::f64"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23882, __extension__
__PRETTY_FUNCTION__))
;
23883#endif
23884
23885 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
23886 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
23887
23888 // If we have a strict compare with a vXi1 result and the input is 128/256
23889 // bits we can't use a masked compare unless we have VLX. If we use a wider
23890 // compare like we do for non-strict, we might trigger spurious exceptions
23891 // from the upper elements. Instead emit a AVX compare and convert to mask.
23892 unsigned Opc;
23893 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 &&
23894 (!IsStrict || Subtarget.hasVLX() ||
23895 Op0.getSimpleValueType().is512BitVector())) {
23896#ifndef NDEBUG
23897 unsigned Num = VT.getVectorNumElements();
23898 assert(Num <= 16 || (Num == 32 && EltVT == MVT::f16))(static_cast <bool> (Num <= 16 || (Num == 32 &&
EltVT == MVT::f16)) ? void (0) : __assert_fail ("Num <= 16 || (Num == 32 && EltVT == MVT::f16)"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23898, __extension__
__PRETTY_FUNCTION__))
;
23899#endif
23900 Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;
23901 } else {
23902 Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP;
23903 // The SSE/AVX packed FP comparison nodes are defined with a
23904 // floating-point vector result that matches the operand type. This allows
23905 // them to work with an SSE1 target (integer vector types are not legal).
23906 VT = Op0.getSimpleValueType();
23907 }
23908
23909 SDValue Cmp;
23910 bool IsAlwaysSignaling;
23911 unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1, IsAlwaysSignaling);
23912 if (!Subtarget.hasAVX()) {
23913 // TODO: We could use following steps to handle a quiet compare with
23914 // signaling encodings.
23915 // 1. Get ordered masks from a quiet ISD::SETO
23916 // 2. Use the masks to mask potential unordered elements in operand A, B
23917 // 3. Get the compare results of masked A, B
23918 // 4. Calculating final result using the mask and result from 3
23919 // But currently, we just fall back to scalar operations.
23920 if (IsStrict && IsAlwaysSignaling && !IsSignaling)
23921 return SDValue();
23922
23923 // Insert an extra signaling instruction to raise exception.
23924 if (IsStrict && !IsAlwaysSignaling && IsSignaling) {
23925 SDValue SignalCmp = DAG.getNode(
23926 Opc, dl, {VT, MVT::Other},
23927 {Chain, Op0, Op1, DAG.getTargetConstant(1, dl, MVT::i8)}); // LT_OS
23928 // FIXME: It seems we need to update the flags of all new strict nodes.
23929 // Otherwise, mayRaiseFPException in MI will return false due to
23930 // NoFPExcept = false by default. However, I didn't find it in other
23931 // patches.
23932 SignalCmp->setFlags(Op->getFlags());
23933 Chain = SignalCmp.getValue(1);
23934 }
23935
23936 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
23937 // emit two comparisons and a logic op to tie them together.
23938 if (!cheapX86FSETCC_SSE(Cond)) {
23939 // LLVM predicate is SETUEQ or SETONE.
23940 unsigned CC0, CC1;
23941 unsigned CombineOpc;
23942 if (Cond == ISD::SETUEQ) {
23943 CC0 = 3; // UNORD
23944 CC1 = 0; // EQ
23945 CombineOpc = X86ISD::FOR;
23946 } else {
23947 assert(Cond == ISD::SETONE)(static_cast <bool> (Cond == ISD::SETONE) ? void (0) : __assert_fail
("Cond == ISD::SETONE", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23947, __extension__ __PRETTY_FUNCTION__))
;
23948 CC0 = 7; // ORD
23949 CC1 = 4; // NEQ
23950 CombineOpc = X86ISD::FAND;
23951 }
23952
23953 SDValue Cmp0, Cmp1;
23954 if (IsStrict) {
23955 Cmp0 = DAG.getNode(
23956 Opc, dl, {VT, MVT::Other},
23957 {Chain, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)});
23958 Cmp1 = DAG.getNode(
23959 Opc, dl, {VT, MVT::Other},
23960 {Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)});
23961 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1),
23962 Cmp1.getValue(1));
23963 } else {
23964 Cmp0 = DAG.getNode(
23965 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8));
23966 Cmp1 = DAG.getNode(
23967 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8));
23968 }
23969 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
23970 } else {
23971 if (IsStrict) {
23972 Cmp = DAG.getNode(
23973 Opc, dl, {VT, MVT::Other},
23974 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
23975 Chain = Cmp.getValue(1);
23976 } else
23977 Cmp = DAG.getNode(
23978 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
23979 }
23980 } else {
23981 // Handle all other FP comparisons here.
23982 if (IsStrict) {
23983 // Make a flip on already signaling CCs before setting bit 4 of AVX CC.
23984 SSECC |= (IsAlwaysSignaling ^ IsSignaling) << 4;
23985 Cmp = DAG.getNode(
23986 Opc, dl, {VT, MVT::Other},
23987 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
23988 Chain = Cmp.getValue(1);
23989 } else
23990 Cmp = DAG.getNode(
23991 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
23992 }
23993
23994 if (VT.getFixedSizeInBits() >
23995 Op.getSimpleValueType().getFixedSizeInBits()) {
23996 // We emitted a compare with an XMM/YMM result. Finish converting to a
23997 // mask register using a vptestm.
23998 EVT CastVT = EVT(VT).changeVectorElementTypeToInteger();
23999 Cmp = DAG.getBitcast(CastVT, Cmp);
24000 Cmp = DAG.getSetCC(dl, Op.getSimpleValueType(), Cmp,
24001 DAG.getConstant(0, dl, CastVT), ISD::SETNE);
24002 } else {
24003 // If this is SSE/AVX CMPP, bitcast the result back to integer to match
24004 // the result type of SETCC. The bitcast is expected to be optimized
24005 // away during combining/isel.
24006 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
24007 }
24008
24009 if (IsStrict)
24010 return DAG.getMergeValues({Cmp, Chain}, dl);
24011
24012 return Cmp;
24013 }
24014
24015 assert(!IsStrict && "Strict SETCC only handles FP operands.")(static_cast <bool> (!IsStrict && "Strict SETCC only handles FP operands."
) ? void (0) : __assert_fail ("!IsStrict && \"Strict SETCC only handles FP operands.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24015, __extension__
__PRETTY_FUNCTION__))
;
24016
24017 MVT VTOp0 = Op0.getSimpleValueType();
24018 (void)VTOp0;
24019 assert(VTOp0 == Op1.getSimpleValueType() &&(static_cast <bool> (VTOp0 == Op1.getSimpleValueType() &&
"Expected operands with same type!") ? void (0) : __assert_fail
("VTOp0 == Op1.getSimpleValueType() && \"Expected operands with same type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24020, __extension__
__PRETTY_FUNCTION__))
24020 "Expected operands with same type!")(static_cast <bool> (VTOp0 == Op1.getSimpleValueType() &&
"Expected operands with same type!") ? void (0) : __assert_fail
("VTOp0 == Op1.getSimpleValueType() && \"Expected operands with same type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24020, __extension__
__PRETTY_FUNCTION__))
;
24021 assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&(static_cast <bool> (VT.getVectorNumElements() == VTOp0
.getVectorNumElements() && "Invalid number of packed elements for source and destination!"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == VTOp0.getVectorNumElements() && \"Invalid number of packed elements for source and destination!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24022, __extension__
__PRETTY_FUNCTION__))
24022 "Invalid number of packed elements for source and destination!")(static_cast <bool> (VT.getVectorNumElements() == VTOp0
.getVectorNumElements() && "Invalid number of packed elements for source and destination!"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == VTOp0.getVectorNumElements() && \"Invalid number of packed elements for source and destination!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24022, __extension__
__PRETTY_FUNCTION__))
;
24023
24024 // The non-AVX512 code below works under the assumption that source and
24025 // destination types are the same.
24026 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&(static_cast <bool> ((Subtarget.hasAVX512() || (VT == VTOp0
)) && "Value types for source and destination must be the same!"
) ? void (0) : __assert_fail ("(Subtarget.hasAVX512() || (VT == VTOp0)) && \"Value types for source and destination must be the same!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24027, __extension__
__PRETTY_FUNCTION__))
24027 "Value types for source and destination must be the same!")(static_cast <bool> ((Subtarget.hasAVX512() || (VT == VTOp0
)) && "Value types for source and destination must be the same!"
) ? void (0) : __assert_fail ("(Subtarget.hasAVX512() || (VT == VTOp0)) && \"Value types for source and destination must be the same!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24027, __extension__
__PRETTY_FUNCTION__))
;
24028
24029 // The result is boolean, but operands are int/float
24030 if (VT.getVectorElementType() == MVT::i1) {
24031 // In AVX-512 architecture setcc returns mask with i1 elements,
24032 // But there is no compare instruction for i8 and i16 elements in KNL.
24033 assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&(static_cast <bool> ((VTOp0.getScalarSizeInBits() >=
32 || Subtarget.hasBWI()) && "Unexpected operand type"
) ? void (0) : __assert_fail ("(VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) && \"Unexpected operand type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24034, __extension__
__PRETTY_FUNCTION__))
24034 "Unexpected operand type")(static_cast <bool> ((VTOp0.getScalarSizeInBits() >=
32 || Subtarget.hasBWI()) && "Unexpected operand type"
) ? void (0) : __assert_fail ("(VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) && \"Unexpected operand type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24034, __extension__
__PRETTY_FUNCTION__))
;
24035 return LowerIntVSETCC_AVX512(Op, DAG);
24036 }
24037
24038 // Lower using XOP integer comparisons.
24039 if (VT.is128BitVector() && Subtarget.hasXOP()) {
24040 // Translate compare code to XOP PCOM compare mode.
24041 unsigned CmpMode = 0;
24042 switch (Cond) {
24043 default: llvm_unreachable("Unexpected SETCC condition")::llvm::llvm_unreachable_internal("Unexpected SETCC condition"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24043)
;
24044 case ISD::SETULT:
24045 case ISD::SETLT: CmpMode = 0x00; break;
24046 case ISD::SETULE:
24047 case ISD::SETLE: CmpMode = 0x01; break;
24048 case ISD::SETUGT:
24049 case ISD::SETGT: CmpMode = 0x02; break;
24050 case ISD::SETUGE:
24051 case ISD::SETGE: CmpMode = 0x03; break;
24052 case ISD::SETEQ: CmpMode = 0x04; break;
24053 case ISD::SETNE: CmpMode = 0x05; break;
24054 }
24055
24056 // Are we comparing unsigned or signed integers?
24057 unsigned Opc =
24058 ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;
24059
24060 return DAG.getNode(Opc, dl, VT, Op0, Op1,
24061 DAG.getTargetConstant(CmpMode, dl, MVT::i8));
24062 }
24063
24064 // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
24065 // Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
24066 if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) {
24067 SDValue BC0 = peekThroughBitcasts(Op0);
24068 if (BC0.getOpcode() == ISD::AND) {
24069 APInt UndefElts;
24070 SmallVector<APInt, 64> EltBits;
24071 if (getTargetConstantBitsFromNode(BC0.getOperand(1),
24072 VT.getScalarSizeInBits(), UndefElts,
24073 EltBits, false, false)) {
24074 if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) {
24075 Cond = ISD::SETEQ;
24076 Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
24077 }
24078 }
24079 }
24080 }
24081
24082 // ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.
24083 if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND &&
24084 Op0.getOperand(1) == Op1 && Op0.hasOneUse()) {
24085 ConstantSDNode *C1 = isConstOrConstSplat(Op1);
24086 if (C1 && C1->getAPIntValue().isPowerOf2()) {
24087 unsigned BitWidth = VT.getScalarSizeInBits();
24088 unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;
24089
24090 SDValue Result = Op0.getOperand(0);
24091 Result = DAG.getNode(ISD::SHL, dl, VT, Result,
24092 DAG.getConstant(ShiftAmt, dl, VT));
24093 Result = DAG.getNode(ISD::SRA, dl, VT, Result,
24094 DAG.getConstant(BitWidth - 1, dl, VT));
24095 return Result;
24096 }
24097 }
24098
24099 // Break 256-bit integer vector compare into smaller ones.
24100 if (VT.is256BitVector() && !Subtarget.hasInt256())
24101 return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);
24102
24103 // Break 512-bit integer vector compare into smaller ones.
24104 // TODO: Try harder to use VPCMPx + VPMOV2x?
24105 if (VT.is512BitVector())
24106 return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);
24107
24108 // If we have a limit constant, try to form PCMPGT (signed cmp) to avoid
24109 // not-of-PCMPEQ:
24110 // X != INT_MIN --> X >s INT_MIN
24111 // X != INT_MAX --> X <s INT_MAX --> INT_MAX >s X
24112 // +X != 0 --> +X >s 0
24113 APInt ConstValue;
24114 if (Cond == ISD::SETNE &&
24115 ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {
24116 if (ConstValue.isMinSignedValue())
24117 Cond = ISD::SETGT;
24118 else if (ConstValue.isMaxSignedValue())
24119 Cond = ISD::SETLT;
24120 else if (ConstValue.isZero() && DAG.SignBitIsZero(Op0))
24121 Cond = ISD::SETGT;
24122 }
24123
24124 // If both operands are known non-negative, then an unsigned compare is the
24125 // same as a signed compare and there's no need to flip signbits.
24126 // TODO: We could check for more general simplifications here since we're
24127 // computing known bits.
24128 bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
24129 !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
24130
24131 // Special case: Use min/max operations for unsigned compares.
24132 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24133 if (ISD::isUnsignedIntSetCC(Cond) &&
24134 (FlipSigns || ISD::isTrueWhenEqual(Cond)) &&
24135 TLI.isOperationLegal(ISD::UMIN, VT)) {
24136 // If we have a constant operand, increment/decrement it and change the
24137 // condition to avoid an invert.
24138 if (Cond == ISD::SETUGT) {
24139 // X > C --> X >= (C+1) --> X == umax(X, C+1)
24140 if (SDValue UGTOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/true)) {
24141 Op1 = UGTOp1;
24142 Cond = ISD::SETUGE;
24143 }
24144 }
24145 if (Cond == ISD::SETULT) {
24146 // X < C --> X <= (C-1) --> X == umin(X, C-1)
24147 if (SDValue ULTOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/false)) {
24148 Op1 = ULTOp1;
24149 Cond = ISD::SETULE;
24150 }
24151 }
24152 bool Invert = false;
24153 unsigned Opc;
24154 switch (Cond) {
24155 default: llvm_unreachable("Unexpected condition code")::llvm::llvm_unreachable_internal("Unexpected condition code"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24155)
;
24156 case ISD::SETUGT: Invert = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];
24157 case ISD::SETULE: Opc = ISD::UMIN; break;
24158 case ISD::SETULT: Invert = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];
24159 case ISD::SETUGE: Opc = ISD::UMAX; break;
24160 }
24161
24162 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
24163 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
24164
24165 // If the logical-not of the result is required, perform that now.
24166 if (Invert)
24167 Result = DAG.getNOT(dl, Result, VT);
24168
24169 return Result;
24170 }
24171
24172 // Try to use SUBUS and PCMPEQ.
24173 if (FlipSigns)
24174 if (SDValue V =
24175 LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
24176 return V;
24177
24178 // We are handling one of the integer comparisons here. Since SSE only has
24179 // GT and EQ comparisons for integer, swapping operands and multiple
24180 // operations may be required for some comparisons.
24181 unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
24182 : X86ISD::PCMPGT;
24183 bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
24184 Cond == ISD::SETGE || Cond == ISD::SETUGE;
24185 bool Invert = Cond == ISD::SETNE ||
24186 (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));
24187
24188 if (Swap)
24189 std::swap(Op0, Op1);
24190
24191 // Check that the operation in question is available (most are plain SSE2,
24192 // but PCMPGTQ and PCMPEQQ have different requirements).
24193 if (VT == MVT::v2i64) {
24194 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
24195 assert(Subtarget.hasSSE2() && "Don't know how to lower!")(static_cast <bool> (Subtarget.hasSSE2() && "Don't know how to lower!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Don't know how to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24195, __extension__
__PRETTY_FUNCTION__))
;
24196
24197 // Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle
24198 // the odd elements over the even elements.
24199 if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) {
24200 Op0 = DAG.getConstant(0, dl, MVT::v4i32);
24201 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24202
24203 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24204 static const int MaskHi[] = { 1, 1, 3, 3 };
24205 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
24206
24207 return DAG.getBitcast(VT, Result);
24208 }
24209
24210 if (!FlipSigns && !Invert && ISD::isBuildVectorAllOnes(Op1.getNode())) {
24211 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24212 Op1 = DAG.getConstant(-1, dl, MVT::v4i32);
24213
24214 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24215 static const int MaskHi[] = { 1, 1, 3, 3 };
24216 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
24217
24218 return DAG.getBitcast(VT, Result);
24219 }
24220
24221 // Since SSE has no unsigned integer comparisons, we need to flip the sign
24222 // bits of the inputs before performing those operations. The lower
24223 // compare is always unsigned.
24224 SDValue SB;
24225 if (FlipSigns) {
24226 SB = DAG.getConstant(0x8000000080000000ULL, dl, MVT::v2i64);
24227 } else {
24228 SB = DAG.getConstant(0x0000000080000000ULL, dl, MVT::v2i64);
24229 }
24230 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB);
24231 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB);
24232
24233 // Cast everything to the right type.
24234 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24235 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24236
24237 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
24238 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24239 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
24240
24241 // Create masks for only the low parts/high parts of the 64 bit integers.
24242 static const int MaskHi[] = { 1, 1, 3, 3 };
24243 static const int MaskLo[] = { 0, 0, 2, 2 };
24244 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
24245 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
24246 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
24247
24248 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
24249 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
24250
24251 if (Invert)
24252 Result = DAG.getNOT(dl, Result, MVT::v4i32);
24253
24254 return DAG.getBitcast(VT, Result);
24255 }
24256
24257 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
24258 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
24259 // pcmpeqd + pshufd + pand.
24260 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!")(static_cast <bool> (Subtarget.hasSSE2() && !FlipSigns
&& "Don't know how to lower!") ? void (0) : __assert_fail
("Subtarget.hasSSE2() && !FlipSigns && \"Don't know how to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24260, __extension__
__PRETTY_FUNCTION__))
;
24261
24262 // First cast everything to the right type.
24263 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24264 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24265
24266 // Do the compare.
24267 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
24268
24269 // Make sure the lower and upper halves are both all-ones.
24270 static const int Mask[] = { 1, 0, 3, 2 };
24271 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
24272 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
24273
24274 if (Invert)
24275 Result = DAG.getNOT(dl, Result, MVT::v4i32);
24276
24277 return DAG.getBitcast(VT, Result);
24278 }
24279 }
24280
24281 // Since SSE has no unsigned integer comparisons, we need to flip the sign
24282 // bits of the inputs before performing those operations.
24283 if (FlipSigns) {
24284 MVT EltVT = VT.getVectorElementType();
24285 SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
24286 VT);
24287 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
24288 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
24289 }
24290
24291 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
24292
24293 // If the logical-not of the result is required, perform that now.
24294 if (Invert)
24295 Result = DAG.getNOT(dl, Result, VT);
24296
24297 return Result;
24298}
24299
24300// Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible.
24301static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC,
24302 const SDLoc &dl, SelectionDAG &DAG,
24303 const X86Subtarget &Subtarget,
24304 SDValue &X86CC) {
24305 // Only support equality comparisons.
24306 if (CC != ISD::SETEQ && CC != ISD::SETNE)
24307 return SDValue();
24308
24309 // Must be a bitcast from vXi1.
24310 if (Op0.getOpcode() != ISD::BITCAST)
24311 return SDValue();
24312
24313 Op0 = Op0.getOperand(0);
24314 MVT VT = Op0.getSimpleValueType();
24315 if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&
24316 !(Subtarget.hasDQI() && VT == MVT::v8i1) &&
24317 !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
24318 return SDValue();
24319
24320 X86::CondCode X86Cond;
24321 if (isNullConstant(Op1)) {
24322 X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
24323 } else if (isAllOnesConstant(Op1)) {
24324 // C flag is set for all ones.
24325 X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;
24326 } else
24327 return SDValue();
24328
24329 // If the input is an AND, we can combine it's operands into the KTEST.
24330 bool KTestable = false;
24331 if (Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1))
24332 KTestable = true;
24333 if (Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1))
24334 KTestable = true;
24335 if (!isNullConstant(Op1))
24336 KTestable = false;
24337 if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) {
24338 SDValue LHS = Op0.getOperand(0);
24339 SDValue RHS = Op0.getOperand(1);
24340 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24341 return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS);
24342 }
24343
24344 // If the input is an OR, we can combine it's operands into the KORTEST.
24345 SDValue LHS = Op0;
24346 SDValue RHS = Op0;
24347 if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {
24348 LHS = Op0.getOperand(0);
24349 RHS = Op0.getOperand(1);
24350 }
24351
24352 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24353 return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
24354}
24355
24356/// Emit flags for the given setcc condition and operands. Also returns the
24357/// corresponding X86 condition code constant in X86CC.
24358SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
24359 ISD::CondCode CC, const SDLoc &dl,
24360 SelectionDAG &DAG,
24361 SDValue &X86CC) const {
24362 // Optimize to BT if possible.
24363 // Lower (X & (1 << N)) == 0 to BT(X, N).
24364 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
24365 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
24366 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1) &&
24367 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
24368 X86::CondCode X86CondCode;
24369 if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CondCode)) {
24370 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24371 return BT;
24372 }
24373 }
24374
24375 // Try to use PTEST/PMOVMSKB for a tree ORs equality compared with 0.
24376 // TODO: We could do AND tree with all 1s as well by using the C flag.
24377 if (isNullConstant(Op1) && (CC == ISD::SETEQ || CC == ISD::SETNE))
24378 if (SDValue CmpZ =
24379 MatchVectorAllZeroTest(Op0, CC, dl, Subtarget, DAG, X86CC))
24380 return CmpZ;
24381
24382 // Try to lower using KORTEST or KTEST.
24383 if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))
24384 return Test;
24385
24386 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
24387 // these.
24388 if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
24389 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
24390 // If the input is a setcc, then reuse the input setcc or use a new one with
24391 // the inverted condition.
24392 if (Op0.getOpcode() == X86ISD::SETCC) {
24393 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
24394
24395 X86CC = Op0.getOperand(0);
24396 if (Invert) {
24397 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
24398 CCode = X86::GetOppositeBranchCondition(CCode);
24399 X86CC = DAG.getTargetConstant(CCode, dl, MVT::i8);
24400 }
24401
24402 return Op0.getOperand(1);
24403 }
24404 }
24405
24406 // Try to use the carry flag from the add in place of an separate CMP for:
24407 // (seteq (add X, -1), -1). Similar for setne.
24408 if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD &&
24409 Op0.getOperand(1) == Op1 && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
24410 if (isProfitableToUseFlagOp(Op0)) {
24411 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
24412
24413 SDValue New = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(0),
24414 Op0.getOperand(1));
24415 DAG.ReplaceAllUsesOfValueWith(SDValue(Op0.getNode(), 0), New);
24416 X86::CondCode CCode = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
24417 X86CC = DAG.getTargetConstant(CCode, dl, MVT::i8);
24418 return SDValue(New.getNode(), 1);
24419 }
24420 }
24421
24422 X86::CondCode CondCode =
24423 TranslateX86CC(CC, dl, /*IsFP*/ false, Op0, Op1, DAG);
24424 assert(CondCode != X86::COND_INVALID && "Unexpected condition code!")(static_cast <bool> (CondCode != X86::COND_INVALID &&
"Unexpected condition code!") ? void (0) : __assert_fail ("CondCode != X86::COND_INVALID && \"Unexpected condition code!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24424, __extension__
__PRETTY_FUNCTION__))
;
24425
24426 SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget);
24427 X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
24428 return EFLAGS;
24429}
24430
24431SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
24432
24433 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
24434 Op.getOpcode() == ISD::STRICT_FSETCCS;
24435 MVT VT = Op->getSimpleValueType(0);
24436
24437 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
24438
24439 assert(VT == MVT::i8 && "SetCC type must be 8-bit integer")(static_cast <bool> (VT == MVT::i8 && "SetCC type must be 8-bit integer"
) ? void (0) : __assert_fail ("VT == MVT::i8 && \"SetCC type must be 8-bit integer\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24439, __extension__
__PRETTY_FUNCTION__))
;
24440 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
24441 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
24442 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
24443 SDLoc dl(Op);
24444 ISD::CondCode CC =
24445 cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();
24446
24447 // Handle f128 first, since one possible outcome is a normal integer
24448 // comparison which gets handled by emitFlagsForSetcc.
24449 if (Op0.getValueType() == MVT::f128) {
24450 softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1, Chain,
24451 Op.getOpcode() == ISD::STRICT_FSETCCS);
24452
24453 // If softenSetCCOperands returned a scalar, use it.
24454 if (!Op1.getNode()) {
24455 assert(Op0.getValueType() == Op.getValueType() &&(static_cast <bool> (Op0.getValueType() == Op.getValueType
() && "Unexpected setcc expansion!") ? void (0) : __assert_fail
("Op0.getValueType() == Op.getValueType() && \"Unexpected setcc expansion!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24456, __extension__
__PRETTY_FUNCTION__))
24456 "Unexpected setcc expansion!")(static_cast <bool> (Op0.getValueType() == Op.getValueType
() && "Unexpected setcc expansion!") ? void (0) : __assert_fail
("Op0.getValueType() == Op.getValueType() && \"Unexpected setcc expansion!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24456, __extension__
__PRETTY_FUNCTION__))
;
24457 if (IsStrict)
24458 return DAG.getMergeValues({Op0, Chain}, dl);
24459 return Op0;
24460 }
24461 }
24462
24463 if (Op0.getSimpleValueType().isInteger()) {
24464 // Attempt to canonicalize SGT/UGT -> SGE/UGE compares with constant which
24465 // reduces the number of EFLAGs bit reads (the GE conditions don't read ZF),
24466 // this may translate to less uops depending on uarch implementation. The
24467 // equivalent for SLE/ULE -> SLT/ULT isn't likely to happen as we already
24468 // canonicalize to that CondCode.
24469 // NOTE: Only do this if incrementing the constant doesn't increase the bit
24470 // encoding size - so it must either already be a i8 or i32 immediate, or it
24471 // shrinks down to that. We don't do this for any i64's to avoid additional
24472 // constant materializations.
24473 // TODO: Can we move this to TranslateX86CC to handle jumps/branches too?
24474 if (auto *Op1C = dyn_cast<ConstantSDNode>(Op1)) {
24475 const APInt &Op1Val = Op1C->getAPIntValue();
24476 if (!Op1Val.isZero()) {
24477 // Ensure the constant+1 doesn't overflow.
24478 if ((CC == ISD::CondCode::SETGT && !Op1Val.isMaxSignedValue()) ||
24479 (CC == ISD::CondCode::SETUGT && !Op1Val.isMaxValue())) {
24480 APInt Op1ValPlusOne = Op1Val + 1;
24481 if (Op1ValPlusOne.isSignedIntN(32) &&
24482 (!Op1Val.isSignedIntN(8) || Op1ValPlusOne.isSignedIntN(8))) {
24483 Op1 = DAG.getConstant(Op1ValPlusOne, dl, Op0.getValueType());
24484 CC = CC == ISD::CondCode::SETGT ? ISD::CondCode::SETGE
24485 : ISD::CondCode::SETUGE;
24486 }
24487 }
24488 }
24489 }
24490
24491 SDValue X86CC;
24492 SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);
24493 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
24494 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
24495 }
24496
24497 // Handle floating point.
24498 X86::CondCode CondCode = TranslateX86CC(CC, dl, /*IsFP*/ true, Op0, Op1, DAG);
24499 if (CondCode == X86::COND_INVALID)
24500 return SDValue();
24501
24502 SDValue EFLAGS;
24503 if (IsStrict) {
24504 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
24505 EFLAGS =
24506 DAG.getNode(IsSignaling ? X86ISD::STRICT_FCMPS : X86ISD::STRICT_FCMP,
24507 dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});
24508 Chain = EFLAGS.getValue(1);
24509 } else {
24510 EFLAGS = DAG.getNode(X86ISD::FCMP, dl, MVT::i32, Op0, Op1);
24511 }
24512
24513 SDValue X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
24514 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
24515 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
24516}
24517
24518SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
24519 SDValue LHS = Op.getOperand(0);
24520 SDValue RHS = Op.getOperand(1);
24521 SDValue Carry = Op.getOperand(2);
24522 SDValue Cond = Op.getOperand(3);
24523 SDLoc DL(Op);
24524
24525 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.")(static_cast <bool> (LHS.getSimpleValueType().isInteger
() && "SETCCCARRY is integer only.") ? void (0) : __assert_fail
("LHS.getSimpleValueType().isInteger() && \"SETCCCARRY is integer only.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24525, __extension__
__PRETTY_FUNCTION__))
;
24526 X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
24527
24528 // Recreate the carry if needed.
24529 EVT CarryVT = Carry.getValueType();
24530 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
24531 Carry, DAG.getAllOnesConstant(DL, CarryVT));
24532
24533 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
24534 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
24535 return getSETCC(CC, Cmp.getValue(1), DL, DAG);
24536}
24537
24538// This function returns three things: the arithmetic computation itself
24539// (Value), an EFLAGS result (Overflow), and a condition code (Cond). The
24540// flag and the condition code define the case in which the arithmetic
24541// computation overflows.
24542static std::pair<SDValue, SDValue>
24543getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG) {
24544 assert(Op.getResNo() == 0 && "Unexpected result number!")(static_cast <bool> (Op.getResNo() == 0 && "Unexpected result number!"
) ? void (0) : __assert_fail ("Op.getResNo() == 0 && \"Unexpected result number!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24544, __extension__
__PRETTY_FUNCTION__))
;
24545 SDValue Value, Overflow;
24546 SDValue LHS = Op.getOperand(0);
24547 SDValue RHS = Op.getOperand(1);
24548 unsigned BaseOp = 0;
24549 SDLoc DL(Op);
24550 switch (Op.getOpcode()) {
24551 default: llvm_unreachable("Unknown ovf instruction!")::llvm::llvm_unreachable_internal("Unknown ovf instruction!",
"llvm/lib/Target/X86/X86ISelLowering.cpp", 24551)
;
24552 case ISD::SADDO:
24553 BaseOp = X86ISD::ADD;
24554 Cond = X86::COND_O;
24555 break;
24556 case ISD::UADDO:
24557 BaseOp = X86ISD::ADD;
24558 Cond = isOneConstant(RHS) ? X86::COND_E : X86::COND_B;
24559 break;
24560 case ISD::SSUBO:
24561 BaseOp = X86ISD::SUB;
24562 Cond = X86::COND_O;
24563 break;
24564 case ISD::USUBO:
24565 BaseOp = X86ISD::SUB;
24566 Cond = X86::COND_B;
24567 break;
24568 case ISD::SMULO:
24569 BaseOp = X86ISD::SMUL;
24570 Cond = X86::COND_O;
24571 break;
24572 case ISD::UMULO:
24573 BaseOp = X86ISD::UMUL;
24574 Cond = X86::COND_O;
24575 break;
24576 }
24577
24578 if (BaseOp) {
24579 // Also sets EFLAGS.
24580 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
24581 Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
24582 Overflow = Value.getValue(1);
24583 }
24584
24585 return std::make_pair(Value, Overflow);
24586}
24587
24588static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
24589 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
24590 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
24591 // looks for this combo and may remove the "setcc" instruction if the "setcc"
24592 // has only one use.
24593 SDLoc DL(Op);
24594 X86::CondCode Cond;
24595 SDValue Value, Overflow;
24596 std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);
24597
24598 SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);
24599 assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!")(static_cast <bool> (Op->getValueType(1) == MVT::i8 &&
"Unexpected VT!") ? void (0) : __assert_fail ("Op->getValueType(1) == MVT::i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24599, __extension__
__PRETTY_FUNCTION__))
;
24600 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);
24601}
24602
24603/// Return true if opcode is a X86 logical comparison.
24604static bool isX86LogicalCmp(SDValue Op) {
24605 unsigned Opc = Op.getOpcode();
24606 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
24607 Opc == X86ISD::FCMP)
24608 return true;
24609 if (Op.getResNo() == 1 &&
24610 (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
24611 Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL ||
24612 Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND))
24613 return true;
24614
24615 return false;
24616}
24617
24618static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
24619 if (V.getOpcode() != ISD::TRUNCATE)
24620 return false;
24621
24622 SDValue VOp0 = V.getOperand(0);
24623 unsigned InBits = VOp0.getValueSizeInBits();
24624 unsigned Bits = V.getValueSizeInBits();
24625 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
24626}
24627
24628SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
24629 bool AddTest = true;
24630 SDValue Cond = Op.getOperand(0);
24631 SDValue Op1 = Op.getOperand(1);
24632 SDValue Op2 = Op.getOperand(2);
24633 SDLoc DL(Op);
24634 MVT VT = Op1.getSimpleValueType();
24635 SDValue CC;
24636
24637 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
24638 // are available or VBLENDV if AVX is available.
24639 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
24640 if (Cond.getOpcode() == ISD::SETCC && isScalarFPTypeInSSEReg(VT) &&
24641 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
24642 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
24643 bool IsAlwaysSignaling;
24644 unsigned SSECC =
24645 translateX86FSETCC(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
24646 CondOp0, CondOp1, IsAlwaysSignaling);
24647
24648 if (Subtarget.hasAVX512()) {
24649 SDValue Cmp =
24650 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1,
24651 DAG.getTargetConstant(SSECC, DL, MVT::i8));
24652 assert(!VT.isVector() && "Not a scalar type?")(static_cast <bool> (!VT.isVector() && "Not a scalar type?"
) ? void (0) : __assert_fail ("!VT.isVector() && \"Not a scalar type?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24652, __extension__
__PRETTY_FUNCTION__))
;
24653 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
24654 }
24655
24656 if (SSECC < 8 || Subtarget.hasAVX()) {
24657 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
24658 DAG.getTargetConstant(SSECC, DL, MVT::i8));
24659
24660 // If we have AVX, we can use a variable vector select (VBLENDV) instead
24661 // of 3 logic instructions for size savings and potentially speed.
24662 // Unfortunately, there is no scalar form of VBLENDV.
24663
24664 // If either operand is a +0.0 constant, don't try this. We can expect to
24665 // optimize away at least one of the logic instructions later in that
24666 // case, so that sequence would be faster than a variable blend.
24667
24668 // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
24669 // uses XMM0 as the selection register. That may need just as many
24670 // instructions as the AND/ANDN/OR sequence due to register moves, so
24671 // don't bother.
24672 if (Subtarget.hasAVX() && !isNullFPConstant(Op1) &&
24673 !isNullFPConstant(Op2)) {
24674 // Convert to vectors, do a VSELECT, and convert back to scalar.
24675 // All of the conversions should be optimized away.
24676 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
24677 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
24678 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
24679 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
24680
24681 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
24682 VCmp = DAG.getBitcast(VCmpVT, VCmp);
24683
24684 SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
24685
24686 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
24687 VSel, DAG.getIntPtrConstant(0, DL));
24688 }
24689 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
24690 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
24691 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
24692 }
24693 }
24694
24695 // AVX512 fallback is to lower selects of scalar floats to masked moves.
24696 if (isScalarFPTypeInSSEReg(VT) && Subtarget.hasAVX512()) {
24697 SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
24698 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
24699 }
24700
24701 if (Cond.getOpcode() == ISD::SETCC) {
24702 if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
24703 Cond = NewCond;
24704 // If the condition was updated, it's possible that the operands of the
24705 // select were also updated (for example, EmitTest has a RAUW). Refresh
24706 // the local references to the select operands in case they got stale.
24707 Op1 = Op.getOperand(1);
24708 Op2 = Op.getOperand(2);
24709 }
24710 }
24711
24712 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
24713 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
24714 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
24715 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
24716 // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
24717 // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
24718 // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
24719 // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
24720 if (Cond.getOpcode() == X86ISD::SETCC &&
24721 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
24722 isNullConstant(Cond.getOperand(1).getOperand(1))) {
24723 SDValue Cmp = Cond.getOperand(1);
24724 SDValue CmpOp0 = Cmp.getOperand(0);
24725 unsigned CondCode = Cond.getConstantOperandVal(0);
24726
24727 // Special handling for __builtin_ffs(X) - 1 pattern which looks like
24728 // (select (seteq X, 0), -1, (cttz_zero_undef X)). Disable the special
24729 // handle to keep the CMP with 0. This should be removed by
24730 // optimizeCompareInst by using the flags from the BSR/TZCNT used for the
24731 // cttz_zero_undef.
24732 auto MatchFFSMinus1 = [&](SDValue Op1, SDValue Op2) {
24733 return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() &&
24734 Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2));
24735 };
24736 if (Subtarget.canUseCMOV() && (VT == MVT::i32 || VT == MVT::i64) &&
24737 ((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) ||
24738 (CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) {
24739 // Keep Cmp.
24740 } else if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
24741 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
24742 SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
24743 SDVTList CmpVTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
24744
24745 // 'X - 1' sets the carry flag if X == 0.
24746 // '0 - X' sets the carry flag if X != 0.
24747 // Convert the carry flag to a -1/0 mask with sbb:
24748 // select (X != 0), -1, Y --> 0 - X; or (sbb), Y
24749 // select (X == 0), Y, -1 --> 0 - X; or (sbb), Y
24750 // select (X != 0), Y, -1 --> X - 1; or (sbb), Y
24751 // select (X == 0), -1, Y --> X - 1; or (sbb), Y
24752 SDValue Sub;
24753 if (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE)) {
24754 SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
24755 Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpOp0);
24756 } else {
24757 SDValue One = DAG.getConstant(1, DL, CmpOp0.getValueType());
24758 Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, CmpOp0, One);
24759 }
24760 SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
24761 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
24762 Sub.getValue(1));
24763 return DAG.getNode(ISD::OR, DL, VT, SBB, Y);
24764 } else if (!Subtarget.canUseCMOV() && CondCode == X86::COND_E &&
24765 Cmp.getOperand(0).getOpcode() == ISD::AND &&
24766 isOneConstant(Cmp.getOperand(0).getOperand(1))) {
24767 SDValue Src1, Src2;
24768 // true if Op2 is XOR or OR operator and one of its operands
24769 // is equal to Op1
24770 // ( a , a op b) || ( b , a op b)
24771 auto isOrXorPattern = [&]() {
24772 if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&
24773 (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {
24774 Src1 =
24775 Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
24776 Src2 = Op1;
24777 return true;
24778 }
24779 return false;
24780 };
24781
24782 if (isOrXorPattern()) {
24783 SDValue Neg;
24784 unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
24785 // we need mask of all zeros or ones with same size of the other
24786 // operands.
24787 if (CmpSz > VT.getSizeInBits())
24788 Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
24789 else if (CmpSz < VT.getSizeInBits())
24790 Neg = DAG.getNode(ISD::AND, DL, VT,
24791 DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
24792 DAG.getConstant(1, DL, VT));
24793 else
24794 Neg = CmpOp0;
24795 SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
24796 Neg); // -(and (x, 0x1))
24797 SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
24798 return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y
24799 }
24800 } else if ((VT == MVT::i32 || VT == MVT::i64) && isNullConstant(Op2) &&
24801 Cmp.getNode()->hasOneUse() && (CmpOp0 == Op1) &&
24802 ((CondCode == X86::COND_S) || // smin(x, 0)
24803 (CondCode == X86::COND_G && hasAndNot(Op1)))) { // smax(x, 0)
24804 // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
24805 //
24806 // If the comparison is testing for a positive value, we have to invert
24807 // the sign bit mask, so only do that transform if the target has a
24808 // bitwise 'and not' instruction (the invert is free).
24809 // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
24810 unsigned ShCt = VT.getSizeInBits() - 1;
24811 SDValue ShiftAmt = DAG.getConstant(ShCt, DL, VT);
24812 SDValue Shift = DAG.getNode(ISD::SRA, DL, VT, Op1, ShiftAmt);
24813 if (CondCode == X86::COND_G)
24814 Shift = DAG.getNOT(DL, Shift, VT);
24815 return DAG.getNode(ISD::AND, DL, VT, Shift, Op1);
24816 }
24817 }
24818
24819 // Look past (and (setcc_carry (cmp ...)), 1).
24820 if (Cond.getOpcode() == ISD::AND &&
24821 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
24822 isOneConstant(Cond.getOperand(1)))
24823 Cond = Cond.getOperand(0);
24824
24825 // If condition flag is set by a X86ISD::CMP, then use it as the condition
24826 // setting operand in place of the X86ISD::SETCC.
24827 unsigned CondOpcode = Cond.getOpcode();
24828 if (CondOpcode == X86ISD::SETCC ||
24829 CondOpcode == X86ISD::SETCC_CARRY) {
24830 CC = Cond.getOperand(0);
24831
24832 SDValue Cmp = Cond.getOperand(1);
24833 bool IllegalFPCMov = false;
24834 if (VT.isFloatingPoint() && !VT.isVector() &&
24835 !isScalarFPTypeInSSEReg(VT) && Subtarget.canUseCMOV()) // FPStack?
24836 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
24837
24838 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
24839 Cmp.getOpcode() == X86ISD::BT) { // FIXME
24840 Cond = Cmp;
24841 AddTest = false;
24842 }
24843 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
24844 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
24845 CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
24846 SDValue Value;
24847 X86::CondCode X86Cond;
24848 std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
24849
24850 CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);
24851 AddTest = false;
24852 }
24853
24854 if (AddTest) {
24855 // Look past the truncate if the high bits are known zero.
24856 if (isTruncWithZeroHighBitsInput(Cond, DAG))
24857 Cond = Cond.getOperand(0);
24858
24859 // We know the result of AND is compared against zero. Try to match
24860 // it to BT.
24861 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
24862 X86::CondCode X86CondCode;
24863 if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, X86CondCode)) {
24864 CC = DAG.getTargetConstant(X86CondCode, DL, MVT::i8);
24865 Cond = BT;
24866 AddTest = false;
24867 }
24868 }
24869 }
24870
24871 if (AddTest) {
24872 CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
24873 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);
24874 }
24875
24876 // a < b ? -1 : 0 -> RES = ~setcc_carry
24877 // a < b ? 0 : -1 -> RES = setcc_carry
24878 // a >= b ? -1 : 0 -> RES = setcc_carry
24879 // a >= b ? 0 : -1 -> RES = ~setcc_carry
24880 if (Cond.getOpcode() == X86ISD::SUB) {
24881 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
24882
24883 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
24884 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
24885 (isNullConstant(Op1) || isNullConstant(Op2))) {
24886 SDValue Res =
24887 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
24888 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond);
24889 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
24890 return DAG.getNOT(DL, Res, Res.getValueType());
24891 return Res;
24892 }
24893 }
24894
24895 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
24896 // widen the cmov and push the truncate through. This avoids introducing a new
24897 // branch during isel and doesn't add any extensions.
24898 if (Op.getValueType() == MVT::i8 &&
24899 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
24900 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
24901 if (T1.getValueType() == T2.getValueType() &&
24902 // Exclude CopyFromReg to avoid partial register stalls.
24903 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
24904 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
24905 CC, Cond);
24906 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
24907 }
24908 }
24909
24910 // Or finally, promote i8 cmovs if we have CMOV,
24911 // or i16 cmovs if it won't prevent folding a load.
24912 // FIXME: we should not limit promotion of i8 case to only when the CMOV is
24913 // legal, but EmitLoweredSelect() can not deal with these extensions
24914 // being inserted between two CMOV's. (in i16 case too TBN)
24915 // https://bugs.llvm.org/show_bug.cgi?id=40974
24916 if ((Op.getValueType() == MVT::i8 && Subtarget.canUseCMOV()) ||
24917 (Op.getValueType() == MVT::i16 && !X86::mayFoldLoad(Op1, Subtarget) &&
24918 !X86::mayFoldLoad(Op2, Subtarget))) {
24919 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
24920 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
24921 SDValue Ops[] = { Op2, Op1, CC, Cond };
24922 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
24923 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
24924 }
24925
24926 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
24927 // condition is true.
24928 SDValue Ops[] = { Op2, Op1, CC, Cond };
24929 return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops);
24930}
24931
24932static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,
24933 const X86Subtarget &Subtarget,
24934 SelectionDAG &DAG) {
24935 MVT VT = Op->getSimpleValueType(0);
24936 SDValue In = Op->getOperand(0);
24937 MVT InVT = In.getSimpleValueType();
24938 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!")(static_cast <bool> (InVT.getVectorElementType() == MVT
::i1 && "Unexpected input type!") ? void (0) : __assert_fail
("InVT.getVectorElementType() == MVT::i1 && \"Unexpected input type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24938, __extension__
__PRETTY_FUNCTION__))
;
24939 MVT VTElt = VT.getVectorElementType();
24940 SDLoc dl(Op);
24941
24942 unsigned NumElts = VT.getVectorNumElements();
24943
24944 // Extend VT if the scalar type is i8/i16 and BWI is not supported.
24945 MVT ExtVT = VT;
24946 if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
24947 // If v16i32 is to be avoided, we'll need to split and concatenate.
24948 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
24949 return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG);
24950
24951 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
24952 }
24953
24954 // Widen to 512-bits if VLX is not supported.
24955 MVT WideVT = ExtVT;
24956 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
24957 NumElts *= 512 / ExtVT.getSizeInBits();
24958 InVT = MVT::getVectorVT(MVT::i1, NumElts);
24959 In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT),
24960 In, DAG.getIntPtrConstant(0, dl));
24961 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
24962 }
24963
24964 SDValue V;
24965 MVT WideEltVT = WideVT.getVectorElementType();
24966 if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
24967 (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
24968 V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);
24969 } else {
24970 SDValue NegOne = DAG.getConstant(-1, dl, WideVT);
24971 SDValue Zero = DAG.getConstant(0, dl, WideVT);
24972 V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
24973 }
24974
24975 // Truncate if we had to extend i16/i8 above.
24976 if (VT != ExtVT) {
24977 WideVT = MVT::getVectorVT(VTElt, NumElts);
24978 V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
24979 }
24980
24981 // Extract back to 128/256-bit if we widened.
24982 if (WideVT != VT)
24983 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
24984 DAG.getIntPtrConstant(0, dl));
24985
24986 return V;
24987}
24988
24989static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
24990 SelectionDAG &DAG) {
24991 SDValue In = Op->getOperand(0);
24992 MVT InVT = In.getSimpleValueType();
24993
24994 if (InVT.getVectorElementType() == MVT::i1)
24995 return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
24996
24997 assert(Subtarget.hasAVX() && "Expected AVX support")(static_cast <bool> (Subtarget.hasAVX() && "Expected AVX support"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"Expected AVX support\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24997, __extension__
__PRETTY_FUNCTION__))
;
24998 return LowerAVXExtend(Op, DAG, Subtarget);
24999}
25000
25001// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
25002// For sign extend this needs to handle all vector sizes and SSE4.1 and
25003// non-SSE4.1 targets. For zero extend this should only handle inputs of
25004// MVT::v64i8 when BWI is not supported, but AVX512 is.
25005static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
25006 const X86Subtarget &Subtarget,
25007 SelectionDAG &DAG) {
25008 SDValue In = Op->getOperand(0);
25009 MVT VT = Op->getSimpleValueType(0);
25010 MVT InVT = In.getSimpleValueType();
25011
25012 MVT SVT = VT.getVectorElementType();
25013 MVT InSVT = InVT.getVectorElementType();
25014 assert(SVT.getFixedSizeInBits() > InSVT.getFixedSizeInBits())(static_cast <bool> (SVT.getFixedSizeInBits() > InSVT
.getFixedSizeInBits()) ? void (0) : __assert_fail ("SVT.getFixedSizeInBits() > InSVT.getFixedSizeInBits()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25014, __extension__
__PRETTY_FUNCTION__))
;
25015
25016 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
25017 return SDValue();
25018 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
25019 return SDValue();
25020 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
25021 !(VT.is256BitVector() && Subtarget.hasAVX()) &&
25022 !(VT.is512BitVector() && Subtarget.hasAVX512()))
25023 return SDValue();
25024
25025 SDLoc dl(Op);
25026 unsigned Opc = Op.getOpcode();
25027 unsigned NumElts = VT.getVectorNumElements();
25028
25029 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
25030 // For 512-bit vectors, we need 128-bits or 256-bits.
25031 if (InVT.getSizeInBits() > 128) {
25032 // Input needs to be at least the same number of elements as output, and
25033 // at least 128-bits.
25034 int InSize = InSVT.getSizeInBits() * NumElts;
25035 In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
25036 InVT = In.getSimpleValueType();
25037 }
25038
25039 // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,
25040 // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
25041 // need to be handled here for 256/512-bit results.
25042 if (Subtarget.hasInt256()) {
25043 assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension")(static_cast <bool> (VT.getSizeInBits() > 128 &&
"Unexpected 128-bit vector extension") ? void (0) : __assert_fail
("VT.getSizeInBits() > 128 && \"Unexpected 128-bit vector extension\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25043, __extension__
__PRETTY_FUNCTION__))
;
25044
25045 if (InVT.getVectorNumElements() != NumElts)
25046 return DAG.getNode(Op.getOpcode(), dl, VT, In);
25047
25048 // FIXME: Apparently we create inreg operations that could be regular
25049 // extends.
25050 unsigned ExtOpc =
25051 Opc == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SIGN_EXTEND
25052 : ISD::ZERO_EXTEND;
25053 return DAG.getNode(ExtOpc, dl, VT, In);
25054 }
25055
25056 // pre-AVX2 256-bit extensions need to be split into 128-bit instructions.
25057 if (Subtarget.hasAVX()) {
25058 assert(VT.is256BitVector() && "256-bit vector expected")(static_cast <bool> (VT.is256BitVector() && "256-bit vector expected"
) ? void (0) : __assert_fail ("VT.is256BitVector() && \"256-bit vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25058, __extension__
__PRETTY_FUNCTION__))
;
25059 MVT HalfVT = VT.getHalfNumVectorElementsVT();
25060 int HalfNumElts = HalfVT.getVectorNumElements();
25061
25062 unsigned NumSrcElts = InVT.getVectorNumElements();
25063 SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);
25064 for (int i = 0; i != HalfNumElts; ++i)
25065 HiMask[i] = HalfNumElts + i;
25066
25067 SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In);
25068 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask);
25069 Hi = DAG.getNode(Opc, dl, HalfVT, Hi);
25070 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
25071 }
25072
25073 // We should only get here for sign extend.
25074 assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!")(static_cast <bool> (Opc == ISD::SIGN_EXTEND_VECTOR_INREG
&& "Unexpected opcode!") ? void (0) : __assert_fail (
"Opc == ISD::SIGN_EXTEND_VECTOR_INREG && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25074, __extension__
__PRETTY_FUNCTION__))
;
25075 assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs")(static_cast <bool> (VT.is128BitVector() && InVT
.is128BitVector() && "Unexpected VTs") ? void (0) : __assert_fail
("VT.is128BitVector() && InVT.is128BitVector() && \"Unexpected VTs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25075, __extension__
__PRETTY_FUNCTION__))
;
25076
25077 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
25078 SDValue Curr = In;
25079 SDValue SignExt = Curr;
25080
25081 // As SRAI is only available on i16/i32 types, we expand only up to i32
25082 // and handle i64 separately.
25083 if (InVT != MVT::v4i32) {
25084 MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
25085
25086 unsigned DestWidth = DestVT.getScalarSizeInBits();
25087 unsigned Scale = DestWidth / InSVT.getSizeInBits();
25088
25089 unsigned InNumElts = InVT.getVectorNumElements();
25090 unsigned DestElts = DestVT.getVectorNumElements();
25091
25092 // Build a shuffle mask that takes each input element and places it in the
25093 // MSBs of the new element size.
25094 SmallVector<int, 16> Mask(InNumElts, SM_SentinelUndef);
25095 for (unsigned i = 0; i != DestElts; ++i)
25096 Mask[i * Scale + (Scale - 1)] = i;
25097
25098 Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask);
25099 Curr = DAG.getBitcast(DestVT, Curr);
25100
25101 unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();
25102 SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,
25103 DAG.getTargetConstant(SignExtShift, dl, MVT::i8));
25104 }
25105
25106 if (VT == MVT::v2i64) {
25107 assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT")(static_cast <bool> (Curr.getValueType() == MVT::v4i32 &&
"Unexpected input VT") ? void (0) : __assert_fail ("Curr.getValueType() == MVT::v4i32 && \"Unexpected input VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25107, __extension__
__PRETTY_FUNCTION__))
;
25108 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
25109 SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT);
25110 SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5});
25111 SignExt = DAG.getBitcast(VT, SignExt);
25112 }
25113
25114 return SignExt;
25115}
25116
25117static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
25118 SelectionDAG &DAG) {
25119 MVT VT = Op->getSimpleValueType(0);
25120 SDValue In = Op->getOperand(0);
25121 MVT InVT = In.getSimpleValueType();
25122 SDLoc dl(Op);
25123
25124 if (InVT.getVectorElementType() == MVT::i1)
25125 return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
25126
25127 assert(VT.isVector() && InVT.isVector() && "Expected vector type")(static_cast <bool> (VT.isVector() && InVT.isVector
() && "Expected vector type") ? void (0) : __assert_fail
("VT.isVector() && InVT.isVector() && \"Expected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25127, __extension__
__PRETTY_FUNCTION__))
;
25128 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Expected same number of elements"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25129, __extension__
__PRETTY_FUNCTION__))
25129 "Expected same number of elements")(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Expected same number of elements"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25129, __extension__
__PRETTY_FUNCTION__))
;
25130 assert((VT.getVectorElementType() == MVT::i16 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25133, __extension__
__PRETTY_FUNCTION__))
25131 VT.getVectorElementType() == MVT::i32 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25133, __extension__
__PRETTY_FUNCTION__))
25132 VT.getVectorElementType() == MVT::i64) &&(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25133, __extension__
__PRETTY_FUNCTION__))
25133 "Unexpected element type")(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25133, __extension__
__PRETTY_FUNCTION__))
;
25134 assert((InVT.getVectorElementType() == MVT::i8 ||(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25137, __extension__
__PRETTY_FUNCTION__))
25135 InVT.getVectorElementType() == MVT::i16 ||(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25137, __extension__
__PRETTY_FUNCTION__))
25136 InVT.getVectorElementType() == MVT::i32) &&(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25137, __extension__
__PRETTY_FUNCTION__))
25137 "Unexpected element type")(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25137, __extension__
__PRETTY_FUNCTION__))
;
25138
25139 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
25140 assert(InVT == MVT::v32i8 && "Unexpected VT!")(static_cast <bool> (InVT == MVT::v32i8 && "Unexpected VT!"
) ? void (0) : __assert_fail ("InVT == MVT::v32i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25140, __extension__
__PRETTY_FUNCTION__))
;
25141 return splitVectorIntUnary(Op, DAG);
25142 }
25143
25144 if (Subtarget.hasInt256())
25145 return Op;
25146
25147 // Optimize vectors in AVX mode
25148 // Sign extend v8i16 to v8i32 and
25149 // v4i32 to v4i64
25150 //
25151 // Divide input vector into two parts
25152 // for v4i32 the high shuffle mask will be {2, 3, -1, -1}
25153 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
25154 // concat the vectors to original VT
25155 MVT HalfVT = VT.getHalfNumVectorElementsVT();
25156 SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);
25157
25158 unsigned NumElems = InVT.getVectorNumElements();
25159 SmallVector<int,8> ShufMask(NumElems, -1);
25160 for (unsigned i = 0; i != NumElems/2; ++i)
25161 ShufMask[i] = i + NumElems/2;
25162
25163 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
25164 OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);
25165
25166 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
25167}
25168
25169/// Change a vector store into a pair of half-size vector stores.
25170static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) {
25171 SDValue StoredVal = Store->getValue();
25172 assert((StoredVal.getValueType().is256BitVector() ||(static_cast <bool> ((StoredVal.getValueType().is256BitVector
() || StoredVal.getValueType().is512BitVector()) && "Expecting 256/512-bit op"
) ? void (0) : __assert_fail ("(StoredVal.getValueType().is256BitVector() || StoredVal.getValueType().is512BitVector()) && \"Expecting 256/512-bit op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25174, __extension__
__PRETTY_FUNCTION__))
25173 StoredVal.getValueType().is512BitVector()) &&(static_cast <bool> ((StoredVal.getValueType().is256BitVector
() || StoredVal.getValueType().is512BitVector()) && "Expecting 256/512-bit op"
) ? void (0) : __assert_fail ("(StoredVal.getValueType().is256BitVector() || StoredVal.getValueType().is512BitVector()) && \"Expecting 256/512-bit op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25174, __extension__
__PRETTY_FUNCTION__))
25174 "Expecting 256/512-bit op")(static_cast <bool> ((StoredVal.getValueType().is256BitVector
() || StoredVal.getValueType().is512BitVector()) && "Expecting 256/512-bit op"
) ? void (0) : __assert_fail ("(StoredVal.getValueType().is256BitVector() || StoredVal.getValueType().is512BitVector()) && \"Expecting 256/512-bit op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25174, __extension__
__PRETTY_FUNCTION__))
;
25175
25176 // Splitting volatile memory ops is not allowed unless the operation was not
25177 // legal to begin with. Assume the input store is legal (this transform is
25178 // only used for targets with AVX). Note: It is possible that we have an
25179 // illegal type like v2i128, and so we could allow splitting a volatile store
25180 // in that case if that is important.
25181 if (!Store->isSimple())
25182 return SDValue();
25183
25184 SDLoc DL(Store);
25185 SDValue Value0, Value1;
25186 std::tie(Value0, Value1) = splitVector(StoredVal, DAG, DL);
25187 unsigned HalfOffset = Value0.getValueType().getStoreSize();
25188 SDValue Ptr0 = Store->getBasePtr();
25189 SDValue Ptr1 =
25190 DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(HalfOffset), DL);
25191 SDValue Ch0 =
25192 DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),
25193 Store->getOriginalAlign(),
25194 Store->getMemOperand()->getFlags());
25195 SDValue Ch1 = DAG.getStore(Store->getChain(), DL, Value1, Ptr1,
25196 Store->getPointerInfo().getWithOffset(HalfOffset),
25197 Store->getOriginalAlign(),
25198 Store->getMemOperand()->getFlags());
25199 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);
25200}
25201
25202/// Scalarize a vector store, bitcasting to TargetVT to determine the scalar
25203/// type.
25204static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT,
25205 SelectionDAG &DAG) {
25206 SDValue StoredVal = Store->getValue();
25207 assert(StoreVT.is128BitVector() &&(static_cast <bool> (StoreVT.is128BitVector() &&
StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op"
) ? void (0) : __assert_fail ("StoreVT.is128BitVector() && StoredVal.getValueType().is128BitVector() && \"Expecting 128-bit op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25208, __extension__
__PRETTY_FUNCTION__))
25208 StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op")(static_cast <bool> (StoreVT.is128BitVector() &&
StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op"
) ? void (0) : __assert_fail ("StoreVT.is128BitVector() && StoredVal.getValueType().is128BitVector() && \"Expecting 128-bit op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25208, __extension__
__PRETTY_FUNCTION__))
;
25209 StoredVal = DAG.getBitcast(StoreVT, StoredVal);
25210
25211 // Splitting volatile memory ops is not allowed unless the operation was not
25212 // legal to begin with. We are assuming the input op is legal (this transform
25213 // is only used for targets with AVX).
25214 if (!Store->isSimple())
25215 return SDValue();
25216
25217 MVT StoreSVT = StoreVT.getScalarType();
25218 unsigned NumElems = StoreVT.getVectorNumElements();
25219 unsigned ScalarSize = StoreSVT.getStoreSize();
25220
25221 SDLoc DL(Store);
25222 SmallVector<SDValue, 4> Stores;
25223 for (unsigned i = 0; i != NumElems; ++i) {
25224 unsigned Offset = i * ScalarSize;
25225 SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(),
25226 TypeSize::Fixed(Offset), DL);
25227 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,
25228 DAG.getIntPtrConstant(i, DL));
25229 SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr,
25230 Store->getPointerInfo().getWithOffset(Offset),
25231 Store->getOriginalAlign(),
25232 Store->getMemOperand()->getFlags());
25233 Stores.push_back(Ch);
25234 }
25235 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
25236}
25237
25238static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
25239 SelectionDAG &DAG) {
25240 StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
25241 SDLoc dl(St);
25242 SDValue StoredVal = St->getValue();
25243
25244 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.
25245 if (StoredVal.getValueType().isVector() &&
25246 StoredVal.getValueType().getVectorElementType() == MVT::i1) {
25247 unsigned NumElts = StoredVal.getValueType().getVectorNumElements();
25248 assert(NumElts <= 8 && "Unexpected VT")(static_cast <bool> (NumElts <= 8 && "Unexpected VT"
) ? void (0) : __assert_fail ("NumElts <= 8 && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25248, __extension__
__PRETTY_FUNCTION__))
;
25249 assert(!St->isTruncatingStore() && "Expected non-truncating store")(static_cast <bool> (!St->isTruncatingStore() &&
"Expected non-truncating store") ? void (0) : __assert_fail (
"!St->isTruncatingStore() && \"Expected non-truncating store\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25249, __extension__
__PRETTY_FUNCTION__))
;
25250 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25251, __extension__
__PRETTY_FUNCTION__))
25251 "Expected AVX512F without AVX512DQI")(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25251, __extension__
__PRETTY_FUNCTION__))
;
25252
25253 // We must pad with zeros to ensure we store zeroes to any unused bits.
25254 StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
25255 DAG.getUNDEF(MVT::v16i1), StoredVal,
25256 DAG.getIntPtrConstant(0, dl));
25257 StoredVal = DAG.getBitcast(MVT::i16, StoredVal);
25258 StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);
25259 // Make sure we store zeros in the extra bits.
25260 if (NumElts < 8)
25261 StoredVal = DAG.getZeroExtendInReg(
25262 StoredVal, dl, EVT::getIntegerVT(*DAG.getContext(), NumElts));
25263
25264 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
25265 St->getPointerInfo(), St->getOriginalAlign(),
25266 St->getMemOperand()->getFlags());
25267 }
25268
25269 if (St->isTruncatingStore())
25270 return SDValue();
25271
25272 // If this is a 256-bit store of concatenated ops, we are better off splitting
25273 // that store into two 128-bit stores. This avoids spurious use of 256-bit ops
25274 // and each half can execute independently. Some cores would split the op into
25275 // halves anyway, so the concat (vinsertf128) is purely an extra op.
25276 MVT StoreVT = StoredVal.getSimpleValueType();
25277 if (StoreVT.is256BitVector() ||
25278 ((StoreVT == MVT::v32i16 || StoreVT == MVT::v64i8) &&
25279 !Subtarget.hasBWI())) {
25280 SmallVector<SDValue, 4> CatOps;
25281 if (StoredVal.hasOneUse() && collectConcatOps(StoredVal.getNode(), CatOps))
25282 return splitVectorStore(St, DAG);
25283 return SDValue();
25284 }
25285
25286 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25287 assert(StoreVT.isVector() && StoreVT.getSizeInBits() == 64 &&(static_cast <bool> (StoreVT.isVector() && StoreVT
.getSizeInBits() == 64 && "Unexpected VT") ? void (0)
: __assert_fail ("StoreVT.isVector() && StoreVT.getSizeInBits() == 64 && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25288, __extension__
__PRETTY_FUNCTION__))
25288 "Unexpected VT")(static_cast <bool> (StoreVT.isVector() && StoreVT
.getSizeInBits() == 64 && "Unexpected VT") ? void (0)
: __assert_fail ("StoreVT.isVector() && StoreVT.getSizeInBits() == 64 && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25288, __extension__
__PRETTY_FUNCTION__))
;
25289 assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) ==(static_cast <bool> (TLI.getTypeAction(*DAG.getContext(
), StoreVT) == TargetLowering::TypeWidenVector && "Unexpected type action!"
) ? void (0) : __assert_fail ("TLI.getTypeAction(*DAG.getContext(), StoreVT) == TargetLowering::TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25290, __extension__
__PRETTY_FUNCTION__))
25290 TargetLowering::TypeWidenVector && "Unexpected type action!")(static_cast <bool> (TLI.getTypeAction(*DAG.getContext(
), StoreVT) == TargetLowering::TypeWidenVector && "Unexpected type action!"
) ? void (0) : __assert_fail ("TLI.getTypeAction(*DAG.getContext(), StoreVT) == TargetLowering::TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25290, __extension__
__PRETTY_FUNCTION__))
;
25291
25292 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT);
25293 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
25294 DAG.getUNDEF(StoreVT));
25295
25296 if (Subtarget.hasSSE2()) {
25297 // Widen the vector, cast to a v2x64 type, extract the single 64-bit element
25298 // and store it.
25299 MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
25300 MVT CastVT = MVT::getVectorVT(StVT, 2);
25301 StoredVal = DAG.getBitcast(CastVT, StoredVal);
25302 StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
25303 DAG.getIntPtrConstant(0, dl));
25304
25305 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
25306 St->getPointerInfo(), St->getOriginalAlign(),
25307 St->getMemOperand()->getFlags());
25308 }
25309 assert(Subtarget.hasSSE1() && "Expected SSE")(static_cast <bool> (Subtarget.hasSSE1() && "Expected SSE"
) ? void (0) : __assert_fail ("Subtarget.hasSSE1() && \"Expected SSE\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25309, __extension__
__PRETTY_FUNCTION__))
;
25310 SDVTList Tys = DAG.getVTList(MVT::Other);
25311 SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};
25312 return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,
25313 St->getMemOperand());
25314}
25315
25316// Lower vector extended loads using a shuffle. If SSSE3 is not available we
25317// may emit an illegal shuffle but the expansion is still better than scalar
25318// code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise
25319// we'll emit a shuffle and a arithmetic shift.
25320// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
25321// TODO: It is possible to support ZExt by zeroing the undef values during
25322// the shuffle phase or after the shuffle.
25323static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
25324 SelectionDAG &DAG) {
25325 MVT RegVT = Op.getSimpleValueType();
25326 assert(RegVT.isVector() && "We only custom lower vector loads.")(static_cast <bool> (RegVT.isVector() && "We only custom lower vector loads."
) ? void (0) : __assert_fail ("RegVT.isVector() && \"We only custom lower vector loads.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25326, __extension__
__PRETTY_FUNCTION__))
;
25327 assert(RegVT.isInteger() &&(static_cast <bool> (RegVT.isInteger() && "We only custom lower integer vector loads."
) ? void (0) : __assert_fail ("RegVT.isInteger() && \"We only custom lower integer vector loads.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25328, __extension__
__PRETTY_FUNCTION__))
25328 "We only custom lower integer vector loads.")(static_cast <bool> (RegVT.isInteger() && "We only custom lower integer vector loads."
) ? void (0) : __assert_fail ("RegVT.isInteger() && \"We only custom lower integer vector loads.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25328, __extension__
__PRETTY_FUNCTION__))
;
25329
25330 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
25331 SDLoc dl(Ld);
25332
25333 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
25334 if (RegVT.getVectorElementType() == MVT::i1) {
25335 assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load")(static_cast <bool> (EVT(RegVT) == Ld->getMemoryVT()
&& "Expected non-extending load") ? void (0) : __assert_fail
("EVT(RegVT) == Ld->getMemoryVT() && \"Expected non-extending load\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25335, __extension__
__PRETTY_FUNCTION__))
;
25336 assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT")(static_cast <bool> (RegVT.getVectorNumElements() <=
8 && "Unexpected VT") ? void (0) : __assert_fail ("RegVT.getVectorNumElements() <= 8 && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25336, __extension__
__PRETTY_FUNCTION__))
;
25337 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25338, __extension__
__PRETTY_FUNCTION__))
25338 "Expected AVX512F without AVX512DQI")(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25338, __extension__
__PRETTY_FUNCTION__))
;
25339
25340 SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
25341 Ld->getPointerInfo(), Ld->getOriginalAlign(),
25342 Ld->getMemOperand()->getFlags());
25343
25344 // Replace chain users with the new chain.
25345 assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!")(static_cast <bool> (NewLd->getNumValues() == 2 &&
"Loads must carry a chain!") ? void (0) : __assert_fail ("NewLd->getNumValues() == 2 && \"Loads must carry a chain!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25345, __extension__
__PRETTY_FUNCTION__))
;
25346
25347 SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd);
25348 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
25349 DAG.getBitcast(MVT::v16i1, Val),
25350 DAG.getIntPtrConstant(0, dl));
25351 return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);
25352 }
25353
25354 return SDValue();
25355}
25356
25357/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
25358/// each of which has no other use apart from the AND / OR.
25359static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
25360 Opc = Op.getOpcode();
25361 if (Opc != ISD::OR && Opc != ISD::AND)
25362 return false;
25363 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
25364 Op.getOperand(0).hasOneUse() &&
25365 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
25366 Op.getOperand(1).hasOneUse());
25367}
25368
25369SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
25370 SDValue Chain = Op.getOperand(0);
25371 SDValue Cond = Op.getOperand(1);
25372 SDValue Dest = Op.getOperand(2);
25373 SDLoc dl(Op);
25374
25375 if (Cond.getOpcode() == ISD::SETCC &&
25376 Cond.getOperand(0).getValueType() != MVT::f128) {
25377 SDValue LHS = Cond.getOperand(0);
25378 SDValue RHS = Cond.getOperand(1);
25379 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
25380
25381 // Special case for
25382 // setcc([su]{add,sub,mul}o == 0)
25383 // setcc([su]{add,sub,mul}o != 1)
25384 if (ISD::isOverflowIntrOpRes(LHS) &&
25385 (CC == ISD::SETEQ || CC == ISD::SETNE) &&
25386 (isNullConstant(RHS) || isOneConstant(RHS))) {
25387 SDValue Value, Overflow;
25388 X86::CondCode X86Cond;
25389 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, LHS.getValue(0), DAG);
25390
25391 if ((CC == ISD::SETEQ) == isNullConstant(RHS))
25392 X86Cond = X86::GetOppositeBranchCondition(X86Cond);
25393
25394 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25395 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25396 Overflow);
25397 }
25398
25399 if (LHS.getSimpleValueType().isInteger()) {
25400 SDValue CCVal;
25401 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, CC, SDLoc(Cond), DAG, CCVal);
25402 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25403 EFLAGS);
25404 }
25405
25406 if (CC == ISD::SETOEQ) {
25407 // For FCMP_OEQ, we can emit
25408 // two branches instead of an explicit AND instruction with a
25409 // separate test. However, we only do this if this block doesn't
25410 // have a fall-through edge, because this requires an explicit
25411 // jmp when the condition is false.
25412 if (Op.getNode()->hasOneUse()) {
25413 SDNode *User = *Op.getNode()->use_begin();
25414 // Look for an unconditional branch following this conditional branch.
25415 // We need this because we need to reverse the successors in order
25416 // to implement FCMP_OEQ.
25417 if (User->getOpcode() == ISD::BR) {
25418 SDValue FalseBB = User->getOperand(1);
25419 SDNode *NewBR =
25420 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
25421 assert(NewBR == User)(static_cast <bool> (NewBR == User) ? void (0) : __assert_fail
("NewBR == User", "llvm/lib/Target/X86/X86ISelLowering.cpp",
25421, __extension__ __PRETTY_FUNCTION__))
;
25422 (void)NewBR;
25423 Dest = FalseBB;
25424
25425 SDValue Cmp =
25426 DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
25427 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
25428 Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest,
25429 CCVal, Cmp);
25430 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
25431 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25432 Cmp);
25433 }
25434 }
25435 } else if (CC == ISD::SETUNE) {
25436 // For FCMP_UNE, we can emit
25437 // two branches instead of an explicit OR instruction with a
25438 // separate test.
25439 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
25440 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
25441 Chain =
25442 DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, Cmp);
25443 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
25444 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25445 Cmp);
25446 } else {
25447 X86::CondCode X86Cond =
25448 TranslateX86CC(CC, dl, /*IsFP*/ true, LHS, RHS, DAG);
25449 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
25450 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25451 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25452 Cmp);
25453 }
25454 }
25455
25456 if (ISD::isOverflowIntrOpRes(Cond)) {
25457 SDValue Value, Overflow;
25458 X86::CondCode X86Cond;
25459 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
25460
25461 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25462 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25463 Overflow);
25464 }
25465
25466 // Look past the truncate if the high bits are known zero.
25467 if (isTruncWithZeroHighBitsInput(Cond, DAG))
25468 Cond = Cond.getOperand(0);
25469
25470 EVT CondVT = Cond.getValueType();
25471
25472 // Add an AND with 1 if we don't already have one.
25473 if (!(Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1))))
25474 Cond =
25475 DAG.getNode(ISD::AND, dl, CondVT, Cond, DAG.getConstant(1, dl, CondVT));
25476
25477 SDValue LHS = Cond;
25478 SDValue RHS = DAG.getConstant(0, dl, CondVT);
25479
25480 SDValue CCVal;
25481 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, ISD::SETNE, dl, DAG, CCVal);
25482 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25483 EFLAGS);
25484}
25485
25486// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
25487// Calls to _alloca are needed to probe the stack when allocating more than 4k
25488// bytes in one go. Touching the stack at 4K increments is necessary to ensure
25489// that the guard pages used by the OS virtual memory manager are allocated in
25490// correct sequence.
25491SDValue
25492X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
25493 SelectionDAG &DAG) const {
25494 MachineFunction &MF = DAG.getMachineFunction();
25495 bool SplitStack = MF.shouldSplitStack();
25496 bool EmitStackProbeCall = hasStackProbeSymbol(MF);
25497 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
25498 SplitStack || EmitStackProbeCall;
25499 SDLoc dl(Op);
25500
25501 // Get the inputs.
25502 SDNode *Node = Op.getNode();
25503 SDValue Chain = Op.getOperand(0);
25504 SDValue Size = Op.getOperand(1);
25505 MaybeAlign Alignment(Op.getConstantOperandVal(2));
25506 EVT VT = Node->getValueType(0);
25507
25508 // Chain the dynamic stack allocation so that it doesn't modify the stack
25509 // pointer when other instructions are using the stack.
25510 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
25511
25512 bool Is64Bit = Subtarget.is64Bit();
25513 MVT SPTy = getPointerTy(DAG.getDataLayout());
25514
25515 SDValue Result;
25516 if (!Lower) {
25517 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25518 Register SPReg = TLI.getStackPointerRegisterToSaveRestore();
25519 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"(static_cast <bool> (SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
" not tell us which reg is the stack pointer!") ? void (0) :
__assert_fail ("SPReg && \"Target cannot require DYNAMIC_STACKALLOC expansion and\" \" not tell us which reg is the stack pointer!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25520, __extension__
__PRETTY_FUNCTION__))
25520 " not tell us which reg is the stack pointer!")(static_cast <bool> (SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
" not tell us which reg is the stack pointer!") ? void (0) :
__assert_fail ("SPReg && \"Target cannot require DYNAMIC_STACKALLOC expansion and\" \" not tell us which reg is the stack pointer!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25520, __extension__
__PRETTY_FUNCTION__))
;
25521
25522 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
25523 const Align StackAlign = TFI.getStackAlign();
25524 if (hasInlineStackProbe(MF)) {
25525 MachineRegisterInfo &MRI = MF.getRegInfo();
25526
25527 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
25528 Register Vreg = MRI.createVirtualRegister(AddrRegClass);
25529 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
25530 Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, SPTy, Chain,
25531 DAG.getRegister(Vreg, SPTy));
25532 } else {
25533 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
25534 Chain = SP.getValue(1);
25535 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
25536 }
25537 if (Alignment && *Alignment > StackAlign)
25538 Result =
25539 DAG.getNode(ISD::AND, dl, VT, Result,
25540 DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
25541 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
25542 } else if (SplitStack) {
25543 MachineRegisterInfo &MRI = MF.getRegInfo();
25544
25545 if (Is64Bit) {
25546 // The 64 bit implementation of segmented stacks needs to clobber both r10
25547 // r11. This makes it impossible to use it along with nested parameters.
25548 const Function &F = MF.getFunction();
25549 for (const auto &A : F.args()) {
25550 if (A.hasNestAttr())
25551 report_fatal_error("Cannot use segmented stacks with functions that "
25552 "have nested arguments.");
25553 }
25554 }
25555
25556 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
25557 Register Vreg = MRI.createVirtualRegister(AddrRegClass);
25558 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
25559 Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
25560 DAG.getRegister(Vreg, SPTy));
25561 } else {
25562 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
25563 Chain = DAG.getNode(X86ISD::DYN_ALLOCA, dl, NodeTys, Chain, Size);
25564 MF.getInfo<X86MachineFunctionInfo>()->setHasDynAlloca(true);
25565
25566 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
25567 Register SPReg = RegInfo->getStackRegister();
25568 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
25569 Chain = SP.getValue(1);
25570
25571 if (Alignment) {
25572 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
25573 DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
25574 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
25575 }
25576
25577 Result = SP;
25578 }
25579
25580 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
25581 DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
25582
25583 SDValue Ops[2] = {Result, Chain};
25584 return DAG.getMergeValues(Ops, dl);
25585}
25586
25587SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
25588 MachineFunction &MF = DAG.getMachineFunction();
25589 auto PtrVT = getPointerTy(MF.getDataLayout());
25590 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
25591
25592 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
25593 SDLoc DL(Op);
25594
25595 if (!Subtarget.is64Bit() ||
25596 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
25597 // vastart just stores the address of the VarArgsFrameIndex slot into the
25598 // memory location argument.
25599 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
25600 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
25601 MachinePointerInfo(SV));
25602 }
25603
25604 // __va_list_tag:
25605 // gp_offset (0 - 6 * 8)
25606 // fp_offset (48 - 48 + 8 * 16)
25607 // overflow_arg_area (point to parameters coming in memory).
25608 // reg_save_area
25609 SmallVector<SDValue, 8> MemOps;
25610 SDValue FIN = Op.getOperand(1);
25611 // Store gp_offset
25612 SDValue Store = DAG.getStore(
25613 Op.getOperand(0), DL,
25614 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
25615 MachinePointerInfo(SV));
25616 MemOps.push_back(Store);
25617
25618 // Store fp_offset
25619 FIN = DAG.getMemBasePlusOffset(FIN, TypeSize::Fixed(4), DL);
25620 Store = DAG.getStore(
25621 Op.getOperand(0), DL,
25622 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
25623 MachinePointerInfo(SV, 4));
25624 MemOps.push_back(Store);
25625
25626 // Store ptr to overflow_arg_area
25627 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
25628 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
25629 Store =
25630 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
25631 MemOps.push_back(Store);
25632
25633 // Store ptr to reg_save_area.
25634 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
25635 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
25636 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
25637 Store = DAG.getStore(
25638 Op.getOperand(0), DL, RSFIN, FIN,
25639 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
25640 MemOps.push_back(Store);
25641 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
25642}
25643
25644SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
25645 assert(Subtarget.is64Bit() &&(static_cast <bool> (Subtarget.is64Bit() && "LowerVAARG only handles 64-bit va_arg!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"LowerVAARG only handles 64-bit va_arg!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25646, __extension__
__PRETTY_FUNCTION__))
25646 "LowerVAARG only handles 64-bit va_arg!")(static_cast <bool> (Subtarget.is64Bit() && "LowerVAARG only handles 64-bit va_arg!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"LowerVAARG only handles 64-bit va_arg!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25646, __extension__
__PRETTY_FUNCTION__))
;
25647 assert(Op.getNumOperands() == 4)(static_cast <bool> (Op.getNumOperands() == 4) ? void (
0) : __assert_fail ("Op.getNumOperands() == 4", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25647, __extension__ __PRETTY_FUNCTION__))
;
25648
25649 MachineFunction &MF = DAG.getMachineFunction();
25650 if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
25651 // The Win64 ABI uses char* instead of a structure.
25652 return DAG.expandVAArg(Op.getNode());
25653
25654 SDValue Chain = Op.getOperand(0);
25655 SDValue SrcPtr = Op.getOperand(1);
25656 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
25657 unsigned Align = Op.getConstantOperandVal(3);
25658 SDLoc dl(Op);
25659
25660 EVT ArgVT = Op.getNode()->getValueType(0);
25661 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
25662 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
25663 uint8_t ArgMode;
25664
25665 // Decide which area this value should be read from.
25666 // TODO: Implement the AMD64 ABI in its entirety. This simple
25667 // selection mechanism works only for the basic types.
25668 assert(ArgVT != MVT::f80 && "va_arg for f80 not yet implemented")(static_cast <bool> (ArgVT != MVT::f80 && "va_arg for f80 not yet implemented"
) ? void (0) : __assert_fail ("ArgVT != MVT::f80 && \"va_arg for f80 not yet implemented\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25668, __extension__
__PRETTY_FUNCTION__))
;
25669 if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
25670 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
25671 } else {
25672 assert(ArgVT.isInteger() && ArgSize <= 32 /*bytes*/ &&(static_cast <bool> (ArgVT.isInteger() && ArgSize
<= 32 && "Unhandled argument type in LowerVAARG")
? void (0) : __assert_fail ("ArgVT.isInteger() && ArgSize <= 32 && \"Unhandled argument type in LowerVAARG\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25673, __extension__
__PRETTY_FUNCTION__))
25673 "Unhandled argument type in LowerVAARG")(static_cast <bool> (ArgVT.isInteger() && ArgSize
<= 32 && "Unhandled argument type in LowerVAARG")
? void (0) : __assert_fail ("ArgVT.isInteger() && ArgSize <= 32 && \"Unhandled argument type in LowerVAARG\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25673, __extension__
__PRETTY_FUNCTION__))
;
25674 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
25675 }
25676
25677 if (ArgMode == 2) {
25678 // Make sure using fp_offset makes sense.
25679 assert(!Subtarget.useSoftFloat() &&(static_cast <bool> (!Subtarget.useSoftFloat() &&
!(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat
)) && Subtarget.hasSSE1()) ? void (0) : __assert_fail
("!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25681, __extension__
__PRETTY_FUNCTION__))
25680 !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&(static_cast <bool> (!Subtarget.useSoftFloat() &&
!(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat
)) && Subtarget.hasSSE1()) ? void (0) : __assert_fail
("!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25681, __extension__
__PRETTY_FUNCTION__))
25681 Subtarget.hasSSE1())(static_cast <bool> (!Subtarget.useSoftFloat() &&
!(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat
)) && Subtarget.hasSSE1()) ? void (0) : __assert_fail
("!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25681, __extension__
__PRETTY_FUNCTION__))
;
25682 }
25683
25684 // Insert VAARG node into the DAG
25685 // VAARG returns two values: Variable Argument Address, Chain
25686 SDValue InstOps[] = {Chain, SrcPtr,
25687 DAG.getTargetConstant(ArgSize, dl, MVT::i32),
25688 DAG.getTargetConstant(ArgMode, dl, MVT::i8),
25689 DAG.getTargetConstant(Align, dl, MVT::i32)};
25690 SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
25691 SDValue VAARG = DAG.getMemIntrinsicNode(
25692 Subtarget.isTarget64BitLP64() ? X86ISD::VAARG_64 : X86ISD::VAARG_X32, dl,
25693 VTs, InstOps, MVT::i64, MachinePointerInfo(SV),
25694 /*Alignment=*/None,
25695 MachineMemOperand::MOLoad | MachineMemOperand::MOStore);
25696 Chain = VAARG.getValue(1);
25697
25698 // Load the next argument and return it
25699 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
25700}
25701
25702static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
25703 SelectionDAG &DAG) {
25704 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
25705 // where a va_list is still an i8*.
25706 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!")(static_cast <bool> (Subtarget.is64Bit() && "This code only handles 64-bit va_copy!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"This code only handles 64-bit va_copy!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25706, __extension__
__PRETTY_FUNCTION__))
;
25707 if (Subtarget.isCallingConvWin64(
25708 DAG.getMachineFunction().getFunction().getCallingConv()))
25709 // Probably a Win64 va_copy.
25710 return DAG.expandVACopy(Op.getNode());
25711
25712 SDValue Chain = Op.getOperand(0);
25713 SDValue DstPtr = Op.getOperand(1);
25714 SDValue SrcPtr = Op.getOperand(2);
25715 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
25716 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
25717 SDLoc DL(Op);
25718
25719 return DAG.getMemcpy(
25720 Chain, DL, DstPtr, SrcPtr,
25721 DAG.getIntPtrConstant(Subtarget.isTarget64BitLP64() ? 24 : 16, DL),
25722 Align(Subtarget.isTarget64BitLP64() ? 8 : 4), /*isVolatile*/ false, false,
25723 false, MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
25724}
25725
25726// Helper to get immediate/variable SSE shift opcode from other shift opcodes.
25727static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {
25728 switch (Opc) {
25729 case ISD::SHL:
25730 case X86ISD::VSHL:
25731 case X86ISD::VSHLI:
25732 return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;
25733 case ISD::SRL:
25734 case X86ISD::VSRL:
25735 case X86ISD::VSRLI:
25736 return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;
25737 case ISD::SRA:
25738 case X86ISD::VSRA:
25739 case X86ISD::VSRAI:
25740 return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;
25741 }
25742 llvm_unreachable("Unknown target vector shift node")::llvm::llvm_unreachable_internal("Unknown target vector shift node"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25742)
;
25743}
25744
25745/// Handle vector element shifts where the shift amount is a constant.
25746/// Takes immediate version of shift as input.
25747static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
25748 SDValue SrcOp, uint64_t ShiftAmt,
25749 SelectionDAG &DAG) {
25750 MVT ElementType = VT.getVectorElementType();
25751
25752 // Bitcast the source vector to the output type, this is mainly necessary for
25753 // vXi8/vXi64 shifts.
25754 if (VT != SrcOp.getSimpleValueType())
25755 SrcOp = DAG.getBitcast(VT, SrcOp);
25756
25757 // Fold this packed shift into its first operand if ShiftAmt is 0.
25758 if (ShiftAmt == 0)
25759 return SrcOp;
25760
25761 // Check for ShiftAmt >= element width
25762 if (ShiftAmt >= ElementType.getSizeInBits()) {
25763 if (Opc == X86ISD::VSRAI)
25764 ShiftAmt = ElementType.getSizeInBits() - 1;
25765 else
25766 return DAG.getConstant(0, dl, VT);
25767 }
25768
25769 assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)(static_cast <bool> ((Opc == X86ISD::VSHLI || Opc == X86ISD
::VSRLI || Opc == X86ISD::VSRAI) && "Unknown target vector shift-by-constant node"
) ? void (0) : __assert_fail ("(Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) && \"Unknown target vector shift-by-constant node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25770, __extension__
__PRETTY_FUNCTION__))
25770 && "Unknown target vector shift-by-constant node")(static_cast <bool> ((Opc == X86ISD::VSHLI || Opc == X86ISD
::VSRLI || Opc == X86ISD::VSRAI) && "Unknown target vector shift-by-constant node"
) ? void (0) : __assert_fail ("(Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) && \"Unknown target vector shift-by-constant node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25770, __extension__
__PRETTY_FUNCTION__))
;
25771
25772 // Fold this packed vector shift into a build vector if SrcOp is a
25773 // vector of Constants or UNDEFs.
25774 if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
25775 SmallVector<SDValue, 8> Elts;
25776 unsigned NumElts = SrcOp->getNumOperands();
25777
25778 switch (Opc) {
25779 default: llvm_unreachable("Unknown opcode!")::llvm::llvm_unreachable_internal("Unknown opcode!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25779)
;
25780 case X86ISD::VSHLI:
25781 for (unsigned i = 0; i != NumElts; ++i) {
25782 SDValue CurrentOp = SrcOp->getOperand(i);
25783 if (CurrentOp->isUndef()) {
25784 // Must produce 0s in the correct bits.
25785 Elts.push_back(DAG.getConstant(0, dl, ElementType));
25786 continue;
25787 }
25788 auto *ND = cast<ConstantSDNode>(CurrentOp);
25789 const APInt &C = ND->getAPIntValue();
25790 Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
25791 }
25792 break;
25793 case X86ISD::VSRLI:
25794 for (unsigned i = 0; i != NumElts; ++i) {
25795 SDValue CurrentOp = SrcOp->getOperand(i);
25796 if (CurrentOp->isUndef()) {
25797 // Must produce 0s in the correct bits.
25798 Elts.push_back(DAG.getConstant(0, dl, ElementType));
25799 continue;
25800 }
25801 auto *ND = cast<ConstantSDNode>(CurrentOp);
25802 const APInt &C = ND->getAPIntValue();
25803 Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
25804 }
25805 break;
25806 case X86ISD::VSRAI:
25807 for (unsigned i = 0; i != NumElts; ++i) {
25808 SDValue CurrentOp = SrcOp->getOperand(i);
25809 if (CurrentOp->isUndef()) {
25810 // All shifted in bits must be the same so use 0.
25811 Elts.push_back(DAG.getConstant(0, dl, ElementType));
25812 continue;
25813 }
25814 auto *ND = cast<ConstantSDNode>(CurrentOp);
25815 const APInt &C = ND->getAPIntValue();
25816 Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
25817 }
25818 break;
25819 }
25820
25821 return DAG.getBuildVector(VT, dl, Elts);
25822 }
25823
25824 return DAG.getNode(Opc, dl, VT, SrcOp,
25825 DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));
25826}
25827
25828/// Handle vector element shifts by a splat shift amount
25829static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
25830 SDValue SrcOp, SDValue ShAmt, int ShAmtIdx,
25831 const X86Subtarget &Subtarget,
25832 SelectionDAG &DAG) {
25833 MVT AmtVT = ShAmt.getSimpleValueType();
25834 assert(AmtVT.isVector() && "Vector shift type mismatch")(static_cast <bool> (AmtVT.isVector() && "Vector shift type mismatch"
) ? void (0) : __assert_fail ("AmtVT.isVector() && \"Vector shift type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25834, __extension__
__PRETTY_FUNCTION__))
;
25835 assert(0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() &&(static_cast <bool> (0 <= ShAmtIdx && ShAmtIdx
< (int)AmtVT.getVectorNumElements() && "Illegal vector splat index"
) ? void (0) : __assert_fail ("0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() && \"Illegal vector splat index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25836, __extension__
__PRETTY_FUNCTION__))
25836 "Illegal vector splat index")(static_cast <bool> (0 <= ShAmtIdx && ShAmtIdx
< (int)AmtVT.getVectorNumElements() && "Illegal vector splat index"
) ? void (0) : __assert_fail ("0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() && \"Illegal vector splat index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25836, __extension__
__PRETTY_FUNCTION__))
;
25837
25838 // Move the splat element to the bottom element.
25839 if (ShAmtIdx != 0) {
25840 SmallVector<int> Mask(AmtVT.getVectorNumElements(), -1);
25841 Mask[0] = ShAmtIdx;
25842 ShAmt = DAG.getVectorShuffle(AmtVT, dl, ShAmt, DAG.getUNDEF(AmtVT), Mask);
25843 }
25844
25845 // Peek through any zext node if we can get back to a 128-bit source.
25846 if (AmtVT.getScalarSizeInBits() == 64 &&
25847 (ShAmt.getOpcode() == ISD::ZERO_EXTEND ||
25848 ShAmt.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG) &&
25849 ShAmt.getOperand(0).getValueType().isSimple() &&
25850 ShAmt.getOperand(0).getValueType().is128BitVector()) {
25851 ShAmt = ShAmt.getOperand(0);
25852 AmtVT = ShAmt.getSimpleValueType();
25853 }
25854
25855 // See if we can mask off the upper elements using the existing source node.
25856 // The shift uses the entire lower 64-bits of the amount vector, so no need to
25857 // do this for vXi64 types.
25858 bool IsMasked = false;
25859 if (AmtVT.getScalarSizeInBits() < 64) {
25860 if (ShAmt.getOpcode() == ISD::BUILD_VECTOR ||
25861 ShAmt.getOpcode() == ISD::SCALAR_TO_VECTOR) {
25862 // If the shift amount has come from a scalar, then zero-extend the scalar
25863 // before moving to the vector.
25864 ShAmt = DAG.getZExtOrTrunc(ShAmt.getOperand(0), dl, MVT::i32);
25865 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
25866 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, ShAmt);
25867 AmtVT = MVT::v4i32;
25868 IsMasked = true;
25869 } else if (ShAmt.getOpcode() == ISD::AND) {
25870 // See if the shift amount is already masked (e.g. for rotation modulo),
25871 // then we can zero-extend it by setting all the other mask elements to
25872 // zero.
25873 SmallVector<SDValue> MaskElts(
25874 AmtVT.getVectorNumElements(),
25875 DAG.getConstant(0, dl, AmtVT.getScalarType()));
25876 MaskElts[0] = DAG.getAllOnesConstant(dl, AmtVT.getScalarType());
25877 SDValue Mask = DAG.getBuildVector(AmtVT, dl, MaskElts);
25878 if ((Mask = DAG.FoldConstantArithmetic(ISD::AND, dl, AmtVT,
25879 {ShAmt.getOperand(1), Mask}))) {
25880 ShAmt = DAG.getNode(ISD::AND, dl, AmtVT, ShAmt.getOperand(0), Mask);
25881 IsMasked = true;
25882 }
25883 }
25884 }
25885
25886 // Extract if the shift amount vector is larger than 128-bits.
25887 if (AmtVT.getSizeInBits() > 128) {
25888 ShAmt = extract128BitVector(ShAmt, 0, DAG, dl);
25889 AmtVT = ShAmt.getSimpleValueType();
25890 }
25891
25892 // Zero-extend bottom element to v2i64 vector type, either by extension or
25893 // shuffle masking.
25894 if (!IsMasked && AmtVT.getScalarSizeInBits() < 64) {
25895 if (AmtVT == MVT::v4i32 && (ShAmt.getOpcode() == X86ISD::VBROADCAST ||
25896 ShAmt.getOpcode() == X86ISD::VBROADCAST_LOAD)) {
25897 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, SDLoc(ShAmt), MVT::v4i32, ShAmt);
25898 } else if (Subtarget.hasSSE41()) {
25899 ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
25900 MVT::v2i64, ShAmt);
25901 } else {
25902 SDValue ByteShift = DAG.getTargetConstant(
25903 (128 - AmtVT.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
25904 ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);
25905 ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
25906 ByteShift);
25907 ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
25908 ByteShift);
25909 }
25910 }
25911
25912 // Change opcode to non-immediate version.
25913 Opc = getTargetVShiftUniformOpcode(Opc, true);
25914
25915 // The return type has to be a 128-bit type with the same element
25916 // type as the input type.
25917 MVT EltVT = VT.getVectorElementType();
25918 MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());
25919
25920 ShAmt = DAG.getBitcast(ShVT, ShAmt);
25921 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
25922}
25923
25924/// Return Mask with the necessary casting or extending
25925/// for \p Mask according to \p MaskVT when lowering masking intrinsics
25926static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
25927 const X86Subtarget &Subtarget, SelectionDAG &DAG,
25928 const SDLoc &dl) {
25929
25930 if (isAllOnesConstant(Mask))
25931 return DAG.getConstant(1, dl, MaskVT);
25932 if (X86::isZeroNode(Mask))
25933 return DAG.getConstant(0, dl, MaskVT);
25934
25935 assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!")(static_cast <bool> (MaskVT.bitsLE(Mask.getSimpleValueType
()) && "Unexpected mask size!") ? void (0) : __assert_fail
("MaskVT.bitsLE(Mask.getSimpleValueType()) && \"Unexpected mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25935, __extension__
__PRETTY_FUNCTION__))
;
25936
25937 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
25938 assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!")(static_cast <bool> (MaskVT == MVT::v64i1 && "Expected v64i1 mask!"
) ? void (0) : __assert_fail ("MaskVT == MVT::v64i1 && \"Expected v64i1 mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25938, __extension__
__PRETTY_FUNCTION__))
;
25939 assert(Subtarget.hasBWI() && "Expected AVX512BW target!")(static_cast <bool> (Subtarget.hasBWI() && "Expected AVX512BW target!"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected AVX512BW target!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25939, __extension__
__PRETTY_FUNCTION__))
;
25940 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
25941 SDValue Lo, Hi;
25942 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
25943 DAG.getConstant(0, dl, MVT::i32));
25944 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
25945 DAG.getConstant(1, dl, MVT::i32));
25946
25947 Lo = DAG.getBitcast(MVT::v32i1, Lo);
25948 Hi = DAG.getBitcast(MVT::v32i1, Hi);
25949
25950 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
25951 } else {
25952 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
25953 Mask.getSimpleValueType().getSizeInBits());
25954 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
25955 // are extracted by EXTRACT_SUBVECTOR.
25956 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
25957 DAG.getBitcast(BitcastVT, Mask),
25958 DAG.getIntPtrConstant(0, dl));
25959 }
25960}
25961
25962/// Return (and \p Op, \p Mask) for compare instructions or
25963/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
25964/// necessary casting or extending for \p Mask when lowering masking intrinsics
25965static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
25966 SDValue PreservedSrc,
25967 const X86Subtarget &Subtarget,
25968 SelectionDAG &DAG) {
25969 MVT VT = Op.getSimpleValueType();
25970 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
25971 unsigned OpcodeSelect = ISD::VSELECT;
25972 SDLoc dl(Op);
25973
25974 if (isAllOnesConstant(Mask))
25975 return Op;
25976
25977 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
25978
25979 if (PreservedSrc.isUndef())
25980 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
25981 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
25982}
25983
25984/// Creates an SDNode for a predicated scalar operation.
25985/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
25986/// The mask is coming as MVT::i8 and it should be transformed
25987/// to MVT::v1i1 while lowering masking intrinsics.
25988/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
25989/// "X86select" instead of "vselect". We just can't create the "vselect" node
25990/// for a scalar instruction.
25991static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
25992 SDValue PreservedSrc,
25993 const X86Subtarget &Subtarget,
25994 SelectionDAG &DAG) {
25995
25996 if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
25997 if (MaskConst->getZExtValue() & 0x1)
25998 return Op;
25999
26000 MVT VT = Op.getSimpleValueType();
26001 SDLoc dl(Op);
26002
26003 assert(Mask.getValueType() == MVT::i8 && "Unexpect type")(static_cast <bool> (Mask.getValueType() == MVT::i8 &&
"Unexpect type") ? void (0) : __assert_fail ("Mask.getValueType() == MVT::i8 && \"Unexpect type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26003, __extension__
__PRETTY_FUNCTION__))
;
26004 SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1,
26005 DAG.getBitcast(MVT::v8i1, Mask),
26006 DAG.getIntPtrConstant(0, dl));
26007 if (Op.getOpcode() == X86ISD::FSETCCM ||
26008 Op.getOpcode() == X86ISD::FSETCCM_SAE ||
26009 Op.getOpcode() == X86ISD::VFPCLASSS)
26010 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
26011
26012 if (PreservedSrc.isUndef())
26013 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
26014 return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
26015}
26016
26017static int getSEHRegistrationNodeSize(const Function *Fn) {
26018 if (!Fn->hasPersonalityFn())
26019 report_fatal_error(
26020 "querying registration node size for function without personality");
26021 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
26022 // WinEHStatePass for the full struct definition.
26023 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
26024 case EHPersonality::MSVC_X86SEH: return 24;
26025 case EHPersonality::MSVC_CXX: return 16;
26026 default: break;
26027 }
26028 report_fatal_error(
26029 "can only recover FP for 32-bit MSVC EH personality functions");
26030}
26031
26032/// When the MSVC runtime transfers control to us, either to an outlined
26033/// function or when returning to a parent frame after catching an exception, we
26034/// recover the parent frame pointer by doing arithmetic on the incoming EBP.
26035/// Here's the math:
26036/// RegNodeBase = EntryEBP - RegNodeSize
26037/// ParentFP = RegNodeBase - ParentFrameOffset
26038/// Subtracting RegNodeSize takes us to the offset of the registration node, and
26039/// subtracting the offset (negative on x86) takes us back to the parent FP.
26040static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
26041 SDValue EntryEBP) {
26042 MachineFunction &MF = DAG.getMachineFunction();
26043 SDLoc dl;
26044
26045 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26046 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
26047
26048 // It's possible that the parent function no longer has a personality function
26049 // if the exceptional code was optimized away, in which case we just return
26050 // the incoming EBP.
26051 if (!Fn->hasPersonalityFn())
26052 return EntryEBP;
26053
26054 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
26055 // registration, or the .set_setframe offset.
26056 MCSymbol *OffsetSym =
26057 MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
26058 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
26059 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
26060 SDValue ParentFrameOffset =
26061 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
26062
26063 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
26064 // prologue to RBP in the parent function.
26065 const X86Subtarget &Subtarget =
26066 static_cast<const X86Subtarget &>(DAG.getSubtarget());
26067 if (Subtarget.is64Bit())
26068 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
26069
26070 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
26071 // RegNodeBase = EntryEBP - RegNodeSize
26072 // ParentFP = RegNodeBase - ParentFrameOffset
26073 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
26074 DAG.getConstant(RegNodeSize, dl, PtrVT));
26075 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
26076}
26077
26078SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
26079 SelectionDAG &DAG) const {
26080 // Helper to detect if the operand is CUR_DIRECTION rounding mode.
26081 auto isRoundModeCurDirection = [](SDValue Rnd) {
26082 if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
26083 return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;
26084
26085 return false;
26086 };
26087 auto isRoundModeSAE = [](SDValue Rnd) {
26088 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
26089 unsigned RC = C->getZExtValue();
26090 if (RC & X86::STATIC_ROUNDING::NO_EXC) {
26091 // Clear the NO_EXC bit and check remaining bits.
26092 RC ^= X86::STATIC_ROUNDING::NO_EXC;
26093 // As a convenience we allow no other bits or explicitly
26094 // current direction.
26095 return RC == 0 || RC == X86::STATIC_ROUNDING::CUR_DIRECTION;
26096 }
26097 }
26098
26099 return false;
26100 };
26101 auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) {
26102 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
26103 RC = C->getZExtValue();
26104 if (RC & X86::STATIC_ROUNDING::NO_EXC) {
26105 // Clear the NO_EXC bit and check remaining bits.
26106 RC ^= X86::STATIC_ROUNDING::NO_EXC;
26107 return RC == X86::STATIC_ROUNDING::TO_NEAREST_INT ||
26108 RC == X86::STATIC_ROUNDING::TO_NEG_INF ||
26109 RC == X86::STATIC_ROUNDING::TO_POS_INF ||
26110 RC == X86::STATIC_ROUNDING::TO_ZERO;
26111 }
26112 }
26113
26114 return false;
26115 };
26116
26117 SDLoc dl(Op);
26118 unsigned IntNo = Op.getConstantOperandVal(0);
26119 MVT VT = Op.getSimpleValueType();
26120 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
26121
26122 // Propagate flags from original node to transformed node(s).
26123 SelectionDAG::FlagInserter FlagsInserter(DAG, Op->getFlags());
26124
26125 if (IntrData) {
26126 switch(IntrData->Type) {
26127 case INTR_TYPE_1OP: {
26128 // We specify 2 possible opcodes for intrinsics with rounding modes.
26129 // First, we check if the intrinsic may have non-default rounding mode,
26130 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26131 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26132 if (IntrWithRoundingModeOpcode != 0) {
26133 SDValue Rnd = Op.getOperand(2);
26134 unsigned RC = 0;
26135 if (isRoundModeSAEToX(Rnd, RC))
26136 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26137 Op.getOperand(1),
26138 DAG.getTargetConstant(RC, dl, MVT::i32));
26139 if (!isRoundModeCurDirection(Rnd))
26140 return SDValue();
26141 }
26142 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26143 Op.getOperand(1));
26144 }
26145 case INTR_TYPE_1OP_SAE: {
26146 SDValue Sae = Op.getOperand(2);
26147
26148 unsigned Opc;
26149 if (isRoundModeCurDirection(Sae))
26150 Opc = IntrData->Opc0;
26151 else if (isRoundModeSAE(Sae))
26152 Opc = IntrData->Opc1;
26153 else
26154 return SDValue();
26155
26156 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1));
26157 }
26158 case INTR_TYPE_2OP: {
26159 SDValue Src2 = Op.getOperand(2);
26160
26161 // We specify 2 possible opcodes for intrinsics with rounding modes.
26162 // First, we check if the intrinsic may have non-default rounding mode,
26163 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26164 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26165 if (IntrWithRoundingModeOpcode != 0) {
26166 SDValue Rnd = Op.getOperand(3);
26167 unsigned RC = 0;
26168 if (isRoundModeSAEToX(Rnd, RC))
26169 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26170 Op.getOperand(1), Src2,
26171 DAG.getTargetConstant(RC, dl, MVT::i32));
26172 if (!isRoundModeCurDirection(Rnd))
26173 return SDValue();
26174 }
26175
26176 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26177 Op.getOperand(1), Src2);
26178 }
26179 case INTR_TYPE_2OP_SAE: {
26180 SDValue Sae = Op.getOperand(3);
26181
26182 unsigned Opc;
26183 if (isRoundModeCurDirection(Sae))
26184 Opc = IntrData->Opc0;
26185 else if (isRoundModeSAE(Sae))
26186 Opc = IntrData->Opc1;
26187 else
26188 return SDValue();
26189
26190 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
26191 Op.getOperand(2));
26192 }
26193 case INTR_TYPE_3OP:
26194 case INTR_TYPE_3OP_IMM8: {
26195 SDValue Src1 = Op.getOperand(1);
26196 SDValue Src2 = Op.getOperand(2);
26197 SDValue Src3 = Op.getOperand(3);
26198
26199 if (IntrData->Type == INTR_TYPE_3OP_IMM8 &&
26200 Src3.getValueType() != MVT::i8) {
26201 Src3 = DAG.getTargetConstant(
26202 cast<ConstantSDNode>(Src3)->getZExtValue() & 0xff, dl, MVT::i8);
26203 }
26204
26205 // We specify 2 possible opcodes for intrinsics with rounding modes.
26206 // First, we check if the intrinsic may have non-default rounding mode,
26207 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26208 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26209 if (IntrWithRoundingModeOpcode != 0) {
26210 SDValue Rnd = Op.getOperand(4);
26211 unsigned RC = 0;
26212 if (isRoundModeSAEToX(Rnd, RC))
26213 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26214 Src1, Src2, Src3,
26215 DAG.getTargetConstant(RC, dl, MVT::i32));
26216 if (!isRoundModeCurDirection(Rnd))
26217 return SDValue();
26218 }
26219
26220 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26221 {Src1, Src2, Src3});
26222 }
26223 case INTR_TYPE_4OP_IMM8: {
26224 assert(Op.getOperand(4)->getOpcode() == ISD::TargetConstant)(static_cast <bool> (Op.getOperand(4)->getOpcode() ==
ISD::TargetConstant) ? void (0) : __assert_fail ("Op.getOperand(4)->getOpcode() == ISD::TargetConstant"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26224, __extension__
__PRETTY_FUNCTION__))
;
26225 SDValue Src4 = Op.getOperand(4);
26226 if (Src4.getValueType() != MVT::i8) {
26227 Src4 = DAG.getTargetConstant(
26228 cast<ConstantSDNode>(Src4)->getZExtValue() & 0xff, dl, MVT::i8);
26229 }
26230
26231 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26232 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
26233 Src4);
26234 }
26235 case INTR_TYPE_1OP_MASK: {
26236 SDValue Src = Op.getOperand(1);
26237 SDValue PassThru = Op.getOperand(2);
26238 SDValue Mask = Op.getOperand(3);
26239 // We add rounding mode to the Node when
26240 // - RC Opcode is specified and
26241 // - RC is not "current direction".
26242 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26243 if (IntrWithRoundingModeOpcode != 0) {
26244 SDValue Rnd = Op.getOperand(4);
26245 unsigned RC = 0;
26246 if (isRoundModeSAEToX(Rnd, RC))
26247 return getVectorMaskingNode(
26248 DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26249 Src, DAG.getTargetConstant(RC, dl, MVT::i32)),
26250 Mask, PassThru, Subtarget, DAG);
26251 if (!isRoundModeCurDirection(Rnd))
26252 return SDValue();
26253 }
26254 return getVectorMaskingNode(
26255 DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,
26256 Subtarget, DAG);
26257 }
26258 case INTR_TYPE_1OP_MASK_SAE: {
26259 SDValue Src = Op.getOperand(1);
26260 SDValue PassThru = Op.getOperand(2);
26261 SDValue Mask = Op.getOperand(3);
26262 SDValue Rnd = Op.getOperand(4);
26263
26264 unsigned Opc;
26265 if (isRoundModeCurDirection(Rnd))
26266 Opc = IntrData->Opc0;
26267 else if (isRoundModeSAE(Rnd))
26268 Opc = IntrData->Opc1;
26269 else
26270 return SDValue();
26271
26272 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), Mask, PassThru,
26273 Subtarget, DAG);
26274 }
26275 case INTR_TYPE_SCALAR_MASK: {
26276 SDValue Src1 = Op.getOperand(1);
26277 SDValue Src2 = Op.getOperand(2);
26278 SDValue passThru = Op.getOperand(3);
26279 SDValue Mask = Op.getOperand(4);
26280 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26281 // There are 2 kinds of intrinsics in this group:
26282 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
26283 // (2) With rounding mode and sae - 7 operands.
26284 bool HasRounding = IntrWithRoundingModeOpcode != 0;
26285 if (Op.getNumOperands() == (5U + HasRounding)) {
26286 if (HasRounding) {
26287 SDValue Rnd = Op.getOperand(5);
26288 unsigned RC = 0;
26289 if (isRoundModeSAEToX(Rnd, RC))
26290 return getScalarMaskingNode(
26291 DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2,
26292 DAG.getTargetConstant(RC, dl, MVT::i32)),
26293 Mask, passThru, Subtarget, DAG);
26294 if (!isRoundModeCurDirection(Rnd))
26295 return SDValue();
26296 }
26297 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
26298 Src2),
26299 Mask, passThru, Subtarget, DAG);
26300 }
26301
26302 assert(Op.getNumOperands() == (6U + HasRounding) &&(static_cast <bool> (Op.getNumOperands() == (6U + HasRounding
) && "Unexpected intrinsic form") ? void (0) : __assert_fail
("Op.getNumOperands() == (6U + HasRounding) && \"Unexpected intrinsic form\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26303, __extension__
__PRETTY_FUNCTION__))
26303 "Unexpected intrinsic form")(static_cast <bool> (Op.getNumOperands() == (6U + HasRounding
) && "Unexpected intrinsic form") ? void (0) : __assert_fail
("Op.getNumOperands() == (6U + HasRounding) && \"Unexpected intrinsic form\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26303, __extension__
__PRETTY_FUNCTION__))
;
26304 SDValue RoundingMode = Op.getOperand(5);
26305 unsigned Opc = IntrData->Opc0;
26306 if (HasRounding) {
26307 SDValue Sae = Op.getOperand(6);
26308 if (isRoundModeSAE(Sae))
26309 Opc = IntrWithRoundingModeOpcode;
26310 else if (!isRoundModeCurDirection(Sae))
26311 return SDValue();
26312 }
26313 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,
26314 Src2, RoundingMode),
26315 Mask, passThru, Subtarget, DAG);
26316 }
26317 case INTR_TYPE_SCALAR_MASK_RND: {
26318 SDValue Src1 = Op.getOperand(1);
26319 SDValue Src2 = Op.getOperand(2);
26320 SDValue passThru = Op.getOperand(3);
26321 SDValue Mask = Op.getOperand(4);
26322 SDValue Rnd = Op.getOperand(5);
26323
26324 SDValue NewOp;
26325 unsigned RC = 0;
26326 if (isRoundModeCurDirection(Rnd))
26327 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
26328 else if (isRoundModeSAEToX(Rnd, RC))
26329 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
26330 DAG.getTargetConstant(RC, dl, MVT::i32));
26331 else
26332 return SDValue();
26333
26334 return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG);
26335 }
26336 case INTR_TYPE_SCALAR_MASK_SAE: {
26337 SDValue Src1 = Op.getOperand(1);
26338 SDValue Src2 = Op.getOperand(2);
26339 SDValue passThru = Op.getOperand(3);
26340 SDValue Mask = Op.getOperand(4);
26341 SDValue Sae = Op.getOperand(5);
26342 unsigned Opc;
26343 if (isRoundModeCurDirection(Sae))
26344 Opc = IntrData->Opc0;
26345 else if (isRoundModeSAE(Sae))
26346 Opc = IntrData->Opc1;
26347 else
26348 return SDValue();
26349
26350 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
26351 Mask, passThru, Subtarget, DAG);
26352 }
26353 case INTR_TYPE_2OP_MASK: {
26354 SDValue Src1 = Op.getOperand(1);
26355 SDValue Src2 = Op.getOperand(2);
26356 SDValue PassThru = Op.getOperand(3);
26357 SDValue Mask = Op.getOperand(4);
26358 SDValue NewOp;
26359 if (IntrData->Opc1 != 0) {
26360 SDValue Rnd = Op.getOperand(5);
26361 unsigned RC = 0;
26362 if (isRoundModeSAEToX(Rnd, RC))
26363 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
26364 DAG.getTargetConstant(RC, dl, MVT::i32));
26365 else if (!isRoundModeCurDirection(Rnd))
26366 return SDValue();
26367 }
26368 if (!NewOp)
26369 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
26370 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
26371 }
26372 case INTR_TYPE_2OP_MASK_SAE: {
26373 SDValue Src1 = Op.getOperand(1);
26374 SDValue Src2 = Op.getOperand(2);
26375 SDValue PassThru = Op.getOperand(3);
26376 SDValue Mask = Op.getOperand(4);
26377
26378 unsigned Opc = IntrData->Opc0;
26379 if (IntrData->Opc1 != 0) {
26380 SDValue Sae = Op.getOperand(5);
26381 if (isRoundModeSAE(Sae))
26382 Opc = IntrData->Opc1;
26383 else if (!isRoundModeCurDirection(Sae))
26384 return SDValue();
26385 }
26386
26387 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
26388 Mask, PassThru, Subtarget, DAG);
26389 }
26390 case INTR_TYPE_3OP_SCALAR_MASK_SAE: {
26391 SDValue Src1 = Op.getOperand(1);
26392 SDValue Src2 = Op.getOperand(2);
26393 SDValue Src3 = Op.getOperand(3);
26394 SDValue PassThru = Op.getOperand(4);
26395 SDValue Mask = Op.getOperand(5);
26396 SDValue Sae = Op.getOperand(6);
26397 unsigned Opc;
26398 if (isRoundModeCurDirection(Sae))
26399 Opc = IntrData->Opc0;
26400 else if (isRoundModeSAE(Sae))
26401 Opc = IntrData->Opc1;
26402 else
26403 return SDValue();
26404
26405 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
26406 Mask, PassThru, Subtarget, DAG);
26407 }
26408 case INTR_TYPE_3OP_MASK_SAE: {
26409 SDValue Src1 = Op.getOperand(1);
26410 SDValue Src2 = Op.getOperand(2);
26411 SDValue Src3 = Op.getOperand(3);
26412 SDValue PassThru = Op.getOperand(4);
26413 SDValue Mask = Op.getOperand(5);
26414
26415 unsigned Opc = IntrData->Opc0;
26416 if (IntrData->Opc1 != 0) {
26417 SDValue Sae = Op.getOperand(6);
26418 if (isRoundModeSAE(Sae))
26419 Opc = IntrData->Opc1;
26420 else if (!isRoundModeCurDirection(Sae))
26421 return SDValue();
26422 }
26423 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
26424 Mask, PassThru, Subtarget, DAG);
26425 }
26426 case BLENDV: {
26427 SDValue Src1 = Op.getOperand(1);
26428 SDValue Src2 = Op.getOperand(2);
26429 SDValue Src3 = Op.getOperand(3);
26430
26431 EVT MaskVT = Src3.getValueType().changeVectorElementTypeToInteger();
26432 Src3 = DAG.getBitcast(MaskVT, Src3);
26433
26434 // Reverse the operands to match VSELECT order.
26435 return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);
26436 }
26437 case VPERM_2OP : {
26438 SDValue Src1 = Op.getOperand(1);
26439 SDValue Src2 = Op.getOperand(2);
26440
26441 // Swap Src1 and Src2 in the node creation
26442 return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
26443 }
26444 case CFMA_OP_MASKZ:
26445 case CFMA_OP_MASK: {
26446 SDValue Src1 = Op.getOperand(1);
26447 SDValue Src2 = Op.getOperand(2);
26448 SDValue Src3 = Op.getOperand(3);
26449 SDValue Mask = Op.getOperand(4);
26450 MVT VT = Op.getSimpleValueType();
26451
26452 SDValue PassThru = Src3;
26453 if (IntrData->Type == CFMA_OP_MASKZ)
26454 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
26455
26456 // We add rounding mode to the Node when
26457 // - RC Opcode is specified and
26458 // - RC is not "current direction".
26459 SDValue NewOp;
26460 if (IntrData->Opc1 != 0) {
26461 SDValue Rnd = Op.getOperand(5);
26462 unsigned RC = 0;
26463 if (isRoundModeSAEToX(Rnd, RC))
26464 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2, Src3,
26465 DAG.getTargetConstant(RC, dl, MVT::i32));
26466 else if (!isRoundModeCurDirection(Rnd))
26467 return SDValue();
26468 }
26469 if (!NewOp)
26470 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, Src3);
26471 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
26472 }
26473 case IFMA_OP:
26474 // NOTE: We need to swizzle the operands to pass the multiply operands
26475 // first.
26476 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26477 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
26478 case FPCLASSS: {
26479 SDValue Src1 = Op.getOperand(1);
26480 SDValue Imm = Op.getOperand(2);
26481 SDValue Mask = Op.getOperand(3);
26482 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
26483 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
26484 Subtarget, DAG);
26485 // Need to fill with zeros to ensure the bitcast will produce zeroes
26486 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
26487 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
26488 DAG.getConstant(0, dl, MVT::v8i1),
26489 FPclassMask, DAG.getIntPtrConstant(0, dl));
26490 return DAG.getBitcast(MVT::i8, Ins);
26491 }
26492
26493 case CMP_MASK_CC: {
26494 MVT MaskVT = Op.getSimpleValueType();
26495 SDValue CC = Op.getOperand(3);
26496 SDValue Mask = Op.getOperand(4);
26497 // We specify 2 possible opcodes for intrinsics with rounding modes.
26498 // First, we check if the intrinsic may have non-default rounding mode,
26499 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26500 if (IntrData->Opc1 != 0) {
26501 SDValue Sae = Op.getOperand(5);
26502 if (isRoundModeSAE(Sae))
26503 return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
26504 Op.getOperand(2), CC, Mask, Sae);
26505 if (!isRoundModeCurDirection(Sae))
26506 return SDValue();
26507 }
26508 //default rounding mode
26509 return DAG.getNode(IntrData->Opc0, dl, MaskVT,
26510 {Op.getOperand(1), Op.getOperand(2), CC, Mask});
26511 }
26512 case CMP_MASK_SCALAR_CC: {
26513 SDValue Src1 = Op.getOperand(1);
26514 SDValue Src2 = Op.getOperand(2);
26515 SDValue CC = Op.getOperand(3);
26516 SDValue Mask = Op.getOperand(4);
26517
26518 SDValue Cmp;
26519 if (IntrData->Opc1 != 0) {
26520 SDValue Sae = Op.getOperand(5);
26521 if (isRoundModeSAE(Sae))
26522 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);
26523 else if (!isRoundModeCurDirection(Sae))
26524 return SDValue();
26525 }
26526 //default rounding mode
26527 if (!Cmp.getNode())
26528 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
26529
26530 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
26531 Subtarget, DAG);
26532 // Need to fill with zeros to ensure the bitcast will produce zeroes
26533 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
26534 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
26535 DAG.getConstant(0, dl, MVT::v8i1),
26536 CmpMask, DAG.getIntPtrConstant(0, dl));
26537 return DAG.getBitcast(MVT::i8, Ins);
26538 }
26539 case COMI: { // Comparison intrinsics
26540 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
26541 SDValue LHS = Op.getOperand(1);
26542 SDValue RHS = Op.getOperand(2);
26543 // Some conditions require the operands to be swapped.
26544 if (CC == ISD::SETLT || CC == ISD::SETLE)
26545 std::swap(LHS, RHS);
26546
26547 SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
26548 SDValue SetCC;
26549 switch (CC) {
26550 case ISD::SETEQ: { // (ZF = 0 and PF = 0)
26551 SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
26552 SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
26553 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
26554 break;
26555 }
26556 case ISD::SETNE: { // (ZF = 1 or PF = 1)
26557 SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
26558 SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
26559 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
26560 break;
26561 }
26562 case ISD::SETGT: // (CF = 0 and ZF = 0)
26563 case ISD::SETLT: { // Condition opposite to GT. Operands swapped above.
26564 SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
26565 break;
26566 }
26567 case ISD::SETGE: // CF = 0
26568 case ISD::SETLE: // Condition opposite to GE. Operands swapped above.
26569 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
26570 break;
26571 default:
26572 llvm_unreachable("Unexpected illegal condition!")::llvm::llvm_unreachable_internal("Unexpected illegal condition!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26572)
;
26573 }
26574 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
26575 }
26576 case COMI_RM: { // Comparison intrinsics with Sae
26577 SDValue LHS = Op.getOperand(1);
26578 SDValue RHS = Op.getOperand(2);
26579 unsigned CondVal = Op.getConstantOperandVal(3);
26580 SDValue Sae = Op.getOperand(4);
26581
26582 SDValue FCmp;
26583 if (isRoundModeCurDirection(Sae))
26584 FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
26585 DAG.getTargetConstant(CondVal, dl, MVT::i8));
26586 else if (isRoundModeSAE(Sae))
26587 FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,
26588 DAG.getTargetConstant(CondVal, dl, MVT::i8), Sae);
26589 else
26590 return SDValue();
26591 // Need to fill with zeros to ensure the bitcast will produce zeroes
26592 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
26593 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
26594 DAG.getConstant(0, dl, MVT::v16i1),
26595 FCmp, DAG.getIntPtrConstant(0, dl));
26596 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
26597 DAG.getBitcast(MVT::i16, Ins));
26598 }
26599 case VSHIFT: {
26600 SDValue SrcOp = Op.getOperand(1);
26601 SDValue ShAmt = Op.getOperand(2);
26602 assert(ShAmt.getValueType() == MVT::i32 &&(static_cast <bool> (ShAmt.getValueType() == MVT::i32 &&
"Unexpected VSHIFT amount type") ? void (0) : __assert_fail (
"ShAmt.getValueType() == MVT::i32 && \"Unexpected VSHIFT amount type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26603, __extension__
__PRETTY_FUNCTION__))
26603 "Unexpected VSHIFT amount type")(static_cast <bool> (ShAmt.getValueType() == MVT::i32 &&
"Unexpected VSHIFT amount type") ? void (0) : __assert_fail (
"ShAmt.getValueType() == MVT::i32 && \"Unexpected VSHIFT amount type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26603, __extension__
__PRETTY_FUNCTION__))
;
26604
26605 // Catch shift-by-constant.
26606 if (auto *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
26607 return getTargetVShiftByConstNode(IntrData->Opc0, dl,
26608 Op.getSimpleValueType(), SrcOp,
26609 CShAmt->getZExtValue(), DAG);
26610
26611 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
26612 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
26613 SrcOp, ShAmt, 0, Subtarget, DAG);
26614 }
26615 case COMPRESS_EXPAND_IN_REG: {
26616 SDValue Mask = Op.getOperand(3);
26617 SDValue DataToCompress = Op.getOperand(1);
26618 SDValue PassThru = Op.getOperand(2);
26619 if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is
26620 return Op.getOperand(1);
26621
26622 // Avoid false dependency.
26623 if (PassThru.isUndef())
26624 PassThru = DAG.getConstant(0, dl, VT);
26625
26626 return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,
26627 Mask);
26628 }
26629 case FIXUPIMM:
26630 case FIXUPIMM_MASKZ: {
26631 SDValue Src1 = Op.getOperand(1);
26632 SDValue Src2 = Op.getOperand(2);
26633 SDValue Src3 = Op.getOperand(3);
26634 SDValue Imm = Op.getOperand(4);
26635 SDValue Mask = Op.getOperand(5);
26636 SDValue Passthru = (IntrData->Type == FIXUPIMM)
26637 ? Src1
26638 : getZeroVector(VT, Subtarget, DAG, dl);
26639
26640 unsigned Opc = IntrData->Opc0;
26641 if (IntrData->Opc1 != 0) {
26642 SDValue Sae = Op.getOperand(6);
26643 if (isRoundModeSAE(Sae))
26644 Opc = IntrData->Opc1;
26645 else if (!isRoundModeCurDirection(Sae))
26646 return SDValue();
26647 }
26648
26649 SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm);
26650
26651 if (Opc == X86ISD::VFIXUPIMM || Opc == X86ISD::VFIXUPIMM_SAE)
26652 return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
26653
26654 return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
26655 }
26656 case ROUNDP: {
26657 assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode")(static_cast <bool> (IntrData->Opc0 == X86ISD::VRNDSCALE
&& "Unexpected opcode") ? void (0) : __assert_fail (
"IntrData->Opc0 == X86ISD::VRNDSCALE && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26657, __extension__
__PRETTY_FUNCTION__))
;
26658 // Clear the upper bits of the rounding immediate so that the legacy
26659 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
26660 auto Round = cast<ConstantSDNode>(Op.getOperand(2));
26661 SDValue RoundingMode =
26662 DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);
26663 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26664 Op.getOperand(1), RoundingMode);
26665 }
26666 case ROUNDS: {
26667 assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode")(static_cast <bool> (IntrData->Opc0 == X86ISD::VRNDSCALES
&& "Unexpected opcode") ? void (0) : __assert_fail (
"IntrData->Opc0 == X86ISD::VRNDSCALES && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26667, __extension__
__PRETTY_FUNCTION__))
;
26668 // Clear the upper bits of the rounding immediate so that the legacy
26669 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
26670 auto Round = cast<ConstantSDNode>(Op.getOperand(3));
26671 SDValue RoundingMode =
26672 DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);
26673 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26674 Op.getOperand(1), Op.getOperand(2), RoundingMode);
26675 }
26676 case BEXTRI: {
26677 assert(IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode")(static_cast <bool> (IntrData->Opc0 == X86ISD::BEXTRI
&& "Unexpected opcode") ? void (0) : __assert_fail (
"IntrData->Opc0 == X86ISD::BEXTRI && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26677, __extension__
__PRETTY_FUNCTION__))
;
26678
26679 uint64_t Imm = Op.getConstantOperandVal(2);
26680 SDValue Control = DAG.getTargetConstant(Imm & 0xffff, dl,
26681 Op.getValueType());
26682 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26683 Op.getOperand(1), Control);
26684 }
26685 // ADC/ADCX/SBB
26686 case ADX: {
26687 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
26688 SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32);
26689
26690 SDValue Res;
26691 // If the carry in is zero, then we should just use ADD/SUB instead of
26692 // ADC/SBB.
26693 if (isNullConstant(Op.getOperand(1))) {
26694 Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),
26695 Op.getOperand(3));
26696 } else {
26697 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1),
26698 DAG.getConstant(-1, dl, MVT::i8));
26699 Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),
26700 Op.getOperand(3), GenCF.getValue(1));
26701 }
26702 SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
26703 SDValue Results[] = { SetCC, Res };
26704 return DAG.getMergeValues(Results, dl);
26705 }
26706 case CVTPD2PS_MASK:
26707 case CVTPD2DQ_MASK:
26708 case CVTQQ2PS_MASK:
26709 case TRUNCATE_TO_REG: {
26710 SDValue Src = Op.getOperand(1);
26711 SDValue PassThru = Op.getOperand(2);
26712 SDValue Mask = Op.getOperand(3);
26713
26714 if (isAllOnesConstant(Mask))
26715 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
26716
26717 MVT SrcVT = Src.getSimpleValueType();
26718 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
26719 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26720 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
26721 {Src, PassThru, Mask});
26722 }
26723 case CVTPS2PH_MASK: {
26724 SDValue Src = Op.getOperand(1);
26725 SDValue Rnd = Op.getOperand(2);
26726 SDValue PassThru = Op.getOperand(3);
26727 SDValue Mask = Op.getOperand(4);
26728
26729 if (isAllOnesConstant(Mask))
26730 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src, Rnd);
26731
26732 MVT SrcVT = Src.getSimpleValueType();
26733 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
26734 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26735 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, Rnd,
26736 PassThru, Mask);
26737
26738 }
26739 case CVTNEPS2BF16_MASK: {
26740 SDValue Src = Op.getOperand(1);
26741 SDValue PassThru = Op.getOperand(2);
26742 SDValue Mask = Op.getOperand(3);
26743
26744 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
26745 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
26746
26747 // Break false dependency.
26748 if (PassThru.isUndef())
26749 PassThru = DAG.getConstant(0, dl, PassThru.getValueType());
26750
26751 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
26752 Mask);
26753 }
26754 default:
26755 break;
26756 }
26757 }
26758
26759 switch (IntNo) {
26760 default: return SDValue(); // Don't custom lower most intrinsics.
26761
26762 // ptest and testp intrinsics. The intrinsic these come from are designed to
26763 // return an integer value, not just an instruction so lower it to the ptest
26764 // or testp pattern and a setcc for the result.
26765 case Intrinsic::x86_avx512_ktestc_b:
26766 case Intrinsic::x86_avx512_ktestc_w:
26767 case Intrinsic::x86_avx512_ktestc_d:
26768 case Intrinsic::x86_avx512_ktestc_q:
26769 case Intrinsic::x86_avx512_ktestz_b:
26770 case Intrinsic::x86_avx512_ktestz_w:
26771 case Intrinsic::x86_avx512_ktestz_d:
26772 case Intrinsic::x86_avx512_ktestz_q:
26773 case Intrinsic::x86_sse41_ptestz:
26774 case Intrinsic::x86_sse41_ptestc:
26775 case Intrinsic::x86_sse41_ptestnzc:
26776 case Intrinsic::x86_avx_ptestz_256:
26777 case Intrinsic::x86_avx_ptestc_256:
26778 case Intrinsic::x86_avx_ptestnzc_256:
26779 case Intrinsic::x86_avx_vtestz_ps:
26780 case Intrinsic::x86_avx_vtestc_ps:
26781 case Intrinsic::x86_avx_vtestnzc_ps:
26782 case Intrinsic::x86_avx_vtestz_pd:
26783 case Intrinsic::x86_avx_vtestc_pd:
26784 case Intrinsic::x86_avx_vtestnzc_pd:
26785 case Intrinsic::x86_avx_vtestz_ps_256:
26786 case Intrinsic::x86_avx_vtestc_ps_256:
26787 case Intrinsic::x86_avx_vtestnzc_ps_256:
26788 case Intrinsic::x86_avx_vtestz_pd_256:
26789 case Intrinsic::x86_avx_vtestc_pd_256:
26790 case Intrinsic::x86_avx_vtestnzc_pd_256: {
26791 unsigned TestOpc = X86ISD::PTEST;
26792 X86::CondCode X86CC;
26793 switch (IntNo) {
26794 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.")::llvm::llvm_unreachable_internal("Bad fallthrough in Intrinsic lowering."
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26794)
;
26795 case Intrinsic::x86_avx512_ktestc_b:
26796 case Intrinsic::x86_avx512_ktestc_w:
26797 case Intrinsic::x86_avx512_ktestc_d:
26798 case Intrinsic::x86_avx512_ktestc_q:
26799 // CF = 1
26800 TestOpc = X86ISD::KTEST;
26801 X86CC = X86::COND_B;
26802 break;
26803 case Intrinsic::x86_avx512_ktestz_b:
26804 case Intrinsic::x86_avx512_ktestz_w:
26805 case Intrinsic::x86_avx512_ktestz_d:
26806 case Intrinsic::x86_avx512_ktestz_q:
26807 TestOpc = X86ISD::KTEST;
26808 X86CC = X86::COND_E;
26809 break;
26810 case Intrinsic::x86_avx_vtestz_ps:
26811 case Intrinsic::x86_avx_vtestz_pd:
26812 case Intrinsic::x86_avx_vtestz_ps_256:
26813 case Intrinsic::x86_avx_vtestz_pd_256:
26814 TestOpc = X86ISD::TESTP;
26815 LLVM_FALLTHROUGH[[gnu::fallthrough]];
26816 case Intrinsic::x86_sse41_ptestz:
26817 case Intrinsic::x86_avx_ptestz_256:
26818 // ZF = 1
26819 X86CC = X86::COND_E;
26820 break;
26821 case Intrinsic::x86_avx_vtestc_ps:
26822 case Intrinsic::x86_avx_vtestc_pd:
26823 case Intrinsic::x86_avx_vtestc_ps_256:
26824 case Intrinsic::x86_avx_vtestc_pd_256:
26825 TestOpc = X86ISD::TESTP;
26826 LLVM_FALLTHROUGH[[gnu::fallthrough]];
26827 case Intrinsic::x86_sse41_ptestc:
26828 case Intrinsic::x86_avx_ptestc_256:
26829 // CF = 1
26830 X86CC = X86::COND_B;
26831 break;
26832 case Intrinsic::x86_avx_vtestnzc_ps:
26833 case Intrinsic::x86_avx_vtestnzc_pd:
26834 case Intrinsic::x86_avx_vtestnzc_ps_256:
26835 case Intrinsic::x86_avx_vtestnzc_pd_256:
26836 TestOpc = X86ISD::TESTP;
26837 LLVM_FALLTHROUGH[[gnu::fallthrough]];
26838 case Intrinsic::x86_sse41_ptestnzc:
26839 case Intrinsic::x86_avx_ptestnzc_256:
26840 // ZF and CF = 0
26841 X86CC = X86::COND_A;
26842 break;
26843 }
26844
26845 SDValue LHS = Op.getOperand(1);
26846 SDValue RHS = Op.getOperand(2);
26847 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
26848 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
26849 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
26850 }
26851
26852 case Intrinsic::x86_sse42_pcmpistria128:
26853 case Intrinsic::x86_sse42_pcmpestria128:
26854 case Intrinsic::x86_sse42_pcmpistric128:
26855 case Intrinsic::x86_sse42_pcmpestric128:
26856 case Intrinsic::x86_sse42_pcmpistrio128:
26857 case Intrinsic::x86_sse42_pcmpestrio128:
26858 case Intrinsic::x86_sse42_pcmpistris128:
26859 case Intrinsic::x86_sse42_pcmpestris128:
26860 case Intrinsic::x86_sse42_pcmpistriz128:
26861 case Intrinsic::x86_sse42_pcmpestriz128: {
26862 unsigned Opcode;
26863 X86::CondCode X86CC;
26864 switch (IntNo) {
26865 default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26865)
; // Can't reach here.
26866 case Intrinsic::x86_sse42_pcmpistria128:
26867 Opcode = X86ISD::PCMPISTR;
26868 X86CC = X86::COND_A;
26869 break;
26870 case Intrinsic::x86_sse42_pcmpestria128:
26871 Opcode = X86ISD::PCMPESTR;
26872 X86CC = X86::COND_A;
26873 break;
26874 case Intrinsic::x86_sse42_pcmpistric128:
26875 Opcode = X86ISD::PCMPISTR;
26876 X86CC = X86::COND_B;
26877 break;
26878 case Intrinsic::x86_sse42_pcmpestric128:
26879 Opcode = X86ISD::PCMPESTR;
26880 X86CC = X86::COND_B;
26881 break;
26882 case Intrinsic::x86_sse42_pcmpistrio128:
26883 Opcode = X86ISD::PCMPISTR;
26884 X86CC = X86::COND_O;
26885 break;
26886 case Intrinsic::x86_sse42_pcmpestrio128:
26887 Opcode = X86ISD::PCMPESTR;
26888 X86CC = X86::COND_O;
26889 break;
26890 case Intrinsic::x86_sse42_pcmpistris128:
26891 Opcode = X86ISD::PCMPISTR;
26892 X86CC = X86::COND_S;
26893 break;
26894 case Intrinsic::x86_sse42_pcmpestris128:
26895 Opcode = X86ISD::PCMPESTR;
26896 X86CC = X86::COND_S;
26897 break;
26898 case Intrinsic::x86_sse42_pcmpistriz128:
26899 Opcode = X86ISD::PCMPISTR;
26900 X86CC = X86::COND_E;
26901 break;
26902 case Intrinsic::x86_sse42_pcmpestriz128:
26903 Opcode = X86ISD::PCMPESTR;
26904 X86CC = X86::COND_E;
26905 break;
26906 }
26907 SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops()));
26908 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
26909 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
26910 SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
26911 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
26912 }
26913
26914 case Intrinsic::x86_sse42_pcmpistri128:
26915 case Intrinsic::x86_sse42_pcmpestri128: {
26916 unsigned Opcode;
26917 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
26918 Opcode = X86ISD::PCMPISTR;
26919 else
26920 Opcode = X86ISD::PCMPESTR;
26921
26922 SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops()));
26923 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
26924 return DAG.getNode(Opcode, dl, VTs, NewOps);
26925 }
26926
26927 case Intrinsic::x86_sse42_pcmpistrm128:
26928 case Intrinsic::x86_sse42_pcmpestrm128: {
26929 unsigned Opcode;
26930 if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
26931 Opcode = X86ISD::PCMPISTR;
26932 else
26933 Opcode = X86ISD::PCMPESTR;
26934
26935 SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops()));
26936 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
26937 return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
26938 }
26939
26940 case Intrinsic::eh_sjlj_lsda: {
26941 MachineFunction &MF = DAG.getMachineFunction();
26942 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26943 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
26944 auto &Context = MF.getMMI().getContext();
26945 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
26946 Twine(MF.getFunctionNumber()));
26947 return DAG.getNode(getGlobalWrapperKind(), dl, VT,
26948 DAG.getMCSymbol(S, PtrVT));
26949 }
26950
26951 case Intrinsic::x86_seh_lsda: {
26952 // Compute the symbol for the LSDA. We know it'll get emitted later.
26953 MachineFunction &MF = DAG.getMachineFunction();
26954 SDValue Op1 = Op.getOperand(1);
26955 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
26956 MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
26957 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
26958
26959 // Generate a simple absolute symbol reference. This intrinsic is only
26960 // supported on 32-bit Windows, which isn't PIC.
26961 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
26962 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
26963 }
26964
26965 case Intrinsic::eh_recoverfp: {
26966 SDValue FnOp = Op.getOperand(1);
26967 SDValue IncomingFPOp = Op.getOperand(2);
26968 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
26969 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
26970 if (!Fn)
26971 report_fatal_error(
26972 "llvm.eh.recoverfp must take a function as the first argument");
26973 return recoverFramePointer(DAG, Fn, IncomingFPOp);
26974 }
26975
26976 case Intrinsic::localaddress: {
26977 // Returns one of the stack, base, or frame pointer registers, depending on
26978 // which is used to reference local variables.
26979 MachineFunction &MF = DAG.getMachineFunction();
26980 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26981 unsigned Reg;
26982 if (RegInfo->hasBasePointer(MF))
26983 Reg = RegInfo->getBaseRegister();
26984 else { // Handles the SP or FP case.
26985 bool CantUseFP = RegInfo->hasStackRealignment(MF);
26986 if (CantUseFP)
26987 Reg = RegInfo->getPtrSizedStackRegister(MF);
26988 else
26989 Reg = RegInfo->getPtrSizedFrameRegister(MF);
26990 }
26991 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
26992 }
26993 case Intrinsic::swift_async_context_addr: {
26994 auto &MF = DAG.getMachineFunction();
26995 auto X86FI = MF.getInfo<X86MachineFunctionInfo>();
26996 if (Subtarget.is64Bit()) {
26997 MF.getFrameInfo().setFrameAddressIsTaken(true);
26998 X86FI->setHasSwiftAsyncContext(true);
26999 return SDValue(
27000 DAG.getMachineNode(
27001 X86::SUB64ri8, dl, MVT::i64,
27002 DAG.getCopyFromReg(DAG.getEntryNode(), dl, X86::RBP, MVT::i64),
27003 DAG.getTargetConstant(8, dl, MVT::i32)),
27004 0);
27005 } else {
27006 // 32-bit so no special extended frame, create or reuse an existing stack
27007 // slot.
27008 if (!X86FI->getSwiftAsyncContextFrameIdx())
27009 X86FI->setSwiftAsyncContextFrameIdx(
27010 MF.getFrameInfo().CreateStackObject(4, Align(4), false));
27011 return DAG.getFrameIndex(*X86FI->getSwiftAsyncContextFrameIdx(), MVT::i32);
27012 }
27013 }
27014 case Intrinsic::x86_avx512_vp2intersect_q_512:
27015 case Intrinsic::x86_avx512_vp2intersect_q_256:
27016 case Intrinsic::x86_avx512_vp2intersect_q_128:
27017 case Intrinsic::x86_avx512_vp2intersect_d_512:
27018 case Intrinsic::x86_avx512_vp2intersect_d_256:
27019 case Intrinsic::x86_avx512_vp2intersect_d_128: {
27020 MVT MaskVT = Op.getSimpleValueType();
27021
27022 SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
27023 SDLoc DL(Op);
27024
27025 SDValue Operation =
27026 DAG.getNode(X86ISD::VP2INTERSECT, DL, VTs,
27027 Op->getOperand(1), Op->getOperand(2));
27028
27029 SDValue Result0 = DAG.getTargetExtractSubreg(X86::sub_mask_0, DL,
27030 MaskVT, Operation);
27031 SDValue Result1 = DAG.getTargetExtractSubreg(X86::sub_mask_1, DL,
27032 MaskVT, Operation);
27033 return DAG.getMergeValues({Result0, Result1}, DL);
27034 }
27035 case Intrinsic::x86_mmx_pslli_w:
27036 case Intrinsic::x86_mmx_pslli_d:
27037 case Intrinsic::x86_mmx_pslli_q:
27038 case Intrinsic::x86_mmx_psrli_w:
27039 case Intrinsic::x86_mmx_psrli_d:
27040 case Intrinsic::x86_mmx_psrli_q:
27041 case Intrinsic::x86_mmx_psrai_w:
27042 case Intrinsic::x86_mmx_psrai_d: {
27043 SDLoc DL(Op);
27044 SDValue ShAmt = Op.getOperand(2);
27045 // If the argument is a constant, convert it to a target constant.
27046 if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) {
27047 // Clamp out of bounds shift amounts since they will otherwise be masked
27048 // to 8-bits which may make it no longer out of bounds.
27049 unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);
27050 if (ShiftAmount == 0)
27051 return Op.getOperand(1);
27052
27053 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
27054 Op.getOperand(0), Op.getOperand(1),
27055 DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));
27056 }
27057
27058 unsigned NewIntrinsic;
27059 switch (IntNo) {
27060 default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27060)
; // Can't reach here.
27061 case Intrinsic::x86_mmx_pslli_w:
27062 NewIntrinsic = Intrinsic::x86_mmx_psll_w;
27063 break;
27064 case Intrinsic::x86_mmx_pslli_d:
27065 NewIntrinsic = Intrinsic::x86_mmx_psll_d;
27066 break;
27067 case Intrinsic::x86_mmx_pslli_q:
27068 NewIntrinsic = Intrinsic::x86_mmx_psll_q;
27069 break;
27070 case Intrinsic::x86_mmx_psrli_w:
27071 NewIntrinsic = Intrinsic::x86_mmx_psrl_w;
27072 break;
27073 case Intrinsic::x86_mmx_psrli_d:
27074 NewIntrinsic = Intrinsic::x86_mmx_psrl_d;
27075 break;
27076 case Intrinsic::x86_mmx_psrli_q:
27077 NewIntrinsic = Intrinsic::x86_mmx_psrl_q;
27078 break;
27079 case Intrinsic::x86_mmx_psrai_w:
27080 NewIntrinsic = Intrinsic::x86_mmx_psra_w;
27081 break;
27082 case Intrinsic::x86_mmx_psrai_d:
27083 NewIntrinsic = Intrinsic::x86_mmx_psra_d;
27084 break;
27085 }
27086
27087 // The vector shift intrinsics with scalars uses 32b shift amounts but
27088 // the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an
27089 // MMX register.
27090 ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt);
27091 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
27092 DAG.getTargetConstant(NewIntrinsic, DL,
27093 getPointerTy(DAG.getDataLayout())),
27094 Op.getOperand(1), ShAmt);
27095 }
27096 case Intrinsic::thread_pointer: {
27097 if (Subtarget.isTargetELF()) {
27098 SDLoc dl(Op);
27099 EVT PtrVT = getPointerTy(DAG.getDataLayout());
27100 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
27101 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(
27102 *DAG.getContext(), Subtarget.is64Bit() ? X86AS::FS : X86AS::GS));
27103 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
27104 DAG.getIntPtrConstant(0, dl), MachinePointerInfo(Ptr));
27105 }
27106 report_fatal_error(
27107 "Target OS doesn't support __builtin_thread_pointer() yet.");
27108 }
27109 }
27110}
27111
27112static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
27113 SDValue Src, SDValue Mask, SDValue Base,
27114 SDValue Index, SDValue ScaleOp, SDValue Chain,
27115 const X86Subtarget &Subtarget) {
27116 SDLoc dl(Op);
27117 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27118 // Scale must be constant.
27119 if (!C)
27120 return SDValue();
27121 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27122 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27123 TLI.getPointerTy(DAG.getDataLayout()));
27124 EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();
27125 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
27126 // If source is undef or we know it won't be used, use a zero vector
27127 // to break register dependency.
27128 // TODO: use undef instead and let BreakFalseDeps deal with it?
27129 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
27130 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
27131
27132 // Cast mask to an integer type.
27133 Mask = DAG.getBitcast(MaskVT, Mask);
27134
27135 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
27136
27137 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
27138 SDValue Res =
27139 DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
27140 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
27141 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
27142}
27143
27144static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,
27145 SDValue Src, SDValue Mask, SDValue Base,
27146 SDValue Index, SDValue ScaleOp, SDValue Chain,
27147 const X86Subtarget &Subtarget) {
27148 MVT VT = Op.getSimpleValueType();
27149 SDLoc dl(Op);
27150 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27151 // Scale must be constant.
27152 if (!C)
27153 return SDValue();
27154 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27155 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27156 TLI.getPointerTy(DAG.getDataLayout()));
27157 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
27158 VT.getVectorNumElements());
27159 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
27160
27161 // We support two versions of the gather intrinsics. One with scalar mask and
27162 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
27163 if (Mask.getValueType() != MaskVT)
27164 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27165
27166 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
27167 // If source is undef or we know it won't be used, use a zero vector
27168 // to break register dependency.
27169 // TODO: use undef instead and let BreakFalseDeps deal with it?
27170 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
27171 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
27172
27173 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
27174
27175 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
27176 SDValue Res =
27177 DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
27178 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
27179 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
27180}
27181
27182static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
27183 SDValue Src, SDValue Mask, SDValue Base,
27184 SDValue Index, SDValue ScaleOp, SDValue Chain,
27185 const X86Subtarget &Subtarget) {
27186 SDLoc dl(Op);
27187 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27188 // Scale must be constant.
27189 if (!C)
27190 return SDValue();
27191 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27192 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27193 TLI.getPointerTy(DAG.getDataLayout()));
27194 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
27195 Src.getSimpleValueType().getVectorNumElements());
27196 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
27197
27198 // We support two versions of the scatter intrinsics. One with scalar mask and
27199 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
27200 if (Mask.getValueType() != MaskVT)
27201 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27202
27203 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
27204
27205 SDVTList VTs = DAG.getVTList(MVT::Other);
27206 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
27207 SDValue Res =
27208 DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
27209 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
27210 return Res;
27211}
27212
27213static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
27214 SDValue Mask, SDValue Base, SDValue Index,
27215 SDValue ScaleOp, SDValue Chain,
27216 const X86Subtarget &Subtarget) {
27217 SDLoc dl(Op);
27218 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27219 // Scale must be constant.
27220 if (!C)
27221 return SDValue();
27222 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27223 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27224 TLI.getPointerTy(DAG.getDataLayout()));
27225 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
27226 SDValue Segment = DAG.getRegister(0, MVT::i32);
27227 MVT MaskVT =
27228 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
27229 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27230 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
27231 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
27232 return SDValue(Res, 0);
27233}
27234
27235/// Handles the lowering of builtin intrinsics with chain that return their
27236/// value into registers EDX:EAX.
27237/// If operand ScrReg is a valid register identifier, then operand 2 of N is
27238/// copied to SrcReg. The assumption is that SrcReg is an implicit input to
27239/// TargetOpcode.
27240/// Returns a Glue value which can be used to add extra copy-from-reg if the
27241/// expanded intrinsics implicitly defines extra registers (i.e. not just
27242/// EDX:EAX).
27243static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL,
27244 SelectionDAG &DAG,
27245 unsigned TargetOpcode,
27246 unsigned SrcReg,
27247 const X86Subtarget &Subtarget,
27248 SmallVectorImpl<SDValue> &Results) {
27249 SDValue Chain = N->getOperand(0);
27250 SDValue Glue;
27251
27252 if (SrcReg) {
27253 assert(N->getNumOperands() == 3 && "Unexpected number of operands!")(static_cast <bool> (N->getNumOperands() == 3 &&
"Unexpected number of operands!") ? void (0) : __assert_fail
("N->getNumOperands() == 3 && \"Unexpected number of operands!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27253, __extension__
__PRETTY_FUNCTION__))
;
27254 Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);
27255 Glue = Chain.getValue(1);
27256 }
27257
27258 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
27259 SDValue N1Ops[] = {Chain, Glue};
27260 SDNode *N1 = DAG.getMachineNode(
27261 TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1));
27262 Chain = SDValue(N1, 0);
27263
27264 // Reads the content of XCR and returns it in registers EDX:EAX.
27265 SDValue LO, HI;
27266 if (Subtarget.is64Bit()) {
27267 LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
27268 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
27269 LO.getValue(2));
27270 } else {
27271 LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
27272 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
27273 LO.getValue(2));
27274 }
27275 Chain = HI.getValue(1);
27276 Glue = HI.getValue(2);
27277
27278 if (Subtarget.is64Bit()) {
27279 // Merge the two 32-bit values into a 64-bit one.
27280 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
27281 DAG.getConstant(32, DL, MVT::i8));
27282 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
27283 Results.push_back(Chain);
27284 return Glue;
27285 }
27286
27287 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
27288 SDValue Ops[] = { LO, HI };
27289 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
27290 Results.push_back(Pair);
27291 Results.push_back(Chain);
27292 return Glue;
27293}
27294
27295/// Handles the lowering of builtin intrinsics that read the time stamp counter
27296/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
27297/// READCYCLECOUNTER nodes.
27298static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
27299 SelectionDAG &DAG,
27300 const X86Subtarget &Subtarget,
27301 SmallVectorImpl<SDValue> &Results) {
27302 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
27303 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
27304 // and the EAX register is loaded with the low-order 32 bits.
27305 SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,
27306 /* NoRegister */0, Subtarget,
27307 Results);
27308 if (Opcode != X86::RDTSCP)
27309 return;
27310
27311 SDValue Chain = Results[1];
27312 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
27313 // the ECX register. Add 'ecx' explicitly to the chain.
27314 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue);
27315 Results[1] = ecx;
27316 Results.push_back(ecx.getValue(1));
27317}
27318
27319static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
27320 SelectionDAG &DAG) {
27321 SmallVector<SDValue, 3> Results;
27322 SDLoc DL(Op);
27323 getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget,
27324 Results);
27325 return DAG.getMergeValues(Results, DL);
27326}
27327
27328static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
27329 MachineFunction &MF = DAG.getMachineFunction();
27330 SDValue Chain = Op.getOperand(0);
27331 SDValue RegNode = Op.getOperand(2);
27332 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
27333 if (!EHInfo)
27334 report_fatal_error("EH registrations only live in functions using WinEH");
27335
27336 // Cast the operand to an alloca, and remember the frame index.
27337 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
27338 if (!FINode)
27339 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
27340 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
27341
27342 // Return the chain operand without making any DAG nodes.
27343 return Chain;
27344}
27345
27346static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
27347 MachineFunction &MF = DAG.getMachineFunction();
27348 SDValue Chain = Op.getOperand(0);
27349 SDValue EHGuard = Op.getOperand(2);
27350 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
27351 if (!EHInfo)
27352 report_fatal_error("EHGuard only live in functions using WinEH");
27353
27354 // Cast the operand to an alloca, and remember the frame index.
27355 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
27356 if (!FINode)
27357 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
27358 EHInfo->EHGuardFrameIndex = FINode->getIndex();
27359
27360 // Return the chain operand without making any DAG nodes.
27361 return Chain;
27362}
27363
27364/// Emit Truncating Store with signed or unsigned saturation.
27365static SDValue
27366EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
27367 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
27368 SelectionDAG &DAG) {
27369 SDVTList VTs = DAG.getVTList(MVT::Other);
27370 SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
27371 SDValue Ops[] = { Chain, Val, Ptr, Undef };
27372 unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS;
27373 return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);
27374}
27375
27376/// Emit Masked Truncating Store with signed or unsigned saturation.
27377static SDValue
27378EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
27379 SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
27380 MachineMemOperand *MMO, SelectionDAG &DAG) {
27381 SDVTList VTs = DAG.getVTList(MVT::Other);
27382 SDValue Ops[] = { Chain, Val, Ptr, Mask };
27383 unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS;
27384 return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);
27385}
27386
27387static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
27388 SelectionDAG &DAG) {
27389 unsigned IntNo = Op.getConstantOperandVal(1);
27390 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
27391 if (!IntrData) {
27392 switch (IntNo) {
27393 case llvm::Intrinsic::x86_seh_ehregnode:
27394 return MarkEHRegistrationNode(Op, DAG);
27395 case llvm::Intrinsic::x86_seh_ehguard:
27396 return MarkEHGuard(Op, DAG);
27397 case llvm::Intrinsic::x86_rdpkru: {
27398 SDLoc dl(Op);
27399 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27400 // Create a RDPKRU node and pass 0 to the ECX parameter.
27401 return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0),
27402 DAG.getConstant(0, dl, MVT::i32));
27403 }
27404 case llvm::Intrinsic::x86_wrpkru: {
27405 SDLoc dl(Op);
27406 // Create a WRPKRU node, pass the input to the EAX parameter, and pass 0
27407 // to the EDX and ECX parameters.
27408 return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,
27409 Op.getOperand(0), Op.getOperand(2),
27410 DAG.getConstant(0, dl, MVT::i32),
27411 DAG.getConstant(0, dl, MVT::i32));
27412 }
27413 case llvm::Intrinsic::asan_check_memaccess: {
27414 // Mark this as adjustsStack because it will be lowered to a call.
27415 DAG.getMachineFunction().getFrameInfo().setAdjustsStack(true);
27416 // Don't do anything here, we will expand these intrinsics out later.
27417 return Op;
27418 }
27419 case llvm::Intrinsic::x86_flags_read_u32:
27420 case llvm::Intrinsic::x86_flags_read_u64:
27421 case llvm::Intrinsic::x86_flags_write_u32:
27422 case llvm::Intrinsic::x86_flags_write_u64: {
27423 // We need a frame pointer because this will get lowered to a PUSH/POP
27424 // sequence.
27425 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
27426 MFI.setHasCopyImplyingStackAdjustment(true);
27427 // Don't do anything here, we will expand these intrinsics out later
27428 // during FinalizeISel in EmitInstrWithCustomInserter.
27429 return Op;
27430 }
27431 case Intrinsic::x86_lwpins32:
27432 case Intrinsic::x86_lwpins64:
27433 case Intrinsic::x86_umwait:
27434 case Intrinsic::x86_tpause: {
27435 SDLoc dl(Op);
27436 SDValue Chain = Op->getOperand(0);
27437 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27438 unsigned Opcode;
27439
27440 switch (IntNo) {
27441 default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27441)
;
27442 case Intrinsic::x86_umwait:
27443 Opcode = X86ISD::UMWAIT;
27444 break;
27445 case Intrinsic::x86_tpause:
27446 Opcode = X86ISD::TPAUSE;
27447 break;
27448 case Intrinsic::x86_lwpins32:
27449 case Intrinsic::x86_lwpins64:
27450 Opcode = X86ISD::LWPINS;
27451 break;
27452 }
27453
27454 SDValue Operation =
27455 DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
27456 Op->getOperand(3), Op->getOperand(4));
27457 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
27458 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27459 Operation.getValue(1));
27460 }
27461 case Intrinsic::x86_enqcmd:
27462 case Intrinsic::x86_enqcmds: {
27463 SDLoc dl(Op);
27464 SDValue Chain = Op.getOperand(0);
27465 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27466 unsigned Opcode;
27467 switch (IntNo) {
27468 default: llvm_unreachable("Impossible intrinsic!")::llvm::llvm_unreachable_internal("Impossible intrinsic!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27468)
;
27469 case Intrinsic::x86_enqcmd:
27470 Opcode = X86ISD::ENQCMD;
27471 break;
27472 case Intrinsic::x86_enqcmds:
27473 Opcode = X86ISD::ENQCMDS;
27474 break;
27475 }
27476 SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2),
27477 Op.getOperand(3));
27478 SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG);
27479 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27480 Operation.getValue(1));
27481 }
27482 case Intrinsic::x86_aesenc128kl:
27483 case Intrinsic::x86_aesdec128kl:
27484 case Intrinsic::x86_aesenc256kl:
27485 case Intrinsic::x86_aesdec256kl: {
27486 SDLoc DL(Op);
27487 SDVTList VTs = DAG.getVTList(MVT::v2i64, MVT::i32, MVT::Other);
27488 SDValue Chain = Op.getOperand(0);
27489 unsigned Opcode;
27490
27491 switch (IntNo) {
27492 default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27492)
;
27493 case Intrinsic::x86_aesenc128kl:
27494 Opcode = X86ISD::AESENC128KL;
27495 break;
27496 case Intrinsic::x86_aesdec128kl:
27497 Opcode = X86ISD::AESDEC128KL;
27498 break;
27499 case Intrinsic::x86_aesenc256kl:
27500 Opcode = X86ISD::AESENC256KL;
27501 break;
27502 case Intrinsic::x86_aesdec256kl:
27503 Opcode = X86ISD::AESDEC256KL;
27504 break;
27505 }
27506
27507 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
27508 MachineMemOperand *MMO = MemIntr->getMemOperand();
27509 EVT MemVT = MemIntr->getMemoryVT();
27510 SDValue Operation = DAG.getMemIntrinsicNode(
27511 Opcode, DL, VTs, {Chain, Op.getOperand(2), Op.getOperand(3)}, MemVT,
27512 MMO);
27513 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(1), DL, DAG);
27514
27515 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
27516 {ZF, Operation.getValue(0), Operation.getValue(2)});
27517 }
27518 case Intrinsic::x86_aesencwide128kl:
27519 case Intrinsic::x86_aesdecwide128kl:
27520 case Intrinsic::x86_aesencwide256kl:
27521 case Intrinsic::x86_aesdecwide256kl: {
27522 SDLoc DL(Op);
27523 SDVTList VTs = DAG.getVTList(
27524 {MVT::i32, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64,
27525 MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::Other});
27526 SDValue Chain = Op.getOperand(0);
27527 unsigned Opcode;
27528
27529 switch (IntNo) {
27530 default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27530)
;
27531 case Intrinsic::x86_aesencwide128kl:
27532 Opcode = X86ISD::AESENCWIDE128KL;
27533 break;
27534 case Intrinsic::x86_aesdecwide128kl:
27535 Opcode = X86ISD::AESDECWIDE128KL;
27536 break;
27537 case Intrinsic::x86_aesencwide256kl:
27538 Opcode = X86ISD::AESENCWIDE256KL;
27539 break;
27540 case Intrinsic::x86_aesdecwide256kl:
27541 Opcode = X86ISD::AESDECWIDE256KL;
27542 break;
27543 }
27544
27545 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
27546 MachineMemOperand *MMO = MemIntr->getMemOperand();
27547 EVT MemVT = MemIntr->getMemoryVT();
27548 SDValue Operation = DAG.getMemIntrinsicNode(
27549 Opcode, DL, VTs,
27550 {Chain, Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
27551 Op.getOperand(5), Op.getOperand(6), Op.getOperand(7),
27552 Op.getOperand(8), Op.getOperand(9), Op.getOperand(10)},
27553 MemVT, MMO);
27554 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(0), DL, DAG);
27555
27556 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
27557 {ZF, Operation.getValue(1), Operation.getValue(2),
27558 Operation.getValue(3), Operation.getValue(4),
27559 Operation.getValue(5), Operation.getValue(6),
27560 Operation.getValue(7), Operation.getValue(8),
27561 Operation.getValue(9)});
27562 }
27563 case Intrinsic::x86_testui: {
27564 SDLoc dl(Op);
27565 SDValue Chain = Op.getOperand(0);
27566 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27567 SDValue Operation = DAG.getNode(X86ISD::TESTUI, dl, VTs, Chain);
27568 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
27569 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27570 Operation.getValue(1));
27571 }
27572 case Intrinsic::x86_atomic_bts:
27573 case Intrinsic::x86_atomic_btc:
27574 case Intrinsic::x86_atomic_btr: {
27575 SDLoc DL(Op);
27576 MVT VT = Op.getSimpleValueType();
27577 SDValue Chain = Op.getOperand(0);
27578 SDValue Op1 = Op.getOperand(2);
27579 SDValue Op2 = Op.getOperand(3);
27580 unsigned Opc = IntNo == Intrinsic::x86_atomic_bts ? X86ISD::LBTS
27581 : IntNo == Intrinsic::x86_atomic_btc ? X86ISD::LBTC
27582 : X86ISD::LBTR;
27583 SDValue Size = DAG.getConstant(VT.getScalarSizeInBits(), DL, MVT::i32);
27584 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
27585 SDValue Res =
27586 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
27587 {Chain, Op1, Op2, Size}, VT, MMO);
27588 Chain = Res.getValue(1);
27589 Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
27590 unsigned Imm = cast<ConstantSDNode>(Op2)->getZExtValue();
27591 if (Imm)
27592 Res = DAG.getNode(ISD::SHL, DL, VT, Res,
27593 DAG.getShiftAmountConstant(Imm, VT, DL));
27594 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
27595 }
27596 }
27597 return SDValue();
27598 }
27599
27600 SDLoc dl(Op);
27601 switch(IntrData->Type) {
27602 default: llvm_unreachable("Unknown Intrinsic Type")::llvm::llvm_unreachable_internal("Unknown Intrinsic Type", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27602)
;
27603 case RDSEED:
27604 case RDRAND: {
27605 // Emit the node with the right value type.
27606 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
27607 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
27608
27609 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
27610 // Otherwise return the value from Rand, which is always 0, casted to i32.
27611 SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
27612 DAG.getConstant(1, dl, Op->getValueType(1)),
27613 DAG.getTargetConstant(X86::COND_B, dl, MVT::i8),
27614 SDValue(Result.getNode(), 1)};
27615 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
27616
27617 // Return { result, isValid, chain }.
27618 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
27619 SDValue(Result.getNode(), 2));
27620 }
27621 case GATHER_AVX2: {
27622 SDValue Chain = Op.getOperand(0);
27623 SDValue Src = Op.getOperand(2);
27624 SDValue Base = Op.getOperand(3);
27625 SDValue Index = Op.getOperand(4);
27626 SDValue Mask = Op.getOperand(5);
27627 SDValue Scale = Op.getOperand(6);
27628 return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
27629 Scale, Chain, Subtarget);
27630 }
27631 case GATHER: {
27632 //gather(v1, mask, index, base, scale);
27633 SDValue Chain = Op.getOperand(0);
27634 SDValue Src = Op.getOperand(2);
27635 SDValue Base = Op.getOperand(3);
27636 SDValue Index = Op.getOperand(4);
27637 SDValue Mask = Op.getOperand(5);
27638 SDValue Scale = Op.getOperand(6);
27639 return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,
27640 Chain, Subtarget);
27641 }
27642 case SCATTER: {
27643 //scatter(base, mask, index, v1, scale);
27644 SDValue Chain = Op.getOperand(0);
27645 SDValue Base = Op.getOperand(2);
27646 SDValue Mask = Op.getOperand(3);
27647 SDValue Index = Op.getOperand(4);
27648 SDValue Src = Op.getOperand(5);
27649 SDValue Scale = Op.getOperand(6);
27650 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
27651 Scale, Chain, Subtarget);
27652 }
27653 case PREFETCH: {
27654 const APInt &HintVal = Op.getConstantOperandAPInt(6);
27655 assert((HintVal == 2 || HintVal == 3) &&(static_cast <bool> ((HintVal == 2 || HintVal == 3) &&
"Wrong prefetch hint in intrinsic: should be 2 or 3") ? void
(0) : __assert_fail ("(HintVal == 2 || HintVal == 3) && \"Wrong prefetch hint in intrinsic: should be 2 or 3\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27656, __extension__
__PRETTY_FUNCTION__))
27656 "Wrong prefetch hint in intrinsic: should be 2 or 3")(static_cast <bool> ((HintVal == 2 || HintVal == 3) &&
"Wrong prefetch hint in intrinsic: should be 2 or 3") ? void
(0) : __assert_fail ("(HintVal == 2 || HintVal == 3) && \"Wrong prefetch hint in intrinsic: should be 2 or 3\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27656, __extension__
__PRETTY_FUNCTION__))
;
27657 unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
27658 SDValue Chain = Op.getOperand(0);
27659 SDValue Mask = Op.getOperand(2);
27660 SDValue Index = Op.getOperand(3);
27661 SDValue Base = Op.getOperand(4);
27662 SDValue Scale = Op.getOperand(5);
27663 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
27664 Subtarget);
27665 }
27666 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
27667 case RDTSC: {
27668 SmallVector<SDValue, 2> Results;
27669 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
27670 Results);
27671 return DAG.getMergeValues(Results, dl);
27672 }
27673 // Read Performance Monitoring Counters.
27674 case RDPMC:
27675 // GetExtended Control Register.
27676 case XGETBV: {
27677 SmallVector<SDValue, 2> Results;
27678
27679 // RDPMC uses ECX to select the index of the performance counter to read.
27680 // XGETBV uses ECX to select the index of the XCR register to return.
27681 // The result is stored into registers EDX:EAX.
27682 expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,
27683 Subtarget, Results);
27684 return DAG.getMergeValues(Results, dl);
27685 }
27686 // XTEST intrinsics.
27687 case XTEST: {
27688 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
27689 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
27690
27691 SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
27692 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
27693 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
27694 Ret, SDValue(InTrans.getNode(), 1));
27695 }
27696 case TRUNCATE_TO_MEM_VI8:
27697 case TRUNCATE_TO_MEM_VI16:
27698 case TRUNCATE_TO_MEM_VI32: {
27699 SDValue Mask = Op.getOperand(4);
27700 SDValue DataToTruncate = Op.getOperand(3);
27701 SDValue Addr = Op.getOperand(2);
27702 SDValue Chain = Op.getOperand(0);
27703
27704 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
27705 assert(MemIntr && "Expected MemIntrinsicSDNode!")(static_cast <bool> (MemIntr && "Expected MemIntrinsicSDNode!"
) ? void (0) : __assert_fail ("MemIntr && \"Expected MemIntrinsicSDNode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27705, __extension__
__PRETTY_FUNCTION__))
;
27706
27707 EVT MemVT = MemIntr->getMemoryVT();
27708
27709 uint16_t TruncationOp = IntrData->Opc0;
27710 switch (TruncationOp) {
27711 case X86ISD::VTRUNC: {
27712 if (isAllOnesConstant(Mask)) // return just a truncate store
27713 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
27714 MemIntr->getMemOperand());
27715
27716 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
27717 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27718 SDValue Offset = DAG.getUNDEF(VMask.getValueType());
27719
27720 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask,
27721 MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED,
27722 true /* truncating */);
27723 }
27724 case X86ISD::VTRUNCUS:
27725 case X86ISD::VTRUNCS: {
27726 bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
27727 if (isAllOnesConstant(Mask))
27728 return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
27729 MemIntr->getMemOperand(), DAG);
27730
27731 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
27732 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27733
27734 return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
27735 VMask, MemVT, MemIntr->getMemOperand(), DAG);
27736 }
27737 default:
27738 llvm_unreachable("Unsupported truncstore intrinsic")::llvm::llvm_unreachable_internal("Unsupported truncstore intrinsic"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27738)
;
27739 }
27740 }
27741 }
27742}
27743
27744SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
27745 SelectionDAG &DAG) const {
27746 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
27747 MFI.setReturnAddressIsTaken(true);
27748
27749 if (verifyReturnAddressArgumentIsConstant(Op, DAG))
27750 return SDValue();
27751
27752 unsigned Depth = Op.getConstantOperandVal(0);
27753 SDLoc dl(Op);
27754 EVT PtrVT = getPointerTy(DAG.getDataLayout());
27755
27756 if (Depth > 0) {
27757 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
27758 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27759 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
27760 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
27761 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
27762 MachinePointerInfo());
27763 }
27764
27765 // Just load the return address.
27766 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
27767 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
27768 MachinePointerInfo());
27769}
27770
27771SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
27772 SelectionDAG &DAG) const {
27773 DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
27774 return getReturnAddressFrameIndex(DAG);
27775}
27776
27777SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
27778 MachineFunction &MF = DAG.getMachineFunction();
27779 MachineFrameInfo &MFI = MF.getFrameInfo();
27780 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
27781 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27782 EVT VT = Op.getValueType();
27783
27784 MFI.setFrameAddressIsTaken(true);
27785
27786 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
27787 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
27788 // is not possible to crawl up the stack without looking at the unwind codes
27789 // simultaneously.
27790 int FrameAddrIndex = FuncInfo->getFAIndex();
27791 if (!FrameAddrIndex) {
27792 // Set up a frame object for the return address.
27793 unsigned SlotSize = RegInfo->getSlotSize();
27794 FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
27795 SlotSize, /*SPOffset=*/0, /*IsImmutable=*/false);
27796 FuncInfo->setFAIndex(FrameAddrIndex);
27797 }
27798 return DAG.getFrameIndex(FrameAddrIndex, VT);
27799 }
27800
27801 unsigned FrameReg =
27802 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
27803 SDLoc dl(Op); // FIXME probably not meaningful
27804 unsigned Depth = Op.getConstantOperandVal(0);
27805 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||(static_cast <bool> (((FrameReg == X86::RBP && VT
== MVT::i64) || (FrameReg == X86::EBP && VT == MVT::
i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27807, __extension__
__PRETTY_FUNCTION__))
27806 (FrameReg == X86::EBP && VT == MVT::i32)) &&(static_cast <bool> (((FrameReg == X86::RBP && VT
== MVT::i64) || (FrameReg == X86::EBP && VT == MVT::
i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27807, __extension__
__PRETTY_FUNCTION__))
27807 "Invalid Frame Register!")(static_cast <bool> (((FrameReg == X86::RBP && VT
== MVT::i64) || (FrameReg == X86::EBP && VT == MVT::
i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27807, __extension__
__PRETTY_FUNCTION__))
;
27808 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
27809 while (Depth--)
27810 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
27811 MachinePointerInfo());
27812 return FrameAddr;
27813}
27814
27815// FIXME? Maybe this could be a TableGen attribute on some registers and
27816// this table could be generated automatically from RegInfo.
27817Register X86TargetLowering::getRegisterByName(const char* RegName, LLT VT,
27818 const MachineFunction &MF) const {
27819 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
27820
27821 Register Reg = StringSwitch<unsigned>(RegName)
27822 .Case("esp", X86::ESP)
27823 .Case("rsp", X86::RSP)
27824 .Case("ebp", X86::EBP)
27825 .Case("rbp", X86::RBP)
27826 .Default(0);
27827
27828 if (Reg == X86::EBP || Reg == X86::RBP) {
27829 if (!TFI.hasFP(MF))
27830 report_fatal_error("register " + StringRef(RegName) +
27831 " is allocatable: function has no frame pointer");
27832#ifndef NDEBUG
27833 else {
27834 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27835 Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF);
27836 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&(static_cast <bool> ((FrameReg == X86::EBP || FrameReg ==
X86::RBP) && "Invalid Frame Register!") ? void (0) :
__assert_fail ("(FrameReg == X86::EBP || FrameReg == X86::RBP) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27837, __extension__
__PRETTY_FUNCTION__))
27837 "Invalid Frame Register!")(static_cast <bool> ((FrameReg == X86::EBP || FrameReg ==
X86::RBP) && "Invalid Frame Register!") ? void (0) :
__assert_fail ("(FrameReg == X86::EBP || FrameReg == X86::RBP) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27837, __extension__
__PRETTY_FUNCTION__))
;
27838 }
27839#endif
27840 }
27841
27842 if (Reg)
27843 return Reg;
27844
27845 report_fatal_error("Invalid register name global variable");
27846}
27847
27848SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
27849 SelectionDAG &DAG) const {
27850 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27851 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
27852}
27853
27854Register X86TargetLowering::getExceptionPointerRegister(
27855 const Constant *PersonalityFn) const {
27856 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
27857 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
27858
27859 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
27860}
27861
27862Register X86TargetLowering::getExceptionSelectorRegister(
27863 const Constant *PersonalityFn) const {
27864 // Funclet personalities don't use selectors (the runtime does the selection).
27865 if (isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)))
27866 return X86::NoRegister;
27867 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
27868}
27869
27870bool X86TargetLowering::needsFixedCatchObjects() const {
27871 return Subtarget.isTargetWin64();
27872}
27873
27874SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
27875 SDValue Chain = Op.getOperand(0);
27876 SDValue Offset = Op.getOperand(1);
27877 SDValue Handler = Op.getOperand(2);
27878 SDLoc dl (Op);
27879
27880 EVT PtrVT = getPointerTy(DAG.getDataLayout());
27881 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27882 Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
27883 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||(static_cast <bool> (((FrameReg == X86::RBP && PtrVT
== MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT
::i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27885, __extension__
__PRETTY_FUNCTION__))
27884 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&(static_cast <bool> (((FrameReg == X86::RBP && PtrVT
== MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT
::i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27885, __extension__
__PRETTY_FUNCTION__))
27885 "Invalid Frame Register!")(static_cast <bool> (((FrameReg == X86::RBP && PtrVT
== MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT
::i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27885, __extension__
__PRETTY_FUNCTION__))
;
27886 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
27887 Register StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
27888
27889 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
27890 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
27891 dl));
27892 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
27893 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
27894 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
27895
27896 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
27897 DAG.getRegister(StoreAddrReg, PtrVT));
27898}
27899
27900SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
27901 SelectionDAG &DAG) const {
27902 SDLoc DL(Op);
27903 // If the subtarget is not 64bit, we may need the global base reg
27904 // after isel expand pseudo, i.e., after CGBR pass ran.
27905 // Therefore, ask for the GlobalBaseReg now, so that the pass
27906 // inserts the code for us in case we need it.
27907 // Otherwise, we will end up in a situation where we will
27908 // reference a virtual register that is not defined!
27909 if (!Subtarget.is64Bit()) {
27910 const X86InstrInfo *TII = Subtarget.getInstrInfo();
27911 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
27912 }
27913 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
27914 DAG.getVTList(MVT::i32, MVT::Other),
27915 Op.getOperand(0), Op.getOperand(1));
27916}
27917
27918SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
27919 SelectionDAG &DAG) const {
27920 SDLoc DL(Op);
27921 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
27922 Op.getOperand(0), Op.getOperand(1));
27923}
27924
27925SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
27926 SelectionDAG &DAG) const {
27927 SDLoc DL(Op);
27928 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
27929 Op.getOperand(0));
27930}
27931
27932static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
27933 return Op.getOperand(0);
27934}
27935
27936SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
27937 SelectionDAG &DAG) const {
27938 SDValue Root = Op.getOperand(0);
27939 SDValue Trmp = Op.getOperand(1); // trampoline
27940 SDValue FPtr = Op.getOperand(2); // nested function
27941 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
27942 SDLoc dl (Op);
27943
27944 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
27945 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
27946
27947 if (Subtarget.is64Bit()) {
27948 SDValue OutChains[6];
27949
27950 // Large code-model.
27951 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
27952 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
27953
27954 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
27955 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
27956
27957 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
27958
27959 // Load the pointer to the nested function into R11.
27960 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
27961 SDValue Addr = Trmp;
27962 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
27963 Addr, MachinePointerInfo(TrmpAddr));
27964
27965 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27966 DAG.getConstant(2, dl, MVT::i64));
27967 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
27968 MachinePointerInfo(TrmpAddr, 2), Align(2));
27969
27970 // Load the 'nest' parameter value into R10.
27971 // R10 is specified in X86CallingConv.td
27972 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
27973 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27974 DAG.getConstant(10, dl, MVT::i64));
27975 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
27976 Addr, MachinePointerInfo(TrmpAddr, 10));
27977
27978 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27979 DAG.getConstant(12, dl, MVT::i64));
27980 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
27981 MachinePointerInfo(TrmpAddr, 12), Align(2));
27982
27983 // Jump to the nested function.
27984 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
27985 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27986 DAG.getConstant(20, dl, MVT::i64));
27987 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
27988 Addr, MachinePointerInfo(TrmpAddr, 20));
27989
27990 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
27991 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27992 DAG.getConstant(22, dl, MVT::i64));
27993 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
27994 Addr, MachinePointerInfo(TrmpAddr, 22));
27995
27996 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
27997 } else {
27998 const Function *Func =
27999 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
28000 CallingConv::ID CC = Func->getCallingConv();
28001 unsigned NestReg;
28002
28003 switch (CC) {
28004 default:
28005 llvm_unreachable("Unsupported calling convention")::llvm::llvm_unreachable_internal("Unsupported calling convention"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28005)
;
28006 case CallingConv::C:
28007 case CallingConv::X86_StdCall: {
28008 // Pass 'nest' parameter in ECX.
28009 // Must be kept in sync with X86CallingConv.td
28010 NestReg = X86::ECX;
28011
28012 // Check that ECX wasn't needed by an 'inreg' parameter.
28013 FunctionType *FTy = Func->getFunctionType();
28014 const AttributeList &Attrs = Func->getAttributes();
28015
28016 if (!Attrs.isEmpty() && !Func->isVarArg()) {
28017 unsigned InRegCount = 0;
28018 unsigned Idx = 0;
28019
28020 for (FunctionType::param_iterator I = FTy->param_begin(),
28021 E = FTy->param_end(); I != E; ++I, ++Idx)
28022 if (Attrs.hasParamAttr(Idx, Attribute::InReg)) {
28023 const DataLayout &DL = DAG.getDataLayout();
28024 // FIXME: should only count parameters that are lowered to integers.
28025 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
28026 }
28027
28028 if (InRegCount > 2) {
28029 report_fatal_error("Nest register in use - reduce number of inreg"
28030 " parameters!");
28031 }
28032 }
28033 break;
28034 }
28035 case CallingConv::X86_FastCall:
28036 case CallingConv::X86_ThisCall:
28037 case CallingConv::Fast:
28038 case CallingConv::Tail:
28039 case CallingConv::SwiftTail:
28040 // Pass 'nest' parameter in EAX.
28041 // Must be kept in sync with X86CallingConv.td
28042 NestReg = X86::EAX;
28043 break;
28044 }
28045
28046 SDValue OutChains[4];
28047 SDValue Addr, Disp;
28048
28049 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28050 DAG.getConstant(10, dl, MVT::i32));
28051 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
28052
28053 // This is storing the opcode for MOV32ri.
28054 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
28055 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
28056 OutChains[0] =
28057 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
28058 Trmp, MachinePointerInfo(TrmpAddr));
28059
28060 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28061 DAG.getConstant(1, dl, MVT::i32));
28062 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
28063 MachinePointerInfo(TrmpAddr, 1), Align(1));
28064
28065 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
28066 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28067 DAG.getConstant(5, dl, MVT::i32));
28068 OutChains[2] =
28069 DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), Addr,
28070 MachinePointerInfo(TrmpAddr, 5), Align(1));
28071
28072 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28073 DAG.getConstant(6, dl, MVT::i32));
28074 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
28075 MachinePointerInfo(TrmpAddr, 6), Align(1));
28076
28077 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
28078 }
28079}
28080
28081SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
28082 SelectionDAG &DAG) const {
28083 /*
28084 The rounding mode is in bits 11:10 of FPSR, and has the following
28085 settings:
28086 00 Round to nearest
28087 01 Round to -inf
28088 10 Round to +inf
28089 11 Round to 0
28090
28091 FLT_ROUNDS, on the other hand, expects the following:
28092 -1 Undefined
28093 0 Round to 0
28094 1 Round to nearest
28095 2 Round to +inf
28096 3 Round to -inf
28097
28098 To perform the conversion, we use a packed lookup table of the four 2-bit
28099 values that we can index by FPSP[11:10]
28100 0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]
28101
28102 (0x2d >> ((FPSR & 0xc00) >> 9)) & 3
28103 */
28104
28105 MachineFunction &MF = DAG.getMachineFunction();
28106 MVT VT = Op.getSimpleValueType();
28107 SDLoc DL(Op);
28108
28109 // Save FP Control Word to stack slot
28110 int SSFI = MF.getFrameInfo().CreateStackObject(2, Align(2), false);
28111 SDValue StackSlot =
28112 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
28113
28114 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
28115
28116 SDValue Chain = Op.getOperand(0);
28117 SDValue Ops[] = {Chain, StackSlot};
28118 Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
28119 DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI,
28120 Align(2), MachineMemOperand::MOStore);
28121
28122 // Load FP Control Word from stack slot
28123 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, Align(2));
28124 Chain = CWD.getValue(1);
28125
28126 // Mask and turn the control bits into a shift for the lookup table.
28127 SDValue Shift =
28128 DAG.getNode(ISD::SRL, DL, MVT::i16,
28129 DAG.getNode(ISD::AND, DL, MVT::i16,
28130 CWD, DAG.getConstant(0xc00, DL, MVT::i16)),
28131 DAG.getConstant(9, DL, MVT::i8));
28132 Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift);
28133
28134 SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32);
28135 SDValue RetVal =
28136 DAG.getNode(ISD::AND, DL, MVT::i32,
28137 DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),
28138 DAG.getConstant(3, DL, MVT::i32));
28139
28140 RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT);
28141
28142 return DAG.getMergeValues({RetVal, Chain}, DL);
28143}
28144
28145SDValue X86TargetLowering::LowerSET_ROUNDING(SDValue Op,
28146 SelectionDAG &DAG) const {
28147 MachineFunction &MF = DAG.getMachineFunction();
28148 SDLoc DL(Op);
28149 SDValue Chain = Op.getNode()->getOperand(0);
28150
28151 // FP control word may be set only from data in memory. So we need to allocate
28152 // stack space to save/load FP control word.
28153 int OldCWFrameIdx = MF.getFrameInfo().CreateStackObject(4, Align(4), false);
28154 SDValue StackSlot =
28155 DAG.getFrameIndex(OldCWFrameIdx, getPointerTy(DAG.getDataLayout()));
28156 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, OldCWFrameIdx);
28157 MachineMemOperand *MMO =
28158 MF.getMachineMemOperand(MPI, MachineMemOperand::MOStore, 2, Align(2));
28159
28160 // Store FP control word into memory.
28161 SDValue Ops[] = {Chain, StackSlot};
28162 Chain = DAG.getMemIntrinsicNode(
28163 X86ISD::FNSTCW16m, DL, DAG.getVTList(MVT::Other), Ops, MVT::i16, MMO);
28164
28165 // Load FP Control Word from stack slot and clear RM field (bits 11:10).
28166 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI);
28167 Chain = CWD.getValue(1);
28168 CWD = DAG.getNode(ISD::AND, DL, MVT::i16, CWD.getValue(0),
28169 DAG.getConstant(0xf3ff, DL, MVT::i16));
28170
28171 // Calculate new rounding mode.
28172 SDValue NewRM = Op.getNode()->getOperand(1);
28173 SDValue RMBits;
28174 if (auto *CVal = dyn_cast<ConstantSDNode>(NewRM)) {
28175 uint64_t RM = CVal->getZExtValue();
28176 int FieldVal;
28177 switch (static_cast<RoundingMode>(RM)) {
28178 case RoundingMode::NearestTiesToEven: FieldVal = X86::rmToNearest; break;
28179 case RoundingMode::TowardNegative: FieldVal = X86::rmDownward; break;
28180 case RoundingMode::TowardPositive: FieldVal = X86::rmUpward; break;
28181 case RoundingMode::TowardZero: FieldVal = X86::rmTowardZero; break;
28182 default:
28183 llvm_unreachable("rounding mode is not supported by X86 hardware")::llvm::llvm_unreachable_internal("rounding mode is not supported by X86 hardware"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28183)
;
28184 }
28185 RMBits = DAG.getConstant(FieldVal, DL, MVT::i16);
28186 } else {
28187 // Need to convert argument into bits of control word:
28188 // 0 Round to 0 -> 11
28189 // 1 Round to nearest -> 00
28190 // 2 Round to +inf -> 10
28191 // 3 Round to -inf -> 01
28192 // The 2-bit value needs then to be shifted so that it occupies bits 11:10.
28193 // To make the conversion, put all these values into a value 0xc9 and shift
28194 // it left depending on the rounding mode:
28195 // (0xc9 << 4) & 0xc00 = X86::rmTowardZero
28196 // (0xc9 << 6) & 0xc00 = X86::rmToNearest
28197 // ...
28198 // (0xc9 << (2 * NewRM + 4)) & 0xc00
28199 SDValue ShiftValue =
28200 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
28201 DAG.getNode(ISD::ADD, DL, MVT::i32,
28202 DAG.getNode(ISD::SHL, DL, MVT::i32, NewRM,
28203 DAG.getConstant(1, DL, MVT::i8)),
28204 DAG.getConstant(4, DL, MVT::i32)));
28205 SDValue Shifted =
28206 DAG.getNode(ISD::SHL, DL, MVT::i16, DAG.getConstant(0xc9, DL, MVT::i16),
28207 ShiftValue);
28208 RMBits = DAG.getNode(ISD::AND, DL, MVT::i16, Shifted,
28209 DAG.getConstant(0xc00, DL, MVT::i16));
28210 }
28211
28212 // Update rounding mode bits and store the new FP Control Word into stack.
28213 CWD = DAG.getNode(ISD::OR, DL, MVT::i16, CWD, RMBits);
28214 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, /* Alignment = */ 2);
28215
28216 // Load FP control word from the slot.
28217 SDValue OpsLD[] = {Chain, StackSlot};
28218 MachineMemOperand *MMOL =
28219 MF.getMachineMemOperand(MPI, MachineMemOperand::MOLoad, 2, Align(2));
28220 Chain = DAG.getMemIntrinsicNode(
28221 X86ISD::FLDCW16m, DL, DAG.getVTList(MVT::Other), OpsLD, MVT::i16, MMOL);
28222
28223 // If target supports SSE, set MXCSR as well. Rounding mode is encoded in the
28224 // same way but in bits 14:13.
28225 if (Subtarget.hasSSE1()) {
28226 // Store MXCSR into memory.
28227 Chain = DAG.getNode(
28228 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28229 DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
28230 StackSlot);
28231
28232 // Load MXCSR from stack slot and clear RM field (bits 14:13).
28233 SDValue CWD = DAG.getLoad(MVT::i32, DL, Chain, StackSlot, MPI);
28234 Chain = CWD.getValue(1);
28235 CWD = DAG.getNode(ISD::AND, DL, MVT::i32, CWD.getValue(0),
28236 DAG.getConstant(0xffff9fff, DL, MVT::i32));
28237
28238 // Shift X87 RM bits from 11:10 to 14:13.
28239 RMBits = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, RMBits);
28240 RMBits = DAG.getNode(ISD::SHL, DL, MVT::i32, RMBits,
28241 DAG.getConstant(3, DL, MVT::i8));
28242
28243 // Update rounding mode bits and store the new FP Control Word into stack.
28244 CWD = DAG.getNode(ISD::OR, DL, MVT::i32, CWD, RMBits);
28245 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, /* Alignment = */ 4);
28246
28247 // Load MXCSR from the slot.
28248 Chain = DAG.getNode(
28249 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28250 DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
28251 StackSlot);
28252 }
28253
28254 return Chain;
28255}
28256
28257/// Lower a vector CTLZ using native supported vector CTLZ instruction.
28258//
28259// i8/i16 vector implemented using dword LZCNT vector instruction
28260// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
28261// split the vector, perform operation on it's Lo a Hi part and
28262// concatenate the results.
28263static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG,
28264 const X86Subtarget &Subtarget) {
28265 assert(Op.getOpcode() == ISD::CTLZ)(static_cast <bool> (Op.getOpcode() == ISD::CTLZ) ? void
(0) : __assert_fail ("Op.getOpcode() == ISD::CTLZ", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28265, __extension__ __PRETTY_FUNCTION__))
;
28266 SDLoc dl(Op);
28267 MVT VT = Op.getSimpleValueType();
28268 MVT EltVT = VT.getVectorElementType();
28269 unsigned NumElems = VT.getVectorNumElements();
28270
28271 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&(static_cast <bool> ((EltVT == MVT::i8 || EltVT == MVT::
i16) && "Unsupported element type") ? void (0) : __assert_fail
("(EltVT == MVT::i8 || EltVT == MVT::i16) && \"Unsupported element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28272, __extension__
__PRETTY_FUNCTION__))
28272 "Unsupported element type")(static_cast <bool> ((EltVT == MVT::i8 || EltVT == MVT::
i16) && "Unsupported element type") ? void (0) : __assert_fail
("(EltVT == MVT::i8 || EltVT == MVT::i16) && \"Unsupported element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28272, __extension__
__PRETTY_FUNCTION__))
;
28273
28274 // Split vector, it's Lo and Hi parts will be handled in next iteration.
28275 if (NumElems > 16 ||
28276 (NumElems == 16 && !Subtarget.canExtendTo512DQ()))
28277 return splitVectorIntUnary(Op, DAG);
28278
28279 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
28280 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&(static_cast <bool> ((NewVT.is256BitVector() || NewVT.is512BitVector
()) && "Unsupported value type for operation") ? void
(0) : __assert_fail ("(NewVT.is256BitVector() || NewVT.is512BitVector()) && \"Unsupported value type for operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28281, __extension__
__PRETTY_FUNCTION__))
28281 "Unsupported value type for operation")(static_cast <bool> ((NewVT.is256BitVector() || NewVT.is512BitVector
()) && "Unsupported value type for operation") ? void
(0) : __assert_fail ("(NewVT.is256BitVector() || NewVT.is512BitVector()) && \"Unsupported value type for operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28281, __extension__
__PRETTY_FUNCTION__))
;
28282
28283 // Use native supported vector instruction vplzcntd.
28284 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
28285 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
28286 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
28287 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
28288
28289 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
28290}
28291
28292// Lower CTLZ using a PSHUFB lookup table implementation.
28293static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
28294 const X86Subtarget &Subtarget,
28295 SelectionDAG &DAG) {
28296 MVT VT = Op.getSimpleValueType();
28297 int NumElts = VT.getVectorNumElements();
28298 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
28299 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
28300
28301 // Per-nibble leading zero PSHUFB lookup table.
28302 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
28303 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
28304 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
28305 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
28306
28307 SmallVector<SDValue, 64> LUTVec;
28308 for (int i = 0; i < NumBytes; ++i)
28309 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
28310 SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
28311
28312 // Begin by bitcasting the input to byte vector, then split those bytes
28313 // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
28314 // If the hi input nibble is zero then we add both results together, otherwise
28315 // we just take the hi result (by masking the lo result to zero before the
28316 // add).
28317 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
28318 SDValue Zero = DAG.getConstant(0, DL, CurrVT);
28319
28320 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
28321 SDValue Lo = Op0;
28322 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
28323 SDValue HiZ;
28324 if (CurrVT.is512BitVector()) {
28325 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
28326 HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
28327 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
28328 } else {
28329 HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
28330 }
28331
28332 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
28333 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
28334 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
28335 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
28336
28337 // Merge result back from vXi8 back to VT, working on the lo/hi halves
28338 // of the current vector width in the same way we did for the nibbles.
28339 // If the upper half of the input element is zero then add the halves'
28340 // leading zero counts together, otherwise just use the upper half's.
28341 // Double the width of the result until we are at target width.
28342 while (CurrVT != VT) {
28343 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
28344 int CurrNumElts = CurrVT.getVectorNumElements();
28345 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
28346 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
28347 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
28348
28349 // Check if the upper half of the input element is zero.
28350 if (CurrVT.is512BitVector()) {
28351 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
28352 HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
28353 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
28354 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
28355 } else {
28356 HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
28357 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
28358 }
28359 HiZ = DAG.getBitcast(NextVT, HiZ);
28360
28361 // Move the upper/lower halves to the lower bits as we'll be extending to
28362 // NextVT. Mask the lower result to zero if HiZ is true and add the results
28363 // together.
28364 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
28365 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
28366 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
28367 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
28368 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
28369 CurrVT = NextVT;
28370 }
28371
28372 return Res;
28373}
28374
28375static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
28376 const X86Subtarget &Subtarget,
28377 SelectionDAG &DAG) {
28378 MVT VT = Op.getSimpleValueType();
28379
28380 if (Subtarget.hasCDI() &&
28381 // vXi8 vectors need to be promoted to 512-bits for vXi32.
28382 (Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))
28383 return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);
28384
28385 // Decompose 256-bit ops into smaller 128-bit ops.
28386 if (VT.is256BitVector() && !Subtarget.hasInt256())
28387 return splitVectorIntUnary(Op, DAG);
28388
28389 // Decompose 512-bit ops into smaller 256-bit ops.
28390 if (VT.is512BitVector() && !Subtarget.hasBWI())
28391 return splitVectorIntUnary(Op, DAG);
28392
28393 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB")(static_cast <bool> (Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB"
) ? void (0) : __assert_fail ("Subtarget.hasSSSE3() && \"Expected SSSE3 support for PSHUFB\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28393, __extension__
__PRETTY_FUNCTION__))
;
28394 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
28395}
28396
28397static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
28398 SelectionDAG &DAG) {
28399 MVT VT = Op.getSimpleValueType();
28400 MVT OpVT = VT;
28401 unsigned NumBits = VT.getSizeInBits();
28402 SDLoc dl(Op);
28403 unsigned Opc = Op.getOpcode();
28404
28405 if (VT.isVector())
28406 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
28407
28408 Op = Op.getOperand(0);
28409 if (VT == MVT::i8) {
28410 // Zero extend to i32 since there is not an i8 bsr.
28411 OpVT = MVT::i32;
28412 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
28413 }
28414
28415 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
28416 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
28417 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
28418
28419 if (Opc == ISD::CTLZ) {
28420 // If src is zero (i.e. bsr sets ZF), returns NumBits.
28421 SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
28422 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
28423 Op.getValue(1)};
28424 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
28425 }
28426
28427 // Finally xor with NumBits-1.
28428 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
28429 DAG.getConstant(NumBits - 1, dl, OpVT));
28430
28431 if (VT == MVT::i8)
28432 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
28433 return Op;
28434}
28435
28436static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
28437 SelectionDAG &DAG) {
28438 MVT VT = Op.getSimpleValueType();
28439 unsigned NumBits = VT.getScalarSizeInBits();
28440 SDValue N0 = Op.getOperand(0);
28441 SDLoc dl(Op);
28442
28443 assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&(static_cast <bool> (!VT.isVector() && Op.getOpcode
() == ISD::CTTZ && "Only scalar CTTZ requires custom lowering"
) ? void (0) : __assert_fail ("!VT.isVector() && Op.getOpcode() == ISD::CTTZ && \"Only scalar CTTZ requires custom lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28444, __extension__
__PRETTY_FUNCTION__))
28444 "Only scalar CTTZ requires custom lowering")(static_cast <bool> (!VT.isVector() && Op.getOpcode
() == ISD::CTTZ && "Only scalar CTTZ requires custom lowering"
) ? void (0) : __assert_fail ("!VT.isVector() && Op.getOpcode() == ISD::CTTZ && \"Only scalar CTTZ requires custom lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28444, __extension__
__PRETTY_FUNCTION__))
;
28445
28446 // Issue a bsf (scan bits forward) which also sets EFLAGS.
28447 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
28448 Op = DAG.getNode(X86ISD::BSF, dl, VTs, N0);
28449
28450 // If src is zero (i.e. bsf sets ZF), returns NumBits.
28451 SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT),
28452 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
28453 Op.getValue(1)};
28454 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
28455}
28456
28457static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG,
28458 const X86Subtarget &Subtarget) {
28459 MVT VT = Op.getSimpleValueType();
28460 if (VT == MVT::i16 || VT == MVT::i32)
28461 return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
28462
28463 if (VT == MVT::v32i16 || VT == MVT::v64i8)
28464 return splitVectorIntBinary(Op, DAG);
28465
28466 assert(Op.getSimpleValueType().is256BitVector() &&(static_cast <bool> (Op.getSimpleValueType().is256BitVector
() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28468, __extension__
__PRETTY_FUNCTION__))
28467 Op.getSimpleValueType().isInteger() &&(static_cast <bool> (Op.getSimpleValueType().is256BitVector
() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28468, __extension__
__PRETTY_FUNCTION__))
28468 "Only handle AVX 256-bit vector integer operation")(static_cast <bool> (Op.getSimpleValueType().is256BitVector
() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28468, __extension__
__PRETTY_FUNCTION__))
;
28469 return splitVectorIntBinary(Op, DAG);
28470}
28471
28472static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG,
28473 const X86Subtarget &Subtarget) {
28474 MVT VT = Op.getSimpleValueType();
28475 SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
28476 unsigned Opcode = Op.getOpcode();
28477 SDLoc DL(Op);
28478
28479 if (VT == MVT::v32i16 || VT == MVT::v64i8 ||
28480 (VT.is256BitVector() && !Subtarget.hasInt256())) {
28481 assert(Op.getSimpleValueType().isInteger() &&(static_cast <bool> (Op.getSimpleValueType().isInteger(
) && "Only handle AVX vector integer operation") ? void
(0) : __assert_fail ("Op.getSimpleValueType().isInteger() && \"Only handle AVX vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28482, __extension__
__PRETTY_FUNCTION__))
28482 "Only handle AVX vector integer operation")(static_cast <bool> (Op.getSimpleValueType().isInteger(
) && "Only handle AVX vector integer operation") ? void
(0) : __assert_fail ("Op.getSimpleValueType().isInteger() && \"Only handle AVX vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28482, __extension__
__PRETTY_FUNCTION__))
;
28483 return splitVectorIntBinary(Op, DAG);
28484 }
28485
28486 // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.
28487 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28488 EVT SetCCResultType =
28489 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
28490
28491 unsigned BitWidth = VT.getScalarSizeInBits();
28492 if (Opcode == ISD::USUBSAT) {
28493 if (!TLI.isOperationLegal(ISD::UMAX, VT) || useVPTERNLOG(Subtarget, VT)) {
28494 // Handle a special-case with a bit-hack instead of cmp+select:
28495 // usubsat X, SMIN --> (X ^ SMIN) & (X s>> BW-1)
28496 // If the target can use VPTERNLOG, DAGToDAG will match this as
28497 // "vpsra + vpternlog" which is better than "vpmax + vpsub" with a
28498 // "broadcast" constant load.
28499 ConstantSDNode *C = isConstOrConstSplat(Y, true);
28500 if (C && C->getAPIntValue().isSignMask()) {
28501 SDValue SignMask = DAG.getConstant(C->getAPIntValue(), DL, VT);
28502 SDValue ShiftAmt = DAG.getConstant(BitWidth - 1, DL, VT);
28503 SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, X, SignMask);
28504 SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShiftAmt);
28505 return DAG.getNode(ISD::AND, DL, VT, Xor, Sra);
28506 }
28507 }
28508 if (!TLI.isOperationLegal(ISD::UMAX, VT)) {
28509 // usubsat X, Y --> (X >u Y) ? X - Y : 0
28510 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
28511 SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
28512 // TODO: Move this to DAGCombiner?
28513 if (SetCCResultType == VT &&
28514 DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits())
28515 return DAG.getNode(ISD::AND, DL, VT, Cmp, Sub);
28516 return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
28517 }
28518 }
28519
28520 if ((Opcode == ISD::SADDSAT || Opcode == ISD::SSUBSAT) &&
28521 (!VT.isVector() || VT == MVT::v2i64)) {
28522 APInt MinVal = APInt::getSignedMinValue(BitWidth);
28523 APInt MaxVal = APInt::getSignedMaxValue(BitWidth);
28524 SDValue Zero = DAG.getConstant(0, DL, VT);
28525 SDValue Result =
28526 DAG.getNode(Opcode == ISD::SADDSAT ? ISD::SADDO : ISD::SSUBO, DL,
28527 DAG.getVTList(VT, SetCCResultType), X, Y);
28528 SDValue SumDiff = Result.getValue(0);
28529 SDValue Overflow = Result.getValue(1);
28530 SDValue SatMin = DAG.getConstant(MinVal, DL, VT);
28531 SDValue SatMax = DAG.getConstant(MaxVal, DL, VT);
28532 SDValue SumNeg =
28533 DAG.getSetCC(DL, SetCCResultType, SumDiff, Zero, ISD::SETLT);
28534 Result = DAG.getSelect(DL, VT, SumNeg, SatMax, SatMin);
28535 return DAG.getSelect(DL, VT, Overflow, Result, SumDiff);
28536 }
28537
28538 // Use default expansion.
28539 return SDValue();
28540}
28541
28542static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
28543 SelectionDAG &DAG) {
28544 MVT VT = Op.getSimpleValueType();
28545 if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
28546 // Since X86 does not have CMOV for 8-bit integer, we don't convert
28547 // 8-bit integer abs to NEG and CMOV.
28548 SDLoc DL(Op);
28549 SDValue N0 = Op.getOperand(0);
28550 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
28551 DAG.getConstant(0, DL, VT), N0);
28552 SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_NS, DL, MVT::i8),
28553 SDValue(Neg.getNode(), 1)};
28554 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
28555 }
28556
28557 // ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).
28558 if ((VT == MVT::v2i64 || VT == MVT::v4i64) && Subtarget.hasSSE41()) {
28559 SDLoc DL(Op);
28560 SDValue Src = Op.getOperand(0);
28561 SDValue Sub =
28562 DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Src);
28563 return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Sub, Src);
28564 }
28565
28566 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
28567 assert(VT.isInteger() &&(static_cast <bool> (VT.isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("VT.isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28568, __extension__
__PRETTY_FUNCTION__))
28568 "Only handle AVX 256-bit vector integer operation")(static_cast <bool> (VT.isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("VT.isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28568, __extension__
__PRETTY_FUNCTION__))
;
28569 return splitVectorIntUnary(Op, DAG);
28570 }
28571
28572 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
28573 return splitVectorIntUnary(Op, DAG);
28574
28575 // Default to expand.
28576 return SDValue();
28577}
28578
28579static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget,
28580 SelectionDAG &DAG) {
28581 MVT VT = Op.getSimpleValueType();
28582
28583 // For AVX1 cases, split to use legal ops (everything but v4i64).
28584 if (VT.is256BitVector() && !Subtarget.hasInt256())
28585 return splitVectorIntBinary(Op, DAG);
28586
28587 if (VT == MVT::v32i16 || VT == MVT::v64i8)
28588 return splitVectorIntBinary(Op, DAG);
28589
28590 // Default to expand.
28591 return SDValue();
28592}
28593
28594static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
28595 MVT VT = Op.getSimpleValueType();
28596
28597 // For AVX1 cases, split to use legal ops (everything but v4i64).
28598 if (VT.getScalarType() != MVT::i64 && VT.is256BitVector())
28599 return splitVectorIntBinary(Op, DAG);
28600
28601 if (VT == MVT::v32i16 || VT == MVT::v64i8)
28602 return splitVectorIntBinary(Op, DAG);
28603
28604 // Default to expand.
28605 return SDValue();
28606}
28607
28608static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
28609 SelectionDAG &DAG) {
28610 SDLoc dl(Op);
28611 MVT VT = Op.getSimpleValueType();
28612
28613 // Decompose 256-bit ops into 128-bit ops.
28614 if (VT.is256BitVector() && !Subtarget.hasInt256())
28615 return splitVectorIntBinary(Op, DAG);
28616
28617 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
28618 return splitVectorIntBinary(Op, DAG);
28619
28620 SDValue A = Op.getOperand(0);
28621 SDValue B = Op.getOperand(1);
28622
28623 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
28624 // vector pairs, multiply and truncate.
28625 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
28626 unsigned NumElts = VT.getVectorNumElements();
28627
28628 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
28629 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
28630 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
28631 return DAG.getNode(
28632 ISD::TRUNCATE, dl, VT,
28633 DAG.getNode(ISD::MUL, dl, ExVT,
28634 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A),
28635 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B)));
28636 }
28637
28638 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
28639
28640 // Extract the lo/hi parts to any extend to i16.
28641 // We're going to mask off the low byte of each result element of the
28642 // pmullw, so it doesn't matter what's in the high byte of each 16-bit
28643 // element.
28644 SDValue Undef = DAG.getUNDEF(VT);
28645 SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef));
28646 SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef));
28647
28648 SDValue BLo, BHi;
28649 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
28650 // If the RHS is a constant, manually unpackl/unpackh.
28651 SmallVector<SDValue, 16> LoOps, HiOps;
28652 for (unsigned i = 0; i != NumElts; i += 16) {
28653 for (unsigned j = 0; j != 8; ++j) {
28654 LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,
28655 MVT::i16));
28656 HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,
28657 MVT::i16));
28658 }
28659 }
28660
28661 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
28662 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
28663 } else {
28664 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));
28665 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));
28666 }
28667
28668 // Multiply, mask the lower 8bits of the lo/hi results and pack.
28669 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
28670 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
28671 return getPack(DAG, Subtarget, dl, VT, RLo, RHi);
28672 }
28673
28674 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
28675 if (VT == MVT::v4i32) {
28676 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&(static_cast <bool> (Subtarget.hasSSE2() && !Subtarget
.hasSSE41() && "Should not custom lower when pmulld is available!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && !Subtarget.hasSSE41() && \"Should not custom lower when pmulld is available!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28677, __extension__
__PRETTY_FUNCTION__))
28677 "Should not custom lower when pmulld is available!")(static_cast <bool> (Subtarget.hasSSE2() && !Subtarget
.hasSSE41() && "Should not custom lower when pmulld is available!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && !Subtarget.hasSSE41() && \"Should not custom lower when pmulld is available!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28677, __extension__
__PRETTY_FUNCTION__))
;
28678
28679 // Extract the odd parts.
28680 static const int UnpackMask[] = { 1, -1, 3, -1 };
28681 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
28682 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
28683
28684 // Multiply the even parts.
28685 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
28686 DAG.getBitcast(MVT::v2i64, A),
28687 DAG.getBitcast(MVT::v2i64, B));
28688 // Now multiply odd parts.
28689 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
28690 DAG.getBitcast(MVT::v2i64, Aodds),
28691 DAG.getBitcast(MVT::v2i64, Bodds));
28692
28693 Evens = DAG.getBitcast(VT, Evens);
28694 Odds = DAG.getBitcast(VT, Odds);
28695
28696 // Merge the two vectors back together with a shuffle. This expands into 2
28697 // shuffles.
28698 static const int ShufMask[] = { 0, 4, 2, 6 };
28699 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
28700 }
28701
28702 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&(static_cast <bool> ((VT == MVT::v2i64 || VT == MVT::v4i64
|| VT == MVT::v8i64) && "Only know how to lower V2I64/V4I64/V8I64 multiply"
) ? void (0) : __assert_fail ("(VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && \"Only know how to lower V2I64/V4I64/V8I64 multiply\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28703, __extension__
__PRETTY_FUNCTION__))
28703 "Only know how to lower V2I64/V4I64/V8I64 multiply")(static_cast <bool> ((VT == MVT::v2i64 || VT == MVT::v4i64
|| VT == MVT::v8i64) && "Only know how to lower V2I64/V4I64/V8I64 multiply"
) ? void (0) : __assert_fail ("(VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && \"Only know how to lower V2I64/V4I64/V8I64 multiply\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28703, __extension__
__PRETTY_FUNCTION__))
;
28704 assert(!Subtarget.hasDQI() && "DQI should use MULLQ")(static_cast <bool> (!Subtarget.hasDQI() && "DQI should use MULLQ"
) ? void (0) : __assert_fail ("!Subtarget.hasDQI() && \"DQI should use MULLQ\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28704, __extension__
__PRETTY_FUNCTION__))
;
28705
28706 // Ahi = psrlqi(a, 32);
28707 // Bhi = psrlqi(b, 32);
28708 //
28709 // AloBlo = pmuludq(a, b);
28710 // AloBhi = pmuludq(a, Bhi);
28711 // AhiBlo = pmuludq(Ahi, b);
28712 //
28713 // Hi = psllqi(AloBhi + AhiBlo, 32);
28714 // return AloBlo + Hi;
28715 KnownBits AKnown = DAG.computeKnownBits(A);
28716 KnownBits BKnown = DAG.computeKnownBits(B);
28717
28718 APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
28719 bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);
28720 bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);
28721
28722 APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
28723 bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
28724 bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);
28725
28726 SDValue Zero = DAG.getConstant(0, dl, VT);
28727
28728 // Only multiply lo/hi halves that aren't known to be zero.
28729 SDValue AloBlo = Zero;
28730 if (!ALoIsZero && !BLoIsZero)
28731 AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
28732
28733 SDValue AloBhi = Zero;
28734 if (!ALoIsZero && !BHiIsZero) {
28735 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
28736 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
28737 }
28738
28739 SDValue AhiBlo = Zero;
28740 if (!AHiIsZero && !BLoIsZero) {
28741 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
28742 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
28743 }
28744
28745 SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
28746 Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
28747
28748 return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
28749}
28750
28751static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl,
28752 MVT VT, bool IsSigned,
28753 const X86Subtarget &Subtarget,
28754 SelectionDAG &DAG,
28755 SDValue *Low = nullptr) {
28756 unsigned NumElts = VT.getVectorNumElements();
28757
28758 // For vXi8 we will unpack the low and high half of each 128 bit lane to widen
28759 // to a vXi16 type. Do the multiplies, shift the results and pack the half
28760 // lane results back together.
28761
28762 // We'll take different approaches for signed and unsigned.
28763 // For unsigned we'll use punpcklbw/punpckhbw to put zero extend the bytes
28764 // and use pmullw to calculate the full 16-bit product.
28765 // For signed we'll use punpcklbw/punpckbw to extend the bytes to words and
28766 // shift them left into the upper byte of each word. This allows us to use
28767 // pmulhw to calculate the full 16-bit product. This trick means we don't
28768 // need to sign extend the bytes to use pmullw.
28769
28770 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
28771 SDValue Zero = DAG.getConstant(0, dl, VT);
28772
28773 SDValue ALo, AHi;
28774 if (IsSigned) {
28775 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, A));
28776 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));
28777 } else {
28778 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));
28779 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));
28780 }
28781
28782 SDValue BLo, BHi;
28783 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
28784 // If the RHS is a constant, manually unpackl/unpackh and extend.
28785 SmallVector<SDValue, 16> LoOps, HiOps;
28786 for (unsigned i = 0; i != NumElts; i += 16) {
28787 for (unsigned j = 0; j != 8; ++j) {
28788 SDValue LoOp = B.getOperand(i + j);
28789 SDValue HiOp = B.getOperand(i + j + 8);
28790
28791 if (IsSigned) {
28792 LoOp = DAG.getAnyExtOrTrunc(LoOp, dl, MVT::i16);
28793 HiOp = DAG.getAnyExtOrTrunc(HiOp, dl, MVT::i16);
28794 LoOp = DAG.getNode(ISD::SHL, dl, MVT::i16, LoOp,
28795 DAG.getConstant(8, dl, MVT::i16));
28796 HiOp = DAG.getNode(ISD::SHL, dl, MVT::i16, HiOp,
28797 DAG.getConstant(8, dl, MVT::i16));
28798 } else {
28799 LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
28800 HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
28801 }
28802
28803 LoOps.push_back(LoOp);
28804 HiOps.push_back(HiOp);
28805 }
28806 }
28807
28808 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
28809 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
28810 } else if (IsSigned) {
28811 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));
28812 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));
28813 } else {
28814 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));
28815 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Zero));
28816 }
28817
28818 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
28819 // pack back to vXi8.
28820 unsigned MulOpc = IsSigned ? ISD::MULHS : ISD::MUL;
28821 SDValue RLo = DAG.getNode(MulOpc, dl, ExVT, ALo, BLo);
28822 SDValue RHi = DAG.getNode(MulOpc, dl, ExVT, AHi, BHi);
28823
28824 if (Low)
28825 *Low = getPack(DAG, Subtarget, dl, VT, RLo, RHi);
28826
28827 return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf*/ true);
28828}
28829
28830static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
28831 SelectionDAG &DAG) {
28832 SDLoc dl(Op);
28833 MVT VT = Op.getSimpleValueType();
28834 bool IsSigned = Op->getOpcode() == ISD::MULHS;
28835 unsigned NumElts = VT.getVectorNumElements();
28836 SDValue A = Op.getOperand(0);
28837 SDValue B = Op.getOperand(1);
28838
28839 // Decompose 256-bit ops into 128-bit ops.
28840 if (VT.is256BitVector() && !Subtarget.hasInt256())
28841 return splitVectorIntBinary(Op, DAG);
28842
28843 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
28844 return splitVectorIntBinary(Op, DAG);
28845
28846 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {
28847 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||(static_cast <bool> ((VT == MVT::v4i32 && Subtarget
.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256
()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())) ?
void (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28849, __extension__
__PRETTY_FUNCTION__))
28848 (VT == MVT::v8i32 && Subtarget.hasInt256()) ||(static_cast <bool> ((VT == MVT::v4i32 && Subtarget
.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256
()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())) ?
void (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28849, __extension__
__PRETTY_FUNCTION__))
28849 (VT == MVT::v16i32 && Subtarget.hasAVX512()))(static_cast <bool> ((VT == MVT::v4i32 && Subtarget
.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256
()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())) ?
void (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28849, __extension__
__PRETTY_FUNCTION__))
;
28850
28851 // PMULxD operations multiply each even value (starting at 0) of LHS with
28852 // the related value of RHS and produce a widen result.
28853 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
28854 // => <2 x i64> <ae|cg>
28855 //
28856 // In other word, to have all the results, we need to perform two PMULxD:
28857 // 1. one with the even values.
28858 // 2. one with the odd values.
28859 // To achieve #2, with need to place the odd values at an even position.
28860 //
28861 // Place the odd value at an even position (basically, shift all values 1
28862 // step to the left):
28863 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1,
28864 9, -1, 11, -1, 13, -1, 15, -1};
28865 // <a|b|c|d> => <b|undef|d|undef>
28866 SDValue Odd0 = DAG.getVectorShuffle(VT, dl, A, A,
28867 makeArrayRef(&Mask[0], NumElts));
28868 // <e|f|g|h> => <f|undef|h|undef>
28869 SDValue Odd1 = DAG.getVectorShuffle(VT, dl, B, B,
28870 makeArrayRef(&Mask[0], NumElts));
28871
28872 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
28873 // ints.
28874 MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
28875 unsigned Opcode =
28876 (IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ;
28877 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
28878 // => <2 x i64> <ae|cg>
28879 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
28880 DAG.getBitcast(MulVT, A),
28881 DAG.getBitcast(MulVT, B)));
28882 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
28883 // => <2 x i64> <bf|dh>
28884 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
28885 DAG.getBitcast(MulVT, Odd0),
28886 DAG.getBitcast(MulVT, Odd1)));
28887
28888 // Shuffle it back into the right order.
28889 SmallVector<int, 16> ShufMask(NumElts);
28890 for (int i = 0; i != (int)NumElts; ++i)
28891 ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
28892
28893 SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);
28894
28895 // If we have a signed multiply but no PMULDQ fix up the result of an
28896 // unsigned multiply.
28897 if (IsSigned && !Subtarget.hasSSE41()) {
28898 SDValue Zero = DAG.getConstant(0, dl, VT);
28899 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
28900 DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B);
28901 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
28902 DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A);
28903
28904 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
28905 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);
28906 }
28907
28908 return Res;
28909 }
28910
28911 // Only i8 vectors should need custom lowering after this.
28912 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||(static_cast <bool> ((VT == MVT::v16i8 || (VT == MVT::v32i8
&& Subtarget.hasInt256()) || (VT == MVT::v64i8 &&
Subtarget.hasBWI())) && "Unsupported vector type") ?
void (0) : __assert_fail ("(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && \"Unsupported vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28914, __extension__
__PRETTY_FUNCTION__))
28913 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&(static_cast <bool> ((VT == MVT::v16i8 || (VT == MVT::v32i8
&& Subtarget.hasInt256()) || (VT == MVT::v64i8 &&
Subtarget.hasBWI())) && "Unsupported vector type") ?
void (0) : __assert_fail ("(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && \"Unsupported vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28914, __extension__
__PRETTY_FUNCTION__))
28914 "Unsupported vector type")(static_cast <bool> ((VT == MVT::v16i8 || (VT == MVT::v32i8
&& Subtarget.hasInt256()) || (VT == MVT::v64i8 &&
Subtarget.hasBWI())) && "Unsupported vector type") ?
void (0) : __assert_fail ("(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && \"Unsupported vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28914, __extension__
__PRETTY_FUNCTION__))
;
28915
28916 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
28917 // logical shift down the upper half and pack back to i8.
28918
28919 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
28920 // and then ashr/lshr the upper bits down to the lower bits before multiply.
28921
28922 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
28923 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
28924 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
28925 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
28926 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
28927 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
28928 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
28929 Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
28930 return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
28931 }
28932
28933 return LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG);
28934}
28935
28936// Custom lowering for SMULO/UMULO.
28937static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget,
28938 SelectionDAG &DAG) {
28939 MVT VT = Op.getSimpleValueType();
28940
28941 // Scalars defer to LowerXALUO.
28942 if (!VT.isVector())
28943 return LowerXALUO(Op, DAG);
28944
28945 SDLoc dl(Op);
28946 bool IsSigned = Op->getOpcode() == ISD::SMULO;
28947 SDValue A = Op.getOperand(0);
28948 SDValue B = Op.getOperand(1);
28949 EVT OvfVT = Op->getValueType(1);
28950
28951 if ((VT == MVT::v32i8 && !Subtarget.hasInt256()) ||
28952 (VT == MVT::v64i8 && !Subtarget.hasBWI())) {
28953 // Extract the LHS Lo/Hi vectors
28954 SDValue LHSLo, LHSHi;
28955 std::tie(LHSLo, LHSHi) = splitVector(A, DAG, dl);
28956
28957 // Extract the RHS Lo/Hi vectors
28958 SDValue RHSLo, RHSHi;
28959 std::tie(RHSLo, RHSHi) = splitVector(B, DAG, dl);
28960
28961 EVT LoOvfVT, HiOvfVT;
28962 std::tie(LoOvfVT, HiOvfVT) = DAG.GetSplitDestVTs(OvfVT);
28963 SDVTList LoVTs = DAG.getVTList(LHSLo.getValueType(), LoOvfVT);
28964 SDVTList HiVTs = DAG.getVTList(LHSHi.getValueType(), HiOvfVT);
28965
28966 // Issue the split operations.
28967 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, LoVTs, LHSLo, RHSLo);
28968 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, HiVTs, LHSHi, RHSHi);
28969
28970 // Join the separate data results and the overflow results.
28971 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
28972 SDValue Ovf = DAG.getNode(ISD::CONCAT_VECTORS, dl, OvfVT, Lo.getValue(1),
28973 Hi.getValue(1));
28974
28975 return DAG.getMergeValues({Res, Ovf}, dl);
28976 }
28977
28978 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28979 EVT SetccVT =
28980 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
28981
28982 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
28983 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
28984 unsigned NumElts = VT.getVectorNumElements();
28985 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
28986 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
28987 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
28988 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
28989 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
28990
28991 SDValue Low = DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
28992
28993 SDValue Ovf;
28994 if (IsSigned) {
28995 SDValue High, LowSign;
28996 if (OvfVT.getVectorElementType() == MVT::i1 &&
28997 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
28998 // Rather the truncating try to do the compare on vXi16 or vXi32.
28999 // Shift the high down filling with sign bits.
29000 High = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Mul, 8, DAG);
29001 // Fill all 16 bits with the sign bit from the low.
29002 LowSign =
29003 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExVT, Mul, 8, DAG);
29004 LowSign = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, LowSign,
29005 15, DAG);
29006 SetccVT = OvfVT;
29007 if (!Subtarget.hasBWI()) {
29008 // We can't do a vXi16 compare so sign extend to v16i32.
29009 High = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, High);
29010 LowSign = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, LowSign);
29011 }
29012 } else {
29013 // Otherwise do the compare at vXi8.
29014 High = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
29015 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
29016 LowSign =
29017 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
29018 }
29019
29020 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
29021 } else {
29022 SDValue High =
29023 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
29024 if (OvfVT.getVectorElementType() == MVT::i1 &&
29025 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
29026 // Rather the truncating try to do the compare on vXi16 or vXi32.
29027 SetccVT = OvfVT;
29028 if (!Subtarget.hasBWI()) {
29029 // We can't do a vXi16 compare so sign extend to v16i32.
29030 High = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, High);
29031 }
29032 } else {
29033 // Otherwise do the compare at vXi8.
29034 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
29035 }
29036
29037 Ovf =
29038 DAG.getSetCC(dl, SetccVT, High,
29039 DAG.getConstant(0, dl, High.getValueType()), ISD::SETNE);
29040 }
29041
29042 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
29043
29044 return DAG.getMergeValues({Low, Ovf}, dl);
29045 }
29046
29047 SDValue Low;
29048 SDValue High =
29049 LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG, &Low);
29050
29051 SDValue Ovf;
29052 if (IsSigned) {
29053 // SMULO overflows if the high bits don't match the sign of the low.
29054 SDValue LowSign =
29055 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
29056 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
29057 } else {
29058 // UMULO overflows if the high bits are non-zero.
29059 Ovf =
29060 DAG.getSetCC(dl, SetccVT, High, DAG.getConstant(0, dl, VT), ISD::SETNE);
29061 }
29062
29063 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
29064
29065 return DAG.getMergeValues({Low, Ovf}, dl);
29066}
29067
29068SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
29069 assert(Subtarget.isTargetWin64() && "Unexpected target")(static_cast <bool> (Subtarget.isTargetWin64() &&
"Unexpected target") ? void (0) : __assert_fail ("Subtarget.isTargetWin64() && \"Unexpected target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29069, __extension__
__PRETTY_FUNCTION__))
;
29070 EVT VT = Op.getValueType();
29071 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&(static_cast <bool> (VT.isInteger() && VT.getSizeInBits
() == 128 && "Unexpected return type for lowering") ?
void (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29072, __extension__
__PRETTY_FUNCTION__))
29072 "Unexpected return type for lowering")(static_cast <bool> (VT.isInteger() && VT.getSizeInBits
() == 128 && "Unexpected return type for lowering") ?
void (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29072, __extension__
__PRETTY_FUNCTION__))
;
29073
29074 RTLIB::Libcall LC;
29075 bool isSigned;
29076 switch (Op->getOpcode()) {
29077 default: llvm_unreachable("Unexpected request for libcall!")::llvm::llvm_unreachable_internal("Unexpected request for libcall!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29077)
;
29078 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
29079 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
29080 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
29081 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
29082 }
29083
29084 SDLoc dl(Op);
29085 SDValue InChain = DAG.getEntryNode();
29086
29087 TargetLowering::ArgListTy Args;
29088 TargetLowering::ArgListEntry Entry;
29089 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
29090 EVT ArgVT = Op->getOperand(i).getValueType();
29091 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&(static_cast <bool> (ArgVT.isInteger() && ArgVT
.getSizeInBits() == 128 && "Unexpected argument type for lowering"
) ? void (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29092, __extension__
__PRETTY_FUNCTION__))
29092 "Unexpected argument type for lowering")(static_cast <bool> (ArgVT.isInteger() && ArgVT
.getSizeInBits() == 128 && "Unexpected argument type for lowering"
) ? void (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29092, __extension__
__PRETTY_FUNCTION__))
;
29093 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
29094 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
29095 MachinePointerInfo MPI =
29096 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
29097 Entry.Node = StackPtr;
29098 InChain =
29099 DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16));
29100 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
29101 Entry.Ty = PointerType::get(ArgTy,0);
29102 Entry.IsSExt = false;
29103 Entry.IsZExt = false;
29104 Args.push_back(Entry);
29105 }
29106
29107 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
29108 getPointerTy(DAG.getDataLayout()));
29109
29110 TargetLowering::CallLoweringInfo CLI(DAG);
29111 CLI.setDebugLoc(dl)
29112 .setChain(InChain)
29113 .setLibCallee(
29114 getLibcallCallingConv(LC),
29115 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
29116 std::move(Args))
29117 .setInRegister()
29118 .setSExtResult(isSigned)
29119 .setZExtResult(!isSigned);
29120
29121 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
29122 return DAG.getBitcast(VT, CallInfo.first);
29123}
29124
29125SDValue X86TargetLowering::LowerWin64_FP_TO_INT128(SDValue Op,
29126 SelectionDAG &DAG,
29127 SDValue &Chain) const {
29128 assert(Subtarget.isTargetWin64() && "Unexpected target")(static_cast <bool> (Subtarget.isTargetWin64() &&
"Unexpected target") ? void (0) : __assert_fail ("Subtarget.isTargetWin64() && \"Unexpected target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29128, __extension__
__PRETTY_FUNCTION__))
;
29129 EVT VT = Op.getValueType();
29130 bool IsStrict = Op->isStrictFPOpcode();
29131
29132 SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
29133 EVT ArgVT = Arg.getValueType();
29134
29135 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&(static_cast <bool> (VT.isInteger() && VT.getSizeInBits
() == 128 && "Unexpected return type for lowering") ?
void (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29136, __extension__
__PRETTY_FUNCTION__))
29136 "Unexpected return type for lowering")(static_cast <bool> (VT.isInteger() && VT.getSizeInBits
() == 128 && "Unexpected return type for lowering") ?
void (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29136, __extension__
__PRETTY_FUNCTION__))
;
29137
29138 RTLIB::Libcall LC;
29139 if (Op->getOpcode() == ISD::FP_TO_SINT ||
29140 Op->getOpcode() == ISD::STRICT_FP_TO_SINT)
29141 LC = RTLIB::getFPTOSINT(ArgVT, VT);
29142 else
29143 LC = RTLIB::getFPTOUINT(ArgVT, VT);
29144 assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!")(static_cast <bool> (LC != RTLIB::UNKNOWN_LIBCALL &&
"Unexpected request for libcall!") ? void (0) : __assert_fail
("LC != RTLIB::UNKNOWN_LIBCALL && \"Unexpected request for libcall!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29144, __extension__
__PRETTY_FUNCTION__))
;
29145
29146 SDLoc dl(Op);
29147 MakeLibCallOptions CallOptions;
29148 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
29149
29150 SDValue Result;
29151 // Expect the i128 argument returned as a v2i64 in xmm0, cast back to the
29152 // expected VT (i128).
29153 std::tie(Result, Chain) =
29154 makeLibCall(DAG, LC, MVT::v2i64, Arg, CallOptions, dl, Chain);
29155 Result = DAG.getBitcast(VT, Result);
29156 return Result;
29157}
29158
29159SDValue X86TargetLowering::LowerWin64_INT128_TO_FP(SDValue Op,
29160 SelectionDAG &DAG) const {
29161 assert(Subtarget.isTargetWin64() && "Unexpected target")(static_cast <bool> (Subtarget.isTargetWin64() &&
"Unexpected target") ? void (0) : __assert_fail ("Subtarget.isTargetWin64() && \"Unexpected target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29161, __extension__
__PRETTY_FUNCTION__))
;
29162 EVT VT = Op.getValueType();
29163 bool IsStrict = Op->isStrictFPOpcode();
29164
29165 SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
29166 EVT ArgVT = Arg.getValueType();
29167
29168 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&(static_cast <bool> (ArgVT.isInteger() && ArgVT
.getSizeInBits() == 128 && "Unexpected argument type for lowering"
) ? void (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29169, __extension__
__PRETTY_FUNCTION__))
29169 "Unexpected argument type for lowering")(static_cast <bool> (ArgVT.isInteger() && ArgVT
.getSizeInBits() == 128 && "Unexpected argument type for lowering"
) ? void (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29169, __extension__
__PRETTY_FUNCTION__))
;
29170
29171 RTLIB::Libcall LC;
29172 if (Op->getOpcode() == ISD::SINT_TO_FP ||
29173 Op->getOpcode() == ISD::STRICT_SINT_TO_FP)
29174 LC = RTLIB::getSINTTOFP(ArgVT, VT);
29175 else
29176 LC = RTLIB::getUINTTOFP(ArgVT, VT);
29177 assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!")(static_cast <bool> (LC != RTLIB::UNKNOWN_LIBCALL &&
"Unexpected request for libcall!") ? void (0) : __assert_fail
("LC != RTLIB::UNKNOWN_LIBCALL && \"Unexpected request for libcall!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29177, __extension__
__PRETTY_FUNCTION__))
;
29178
29179 SDLoc dl(Op);
29180 MakeLibCallOptions CallOptions;
29181 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
29182
29183 // Pass the i128 argument as an indirect argument on the stack.
29184 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
29185 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
29186 MachinePointerInfo MPI =
29187 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
29188 Chain = DAG.getStore(Chain, dl, Arg, StackPtr, MPI, Align(16));
29189
29190 SDValue Result;
29191 std::tie(Result, Chain) =
29192 makeLibCall(DAG, LC, VT, StackPtr, CallOptions, dl, Chain);
29193 return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;
29194}
29195
29196// Return true if the required (according to Opcode) shift-imm form is natively
29197// supported by the Subtarget
29198static bool supportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
29199 unsigned Opcode) {
29200 if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
29201 return false;
29202
29203 if (VT.getScalarSizeInBits() < 16)
29204 return false;
29205
29206 if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
29207 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
29208 return true;
29209
29210 bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
29211 (VT.is256BitVector() && Subtarget.hasInt256());
29212
29213 bool AShift = LShift && (Subtarget.hasAVX512() ||
29214 (VT != MVT::v2i64 && VT != MVT::v4i64));
29215 return (Opcode == ISD::SRA) ? AShift : LShift;
29216}
29217
29218// The shift amount is a variable, but it is the same for all vector lanes.
29219// These instructions are defined together with shift-immediate.
29220static
29221bool supportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
29222 unsigned Opcode) {
29223 return supportedVectorShiftWithImm(VT, Subtarget, Opcode);
29224}
29225
29226// Return true if the required (according to Opcode) variable-shift form is
29227// natively supported by the Subtarget
29228static bool supportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
29229 unsigned Opcode) {
29230 if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
29231 return false;
29232
29233 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
29234 return false;
29235
29236 // vXi16 supported only on AVX-512, BWI
29237 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
29238 return false;
29239
29240 if (Subtarget.hasAVX512() &&
29241 (Subtarget.useAVX512Regs() || !VT.is512BitVector()))
29242 return true;
29243
29244 bool LShift = VT.is128BitVector() || VT.is256BitVector();
29245 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
29246 return (Opcode == ISD::SRA) ? AShift : LShift;
29247}
29248
29249static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG,
29250 const X86Subtarget &Subtarget) {
29251 MVT VT = Op.getSimpleValueType();
29252 SDLoc dl(Op);
29253 SDValue R = Op.getOperand(0);
29254 SDValue Amt = Op.getOperand(1);
29255 unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);
29256
29257 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
29258 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type")(static_cast <bool> ((VT == MVT::v2i64 || VT == MVT::v4i64
) && "Unexpected SRA type") ? void (0) : __assert_fail
("(VT == MVT::v2i64 || VT == MVT::v4i64) && \"Unexpected SRA type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29258, __extension__
__PRETTY_FUNCTION__))
;
29259 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
29260 SDValue Ex = DAG.getBitcast(ExVT, R);
29261
29262 // ashr(R, 63) === cmp_slt(R, 0)
29263 if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
29264 assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&(static_cast <bool> ((VT != MVT::v4i64 || Subtarget.hasInt256
()) && "Unsupported PCMPGT op") ? void (0) : __assert_fail
("(VT != MVT::v4i64 || Subtarget.hasInt256()) && \"Unsupported PCMPGT op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29265, __extension__
__PRETTY_FUNCTION__))
29265 "Unsupported PCMPGT op")(static_cast <bool> ((VT != MVT::v4i64 || Subtarget.hasInt256
()) && "Unsupported PCMPGT op") ? void (0) : __assert_fail
("(VT != MVT::v4i64 || Subtarget.hasInt256()) && \"Unsupported PCMPGT op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29265, __extension__
__PRETTY_FUNCTION__))
;
29266 return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R);
29267 }
29268
29269 if (ShiftAmt >= 32) {
29270 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
29271 SDValue Upper =
29272 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
29273 SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
29274 ShiftAmt - 32, DAG);
29275 if (VT == MVT::v2i64)
29276 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
29277 if (VT == MVT::v4i64)
29278 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
29279 {9, 1, 11, 3, 13, 5, 15, 7});
29280 } else {
29281 // SRA upper i32, SRL whole i64 and select lower i32.
29282 SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
29283 ShiftAmt, DAG);
29284 SDValue Lower =
29285 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
29286 Lower = DAG.getBitcast(ExVT, Lower);
29287 if (VT == MVT::v2i64)
29288 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
29289 if (VT == MVT::v4i64)
29290 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
29291 {8, 1, 10, 3, 12, 5, 14, 7});
29292 }
29293 return DAG.getBitcast(VT, Ex);
29294 };
29295
29296 // Optimize shl/srl/sra with constant shift amount.
29297 APInt APIntShiftAmt;
29298 if (!X86::isConstantSplat(Amt, APIntShiftAmt))
29299 return SDValue();
29300
29301 // If the shift amount is out of range, return undef.
29302 if (APIntShiftAmt.uge(VT.getScalarSizeInBits()))
29303 return DAG.getUNDEF(VT);
29304
29305 uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
29306
29307 if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
29308 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
29309
29310 // i64 SRA needs to be performed as partial shifts.
29311 if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
29312 (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
29313 Op.getOpcode() == ISD::SRA)
29314 return ArithmeticShiftRight64(ShiftAmt);
29315
29316 if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) ||
29317 (Subtarget.hasBWI() && VT == MVT::v64i8)) {
29318 unsigned NumElts = VT.getVectorNumElements();
29319 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29320
29321 // Simple i8 add case
29322 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
29323 // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
29324 // must be 0). (add undef, undef) however can be any value. To make this
29325 // safe, we must freeze R to ensure that register allocation uses the same
29326 // register for an undefined value. This ensures that the result will
29327 // still be even and preserves the original semantics.
29328 R = DAG.getNode(ISD::FREEZE, dl, VT, R);
29329 return DAG.getNode(ISD::ADD, dl, VT, R, R);
29330 }
29331
29332 // ashr(R, 7) === cmp_slt(R, 0)
29333 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
29334 SDValue Zeros = DAG.getConstant(0, dl, VT);
29335 if (VT.is512BitVector()) {
29336 assert(VT == MVT::v64i8 && "Unexpected element type!")(static_cast <bool> (VT == MVT::v64i8 && "Unexpected element type!"
) ? void (0) : __assert_fail ("VT == MVT::v64i8 && \"Unexpected element type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29336, __extension__
__PRETTY_FUNCTION__))
;
29337 SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT);
29338 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
29339 }
29340 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
29341 }
29342
29343 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
29344 if (VT == MVT::v16i8 && Subtarget.hasXOP())
29345 return SDValue();
29346
29347 if (Op.getOpcode() == ISD::SHL) {
29348 // Make a large shift.
29349 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,
29350 ShiftAmt, DAG);
29351 SHL = DAG.getBitcast(VT, SHL);
29352 // Zero out the rightmost bits.
29353 APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);
29354 return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT));
29355 }
29356 if (Op.getOpcode() == ISD::SRL) {
29357 // Make a large shift.
29358 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R,
29359 ShiftAmt, DAG);
29360 SRL = DAG.getBitcast(VT, SRL);
29361 // Zero out the leftmost bits.
29362 APInt Mask = APInt::getLowBitsSet(8, 8 - ShiftAmt);
29363 return DAG.getNode(ISD::AND, dl, VT, SRL, DAG.getConstant(Mask, dl, VT));
29364 }
29365 if (Op.getOpcode() == ISD::SRA) {
29366 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
29367 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
29368
29369 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
29370 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
29371 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
29372 return Res;
29373 }
29374 llvm_unreachable("Unknown shift opcode.")::llvm::llvm_unreachable_internal("Unknown shift opcode.", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29374)
;
29375 }
29376
29377 return SDValue();
29378}
29379
29380static SDValue LowerShiftByScalarVariable(SDValue Op, SelectionDAG &DAG,
29381 const X86Subtarget &Subtarget) {
29382 MVT VT = Op.getSimpleValueType();
29383 SDLoc dl(Op);
29384 SDValue R = Op.getOperand(0);
29385 SDValue Amt = Op.getOperand(1);
29386 unsigned Opcode = Op.getOpcode();
29387 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);
29388
29389 int BaseShAmtIdx = -1;
29390 if (SDValue BaseShAmt = DAG.getSplatSourceVector(Amt, BaseShAmtIdx)) {
29391 if (supportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode))
29392 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, BaseShAmtIdx,
29393 Subtarget, DAG);
29394
29395 // vXi8 shifts - shift as v8i16 + mask result.
29396 if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) ||
29397 (VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) ||
29398 VT == MVT::v64i8) &&
29399 !Subtarget.hasXOP()) {
29400 unsigned NumElts = VT.getVectorNumElements();
29401 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29402 if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {
29403 unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);
29404 unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);
29405
29406 // Create the mask using vXi16 shifts. For shift-rights we need to move
29407 // the upper byte down before splatting the vXi8 mask.
29408 SDValue BitMask = DAG.getConstant(-1, dl, ExtVT);
29409 BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask,
29410 BaseShAmt, BaseShAmtIdx, Subtarget, DAG);
29411 if (Opcode != ISD::SHL)
29412 BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask,
29413 8, DAG);
29414 BitMask = DAG.getBitcast(VT, BitMask);
29415 BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask,
29416 SmallVector<int, 64>(NumElts, 0));
29417
29418 SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT,
29419 DAG.getBitcast(ExtVT, R), BaseShAmt,
29420 BaseShAmtIdx, Subtarget, DAG);
29421 Res = DAG.getBitcast(VT, Res);
29422 Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask);
29423
29424 if (Opcode == ISD::SRA) {
29425 // ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask)
29426 // SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.
29427 SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT);
29428 SignMask =
29429 getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask, BaseShAmt,
29430 BaseShAmtIdx, Subtarget, DAG);
29431 SignMask = DAG.getBitcast(VT, SignMask);
29432 Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask);
29433 Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask);
29434 }
29435 return Res;
29436 }
29437 }
29438 }
29439
29440 return SDValue();
29441}
29442
29443// Convert a shift/rotate left amount to a multiplication scale factor.
29444static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,
29445 const X86Subtarget &Subtarget,
29446 SelectionDAG &DAG) {
29447 MVT VT = Amt.getSimpleValueType();
29448 if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||
29449 (Subtarget.hasInt256() && VT == MVT::v16i16) ||
29450 (Subtarget.hasAVX512() && VT == MVT::v32i16) ||
29451 (!Subtarget.hasAVX512() && VT == MVT::v16i8) ||
29452 (Subtarget.hasInt256() && VT == MVT::v32i8) ||
29453 (Subtarget.hasBWI() && VT == MVT::v64i8)))
29454 return SDValue();
29455
29456 MVT SVT = VT.getVectorElementType();
29457 unsigned SVTBits = SVT.getSizeInBits();
29458 unsigned NumElems = VT.getVectorNumElements();
29459
29460 APInt UndefElts;
29461 SmallVector<APInt> EltBits;
29462 if (getTargetConstantBitsFromNode(Amt, SVTBits, UndefElts, EltBits)) {
29463 APInt One(SVTBits, 1);
29464 SmallVector<SDValue> Elts(NumElems, DAG.getUNDEF(SVT));
29465 for (unsigned I = 0; I != NumElems; ++I) {
29466 if (UndefElts[I] || EltBits[I].uge(SVTBits))
29467 continue;
29468 uint64_t ShAmt = EltBits[I].getZExtValue();
29469 Elts[I] = DAG.getConstant(One.shl(ShAmt), dl, SVT);
29470 }
29471 return DAG.getBuildVector(VT, dl, Elts);
29472 }
29473
29474 // If the target doesn't support variable shifts, use either FP conversion
29475 // or integer multiplication to avoid shifting each element individually.
29476 if (VT == MVT::v4i32) {
29477 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
29478 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,
29479 DAG.getConstant(0x3f800000U, dl, VT));
29480 Amt = DAG.getBitcast(MVT::v4f32, Amt);
29481 return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);
29482 }
29483
29484 // AVX2 can more effectively perform this as a zext/trunc to/from v8i32.
29485 if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {
29486 SDValue Z = DAG.getConstant(0, dl, VT);
29487 SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));
29488 SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));
29489 Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);
29490 Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);
29491 if (Subtarget.hasSSE41())
29492 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
29493 return getPack(DAG, Subtarget, dl, VT, Lo, Hi);
29494 }
29495
29496 return SDValue();
29497}
29498
29499static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
29500 SelectionDAG &DAG) {
29501 MVT VT = Op.getSimpleValueType();
29502 SDLoc dl(Op);
29503 SDValue R = Op.getOperand(0);
29504 SDValue Amt = Op.getOperand(1);
29505 unsigned EltSizeInBits = VT.getScalarSizeInBits();
29506 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
29507
29508 unsigned Opc = Op.getOpcode();
29509 unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true);
29510 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false);
29511
29512 assert(VT.isVector() && "Custom lowering only for vector shifts!")(static_cast <bool> (VT.isVector() && "Custom lowering only for vector shifts!"
) ? void (0) : __assert_fail ("VT.isVector() && \"Custom lowering only for vector shifts!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29512, __extension__
__PRETTY_FUNCTION__))
;
29513 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Only custom lower when we have SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Only custom lower when we have SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29513, __extension__
__PRETTY_FUNCTION__))
;
29514
29515 if (SDValue V = LowerShiftByScalarImmediate(Op, DAG, Subtarget))
29516 return V;
29517
29518 if (SDValue V = LowerShiftByScalarVariable(Op, DAG, Subtarget))
29519 return V;
29520
29521 if (supportedVectorVarShift(VT, Subtarget, Opc))
29522 return Op;
29523
29524 // i64 vector arithmetic shift can be emulated with the transform:
29525 // M = lshr(SIGN_MASK, Amt)
29526 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
29527 if (((VT == MVT::v2i64 && !Subtarget.hasXOP()) ||
29528 (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
29529 Opc == ISD::SRA) {
29530 SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
29531 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
29532 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
29533 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
29534 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
29535 return R;
29536 }
29537
29538 // XOP has 128-bit variable logical/arithmetic shifts.
29539 // +ve/-ve Amt = shift left/right.
29540 if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||
29541 VT == MVT::v8i16 || VT == MVT::v16i8)) {
29542 if (Opc == ISD::SRL || Opc == ISD::SRA) {
29543 SDValue Zero = DAG.getConstant(0, dl, VT);
29544 Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
29545 }
29546 if (Opc == ISD::SHL || Opc == ISD::SRL)
29547 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
29548 if (Opc == ISD::SRA)
29549 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
29550 }
29551
29552 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
29553 // shifts per-lane and then shuffle the partial results back together.
29554 if (VT == MVT::v2i64 && Opc != ISD::SRA) {
29555 // Splat the shift amounts so the scalar shifts above will catch it.
29556 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
29557 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
29558 SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
29559 SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
29560 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
29561 }
29562
29563 // If possible, lower this shift as a sequence of two shifts by
29564 // constant plus a BLENDing shuffle instead of scalarizing it.
29565 // Example:
29566 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
29567 //
29568 // Could be rewritten as:
29569 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
29570 //
29571 // The advantage is that the two shifts from the example would be
29572 // lowered as X86ISD::VSRLI nodes in parallel before blending.
29573 if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
29574 (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
29575 SDValue Amt1, Amt2;
29576 unsigned NumElts = VT.getVectorNumElements();
29577 SmallVector<int, 8> ShuffleMask;
29578 for (unsigned i = 0; i != NumElts; ++i) {
29579 SDValue A = Amt->getOperand(i);
29580 if (A.isUndef()) {
29581 ShuffleMask.push_back(SM_SentinelUndef);
29582 continue;
29583 }
29584 if (!Amt1 || Amt1 == A) {
29585 ShuffleMask.push_back(i);
29586 Amt1 = A;
29587 continue;
29588 }
29589 if (!Amt2 || Amt2 == A) {
29590 ShuffleMask.push_back(i + NumElts);
29591 Amt2 = A;
29592 continue;
29593 }
29594 break;
29595 }
29596
29597 // Only perform this blend if we can perform it without loading a mask.
29598 if (ShuffleMask.size() == NumElts && Amt1 && Amt2 &&
29599 (VT != MVT::v16i16 ||
29600 is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&
29601 (VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL ||
29602 canWidenShuffleElements(ShuffleMask))) {
29603 auto *Cst1 = dyn_cast<ConstantSDNode>(Amt1);
29604 auto *Cst2 = dyn_cast<ConstantSDNode>(Amt2);
29605 if (Cst1 && Cst2 && Cst1->getAPIntValue().ult(EltSizeInBits) &&
29606 Cst2->getAPIntValue().ult(EltSizeInBits)) {
29607 SDValue Shift1 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
29608 Cst1->getZExtValue(), DAG);
29609 SDValue Shift2 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
29610 Cst2->getZExtValue(), DAG);
29611 return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
29612 }
29613 }
29614 }
29615
29616 // If possible, lower this packed shift into a vector multiply instead of
29617 // expanding it into a sequence of scalar shifts.
29618 // For v32i8 cases, it might be quicker to split/extend to vXi16 shifts.
29619 if (Opc == ISD::SHL && !(VT == MVT::v32i8 && (Subtarget.hasXOP() ||
29620 Subtarget.canExtendTo512BW())))
29621 if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
29622 return DAG.getNode(ISD::MUL, dl, VT, R, Scale);
29623
29624 // Constant ISD::SRL can be performed efficiently on vXi16 vectors as we
29625 // can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).
29626 if (Opc == ISD::SRL && ConstantAmt &&
29627 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
29628 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
29629 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
29630 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
29631 SDValue Zero = DAG.getConstant(0, dl, VT);
29632 SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);
29633 SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);
29634 return DAG.getSelect(dl, VT, ZAmt, R, Res);
29635 }
29636 }
29637
29638 // Constant ISD::SRA can be performed efficiently on vXi16 vectors as we
29639 // can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).
29640 // TODO: Special case handling for shift by 0/1, really we can afford either
29641 // of these cases in pre-SSE41/XOP/AVX512 but not both.
29642 if (Opc == ISD::SRA && ConstantAmt &&
29643 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256())) &&
29644 ((Subtarget.hasSSE41() && !Subtarget.hasXOP() &&
29645 !Subtarget.hasAVX512()) ||
29646 DAG.isKnownNeverZero(Amt))) {
29647 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
29648 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
29649 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
29650 SDValue Amt0 =
29651 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ);
29652 SDValue Amt1 =
29653 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ);
29654 SDValue Sra1 =
29655 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG);
29656 SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale);
29657 Res = DAG.getSelect(dl, VT, Amt0, R, Res);
29658 return DAG.getSelect(dl, VT, Amt1, Sra1, Res);
29659 }
29660 }
29661
29662 // v4i32 Non Uniform Shifts.
29663 // If the shift amount is constant we can shift each lane using the SSE2
29664 // immediate shifts, else we need to zero-extend each lane to the lower i64
29665 // and shift using the SSE2 variable shifts.
29666 // The separate results can then be blended together.
29667 if (VT == MVT::v4i32) {
29668 SDValue Amt0, Amt1, Amt2, Amt3;
29669 if (ConstantAmt) {
29670 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
29671 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
29672 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
29673 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
29674 } else {
29675 // The SSE2 shifts use the lower i64 as the same shift amount for
29676 // all lanes and the upper i64 is ignored. On AVX we're better off
29677 // just zero-extending, but for SSE just duplicating the top 16-bits is
29678 // cheaper and has the same effect for out of range values.
29679 if (Subtarget.hasAVX()) {
29680 SDValue Z = DAG.getConstant(0, dl, VT);
29681 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
29682 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
29683 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
29684 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
29685 } else {
29686 SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);
29687 SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
29688 {4, 5, 6, 7, -1, -1, -1, -1});
29689 Amt0 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
29690 {0, 1, 1, 1, -1, -1, -1, -1});
29691 Amt1 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
29692 {2, 3, 3, 3, -1, -1, -1, -1});
29693 Amt2 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
29694 {0, 1, 1, 1, -1, -1, -1, -1});
29695 Amt3 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
29696 {2, 3, 3, 3, -1, -1, -1, -1});
29697 }
29698 }
29699
29700 unsigned ShOpc = ConstantAmt ? Opc : X86OpcV;
29701 SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0));
29702 SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1));
29703 SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2));
29704 SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3));
29705
29706 // Merge the shifted lane results optimally with/without PBLENDW.
29707 // TODO - ideally shuffle combining would handle this.
29708 if (Subtarget.hasSSE41()) {
29709 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
29710 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
29711 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
29712 }
29713 SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
29714 SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
29715 return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
29716 }
29717
29718 // It's worth extending once and using the vXi16/vXi32 shifts for smaller
29719 // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
29720 // make the existing SSE solution better.
29721 // NOTE: We honor prefered vector width before promoting to 512-bits.
29722 if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
29723 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||
29724 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||
29725 (Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||
29726 (Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
29727 assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&(static_cast <bool> ((!Subtarget.hasBWI() || VT == MVT::
v32i8 || VT == MVT::v16i8) && "Unexpected vector type"
) ? void (0) : __assert_fail ("(!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29728, __extension__
__PRETTY_FUNCTION__))
29728 "Unexpected vector type")(static_cast <bool> ((!Subtarget.hasBWI() || VT == MVT::
v32i8 || VT == MVT::v16i8) && "Unexpected vector type"
) ? void (0) : __assert_fail ("(!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29728, __extension__
__PRETTY_FUNCTION__))
;
29729 MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
29730 MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
29731 unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29732 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
29733 Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
29734 return DAG.getNode(ISD::TRUNCATE, dl, VT,
29735 DAG.getNode(Opc, dl, ExtVT, R, Amt));
29736 }
29737
29738 // Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we
29739 // extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.
29740 if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&
29741 (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
29742 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
29743 !Subtarget.hasXOP()) {
29744 int NumElts = VT.getVectorNumElements();
29745 SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);
29746
29747 // Extend constant shift amount to vXi16 (it doesn't matter if the type
29748 // isn't legal).
29749 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
29750 Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT);
29751 Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt);
29752 Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt);
29753 assert(ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) &&(static_cast <bool> (ISD::isBuildVectorOfConstantSDNodes
(Amt.getNode()) && "Constant build vector expected") ?
void (0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) && \"Constant build vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29754, __extension__
__PRETTY_FUNCTION__))
29754 "Constant build vector expected")(static_cast <bool> (ISD::isBuildVectorOfConstantSDNodes
(Amt.getNode()) && "Constant build vector expected") ?
void (0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) && \"Constant build vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29754, __extension__
__PRETTY_FUNCTION__))
;
29755
29756 if (VT == MVT::v16i8 && Subtarget.hasInt256()) {
29757 R = Opc == ISD::SRA ? DAG.getSExtOrTrunc(R, dl, ExVT)
29758 : DAG.getZExtOrTrunc(R, dl, ExVT);
29759 R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt);
29760 R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8);
29761 return DAG.getZExtOrTrunc(R, dl, VT);
29762 }
29763
29764 SmallVector<SDValue, 16> LoAmt, HiAmt;
29765 for (int i = 0; i != NumElts; i += 16) {
29766 for (int j = 0; j != 8; ++j) {
29767 LoAmt.push_back(Amt.getOperand(i + j));
29768 HiAmt.push_back(Amt.getOperand(i + j + 8));
29769 }
29770 }
29771
29772 MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2);
29773 SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt);
29774 SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt);
29775
29776 SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R));
29777 SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R));
29778 LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8);
29779 HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8);
29780 LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA);
29781 HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA);
29782 LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8);
29783 HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8);
29784 return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);
29785 }
29786
29787 if (VT == MVT::v16i8 ||
29788 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
29789 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
29790 MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
29791
29792 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
29793 if (VT.is512BitVector()) {
29794 // On AVX512BW targets we make use of the fact that VSELECT lowers
29795 // to a masked blend which selects bytes based just on the sign bit
29796 // extracted to a mask.
29797 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
29798 V0 = DAG.getBitcast(VT, V0);
29799 V1 = DAG.getBitcast(VT, V1);
29800 Sel = DAG.getBitcast(VT, Sel);
29801 Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,
29802 ISD::SETGT);
29803 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
29804 } else if (Subtarget.hasSSE41()) {
29805 // On SSE41 targets we can use PBLENDVB which selects bytes based just
29806 // on the sign bit.
29807 V0 = DAG.getBitcast(VT, V0);
29808 V1 = DAG.getBitcast(VT, V1);
29809 Sel = DAG.getBitcast(VT, Sel);
29810 return DAG.getBitcast(SelVT,
29811 DAG.getNode(X86ISD::BLENDV, dl, VT, Sel, V0, V1));
29812 }
29813 // On pre-SSE41 targets we test for the sign bit by comparing to
29814 // zero - a negative value will set all bits of the lanes to true
29815 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
29816 SDValue Z = DAG.getConstant(0, dl, SelVT);
29817 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
29818 return DAG.getSelect(dl, SelVT, C, V0, V1);
29819 };
29820
29821 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
29822 // We can safely do this using i16 shifts as we're only interested in
29823 // the 3 lower bits of each byte.
29824 Amt = DAG.getBitcast(ExtVT, Amt);
29825 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG);
29826 Amt = DAG.getBitcast(VT, Amt);
29827
29828 if (Opc == ISD::SHL || Opc == ISD::SRL) {
29829 // r = VSELECT(r, shift(r, 4), a);
29830 SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT));
29831 R = SignBitSelect(VT, Amt, M, R);
29832
29833 // a += a
29834 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
29835
29836 // r = VSELECT(r, shift(r, 2), a);
29837 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT));
29838 R = SignBitSelect(VT, Amt, M, R);
29839
29840 // a += a
29841 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
29842
29843 // return VSELECT(r, shift(r, 1), a);
29844 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT));
29845 R = SignBitSelect(VT, Amt, M, R);
29846 return R;
29847 }
29848
29849 if (Opc == ISD::SRA) {
29850 // For SRA we need to unpack each byte to the higher byte of a i16 vector
29851 // so we can correctly sign extend. We don't care what happens to the
29852 // lower byte.
29853 SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
29854 SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
29855 SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R);
29856 SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R);
29857 ALo = DAG.getBitcast(ExtVT, ALo);
29858 AHi = DAG.getBitcast(ExtVT, AHi);
29859 RLo = DAG.getBitcast(ExtVT, RLo);
29860 RHi = DAG.getBitcast(ExtVT, RHi);
29861
29862 // r = VSELECT(r, shift(r, 4), a);
29863 SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG);
29864 SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG);
29865 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
29866 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
29867
29868 // a += a
29869 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
29870 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
29871
29872 // r = VSELECT(r, shift(r, 2), a);
29873 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG);
29874 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG);
29875 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
29876 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
29877
29878 // a += a
29879 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
29880 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
29881
29882 // r = VSELECT(r, shift(r, 1), a);
29883 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG);
29884 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG);
29885 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
29886 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
29887
29888 // Logical shift the result back to the lower byte, leaving a zero upper
29889 // byte meaning that we can safely pack with PACKUSWB.
29890 RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG);
29891 RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG);
29892 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
29893 }
29894 }
29895
29896 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
29897 MVT ExtVT = MVT::v8i32;
29898 SDValue Z = DAG.getConstant(0, dl, VT);
29899 SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z);
29900 SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z);
29901 SDValue RLo = getUnpackl(DAG, dl, VT, Z, R);
29902 SDValue RHi = getUnpackh(DAG, dl, VT, Z, R);
29903 ALo = DAG.getBitcast(ExtVT, ALo);
29904 AHi = DAG.getBitcast(ExtVT, AHi);
29905 RLo = DAG.getBitcast(ExtVT, RLo);
29906 RHi = DAG.getBitcast(ExtVT, RHi);
29907 SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo);
29908 SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi);
29909 Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG);
29910 Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG);
29911 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
29912 }
29913
29914 if (VT == MVT::v8i16) {
29915 // If we have a constant shift amount, the non-SSE41 path is best as
29916 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
29917 bool UseSSE41 = Subtarget.hasSSE41() &&
29918 !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
29919
29920 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
29921 // On SSE41 targets we can use PBLENDVB which selects bytes based just on
29922 // the sign bit.
29923 if (UseSSE41) {
29924 MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
29925 V0 = DAG.getBitcast(ExtVT, V0);
29926 V1 = DAG.getBitcast(ExtVT, V1);
29927 Sel = DAG.getBitcast(ExtVT, Sel);
29928 return DAG.getBitcast(
29929 VT, DAG.getNode(X86ISD::BLENDV, dl, ExtVT, Sel, V0, V1));
29930 }
29931 // On pre-SSE41 targets we splat the sign bit - a negative value will
29932 // set all bits of the lanes to true and VSELECT uses that in
29933 // its OR(AND(V0,C),AND(V1,~C)) lowering.
29934 SDValue C =
29935 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG);
29936 return DAG.getSelect(dl, VT, C, V0, V1);
29937 };
29938
29939 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
29940 if (UseSSE41) {
29941 // On SSE41 targets we need to replicate the shift mask in both
29942 // bytes for PBLENDVB.
29943 Amt = DAG.getNode(
29944 ISD::OR, dl, VT,
29945 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG),
29946 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG));
29947 } else {
29948 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG);
29949 }
29950
29951 // r = VSELECT(r, shift(r, 8), a);
29952 SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG);
29953 R = SignBitSelect(Amt, M, R);
29954
29955 // a += a
29956 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
29957
29958 // r = VSELECT(r, shift(r, 4), a);
29959 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG);
29960 R = SignBitSelect(Amt, M, R);
29961
29962 // a += a
29963 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
29964
29965 // r = VSELECT(r, shift(r, 2), a);
29966 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG);
29967 R = SignBitSelect(Amt, M, R);
29968
29969 // a += a
29970 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
29971
29972 // return VSELECT(r, shift(r, 1), a);
29973 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG);
29974 R = SignBitSelect(Amt, M, R);
29975 return R;
29976 }
29977
29978 // Decompose 256-bit shifts into 128-bit shifts.
29979 if (VT.is256BitVector())
29980 return splitVectorIntBinary(Op, DAG);
29981
29982 if (VT == MVT::v32i16 || VT == MVT::v64i8)
29983 return splitVectorIntBinary(Op, DAG);
29984
29985 return SDValue();
29986}
29987
29988static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
29989 SelectionDAG &DAG) {
29990 MVT VT = Op.getSimpleValueType();
29991 assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) &&(static_cast <bool> ((Op.getOpcode() == ISD::FSHL || Op
.getOpcode() == ISD::FSHR) && "Unexpected funnel shift opcode!"
) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) && \"Unexpected funnel shift opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29992, __extension__
__PRETTY_FUNCTION__))
29992 "Unexpected funnel shift opcode!")(static_cast <bool> ((Op.getOpcode() == ISD::FSHL || Op
.getOpcode() == ISD::FSHR) && "Unexpected funnel shift opcode!"
) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) && \"Unexpected funnel shift opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29992, __extension__
__PRETTY_FUNCTION__))
;
29993
29994 SDLoc DL(Op);
29995 SDValue Op0 = Op.getOperand(0);
29996 SDValue Op1 = Op.getOperand(1);
29997 SDValue Amt = Op.getOperand(2);
29998 unsigned EltSizeInBits = VT.getScalarSizeInBits();
29999 bool IsFSHR = Op.getOpcode() == ISD::FSHR;
30000
30001 if (VT.isVector()) {
30002 APInt APIntShiftAmt;
30003 bool IsCstSplat = X86::isConstantSplat(Amt, APIntShiftAmt);
30004
30005 if (Subtarget.hasVBMI2() && EltSizeInBits > 8) {
30006 if (IsFSHR)
30007 std::swap(Op0, Op1);
30008
30009 if (IsCstSplat) {
30010 uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
30011 SDValue Imm = DAG.getTargetConstant(ShiftAmt, DL, MVT::i8);
30012 return getAVX512Node(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT,
30013 {Op0, Op1, Imm}, DAG, Subtarget);
30014 }
30015 return getAVX512Node(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT,
30016 {Op0, Op1, Amt}, DAG, Subtarget);
30017 }
30018 assert((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8
|| VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16
|| VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32
|| VT == MVT::v16i32) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30021, __extension__
__PRETTY_FUNCTION__))
30019 VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 ||(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8
|| VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16
|| VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32
|| VT == MVT::v16i32) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30021, __extension__
__PRETTY_FUNCTION__))
30020 VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) &&(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8
|| VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16
|| VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32
|| VT == MVT::v16i32) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30021, __extension__
__PRETTY_FUNCTION__))
30021 "Unexpected funnel shift type!")(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8
|| VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16
|| VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32
|| VT == MVT::v16i32) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30021, __extension__
__PRETTY_FUNCTION__))
;
30022
30023 // fshl(x,y,z) -> unpack(y,x) << (z & (bw-1))) >> bw.
30024 // fshr(x,y,z) -> unpack(y,x) >> (z & (bw-1))).
30025 if (IsCstSplat)
30026 return SDValue();
30027
30028 SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
30029 SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
30030 bool IsCst = ISD::isBuildVectorOfConstantSDNodes(AmtMod.getNode());
30031
30032 // Constant vXi16 funnel shifts can be efficiently handled by default.
30033 if (IsCst && EltSizeInBits == 16)
30034 return SDValue();
30035
30036 unsigned ShiftOpc = IsFSHR ? ISD::SRL : ISD::SHL;
30037 unsigned NumElts = VT.getVectorNumElements();
30038 MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
30039 MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
30040
30041 // Split 256-bit integers on XOP/pre-AVX2 targets.
30042 // Split 512-bit integers on non 512-bit BWI targets.
30043 if ((VT.is256BitVector() && ((Subtarget.hasXOP() && EltSizeInBits < 16) ||
30044 !Subtarget.hasAVX2())) ||
30045 (VT.is512BitVector() && !Subtarget.useBWIRegs() &&
30046 EltSizeInBits < 32)) {
30047 // Pre-mask the amount modulo using the wider vector.
30048 Op = DAG.getNode(Op.getOpcode(), DL, VT, Op0, Op1, AmtMod);
30049 return splitVectorOp(Op, DAG);
30050 }
30051
30052 // Attempt to fold scalar shift as unpack(y,x) << zext(splat(z))
30053 if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, ShiftOpc)) {
30054 int ScalarAmtIdx = -1;
30055 if (SDValue ScalarAmt = DAG.getSplatSourceVector(AmtMod, ScalarAmtIdx)) {
30056 // Uniform vXi16 funnel shifts can be efficiently handled by default.
30057 if (EltSizeInBits == 16)
30058 return SDValue();
30059
30060 SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
30061 SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
30062 Lo = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Lo, ScalarAmt,
30063 ScalarAmtIdx, Subtarget, DAG);
30064 Hi = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Hi, ScalarAmt,
30065 ScalarAmtIdx, Subtarget, DAG);
30066 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
30067 }
30068 }
30069
30070 MVT WideSVT = MVT::getIntegerVT(
30071 std::min<unsigned>(EltSizeInBits * 2, Subtarget.hasBWI() ? 16 : 32));
30072 MVT WideVT = MVT::getVectorVT(WideSVT, NumElts);
30073
30074 // If per-element shifts are legal, fallback to generic expansion.
30075 if (supportedVectorVarShift(VT, Subtarget, ShiftOpc) || Subtarget.hasXOP())
30076 return SDValue();
30077
30078 // Attempt to fold as:
30079 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
30080 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
30081 if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
30082 supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
30083 Op0 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Op0);
30084 Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, Op1);
30085 AmtMod = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
30086 Op0 = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, Op0,
30087 EltSizeInBits, DAG);
30088 SDValue Res = DAG.getNode(ISD::OR, DL, WideVT, Op0, Op1);
30089 Res = DAG.getNode(ShiftOpc, DL, WideVT, Res, AmtMod);
30090 if (!IsFSHR)
30091 Res = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, Res,
30092 EltSizeInBits, DAG);
30093 return DAG.getNode(ISD::TRUNCATE, DL, VT, Res);
30094 }
30095
30096 // Attempt to fold per-element (ExtVT) shift as unpack(y,x) << zext(z)
30097 if (((IsCst || !Subtarget.hasAVX512()) && !IsFSHR && EltSizeInBits <= 16) ||
30098 supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc)) {
30099 SDValue Z = DAG.getConstant(0, DL, VT);
30100 SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
30101 SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
30102 SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
30103 SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
30104 SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
30105 SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
30106 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
30107 }
30108
30109 // Fallback to generic expansion.
30110 return SDValue();
30111 }
30112 assert((static_cast <bool> ((VT == MVT::i8 || VT == MVT::i16 ||
VT == MVT::i32 || VT == MVT::i64) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30114, __extension__
__PRETTY_FUNCTION__))
30113 (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&(static_cast <bool> ((VT == MVT::i8 || VT == MVT::i16 ||
VT == MVT::i32 || VT == MVT::i64) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30114, __extension__
__PRETTY_FUNCTION__))
30114 "Unexpected funnel shift type!")(static_cast <bool> ((VT == MVT::i8 || VT == MVT::i16 ||
VT == MVT::i32 || VT == MVT::i64) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30114, __extension__
__PRETTY_FUNCTION__))
;
30115
30116 // Expand slow SHLD/SHRD cases if we are not optimizing for size.
30117 bool OptForSize = DAG.shouldOptForSize();
30118 bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow();
30119
30120 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
30121 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
30122 if ((VT == MVT::i8 || (ExpandFunnel && VT == MVT::i16)) &&
30123 !isa<ConstantSDNode>(Amt)) {
30124 SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType());
30125 SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType());
30126 Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32);
30127 Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32);
30128 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask);
30129 SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift);
30130 Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1);
30131 if (IsFSHR) {
30132 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt);
30133 } else {
30134 Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt);
30135 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift);
30136 }
30137 return DAG.getZExtOrTrunc(Res, DL, VT);
30138 }
30139
30140 if (VT == MVT::i8 || ExpandFunnel)
30141 return SDValue();
30142
30143 // i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
30144 if (VT == MVT::i16) {
30145 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
30146 DAG.getConstant(15, DL, Amt.getValueType()));
30147 unsigned FSHOp = (IsFSHR ? X86ISD::FSHR : X86ISD::FSHL);
30148 return DAG.getNode(FSHOp, DL, VT, Op0, Op1, Amt);
30149 }
30150
30151 return Op;
30152}
30153
30154static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
30155 SelectionDAG &DAG) {
30156 MVT VT = Op.getSimpleValueType();
30157 assert(VT.isVector() && "Custom lowering only for vector rotates!")(static_cast <bool> (VT.isVector() && "Custom lowering only for vector rotates!"
) ? void (0) : __assert_fail ("VT.isVector() && \"Custom lowering only for vector rotates!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30157, __extension__
__PRETTY_FUNCTION__))
;
30158
30159 SDLoc DL(Op);
30160 SDValue R = Op.getOperand(0);
30161 SDValue Amt = Op.getOperand(1);
30162 unsigned Opcode = Op.getOpcode();
30163 unsigned EltSizeInBits = VT.getScalarSizeInBits();
30164 int NumElts = VT.getVectorNumElements();
30165 bool IsROTL = Opcode == ISD::ROTL;
30166
30167 // Check for constant splat rotation amount.
30168 APInt CstSplatValue;
30169 bool IsCstSplat = X86::isConstantSplat(Amt, CstSplatValue);
30170
30171 // Check for splat rotate by zero.
30172 if (IsCstSplat && CstSplatValue.urem(EltSizeInBits) == 0)
30173 return R;
30174
30175 // AVX512 implicitly uses modulo rotation amounts.
30176 if (Subtarget.hasAVX512() && 32 <= EltSizeInBits) {
30177 // Attempt to rotate by immediate.
30178 if (IsCstSplat) {
30179 unsigned RotOpc = IsROTL ? X86ISD::VROTLI : X86ISD::VROTRI;
30180 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
30181 return DAG.getNode(RotOpc, DL, VT, R,
30182 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
30183 }
30184
30185 // Else, fall-back on VPROLV/VPRORV.
30186 return Op;
30187 }
30188
30189 // AVX512 VBMI2 vXi16 - lower to funnel shifts.
30190 if (Subtarget.hasVBMI2() && 16 == EltSizeInBits) {
30191 unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
30192 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
30193 }
30194
30195 SDValue Z = DAG.getConstant(0, DL, VT);
30196
30197 if (!IsROTL) {
30198 // If the ISD::ROTR amount is constant, we're always better converting to
30199 // ISD::ROTL.
30200 if (SDValue NegAmt = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {Z, Amt}))
30201 return DAG.getNode(ISD::ROTL, DL, VT, R, NegAmt);
30202
30203 // XOP targets always prefers ISD::ROTL.
30204 if (Subtarget.hasXOP())
30205 return DAG.getNode(ISD::ROTL, DL, VT, R,
30206 DAG.getNode(ISD::SUB, DL, VT, Z, Amt));
30207 }
30208
30209 // Split 256-bit integers on XOP/pre-AVX2 targets.
30210 if (VT.is256BitVector() && (Subtarget.hasXOP() || !Subtarget.hasAVX2()))
30211 return splitVectorIntBinary(Op, DAG);
30212
30213 // XOP has 128-bit vector variable + immediate rotates.
30214 // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
30215 // XOP implicitly uses modulo rotation amounts.
30216 if (Subtarget.hasXOP()) {
30217 assert(IsROTL && "Only ROTL expected")(static_cast <bool> (IsROTL && "Only ROTL expected"
) ? void (0) : __assert_fail ("IsROTL && \"Only ROTL expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30217, __extension__
__PRETTY_FUNCTION__))
;
30218 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!")(static_cast <bool> (VT.is128BitVector() && "Only rotate 128-bit vectors!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only rotate 128-bit vectors!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30218, __extension__
__PRETTY_FUNCTION__))
;
30219
30220 // Attempt to rotate by immediate.
30221 if (IsCstSplat) {
30222 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
30223 return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
30224 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
30225 }
30226
30227 // Use general rotate by variable (per-element).
30228 return Op;
30229 }
30230
30231 // Rotate by an uniform constant - expand back to shifts.
30232 if (IsCstSplat)
30233 return SDValue();
30234
30235 // Split 512-bit integers on non 512-bit BWI targets.
30236 if (VT.is512BitVector() && !Subtarget.useBWIRegs())
30237 return splitVectorIntBinary(Op, DAG);
30238
30239 assert((static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30244, __extension__
__PRETTY_FUNCTION__))
30240 (VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30244, __extension__
__PRETTY_FUNCTION__))
30241 ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) &&(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30244, __extension__
__PRETTY_FUNCTION__))
30242 Subtarget.hasAVX2()) ||(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30244, __extension__
__PRETTY_FUNCTION__))
30243 ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) &&(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30244, __extension__
__PRETTY_FUNCTION__))
30244 "Only vXi32/vXi16/vXi8 vector rotates supported")(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30244, __extension__
__PRETTY_FUNCTION__))
;
30245
30246 MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
30247 MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
30248
30249 SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
30250 SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
30251
30252 // Attempt to fold as unpack(x,x) << zext(splat(y)):
30253 // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
30254 // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
30255 if (EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) {
30256 int BaseRotAmtIdx = -1;
30257 if (SDValue BaseRotAmt = DAG.getSplatSourceVector(AmtMod, BaseRotAmtIdx)) {
30258 if (EltSizeInBits == 16 && Subtarget.hasSSE41()) {
30259 unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
30260 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
30261 }
30262 unsigned ShiftX86Opc = IsROTL ? X86ISD::VSHLI : X86ISD::VSRLI;
30263 SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
30264 SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
30265 Lo = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Lo, BaseRotAmt,
30266 BaseRotAmtIdx, Subtarget, DAG);
30267 Hi = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Hi, BaseRotAmt,
30268 BaseRotAmtIdx, Subtarget, DAG);
30269 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
30270 }
30271 }
30272
30273 // v16i8/v32i8/v64i8: Split rotation into rot4/rot2/rot1 stages and select by
30274 // the amount bit.
30275 // TODO: We're doing nothing here that we couldn't do for funnel shifts.
30276 if (EltSizeInBits == 8) {
30277 bool IsConstAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
30278 MVT WideVT =
30279 MVT::getVectorVT(Subtarget.hasBWI() ? MVT::i16 : MVT::i32, NumElts);
30280 unsigned ShiftOpc = IsROTL ? ISD::SHL : ISD::SRL;
30281
30282 // Attempt to fold as:
30283 // rotl(x,y) -> (((aext(x) << bw) | zext(x)) << (y & (bw-1))) >> bw.
30284 // rotr(x,y) -> (((aext(x) << bw) | zext(x)) >> (y & (bw-1))).
30285 if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
30286 supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
30287 // If we're rotating by constant, just use default promotion.
30288 if (IsConstAmt)
30289 return SDValue();
30290 // See if we can perform this by widening to vXi16 or vXi32.
30291 R = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, R);
30292 R = DAG.getNode(
30293 ISD::OR, DL, WideVT, R,
30294 getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, R, 8, DAG));
30295 Amt = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
30296 R = DAG.getNode(ShiftOpc, DL, WideVT, R, Amt);
30297 if (IsROTL)
30298 R = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, R, 8, DAG);
30299 return DAG.getNode(ISD::TRUNCATE, DL, VT, R);
30300 }
30301
30302 // Attempt to fold as unpack(x,x) << zext(y):
30303 // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
30304 // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
30305 if (IsConstAmt || supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc)) {
30306 // See if we can perform this by unpacking to lo/hi vXi16.
30307 SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
30308 SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
30309 SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
30310 SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
30311 SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
30312 SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
30313 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
30314 }
30315 assert((VT == MVT::v16i8 || VT == MVT::v32i8) && "Unsupported vXi8 type")(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8
) && "Unsupported vXi8 type") ? void (0) : __assert_fail
("(VT == MVT::v16i8 || VT == MVT::v32i8) && \"Unsupported vXi8 type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30315, __extension__
__PRETTY_FUNCTION__))
;
30316
30317 // We don't need ModuloAmt here as we just peek at individual bits.
30318 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
30319 if (Subtarget.hasSSE41()) {
30320 // On SSE41 targets we can use PBLENDVB which selects bytes based just
30321 // on the sign bit.
30322 V0 = DAG.getBitcast(VT, V0);
30323 V1 = DAG.getBitcast(VT, V1);
30324 Sel = DAG.getBitcast(VT, Sel);
30325 return DAG.getBitcast(SelVT,
30326 DAG.getNode(X86ISD::BLENDV, DL, VT, Sel, V0, V1));
30327 }
30328 // On pre-SSE41 targets we test for the sign bit by comparing to
30329 // zero - a negative value will set all bits of the lanes to true
30330 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
30331 SDValue Z = DAG.getConstant(0, DL, SelVT);
30332 SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);
30333 return DAG.getSelect(DL, SelVT, C, V0, V1);
30334 };
30335
30336 // ISD::ROTR is currently only profitable on AVX512 targets with VPTERNLOG.
30337 if (!IsROTL && !useVPTERNLOG(Subtarget, VT)) {
30338 Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
30339 IsROTL = true;
30340 }
30341
30342 unsigned ShiftLHS = IsROTL ? ISD::SHL : ISD::SRL;
30343 unsigned ShiftRHS = IsROTL ? ISD::SRL : ISD::SHL;
30344
30345 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
30346 // We can safely do this using i16 shifts as we're only interested in
30347 // the 3 lower bits of each byte.
30348 Amt = DAG.getBitcast(ExtVT, Amt);
30349 Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));
30350 Amt = DAG.getBitcast(VT, Amt);
30351
30352 // r = VSELECT(r, rot(r, 4), a);
30353 SDValue M;
30354 M = DAG.getNode(
30355 ISD::OR, DL, VT,
30356 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(4, DL, VT)),
30357 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(4, DL, VT)));
30358 R = SignBitSelect(VT, Amt, M, R);
30359
30360 // a += a
30361 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
30362
30363 // r = VSELECT(r, rot(r, 2), a);
30364 M = DAG.getNode(
30365 ISD::OR, DL, VT,
30366 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(2, DL, VT)),
30367 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(6, DL, VT)));
30368 R = SignBitSelect(VT, Amt, M, R);
30369
30370 // a += a
30371 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
30372
30373 // return VSELECT(r, rot(r, 1), a);
30374 M = DAG.getNode(
30375 ISD::OR, DL, VT,
30376 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(1, DL, VT)),
30377 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(7, DL, VT)));
30378 return SignBitSelect(VT, Amt, M, R);
30379 }
30380
30381 bool IsSplatAmt = DAG.isSplatValue(Amt);
30382 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
30383 bool LegalVarShifts = supportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
30384 supportedVectorVarShift(VT, Subtarget, ISD::SRL);
30385
30386 // Fallback for splats + all supported variable shifts.
30387 // Fallback for non-constants AVX2 vXi16 as well.
30388 if (IsSplatAmt || LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {
30389 Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
30390 SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
30391 AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
30392 SDValue SHL = DAG.getNode(IsROTL ? ISD::SHL : ISD::SRL, DL, VT, R, Amt);
30393 SDValue SRL = DAG.getNode(IsROTL ? ISD::SRL : ISD::SHL, DL, VT, R, AmtR);
30394 return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
30395 }
30396
30397 // Everything below assumes ISD::ROTL.
30398 if (!IsROTL) {
30399 Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
30400 IsROTL = true;
30401 }
30402
30403 // ISD::ROT* uses modulo rotate amounts.
30404 Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
30405
30406 assert(IsROTL && "Only ROTL supported")(static_cast <bool> (IsROTL && "Only ROTL supported"
) ? void (0) : __assert_fail ("IsROTL && \"Only ROTL supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30406, __extension__
__PRETTY_FUNCTION__))
;
30407
30408 // As with shifts, attempt to convert the rotation amount to a multiplication
30409 // factor, fallback to general expansion.
30410 SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);
30411 if (!Scale)
30412 return SDValue();
30413
30414 // v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.
30415 if (EltSizeInBits == 16) {
30416 SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);
30417 SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);
30418 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
30419 }
30420
30421 // v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32
30422 // to v2i64 results at a time. The upper 32-bits contain the wrapped bits
30423 // that can then be OR'd with the lower 32-bits.
30424 assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected")(static_cast <bool> (VT == MVT::v4i32 && "Only v4i32 vector rotate expected"
) ? void (0) : __assert_fail ("VT == MVT::v4i32 && \"Only v4i32 vector rotate expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30424, __extension__
__PRETTY_FUNCTION__))
;
30425 static const int OddMask[] = {1, -1, 3, -1};
30426 SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);
30427 SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);
30428
30429 SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
30430 DAG.getBitcast(MVT::v2i64, R),
30431 DAG.getBitcast(MVT::v2i64, Scale));
30432 SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
30433 DAG.getBitcast(MVT::v2i64, R13),
30434 DAG.getBitcast(MVT::v2i64, Scale13));
30435 Res02 = DAG.getBitcast(VT, Res02);
30436 Res13 = DAG.getBitcast(VT, Res13);
30437
30438 return DAG.getNode(ISD::OR, DL, VT,
30439 DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),
30440 DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));
30441}
30442
30443/// Returns true if the operand type is exactly twice the native width, and
30444/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
30445/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
30446/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
30447bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
30448 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
30449
30450 if (OpWidth == 64)
30451 return Subtarget.canUseCMPXCHG8B() && !Subtarget.is64Bit();
30452 if (OpWidth == 128)
30453 return Subtarget.canUseCMPXCHG16B();
30454
30455 return false;
30456}
30457
30458TargetLoweringBase::AtomicExpansionKind
30459X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
30460 Type *MemType = SI->getValueOperand()->getType();
30461
30462 bool NoImplicitFloatOps =
30463 SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
30464 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
30465 !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
30466 (Subtarget.hasSSE1() || Subtarget.hasX87()))
30467 return AtomicExpansionKind::None;
30468
30469 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::Expand
30470 : AtomicExpansionKind::None;
30471}
30472
30473// Note: this turns large loads into lock cmpxchg8b/16b.
30474// TODO: In 32-bit mode, use MOVLPS when SSE1 is available?
30475TargetLowering::AtomicExpansionKind
30476X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
30477 Type *MemType = LI->getType();
30478
30479 // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
30480 // can use movq to do the load. If we have X87 we can load into an 80-bit
30481 // X87 register and store it to a stack temporary.
30482 bool NoImplicitFloatOps =
30483 LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
30484 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
30485 !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
30486 (Subtarget.hasSSE1() || Subtarget.hasX87()))
30487 return AtomicExpansionKind::None;
30488
30489 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
30490 : AtomicExpansionKind::None;
30491}
30492
30493TargetLowering::AtomicExpansionKind
30494X86TargetLowering::shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const {
30495 // If the atomicrmw's result isn't actually used, we can just add a "lock"
30496 // prefix to a normal instruction for these operations.
30497 if (AI->use_empty())
30498 return AtomicExpansionKind::None;
30499
30500 // If the atomicrmw's result is used by a single bit AND, we may use
30501 // bts/btr/btc instruction for these operations.
30502 auto *C1 = dyn_cast<ConstantInt>(AI->getValOperand());
30503 Instruction *I = AI->user_back();
30504 if (!C1 || !AI->hasOneUse() || I->getOpcode() != Instruction::And ||
30505 AI->getParent() != I->getParent())
30506 return AtomicExpansionKind::CmpXChg;
30507 // The following instruction must be a AND single bit.
30508 auto *C2 = dyn_cast<ConstantInt>(I->getOperand(1));
30509 unsigned Bits = AI->getType()->getPrimitiveSizeInBits();
30510 if (!C2 || Bits == 8 || !isPowerOf2_64(C2->getZExtValue()))
30511 return AtomicExpansionKind::CmpXChg;
30512
30513 if (AI->getOperation() == AtomicRMWInst::And)
30514 return ~C1->getValue() == C2->getValue()
30515 ? AtomicExpansionKind::BitTestIntrinsic
30516 : AtomicExpansionKind::CmpXChg;
30517
30518 return C1 == C2 ? AtomicExpansionKind::BitTestIntrinsic
30519 : AtomicExpansionKind::CmpXChg;
30520}
30521
30522void X86TargetLowering::emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const {
30523 IRBuilder<> Builder(AI);
30524 Intrinsic::ID IID = Intrinsic::not_intrinsic;
30525 switch (AI->getOperation()) {
30526 default:
30527 llvm_unreachable("Unknown atomic operation")::llvm::llvm_unreachable_internal("Unknown atomic operation",
"llvm/lib/Target/X86/X86ISelLowering.cpp", 30527)
;
30528 case AtomicRMWInst::Or:
30529 IID = Intrinsic::x86_atomic_bts;
30530 break;
30531 case AtomicRMWInst::Xor:
30532 IID = Intrinsic::x86_atomic_btc;
30533 break;
30534 case AtomicRMWInst::And:
30535 IID = Intrinsic::x86_atomic_btr;
30536 break;
30537 }
30538 Instruction *I = AI->user_back();
30539 LLVMContext &Ctx = AI->getContext();
30540 unsigned Imm =
30541 countTrailingZeros(cast<ConstantInt>(I->getOperand(1))->getZExtValue());
30542 Function *BitTest =
30543 Intrinsic::getDeclaration(AI->getModule(), IID, AI->getType());
30544 Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
30545 Type::getInt8PtrTy(Ctx));
30546 Value *Result = Builder.CreateCall(BitTest, {Addr, Builder.getInt8(Imm)});
30547 I->replaceAllUsesWith(Result);
30548 I->eraseFromParent();
30549 AI->eraseFromParent();
30550}
30551
30552TargetLowering::AtomicExpansionKind
30553X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
30554 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
30555 Type *MemType = AI->getType();
30556
30557 // If the operand is too big, we must see if cmpxchg8/16b is available
30558 // and default to library calls otherwise.
30559 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
30560 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
30561 : AtomicExpansionKind::None;
30562 }
30563
30564 AtomicRMWInst::BinOp Op = AI->getOperation();
30565 switch (Op) {
30566 default:
30567 llvm_unreachable("Unknown atomic operation")::llvm::llvm_unreachable_internal("Unknown atomic operation",
"llvm/lib/Target/X86/X86ISelLowering.cpp", 30567)
;
30568 case AtomicRMWInst::Xchg:
30569 case AtomicRMWInst::Add:
30570 case AtomicRMWInst::Sub:
30571 // It's better to use xadd, xsub or xchg for these in all cases.
30572 return AtomicExpansionKind::None;
30573 case AtomicRMWInst::Or:
30574 case AtomicRMWInst::And:
30575 case AtomicRMWInst::Xor:
30576 return shouldExpandLogicAtomicRMWInIR(AI);
30577 case AtomicRMWInst::Nand:
30578 case AtomicRMWInst::Max:
30579 case AtomicRMWInst::Min:
30580 case AtomicRMWInst::UMax:
30581 case AtomicRMWInst::UMin:
30582 case AtomicRMWInst::FAdd:
30583 case AtomicRMWInst::FSub:
30584 // These always require a non-trivial set of data operations on x86. We must
30585 // use a cmpxchg loop.
30586 return AtomicExpansionKind::CmpXChg;
30587 }
30588}
30589
30590LoadInst *
30591X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
30592 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
30593 Type *MemType = AI->getType();
30594 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
30595 // there is no benefit in turning such RMWs into loads, and it is actually
30596 // harmful as it introduces a mfence.
30597 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
30598 return nullptr;
30599
30600 // If this is a canonical idempotent atomicrmw w/no uses, we have a better
30601 // lowering available in lowerAtomicArith.
30602 // TODO: push more cases through this path.
30603 if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
30604 if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
30605 AI->use_empty())
30606 return nullptr;
30607
30608 IRBuilder<> Builder(AI);
30609 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
30610 auto SSID = AI->getSyncScopeID();
30611 // We must restrict the ordering to avoid generating loads with Release or
30612 // ReleaseAcquire orderings.
30613 auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
30614
30615 // Before the load we need a fence. Here is an example lifted from
30616 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
30617 // is required:
30618 // Thread 0:
30619 // x.store(1, relaxed);
30620 // r1 = y.fetch_add(0, release);
30621 // Thread 1:
30622 // y.fetch_add(42, acquire);
30623 // r2 = x.load(relaxed);
30624 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
30625 // lowered to just a load without a fence. A mfence flushes the store buffer,
30626 // making the optimization clearly correct.
30627 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
30628 // otherwise, we might be able to be more aggressive on relaxed idempotent
30629 // rmw. In practice, they do not look useful, so we don't try to be
30630 // especially clever.
30631 if (SSID == SyncScope::SingleThread)
30632 // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
30633 // the IR level, so we must wrap it in an intrinsic.
30634 return nullptr;
30635
30636 if (!Subtarget.hasMFence())
30637 // FIXME: it might make sense to use a locked operation here but on a
30638 // different cache-line to prevent cache-line bouncing. In practice it
30639 // is probably a small win, and x86 processors without mfence are rare
30640 // enough that we do not bother.
30641 return nullptr;
30642
30643 Function *MFence =
30644 llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
30645 Builder.CreateCall(MFence, {});
30646
30647 // Finally we can emit the atomic load.
30648 LoadInst *Loaded = Builder.CreateAlignedLoad(
30649 AI->getType(), AI->getPointerOperand(), AI->getAlign());
30650 Loaded->setAtomic(Order, SSID);
30651 AI->replaceAllUsesWith(Loaded);
30652 AI->eraseFromParent();
30653 return Loaded;
30654}
30655
30656bool X86TargetLowering::lowerAtomicStoreAsStoreSDNode(const StoreInst &SI) const {
30657 if (!SI.isUnordered())
30658 return false;
30659 return ExperimentalUnorderedISEL;
30660}
30661bool X86TargetLowering::lowerAtomicLoadAsLoadSDNode(const LoadInst &LI) const {
30662 if (!LI.isUnordered())
30663 return false;
30664 return ExperimentalUnorderedISEL;
30665}
30666
30667
30668/// Emit a locked operation on a stack location which does not change any
30669/// memory location, but does involve a lock prefix. Location is chosen to be
30670/// a) very likely accessed only by a single thread to minimize cache traffic,
30671/// and b) definitely dereferenceable. Returns the new Chain result.
30672static SDValue emitLockedStackOp(SelectionDAG &DAG,
30673 const X86Subtarget &Subtarget, SDValue Chain,
30674 const SDLoc &DL) {
30675 // Implementation notes:
30676 // 1) LOCK prefix creates a full read/write reordering barrier for memory
30677 // operations issued by the current processor. As such, the location
30678 // referenced is not relevant for the ordering properties of the instruction.
30679 // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
30680 // 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions
30681 // 2) Using an immediate operand appears to be the best encoding choice
30682 // here since it doesn't require an extra register.
30683 // 3) OR appears to be very slightly faster than ADD. (Though, the difference
30684 // is small enough it might just be measurement noise.)
30685 // 4) When choosing offsets, there are several contributing factors:
30686 // a) If there's no redzone, we default to TOS. (We could allocate a cache
30687 // line aligned stack object to improve this case.)
30688 // b) To minimize our chances of introducing a false dependence, we prefer
30689 // to offset the stack usage from TOS slightly.
30690 // c) To minimize concerns about cross thread stack usage - in particular,
30691 // the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which
30692 // captures state in the TOS frame and accesses it from many threads -
30693 // we want to use an offset such that the offset is in a distinct cache
30694 // line from the TOS frame.
30695 //
30696 // For a general discussion of the tradeoffs and benchmark results, see:
30697 // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
30698
30699 auto &MF = DAG.getMachineFunction();
30700 auto &TFL = *Subtarget.getFrameLowering();
30701 const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;
30702
30703 if (Subtarget.is64Bit()) {
30704 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
30705 SDValue Ops[] = {
30706 DAG.getRegister(X86::RSP, MVT::i64), // Base
30707 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
30708 DAG.getRegister(0, MVT::i64), // Index
30709 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
30710 DAG.getRegister(0, MVT::i16), // Segment.
30711 Zero,
30712 Chain};
30713 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
30714 MVT::Other, Ops);
30715 return SDValue(Res, 1);
30716 }
30717
30718 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
30719 SDValue Ops[] = {
30720 DAG.getRegister(X86::ESP, MVT::i32), // Base
30721 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
30722 DAG.getRegister(0, MVT::i32), // Index
30723 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
30724 DAG.getRegister(0, MVT::i16), // Segment.
30725 Zero,
30726 Chain
30727 };
30728 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
30729 MVT::Other, Ops);
30730 return SDValue(Res, 1);
30731}
30732
30733static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
30734 SelectionDAG &DAG) {
30735 SDLoc dl(Op);
30736 AtomicOrdering FenceOrdering =
30737 static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
30738 SyncScope::ID FenceSSID =
30739 static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
30740
30741 // The only fence that needs an instruction is a sequentially-consistent
30742 // cross-thread fence.
30743 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
30744 FenceSSID == SyncScope::System) {
30745 if (Subtarget.hasMFence())
30746 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
30747
30748 SDValue Chain = Op.getOperand(0);
30749 return emitLockedStackOp(DAG, Subtarget, Chain, dl);
30750 }
30751
30752 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
30753 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
30754}
30755
30756static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
30757 SelectionDAG &DAG) {
30758 MVT T = Op.getSimpleValueType();
30759 SDLoc DL(Op);
30760 unsigned Reg = 0;
30761 unsigned size = 0;
30762 switch(T.SimpleTy) {
30763 default: llvm_unreachable("Invalid value type!")::llvm::llvm_unreachable_internal("Invalid value type!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30763)
;
30764 case MVT::i8: Reg = X86::AL; size = 1; break;
30765 case MVT::i16: Reg = X86::AX; size = 2; break;
30766 case MVT::i32: Reg = X86::EAX; size = 4; break;
30767 case MVT::i64:
30768 assert(Subtarget.is64Bit() && "Node not type legal!")(static_cast <bool> (Subtarget.is64Bit() && "Node not type legal!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Node not type legal!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30768, __extension__
__PRETTY_FUNCTION__))
;
30769 Reg = X86::RAX; size = 8;
30770 break;
30771 }
30772 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
30773 Op.getOperand(2), SDValue());
30774 SDValue Ops[] = { cpIn.getValue(0),
30775 Op.getOperand(1),
30776 Op.getOperand(3),
30777 DAG.getTargetConstant(size, DL, MVT::i8),
30778 cpIn.getValue(1) };
30779 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
30780 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
30781 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
30782 Ops, T, MMO);
30783
30784 SDValue cpOut =
30785 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
30786 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
30787 MVT::i32, cpOut.getValue(2));
30788 SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
30789
30790 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
30791 cpOut, Success, EFLAGS.getValue(1));
30792}
30793
30794// Create MOVMSKB, taking into account whether we need to split for AVX1.
30795static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG,
30796 const X86Subtarget &Subtarget) {
30797 MVT InVT = V.getSimpleValueType();
30798
30799 if (InVT == MVT::v64i8) {
30800 SDValue Lo, Hi;
30801 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
30802 Lo = getPMOVMSKB(DL, Lo, DAG, Subtarget);
30803 Hi = getPMOVMSKB(DL, Hi, DAG, Subtarget);
30804 Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);
30805 Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);
30806 Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,
30807 DAG.getConstant(32, DL, MVT::i8));
30808 return DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);
30809 }
30810 if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
30811 SDValue Lo, Hi;
30812 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
30813 Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
30814 Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
30815 Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
30816 DAG.getConstant(16, DL, MVT::i8));
30817 return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
30818 }
30819
30820 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
30821}
30822
30823static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
30824 SelectionDAG &DAG) {
30825 SDValue Src = Op.getOperand(0);
30826 MVT SrcVT = Src.getSimpleValueType();
30827 MVT DstVT = Op.getSimpleValueType();
30828
30829 // Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each
30830 // half to v32i1 and concatenating the result.
30831 if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {
30832 assert(!Subtarget.is64Bit() && "Expected 32-bit mode")(static_cast <bool> (!Subtarget.is64Bit() && "Expected 32-bit mode"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Expected 32-bit mode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30832, __extension__
__PRETTY_FUNCTION__))
;
30833 assert(Subtarget.hasBWI() && "Expected BWI target")(static_cast <bool> (Subtarget.hasBWI() && "Expected BWI target"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected BWI target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30833, __extension__
__PRETTY_FUNCTION__))
;
30834 SDLoc dl(Op);
30835 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
30836 DAG.getIntPtrConstant(0, dl));
30837 Lo = DAG.getBitcast(MVT::v32i1, Lo);
30838 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
30839 DAG.getIntPtrConstant(1, dl));
30840 Hi = DAG.getBitcast(MVT::v32i1, Hi);
30841 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
30842 }
30843
30844 // Use MOVMSK for vector to scalar conversion to prevent scalarization.
30845 if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
30846 assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512")(static_cast <bool> (!Subtarget.hasAVX512() && "Should use K-registers with AVX512"
) ? void (0) : __assert_fail ("!Subtarget.hasAVX512() && \"Should use K-registers with AVX512\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30846, __extension__
__PRETTY_FUNCTION__))
;
30847 MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
30848 SDLoc DL(Op);
30849 SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
30850 V = getPMOVMSKB(DL, V, DAG, Subtarget);
30851 return DAG.getZExtOrTrunc(V, DL, DstVT);
30852 }
30853
30854 assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||(static_cast <bool> ((SrcVT == MVT::v2i32 || SrcVT == MVT
::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) &&
"Unexpected VT!") ? void (0) : __assert_fail ("(SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30855, __extension__
__PRETTY_FUNCTION__))
30855 SrcVT == MVT::i64) && "Unexpected VT!")(static_cast <bool> ((SrcVT == MVT::v2i32 || SrcVT == MVT
::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) &&
"Unexpected VT!") ? void (0) : __assert_fail ("(SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30855, __extension__
__PRETTY_FUNCTION__))
;
30856
30857 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30857, __extension__
__PRETTY_FUNCTION__))
;
30858 if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) &&
30859 !(DstVT == MVT::x86mmx && SrcVT.isVector()))
30860 // This conversion needs to be expanded.
30861 return SDValue();
30862
30863 SDLoc dl(Op);
30864 if (SrcVT.isVector()) {
30865 // Widen the vector in input in the case of MVT::v2i32.
30866 // Example: from MVT::v2i32 to MVT::v4i32.
30867 MVT NewVT = MVT::getVectorVT(SrcVT.getVectorElementType(),
30868 SrcVT.getVectorNumElements() * 2);
30869 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,
30870 DAG.getUNDEF(SrcVT));
30871 } else {
30872 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&(static_cast <bool> (SrcVT == MVT::i64 && !Subtarget
.is64Bit() && "Unexpected source type in LowerBITCAST"
) ? void (0) : __assert_fail ("SrcVT == MVT::i64 && !Subtarget.is64Bit() && \"Unexpected source type in LowerBITCAST\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30873, __extension__
__PRETTY_FUNCTION__))
30873 "Unexpected source type in LowerBITCAST")(static_cast <bool> (SrcVT == MVT::i64 && !Subtarget
.is64Bit() && "Unexpected source type in LowerBITCAST"
) ? void (0) : __assert_fail ("SrcVT == MVT::i64 && !Subtarget.is64Bit() && \"Unexpected source type in LowerBITCAST\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30873, __extension__
__PRETTY_FUNCTION__))
;
30874 Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
30875 }
30876
30877 MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;
30878 Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);
30879
30880 if (DstVT == MVT::x86mmx)
30881 return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);
30882
30883 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,
30884 DAG.getIntPtrConstant(0, dl));
30885}
30886
30887/// Compute the horizontal sum of bytes in V for the elements of VT.
30888///
30889/// Requires V to be a byte vector and VT to be an integer vector type with
30890/// wider elements than V's type. The width of the elements of VT determines
30891/// how many bytes of V are summed horizontally to produce each element of the
30892/// result.
30893static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
30894 const X86Subtarget &Subtarget,
30895 SelectionDAG &DAG) {
30896 SDLoc DL(V);
30897 MVT ByteVecVT = V.getSimpleValueType();
30898 MVT EltVT = VT.getVectorElementType();
30899 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&(static_cast <bool> (ByteVecVT.getVectorElementType() ==
MVT::i8 && "Expected value to have byte element type."
) ? void (0) : __assert_fail ("ByteVecVT.getVectorElementType() == MVT::i8 && \"Expected value to have byte element type.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30900, __extension__
__PRETTY_FUNCTION__))
30900 "Expected value to have byte element type.")(static_cast <bool> (ByteVecVT.getVectorElementType() ==
MVT::i8 && "Expected value to have byte element type."
) ? void (0) : __assert_fail ("ByteVecVT.getVectorElementType() == MVT::i8 && \"Expected value to have byte element type.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30900, __extension__
__PRETTY_FUNCTION__))
;
30901 assert(EltVT != MVT::i8 &&(static_cast <bool> (EltVT != MVT::i8 && "Horizontal byte sum only makes sense for wider elements!"
) ? void (0) : __assert_fail ("EltVT != MVT::i8 && \"Horizontal byte sum only makes sense for wider elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30902, __extension__
__PRETTY_FUNCTION__))
30902 "Horizontal byte sum only makes sense for wider elements!")(static_cast <bool> (EltVT != MVT::i8 && "Horizontal byte sum only makes sense for wider elements!"
) ? void (0) : __assert_fail ("EltVT != MVT::i8 && \"Horizontal byte sum only makes sense for wider elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30902, __extension__
__PRETTY_FUNCTION__))
;
30903 unsigned VecSize = VT.getSizeInBits();
30904 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!")(static_cast <bool> (ByteVecVT.getSizeInBits() == VecSize
&& "Cannot change vector size!") ? void (0) : __assert_fail
("ByteVecVT.getSizeInBits() == VecSize && \"Cannot change vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30904, __extension__
__PRETTY_FUNCTION__))
;
30905
30906 // PSADBW instruction horizontally add all bytes and leave the result in i64
30907 // chunks, thus directly computes the pop count for v2i64 and v4i64.
30908 if (EltVT == MVT::i64) {
30909 SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT);
30910 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
30911 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
30912 return DAG.getBitcast(VT, V);
30913 }
30914
30915 if (EltVT == MVT::i32) {
30916 // We unpack the low half and high half into i32s interleaved with zeros so
30917 // that we can use PSADBW to horizontally sum them. The most useful part of
30918 // this is that it lines up the results of two PSADBW instructions to be
30919 // two v2i64 vectors which concatenated are the 4 population counts. We can
30920 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
30921 SDValue Zeros = DAG.getConstant(0, DL, VT);
30922 SDValue V32 = DAG.getBitcast(VT, V);
30923 SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros);
30924 SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros);
30925
30926 // Do the horizontal sums into two v2i64s.
30927 Zeros = DAG.getConstant(0, DL, ByteVecVT);
30928 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
30929 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
30930 DAG.getBitcast(ByteVecVT, Low), Zeros);
30931 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
30932 DAG.getBitcast(ByteVecVT, High), Zeros);
30933
30934 // Merge them together.
30935 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
30936 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
30937 DAG.getBitcast(ShortVecVT, Low),
30938 DAG.getBitcast(ShortVecVT, High));
30939
30940 return DAG.getBitcast(VT, V);
30941 }
30942
30943 // The only element type left is i16.
30944 assert(EltVT == MVT::i16 && "Unknown how to handle type")(static_cast <bool> (EltVT == MVT::i16 && "Unknown how to handle type"
) ? void (0) : __assert_fail ("EltVT == MVT::i16 && \"Unknown how to handle type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30944, __extension__
__PRETTY_FUNCTION__))
;
30945
30946 // To obtain pop count for each i16 element starting from the pop count for
30947 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
30948 // right by 8. It is important to shift as i16s as i8 vector shift isn't
30949 // directly supported.
30950 SDValue ShifterV = DAG.getConstant(8, DL, VT);
30951 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
30952 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
30953 DAG.getBitcast(ByteVecVT, V));
30954 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
30955}
30956
30957static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
30958 const X86Subtarget &Subtarget,
30959 SelectionDAG &DAG) {
30960 MVT VT = Op.getSimpleValueType();
30961 MVT EltVT = VT.getVectorElementType();
30962 int NumElts = VT.getVectorNumElements();
30963 (void)EltVT;
30964 assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.")(static_cast <bool> (EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported."
) ? void (0) : __assert_fail ("EltVT == MVT::i8 && \"Only vXi8 vector CTPOP lowering supported.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30964, __extension__
__PRETTY_FUNCTION__))
;
30965
30966 // Implement a lookup table in register by using an algorithm based on:
30967 // http://wm.ite.pl/articles/sse-popcount.html
30968 //
30969 // The general idea is that every lower byte nibble in the input vector is an
30970 // index into a in-register pre-computed pop count table. We then split up the
30971 // input vector in two new ones: (1) a vector with only the shifted-right
30972 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
30973 // masked out higher ones) for each byte. PSHUFB is used separately with both
30974 // to index the in-register table. Next, both are added and the result is a
30975 // i8 vector where each element contains the pop count for input byte.
30976 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
30977 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
30978 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
30979 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
30980
30981 SmallVector<SDValue, 64> LUTVec;
30982 for (int i = 0; i < NumElts; ++i)
30983 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
30984 SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);
30985 SDValue M0F = DAG.getConstant(0x0F, DL, VT);
30986
30987 // High nibbles
30988 SDValue FourV = DAG.getConstant(4, DL, VT);
30989 SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);
30990
30991 // Low nibbles
30992 SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);
30993
30994 // The input vector is used as the shuffle mask that index elements into the
30995 // LUT. After counting low and high nibbles, add the vector to obtain the
30996 // final pop count per i8 element.
30997 SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);
30998 SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);
30999 return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);
31000}
31001
31002// Please ensure that any codegen change from LowerVectorCTPOP is reflected in
31003// updated cost models in X86TTIImpl::getIntrinsicInstrCost.
31004static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
31005 SelectionDAG &DAG) {
31006 MVT VT = Op.getSimpleValueType();
31007 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&(static_cast <bool> ((VT.is512BitVector() || VT.is256BitVector
() || VT.is128BitVector()) && "Unknown CTPOP type to handle"
) ? void (0) : __assert_fail ("(VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) && \"Unknown CTPOP type to handle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31008, __extension__
__PRETTY_FUNCTION__))
31008 "Unknown CTPOP type to handle")(static_cast <bool> ((VT.is512BitVector() || VT.is256BitVector
() || VT.is128BitVector()) && "Unknown CTPOP type to handle"
) ? void (0) : __assert_fail ("(VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) && \"Unknown CTPOP type to handle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31008, __extension__
__PRETTY_FUNCTION__))
;
31009 SDLoc DL(Op.getNode());
31010 SDValue Op0 = Op.getOperand(0);
31011
31012 // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
31013 if (Subtarget.hasVPOPCNTDQ()) {
31014 unsigned NumElems = VT.getVectorNumElements();
31015 assert((VT.getVectorElementType() == MVT::i8 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i8 || VT.getVectorElementType() == MVT::i16) && "Unexpected type"
) ? void (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i8 || VT.getVectorElementType() == MVT::i16) && \"Unexpected type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31016, __extension__
__PRETTY_FUNCTION__))
31016 VT.getVectorElementType() == MVT::i16) && "Unexpected type")(static_cast <bool> ((VT.getVectorElementType() == MVT::
i8 || VT.getVectorElementType() == MVT::i16) && "Unexpected type"
) ? void (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i8 || VT.getVectorElementType() == MVT::i16) && \"Unexpected type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31016, __extension__
__PRETTY_FUNCTION__))
;
31017 if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
31018 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
31019 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
31020 Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
31021 return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
31022 }
31023 }
31024
31025 // Decompose 256-bit ops into smaller 128-bit ops.
31026 if (VT.is256BitVector() && !Subtarget.hasInt256())
31027 return splitVectorIntUnary(Op, DAG);
31028
31029 // Decompose 512-bit ops into smaller 256-bit ops.
31030 if (VT.is512BitVector() && !Subtarget.hasBWI())
31031 return splitVectorIntUnary(Op, DAG);
31032
31033 // For element types greater than i8, do vXi8 pop counts and a bytesum.
31034 if (VT.getScalarType() != MVT::i8) {
31035 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
31036 SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);
31037 SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);
31038 return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);
31039 }
31040
31041 // We can't use the fast LUT approach, so fall back on LegalizeDAG.
31042 if (!Subtarget.hasSSSE3())
31043 return SDValue();
31044
31045 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
31046}
31047
31048static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
31049 SelectionDAG &DAG) {
31050 assert(Op.getSimpleValueType().isVector() &&(static_cast <bool> (Op.getSimpleValueType().isVector()
&& "We only do custom lowering for vector population count."
) ? void (0) : __assert_fail ("Op.getSimpleValueType().isVector() && \"We only do custom lowering for vector population count.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31051, __extension__
__PRETTY_FUNCTION__))
31051 "We only do custom lowering for vector population count.")(static_cast <bool> (Op.getSimpleValueType().isVector()
&& "We only do custom lowering for vector population count."
) ? void (0) : __assert_fail ("Op.getSimpleValueType().isVector() && \"We only do custom lowering for vector population count.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31051, __extension__
__PRETTY_FUNCTION__))
;
31052 return LowerVectorCTPOP(Op, Subtarget, DAG);
31053}
31054
31055static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
31056 MVT VT = Op.getSimpleValueType();
31057 SDValue In = Op.getOperand(0);
31058 SDLoc DL(Op);
31059
31060 // For scalars, its still beneficial to transfer to/from the SIMD unit to
31061 // perform the BITREVERSE.
31062 if (!VT.isVector()) {
31063 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
31064 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
31065 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
31066 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
31067 DAG.getIntPtrConstant(0, DL));
31068 }
31069
31070 int NumElts = VT.getVectorNumElements();
31071 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
31072
31073 // Decompose 256-bit ops into smaller 128-bit ops.
31074 if (VT.is256BitVector())
31075 return splitVectorIntUnary(Op, DAG);
31076
31077 assert(VT.is128BitVector() &&(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vector bitreverse lowering supported."
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector bitreverse lowering supported.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31078, __extension__
__PRETTY_FUNCTION__))
31078 "Only 128-bit vector bitreverse lowering supported.")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vector bitreverse lowering supported."
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector bitreverse lowering supported.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31078, __extension__
__PRETTY_FUNCTION__))
;
31079
31080 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
31081 // perform the BSWAP in the shuffle.
31082 // Its best to shuffle using the second operand as this will implicitly allow
31083 // memory folding for multiple vectors.
31084 SmallVector<SDValue, 16> MaskElts;
31085 for (int i = 0; i != NumElts; ++i) {
31086 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
31087 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
31088 int PermuteByte = SourceByte | (2 << 5);
31089 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
31090 }
31091 }
31092
31093 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
31094 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
31095 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
31096 Res, Mask);
31097 return DAG.getBitcast(VT, Res);
31098}
31099
31100static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
31101 SelectionDAG &DAG) {
31102 MVT VT = Op.getSimpleValueType();
31103
31104 if (Subtarget.hasXOP() && !VT.is512BitVector())
31105 return LowerBITREVERSE_XOP(Op, DAG);
31106
31107 assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE")(static_cast <bool> (Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE"
) ? void (0) : __assert_fail ("Subtarget.hasSSSE3() && \"SSSE3 required for BITREVERSE\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31107, __extension__
__PRETTY_FUNCTION__))
;
31108
31109 SDValue In = Op.getOperand(0);
31110 SDLoc DL(Op);
31111
31112 assert(VT.getScalarType() == MVT::i8 &&(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Only byte vector BITREVERSE supported") ? void (0) : __assert_fail
("VT.getScalarType() == MVT::i8 && \"Only byte vector BITREVERSE supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31113, __extension__
__PRETTY_FUNCTION__))
31113 "Only byte vector BITREVERSE supported")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Only byte vector BITREVERSE supported") ? void (0) : __assert_fail
("VT.getScalarType() == MVT::i8 && \"Only byte vector BITREVERSE supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31113, __extension__
__PRETTY_FUNCTION__))
;
31114
31115 // Split v64i8 without BWI so that we can still use the PSHUFB lowering.
31116 if (VT == MVT::v64i8 && !Subtarget.hasBWI())
31117 return splitVectorIntUnary(Op, DAG);
31118
31119 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
31120 if (VT == MVT::v32i8 && !Subtarget.hasInt256())
31121 return splitVectorIntUnary(Op, DAG);
31122
31123 unsigned NumElts = VT.getVectorNumElements();
31124
31125 // If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.
31126 if (Subtarget.hasGFNI()) {
31127 MVT MatrixVT = MVT::getVectorVT(MVT::i64, NumElts / 8);
31128 SDValue Matrix = DAG.getConstant(0x8040201008040201ULL, DL, MatrixVT);
31129 Matrix = DAG.getBitcast(VT, Matrix);
31130 return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix,
31131 DAG.getTargetConstant(0, DL, MVT::i8));
31132 }
31133
31134 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
31135 // two nibbles and a PSHUFB lookup to find the bitreverse of each
31136 // 0-15 value (moved to the other nibble).
31137 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
31138 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
31139 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
31140
31141 const int LoLUT[16] = {
31142 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
31143 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
31144 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
31145 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
31146 const int HiLUT[16] = {
31147 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
31148 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
31149 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
31150 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
31151
31152 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
31153 for (unsigned i = 0; i < NumElts; ++i) {
31154 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
31155 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
31156 }
31157
31158 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
31159 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
31160 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
31161 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
31162 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
31163}
31164
31165static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget,
31166 SelectionDAG &DAG) {
31167 SDLoc DL(Op);
31168 SDValue X = Op.getOperand(0);
31169 MVT VT = Op.getSimpleValueType();
31170
31171 // Special case. If the input fits in 8-bits we can use a single 8-bit TEST.
31172 if (VT == MVT::i8 ||
31173 DAG.MaskedValueIsZero(X, APInt::getBitsSetFrom(VT.getSizeInBits(), 8))) {
31174 X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
31175 SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X,
31176 DAG.getConstant(0, DL, MVT::i8));
31177 // Copy the inverse of the parity flag into a register with setcc.
31178 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
31179 // Extend to the original type.
31180 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
31181 }
31182
31183 // If we have POPCNT, use the default expansion.
31184 if (Subtarget.hasPOPCNT())
31185 return SDValue();
31186
31187 if (VT == MVT::i64) {
31188 // Xor the high and low 16-bits together using a 32-bit operation.
31189 SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
31190 DAG.getNode(ISD::SRL, DL, MVT::i64, X,
31191 DAG.getConstant(32, DL, MVT::i8)));
31192 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
31193 X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
31194 }
31195
31196 if (VT != MVT::i16) {
31197 // Xor the high and low 16-bits together using a 32-bit operation.
31198 SDValue Hi16 = DAG.getNode(ISD::SRL, DL, MVT::i32, X,
31199 DAG.getConstant(16, DL, MVT::i8));
31200 X = DAG.getNode(ISD::XOR, DL, MVT::i32, X, Hi16);
31201 } else {
31202 // If the input is 16-bits, we need to extend to use an i32 shift below.
31203 X = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, X);
31204 }
31205
31206 // Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
31207 // This should allow an h-reg to be used to save a shift.
31208 SDValue Hi = DAG.getNode(
31209 ISD::TRUNCATE, DL, MVT::i8,
31210 DAG.getNode(ISD::SRL, DL, MVT::i32, X, DAG.getConstant(8, DL, MVT::i8)));
31211 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
31212 SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
31213 SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);
31214
31215 // Copy the inverse of the parity flag into a register with setcc.
31216 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
31217 // Extend to the original type.
31218 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
31219}
31220
31221static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
31222 const X86Subtarget &Subtarget) {
31223 unsigned NewOpc = 0;
31224 switch (N->getOpcode()) {
31225 case ISD::ATOMIC_LOAD_ADD:
31226 NewOpc = X86ISD::LADD;
31227 break;
31228 case ISD::ATOMIC_LOAD_SUB:
31229 NewOpc = X86ISD::LSUB;
31230 break;
31231 case ISD::ATOMIC_LOAD_OR:
31232 NewOpc = X86ISD::LOR;
31233 break;
31234 case ISD::ATOMIC_LOAD_XOR:
31235 NewOpc = X86ISD::LXOR;
31236 break;
31237 case ISD::ATOMIC_LOAD_AND:
31238 NewOpc = X86ISD::LAND;
31239 break;
31240 default:
31241 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode")::llvm::llvm_unreachable_internal("Unknown ATOMIC_LOAD_ opcode"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31241)
;
31242 }
31243
31244 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
31245
31246 return DAG.getMemIntrinsicNode(
31247 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
31248 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
31249 /*MemVT=*/N->getSimpleValueType(0), MMO);
31250}
31251
31252/// Lower atomic_load_ops into LOCK-prefixed operations.
31253static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
31254 const X86Subtarget &Subtarget) {
31255 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
31256 SDValue Chain = N->getOperand(0);
31257 SDValue LHS = N->getOperand(1);
31258 SDValue RHS = N->getOperand(2);
31259 unsigned Opc = N->getOpcode();
31260 MVT VT = N->getSimpleValueType(0);
31261 SDLoc DL(N);
31262
31263 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
31264 // can only be lowered when the result is unused. They should have already
31265 // been transformed into a cmpxchg loop in AtomicExpand.
31266 if (N->hasAnyUseOfValue(0)) {
31267 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
31268 // select LXADD if LOCK_SUB can't be selected.
31269 if (Opc == ISD::ATOMIC_LOAD_SUB) {
31270 RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
31271 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
31272 RHS, AN->getMemOperand());
31273 }
31274 assert(Opc == ISD::ATOMIC_LOAD_ADD &&(static_cast <bool> (Opc == ISD::ATOMIC_LOAD_ADD &&
"Used AtomicRMW ops other than Add should have been expanded!"
) ? void (0) : __assert_fail ("Opc == ISD::ATOMIC_LOAD_ADD && \"Used AtomicRMW ops other than Add should have been expanded!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31275, __extension__
__PRETTY_FUNCTION__))
31275 "Used AtomicRMW ops other than Add should have been expanded!")(static_cast <bool> (Opc == ISD::ATOMIC_LOAD_ADD &&
"Used AtomicRMW ops other than Add should have been expanded!"
) ? void (0) : __assert_fail ("Opc == ISD::ATOMIC_LOAD_ADD && \"Used AtomicRMW ops other than Add should have been expanded!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31275, __extension__
__PRETTY_FUNCTION__))
;
31276 return N;
31277 }
31278
31279 // Specialized lowering for the canonical form of an idemptotent atomicrmw.
31280 // The core idea here is that since the memory location isn't actually
31281 // changing, all we need is a lowering for the *ordering* impacts of the
31282 // atomicrmw. As such, we can chose a different operation and memory
31283 // location to minimize impact on other code.
31284 if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS)) {
31285 // On X86, the only ordering which actually requires an instruction is
31286 // seq_cst which isn't SingleThread, everything just needs to be preserved
31287 // during codegen and then dropped. Note that we expect (but don't assume),
31288 // that orderings other than seq_cst and acq_rel have been canonicalized to
31289 // a store or load.
31290 if (AN->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent &&
31291 AN->getSyncScopeID() == SyncScope::System) {
31292 // Prefer a locked operation against a stack location to minimize cache
31293 // traffic. This assumes that stack locations are very likely to be
31294 // accessed only by the owning thread.
31295 SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);
31296 assert(!N->hasAnyUseOfValue(0))(static_cast <bool> (!N->hasAnyUseOfValue(0)) ? void
(0) : __assert_fail ("!N->hasAnyUseOfValue(0)", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31296, __extension__ __PRETTY_FUNCTION__))
;
31297 // NOTE: The getUNDEF is needed to give something for the unused result 0.
31298 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
31299 DAG.getUNDEF(VT), NewChain);
31300 }
31301 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
31302 SDValue NewChain = DAG.getNode(X86ISD::MEMBARRIER, DL, MVT::Other, Chain);
31303 assert(!N->hasAnyUseOfValue(0))(static_cast <bool> (!N->hasAnyUseOfValue(0)) ? void
(0) : __assert_fail ("!N->hasAnyUseOfValue(0)", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31303, __extension__ __PRETTY_FUNCTION__))
;
31304 // NOTE: The getUNDEF is needed to give something for the unused result 0.
31305 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
31306 DAG.getUNDEF(VT), NewChain);
31307 }
31308
31309 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
31310 // RAUW the chain, but don't worry about the result, as it's unused.
31311 assert(!N->hasAnyUseOfValue(0))(static_cast <bool> (!N->hasAnyUseOfValue(0)) ? void
(0) : __assert_fail ("!N->hasAnyUseOfValue(0)", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31311, __extension__ __PRETTY_FUNCTION__))
;
31312 // NOTE: The getUNDEF is needed to give something for the unused result 0.
31313 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
31314 DAG.getUNDEF(VT), LockOp.getValue(1));
31315}
31316
31317static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,
31318 const X86Subtarget &Subtarget) {
31319 auto *Node = cast<AtomicSDNode>(Op.getNode());
31320 SDLoc dl(Node);
31321 EVT VT = Node->getMemoryVT();
31322
31323 bool IsSeqCst =
31324 Node->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent;
31325 bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT);
31326
31327 // If this store is not sequentially consistent and the type is legal
31328 // we can just keep it.
31329 if (!IsSeqCst && IsTypeLegal)
31330 return Op;
31331
31332 if (VT == MVT::i64 && !IsTypeLegal) {
31333 // For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE
31334 // is enabled.
31335 bool NoImplicitFloatOps =
31336 DAG.getMachineFunction().getFunction().hasFnAttribute(
31337 Attribute::NoImplicitFloat);
31338 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
31339 SDValue Chain;
31340 if (Subtarget.hasSSE1()) {
31341 SDValue SclToVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
31342 Node->getOperand(2));
31343 MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
31344 SclToVec = DAG.getBitcast(StVT, SclToVec);
31345 SDVTList Tys = DAG.getVTList(MVT::Other);
31346 SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()};
31347 Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops,
31348 MVT::i64, Node->getMemOperand());
31349 } else if (Subtarget.hasX87()) {
31350 // First load this into an 80-bit X87 register using a stack temporary.
31351 // This will put the whole integer into the significand.
31352 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
31353 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
31354 MachinePointerInfo MPI =
31355 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
31356 Chain =
31357 DAG.getStore(Node->getChain(), dl, Node->getOperand(2), StackPtr,
31358 MPI, MaybeAlign(), MachineMemOperand::MOStore);
31359 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
31360 SDValue LdOps[] = {Chain, StackPtr};
31361 SDValue Value =
31362 DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, LdOps, MVT::i64, MPI,
31363 /*Align*/ None, MachineMemOperand::MOLoad);
31364 Chain = Value.getValue(1);
31365
31366 // Now use an FIST to do the atomic store.
31367 SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()};
31368 Chain =
31369 DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),
31370 StoreOps, MVT::i64, Node->getMemOperand());
31371 }
31372
31373 if (Chain) {
31374 // If this is a sequentially consistent store, also emit an appropriate
31375 // barrier.
31376 if (IsSeqCst)
31377 Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
31378
31379 return Chain;
31380 }
31381 }
31382 }
31383
31384 // Convert seq_cst store -> xchg
31385 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
31386 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
31387 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
31388 Node->getMemoryVT(),
31389 Node->getOperand(0),
31390 Node->getOperand(1), Node->getOperand(2),
31391 Node->getMemOperand());
31392 return Swap.getValue(1);
31393}
31394
31395static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
31396 SDNode *N = Op.getNode();
31397 MVT VT = N->getSimpleValueType(0);
31398 unsigned Opc = Op.getOpcode();
31399
31400 // Let legalize expand this if it isn't a legal type yet.
31401 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
31402 return SDValue();
31403
31404 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
31405 SDLoc DL(N);
31406
31407 // Set the carry flag.
31408 SDValue Carry = Op.getOperand(2);
31409 EVT CarryVT = Carry.getValueType();
31410 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
31411 Carry, DAG.getAllOnesConstant(DL, CarryVT));
31412
31413 bool IsAdd = Opc == ISD::ADDCARRY || Opc == ISD::SADDO_CARRY;
31414 SDValue Sum = DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs,
31415 Op.getOperand(0), Op.getOperand(1),
31416 Carry.getValue(1));
31417
31418 bool IsSigned = Opc == ISD::SADDO_CARRY || Opc == ISD::SSUBO_CARRY;
31419 SDValue SetCC = getSETCC(IsSigned ? X86::COND_O : X86::COND_B,
31420 Sum.getValue(1), DL, DAG);
31421 if (N->getValueType(1) == MVT::i1)
31422 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
31423
31424 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
31425}
31426
31427static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
31428 SelectionDAG &DAG) {
31429 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit())(static_cast <bool> (Subtarget.isTargetDarwin() &&
Subtarget.is64Bit()) ? void (0) : __assert_fail ("Subtarget.isTargetDarwin() && Subtarget.is64Bit()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31429, __extension__
__PRETTY_FUNCTION__))
;
31430
31431 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
31432 // which returns the values as { float, float } (in XMM0) or
31433 // { double, double } (which is returned in XMM0, XMM1).
31434 SDLoc dl(Op);
31435 SDValue Arg = Op.getOperand(0);
31436 EVT ArgVT = Arg.getValueType();
31437 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
31438
31439 TargetLowering::ArgListTy Args;
31440 TargetLowering::ArgListEntry Entry;
31441
31442 Entry.Node = Arg;
31443 Entry.Ty = ArgTy;
31444 Entry.IsSExt = false;
31445 Entry.IsZExt = false;
31446 Args.push_back(Entry);
31447
31448 bool isF64 = ArgVT == MVT::f64;
31449 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
31450 // the small struct {f32, f32} is returned in (eax, edx). For f64,
31451 // the results are returned via SRet in memory.
31452 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31453 RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
31454 const char *LibcallName = TLI.getLibcallName(LC);
31455 SDValue Callee =
31456 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
31457
31458 Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
31459 : (Type *)FixedVectorType::get(ArgTy, 4);
31460
31461 TargetLowering::CallLoweringInfo CLI(DAG);
31462 CLI.setDebugLoc(dl)
31463 .setChain(DAG.getEntryNode())
31464 .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
31465
31466 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
31467
31468 if (isF64)
31469 // Returned in xmm0 and xmm1.
31470 return CallResult.first;
31471
31472 // Returned in bits 0:31 and 32:64 xmm0.
31473 SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
31474 CallResult.first, DAG.getIntPtrConstant(0, dl));
31475 SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
31476 CallResult.first, DAG.getIntPtrConstant(1, dl));
31477 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
31478 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
31479}
31480
31481/// Widen a vector input to a vector of NVT. The
31482/// input vector must have the same element type as NVT.
31483static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
31484 bool FillWithZeroes = false) {
31485 // Check if InOp already has the right width.
31486 MVT InVT = InOp.getSimpleValueType();
31487 if (InVT == NVT)
31488 return InOp;
31489
31490 if (InOp.isUndef())
31491 return DAG.getUNDEF(NVT);
31492
31493 assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&(static_cast <bool> (InVT.getVectorElementType() == NVT
.getVectorElementType() && "input and widen element type must match"
) ? void (0) : __assert_fail ("InVT.getVectorElementType() == NVT.getVectorElementType() && \"input and widen element type must match\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31494, __extension__
__PRETTY_FUNCTION__))
31494 "input and widen element type must match")(static_cast <bool> (InVT.getVectorElementType() == NVT
.getVectorElementType() && "input and widen element type must match"
) ? void (0) : __assert_fail ("InVT.getVectorElementType() == NVT.getVectorElementType() && \"input and widen element type must match\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31494, __extension__
__PRETTY_FUNCTION__))
;
31495
31496 unsigned InNumElts = InVT.getVectorNumElements();
31497 unsigned WidenNumElts = NVT.getVectorNumElements();
31498 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&(static_cast <bool> (WidenNumElts > InNumElts &&
WidenNumElts % InNumElts == 0 && "Unexpected request for vector widening"
) ? void (0) : __assert_fail ("WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 && \"Unexpected request for vector widening\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31499, __extension__
__PRETTY_FUNCTION__))
31499 "Unexpected request for vector widening")(static_cast <bool> (WidenNumElts > InNumElts &&
WidenNumElts % InNumElts == 0 && "Unexpected request for vector widening"
) ? void (0) : __assert_fail ("WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 && \"Unexpected request for vector widening\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31499, __extension__
__PRETTY_FUNCTION__))
;
31500
31501 SDLoc dl(InOp);
31502 if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
31503 InOp.getNumOperands() == 2) {
31504 SDValue N1 = InOp.getOperand(1);
31505 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
31506 N1.isUndef()) {
31507 InOp = InOp.getOperand(0);
31508 InVT = InOp.getSimpleValueType();
31509 InNumElts = InVT.getVectorNumElements();
31510 }
31511 }
31512 if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
31513 ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
31514 SmallVector<SDValue, 16> Ops;
31515 for (unsigned i = 0; i < InNumElts; ++i)
31516 Ops.push_back(InOp.getOperand(i));
31517
31518 EVT EltVT = InOp.getOperand(0).getValueType();
31519
31520 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
31521 DAG.getUNDEF(EltVT);
31522 for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
31523 Ops.push_back(FillVal);
31524 return DAG.getBuildVector(NVT, dl, Ops);
31525 }
31526 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
31527 DAG.getUNDEF(NVT);
31528 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
31529 InOp, DAG.getIntPtrConstant(0, dl));
31530}
31531
31532static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
31533 SelectionDAG &DAG) {
31534 assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "MGATHER/MSCATTER are supported on AVX-512 arch only"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"MGATHER/MSCATTER are supported on AVX-512 arch only\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31535, __extension__
__PRETTY_FUNCTION__))
31535 "MGATHER/MSCATTER are supported on AVX-512 arch only")(static_cast <bool> (Subtarget.hasAVX512() && "MGATHER/MSCATTER are supported on AVX-512 arch only"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"MGATHER/MSCATTER are supported on AVX-512 arch only\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31535, __extension__
__PRETTY_FUNCTION__))
;
31536
31537 MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
31538 SDValue Src = N->getValue();
31539 MVT VT = Src.getSimpleValueType();
31540 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op")(static_cast <bool> (VT.getScalarSizeInBits() >= 32 &&
"Unsupported scatter op") ? void (0) : __assert_fail ("VT.getScalarSizeInBits() >= 32 && \"Unsupported scatter op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31540, __extension__
__PRETTY_FUNCTION__))
;
31541 SDLoc dl(Op);
31542
31543 SDValue Scale = N->getScale();
31544 SDValue Index = N->getIndex();
31545 SDValue Mask = N->getMask();
31546 SDValue Chain = N->getChain();
31547 SDValue BasePtr = N->getBasePtr();
31548
31549 if (VT == MVT::v2f32 || VT == MVT::v2i32) {
31550 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type")(static_cast <bool> (Mask.getValueType() == MVT::v2i1 &&
"Unexpected mask type") ? void (0) : __assert_fail ("Mask.getValueType() == MVT::v2i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31550, __extension__
__PRETTY_FUNCTION__))
;
31551 // If the index is v2i64 and we have VLX we can use xmm for data and index.
31552 if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
31553 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31554 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
31555 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));
31556 SDVTList VTs = DAG.getVTList(MVT::Other);
31557 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
31558 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
31559 N->getMemoryVT(), N->getMemOperand());
31560 }
31561 return SDValue();
31562 }
31563
31564 MVT IndexVT = Index.getSimpleValueType();
31565
31566 // If the index is v2i32, we're being called by type legalization and we
31567 // should just let the default handling take care of it.
31568 if (IndexVT == MVT::v2i32)
31569 return SDValue();
31570
31571 // If we don't have VLX and neither the passthru or index is 512-bits, we
31572 // need to widen until one is.
31573 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
31574 !Index.getSimpleValueType().is512BitVector()) {
31575 // Determine how much we need to widen by to get a 512-bit type.
31576 unsigned Factor = std::min(512/VT.getSizeInBits(),
31577 512/IndexVT.getSizeInBits());
31578 unsigned NumElts = VT.getVectorNumElements() * Factor;
31579
31580 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
31581 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
31582 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
31583
31584 Src = ExtendToType(Src, VT, DAG);
31585 Index = ExtendToType(Index, IndexVT, DAG);
31586 Mask = ExtendToType(Mask, MaskVT, DAG, true);
31587 }
31588
31589 SDVTList VTs = DAG.getVTList(MVT::Other);
31590 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
31591 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
31592 N->getMemoryVT(), N->getMemOperand());
31593}
31594
31595static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
31596 SelectionDAG &DAG) {
31597
31598 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
31599 MVT VT = Op.getSimpleValueType();
31600 MVT ScalarVT = VT.getScalarType();
31601 SDValue Mask = N->getMask();
31602 MVT MaskVT = Mask.getSimpleValueType();
31603 SDValue PassThru = N->getPassThru();
31604 SDLoc dl(Op);
31605
31606 // Handle AVX masked loads which don't support passthru other than 0.
31607 if (MaskVT.getVectorElementType() != MVT::i1) {
31608 // We also allow undef in the isel pattern.
31609 if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode()))
31610 return Op;
31611
31612 SDValue NewLoad = DAG.getMaskedLoad(
31613 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
31614 getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(),
31615 N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),
31616 N->isExpandingLoad());
31617 // Emit a blend.
31618 SDValue Select = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
31619 return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);
31620 }
31621
31622 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&(static_cast <bool> ((!N->isExpandingLoad() || Subtarget
.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"
) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31623, __extension__
__PRETTY_FUNCTION__))
31623 "Expanding masked load is supported on AVX-512 target only!")(static_cast <bool> ((!N->isExpandingLoad() || Subtarget
.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"
) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31623, __extension__
__PRETTY_FUNCTION__))
;
31624
31625 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&(static_cast <bool> ((!N->isExpandingLoad() || ScalarVT
.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31626, __extension__
__PRETTY_FUNCTION__))
31626 "Expanding masked load is supported for 32 and 64-bit types only!")(static_cast <bool> ((!N->isExpandingLoad() || ScalarVT
.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31626, __extension__
__PRETTY_FUNCTION__))
;
31627
31628 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked load op.") ? void (0) : __assert_fail (
"Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31629, __extension__
__PRETTY_FUNCTION__))
31629 "Cannot lower masked load op.")(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked load op.") ? void (0) : __assert_fail (
"Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31629, __extension__
__PRETTY_FUNCTION__))
;
31630
31631 assert((ScalarVT.getSizeInBits() >= 32 ||(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked load op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31634, __extension__
__PRETTY_FUNCTION__))
31632 (Subtarget.hasBWI() &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked load op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31634, __extension__
__PRETTY_FUNCTION__))
31633 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked load op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31634, __extension__
__PRETTY_FUNCTION__))
31634 "Unsupported masked load op.")(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked load op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31634, __extension__
__PRETTY_FUNCTION__))
;
31635
31636 // This operation is legal for targets with VLX, but without
31637 // VLX the vector should be widened to 512 bit
31638 unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
31639 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
31640 PassThru = ExtendToType(PassThru, WideDataVT, DAG);
31641
31642 // Mask element has to be i1.
31643 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&(static_cast <bool> (Mask.getSimpleValueType().getScalarType
() == MVT::i1 && "Unexpected mask type") ? void (0) :
__assert_fail ("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31644, __extension__
__PRETTY_FUNCTION__))
31644 "Unexpected mask type")(static_cast <bool> (Mask.getSimpleValueType().getScalarType
() == MVT::i1 && "Unexpected mask type") ? void (0) :
__assert_fail ("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31644, __extension__
__PRETTY_FUNCTION__))
;
31645
31646 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
31647
31648 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
31649 SDValue NewLoad = DAG.getMaskedLoad(
31650 WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
31651 PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
31652 N->getExtensionType(), N->isExpandingLoad());
31653
31654 SDValue Extract =
31655 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0),
31656 DAG.getIntPtrConstant(0, dl));
31657 SDValue RetOps[] = {Extract, NewLoad.getValue(1)};
31658 return DAG.getMergeValues(RetOps, dl);
31659}
31660
31661static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
31662 SelectionDAG &DAG) {
31663 MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
31664 SDValue DataToStore = N->getValue();
31665 MVT VT = DataToStore.getSimpleValueType();
31666 MVT ScalarVT = VT.getScalarType();
31667 SDValue Mask = N->getMask();
31668 SDLoc dl(Op);
31669
31670 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&(static_cast <bool> ((!N->isCompressingStore() || Subtarget
.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"
) ? void (0) : __assert_fail ("(!N->isCompressingStore() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31671, __extension__
__PRETTY_FUNCTION__))
31671 "Expanding masked load is supported on AVX-512 target only!")(static_cast <bool> ((!N->isCompressingStore() || Subtarget
.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"
) ? void (0) : __assert_fail ("(!N->isCompressingStore() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31671, __extension__
__PRETTY_FUNCTION__))
;
31672
31673 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&(static_cast <bool> ((!N->isCompressingStore() || ScalarVT
.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? void (0) : __assert_fail ("(!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31674, __extension__
__PRETTY_FUNCTION__))
31674 "Expanding masked load is supported for 32 and 64-bit types only!")(static_cast <bool> ((!N->isCompressingStore() || ScalarVT
.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? void (0) : __assert_fail ("(!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31674, __extension__
__PRETTY_FUNCTION__))
;
31675
31676 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked store op.") ? void (0) : __assert_fail (
"Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31677, __extension__
__PRETTY_FUNCTION__))
31677 "Cannot lower masked store op.")(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked store op.") ? void (0) : __assert_fail (
"Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31677, __extension__
__PRETTY_FUNCTION__))
;
31678
31679 assert((ScalarVT.getSizeInBits() >= 32 ||(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked store op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31682, __extension__
__PRETTY_FUNCTION__))
31680 (Subtarget.hasBWI() &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked store op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31682, __extension__
__PRETTY_FUNCTION__))
31681 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked store op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31682, __extension__
__PRETTY_FUNCTION__))
31682 "Unsupported masked store op.")(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked store op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31682, __extension__
__PRETTY_FUNCTION__))
;
31683
31684 // This operation is legal for targets with VLX, but without
31685 // VLX the vector should be widened to 512 bit
31686 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
31687 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
31688
31689 // Mask element has to be i1.
31690 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&(static_cast <bool> (Mask.getSimpleValueType().getScalarType
() == MVT::i1 && "Unexpected mask type") ? void (0) :
__assert_fail ("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31691, __extension__
__PRETTY_FUNCTION__))
31691 "Unexpected mask type")(static_cast <bool> (Mask.getSimpleValueType().getScalarType
() == MVT::i1 && "Unexpected mask type") ? void (0) :
__assert_fail ("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31691, __extension__
__PRETTY_FUNCTION__))
;
31692
31693 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
31694
31695 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
31696 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
31697 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
31698 N->getOffset(), Mask, N->getMemoryVT(),
31699 N->getMemOperand(), N->getAddressingMode(),
31700 N->isTruncatingStore(), N->isCompressingStore());
31701}
31702
31703static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
31704 SelectionDAG &DAG) {
31705 assert(Subtarget.hasAVX2() &&(static_cast <bool> (Subtarget.hasAVX2() && "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31706, __extension__
__PRETTY_FUNCTION__))
31706 "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only")(static_cast <bool> (Subtarget.hasAVX2() && "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31706, __extension__
__PRETTY_FUNCTION__))
;
31707
31708 MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
31709 SDLoc dl(Op);
31710 MVT VT = Op.getSimpleValueType();
31711 SDValue Index = N->getIndex();
31712 SDValue Mask = N->getMask();
31713 SDValue PassThru = N->getPassThru();
31714 MVT IndexVT = Index.getSimpleValueType();
31715
31716 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op")(static_cast <bool> (VT.getScalarSizeInBits() >= 32 &&
"Unsupported gather op") ? void (0) : __assert_fail ("VT.getScalarSizeInBits() >= 32 && \"Unsupported gather op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31716, __extension__
__PRETTY_FUNCTION__))
;
31717
31718 // If the index is v2i32, we're being called by type legalization.
31719 if (IndexVT == MVT::v2i32)
31720 return SDValue();
31721
31722 // If we don't have VLX and neither the passthru or index is 512-bits, we
31723 // need to widen until one is.
31724 MVT OrigVT = VT;
31725 if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
31726 !IndexVT.is512BitVector()) {
31727 // Determine how much we need to widen by to get a 512-bit type.
31728 unsigned Factor = std::min(512/VT.getSizeInBits(),
31729 512/IndexVT.getSizeInBits());
31730
31731 unsigned NumElts = VT.getVectorNumElements() * Factor;
31732
31733 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
31734 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
31735 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
31736
31737 PassThru = ExtendToType(PassThru, VT, DAG);
31738 Index = ExtendToType(Index, IndexVT, DAG);
31739 Mask = ExtendToType(Mask, MaskVT, DAG, true);
31740 }
31741
31742 // Break dependency on the data register.
31743 if (PassThru.isUndef())
31744 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
31745
31746 SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
31747 N->getScale() };
31748 SDValue NewGather = DAG.getMemIntrinsicNode(
31749 X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(),
31750 N->getMemOperand());
31751 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT,
31752 NewGather, DAG.getIntPtrConstant(0, dl));
31753 return DAG.getMergeValues({Extract, NewGather.getValue(1)}, dl);
31754}
31755
31756static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) {
31757 SDLoc dl(Op);
31758 SDValue Src = Op.getOperand(0);
31759 MVT DstVT = Op.getSimpleValueType();
31760
31761 AddrSpaceCastSDNode *N = cast<AddrSpaceCastSDNode>(Op.getNode());
31762 unsigned SrcAS = N->getSrcAddressSpace();
31763
31764 assert(SrcAS != N->getDestAddressSpace() &&(static_cast <bool> (SrcAS != N->getDestAddressSpace
() && "addrspacecast must be between different address spaces"
) ? void (0) : __assert_fail ("SrcAS != N->getDestAddressSpace() && \"addrspacecast must be between different address spaces\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31765, __extension__
__PRETTY_FUNCTION__))
31765 "addrspacecast must be between different address spaces")(static_cast <bool> (SrcAS != N->getDestAddressSpace
() && "addrspacecast must be between different address spaces"
) ? void (0) : __assert_fail ("SrcAS != N->getDestAddressSpace() && \"addrspacecast must be between different address spaces\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31765, __extension__
__PRETTY_FUNCTION__))
;
31766
31767 if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) {
31768 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);
31769 } else if (DstVT == MVT::i64) {
31770 Op = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);
31771 } else if (DstVT == MVT::i32) {
31772 Op = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);
31773 } else {
31774 report_fatal_error("Bad address space in addrspacecast");
31775 }
31776 return Op;
31777}
31778
31779SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op,
31780 SelectionDAG &DAG) const {
31781 // TODO: Eventually, the lowering of these nodes should be informed by or
31782 // deferred to the GC strategy for the function in which they appear. For
31783 // now, however, they must be lowered to something. Since they are logically
31784 // no-ops in the case of a null GC strategy (or a GC strategy which does not
31785 // require special handling for these nodes), lower them as literal NOOPs for
31786 // the time being.
31787 SmallVector<SDValue, 2> Ops;
31788
31789 Ops.push_back(Op.getOperand(0));
31790 if (Op->getGluedNode())
31791 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
31792
31793 SDLoc OpDL(Op);
31794 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
31795 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
31796
31797 return NOOP;
31798}
31799
31800// Custom split CVTPS2PH with wide types.
31801static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG) {
31802 SDLoc dl(Op);
31803 EVT VT = Op.getValueType();
31804 SDValue Lo, Hi;
31805 std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
31806 EVT LoVT, HiVT;
31807 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
31808 SDValue RC = Op.getOperand(1);
31809 Lo = DAG.getNode(X86ISD::CVTPS2PH, dl, LoVT, Lo, RC);
31810 Hi = DAG.getNode(X86ISD::CVTPS2PH, dl, HiVT, Hi, RC);
31811 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
31812}
31813
31814/// Provide custom lowering hooks for some operations.
31815SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
31816 switch (Op.getOpcode()) {
31817 default: llvm_unreachable("Should not custom lower this!")::llvm::llvm_unreachable_internal("Should not custom lower this!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31817)
;
31818 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
31819 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
31820 return LowerCMP_SWAP(Op, Subtarget, DAG);
31821 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
31822 case ISD::ATOMIC_LOAD_ADD:
31823 case ISD::ATOMIC_LOAD_SUB:
31824 case ISD::ATOMIC_LOAD_OR:
31825 case ISD::ATOMIC_LOAD_XOR:
31826 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
31827 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget);
31828 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
31829 case ISD::PARITY: return LowerPARITY(Op, Subtarget, DAG);
31830 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
31831 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
31832 case ISD::VECTOR_SHUFFLE: return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG);
31833 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
31834 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
31835 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
31836 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
31837 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
31838 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
31839 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
31840 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
31841 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
31842 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
31843 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
31844 case ISD::SHL_PARTS:
31845 case ISD::SRA_PARTS:
31846 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
31847 case ISD::FSHL:
31848 case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG);
31849 case ISD::STRICT_SINT_TO_FP:
31850 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
31851 case ISD::STRICT_UINT_TO_FP:
31852 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
31853 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
31854 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
31855 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
31856 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
31857 case ISD::ZERO_EXTEND_VECTOR_INREG:
31858 case ISD::SIGN_EXTEND_VECTOR_INREG:
31859 return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
31860 case ISD::FP_TO_SINT:
31861 case ISD::STRICT_FP_TO_SINT:
31862 case ISD::FP_TO_UINT:
31863 case ISD::STRICT_FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
31864 case ISD::FP_TO_SINT_SAT:
31865 case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG);
31866 case ISD::FP_EXTEND:
31867 case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
31868 case ISD::FP_ROUND:
31869 case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG);
31870 case ISD::FP16_TO_FP:
31871 case ISD::STRICT_FP16_TO_FP: return LowerFP16_TO_FP(Op, DAG);
31872 case ISD::FP_TO_FP16:
31873 case ISD::STRICT_FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
31874 case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);
31875 case ISD::STORE: return LowerStore(Op, Subtarget, DAG);
31876 case ISD::FADD:
31877 case ISD::FSUB: return lowerFaddFsub(Op, DAG);
31878 case ISD::FROUND: return LowerFROUND(Op, DAG);
31879 case ISD::FABS:
31880 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
31881 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
31882 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
31883 case ISD::LRINT:
31884 case ISD::LLRINT: return LowerLRINT_LLRINT(Op, DAG);
31885 case ISD::SETCC:
31886 case ISD::STRICT_FSETCC:
31887 case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG);
31888 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
31889 case ISD::SELECT: return LowerSELECT(Op, DAG);
31890 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
31891 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
31892 case ISD::VASTART: return LowerVASTART(Op, DAG);
31893 case ISD::VAARG: return LowerVAARG(Op, DAG);
31894 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
31895 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
31896 case ISD::INTRINSIC_VOID:
31897 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
31898 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
31899 case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
31900 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
31901 case ISD::FRAME_TO_ARGS_OFFSET:
31902 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
31903 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
31904 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
31905 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
31906 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
31907 case ISD::EH_SJLJ_SETUP_DISPATCH:
31908 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
31909 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
31910 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
31911 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
31912 case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG);
31913 case ISD::CTLZ:
31914 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
31915 case ISD::CTTZ:
31916 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, Subtarget, DAG);
31917 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
31918 case ISD::MULHS:
31919 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
31920 case ISD::ROTL:
31921 case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
31922 case ISD::SRA:
31923 case ISD::SRL:
31924 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
31925 case ISD::SADDO:
31926 case ISD::UADDO:
31927 case ISD::SSUBO:
31928 case ISD::USUBO: return LowerXALUO(Op, DAG);
31929 case ISD::SMULO:
31930 case ISD::UMULO: return LowerMULO(Op, Subtarget, DAG);
31931 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
31932 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
31933 case ISD::SADDO_CARRY:
31934 case ISD::SSUBO_CARRY:
31935 case ISD::ADDCARRY:
31936 case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG);
31937 case ISD::ADD:
31938 case ISD::SUB: return lowerAddSub(Op, DAG, Subtarget);
31939 case ISD::UADDSAT:
31940 case ISD::SADDSAT:
31941 case ISD::USUBSAT:
31942 case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG, Subtarget);
31943 case ISD::SMAX:
31944 case ISD::SMIN:
31945 case ISD::UMAX:
31946 case ISD::UMIN: return LowerMINMAX(Op, DAG);
31947 case ISD::ABS: return LowerABS(Op, Subtarget, DAG);
31948 case ISD::AVGCEILU: return LowerAVG(Op, Subtarget, DAG);
31949 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
31950 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
31951 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
31952 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
31953 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
31954 case ISD::GC_TRANSITION_START:
31955 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION(Op, DAG);
31956 case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG);
31957 case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG);
31958 }
31959}
31960
31961/// Replace a node with an illegal result type with a new node built out of
31962/// custom code.
31963void X86TargetLowering::ReplaceNodeResults(SDNode *N,
31964 SmallVectorImpl<SDValue>&Results,
31965 SelectionDAG &DAG) const {
31966 SDLoc dl(N);
31967 switch (N->getOpcode()) {
31968 default:
31969#ifndef NDEBUG
31970 dbgs() << "ReplaceNodeResults: ";
31971 N->dump(&DAG);
31972#endif
31973 llvm_unreachable("Do not know how to custom type legalize this operation!")::llvm::llvm_unreachable_internal("Do not know how to custom type legalize this operation!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31973)
;
31974 case X86ISD::CVTPH2PS: {
31975 EVT VT = N->getValueType(0);
31976 SDValue Lo, Hi;
31977 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
31978 EVT LoVT, HiVT;
31979 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
31980 Lo = DAG.getNode(X86ISD::CVTPH2PS, dl, LoVT, Lo);
31981 Hi = DAG.getNode(X86ISD::CVTPH2PS, dl, HiVT, Hi);
31982 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
31983 Results.push_back(Res);
31984 return;
31985 }
31986 case X86ISD::STRICT_CVTPH2PS: {
31987 EVT VT = N->getValueType(0);
31988 SDValue Lo, Hi;
31989 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 1);
31990 EVT LoVT, HiVT;
31991 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
31992 Lo = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {LoVT, MVT::Other},
31993 {N->getOperand(0), Lo});
31994 Hi = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {HiVT, MVT::Other},
31995 {N->getOperand(0), Hi});
31996 SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
31997 Lo.getValue(1), Hi.getValue(1));
31998 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
31999 Results.push_back(Res);
32000 Results.push_back(Chain);
32001 return;
32002 }
32003 case X86ISD::CVTPS2PH:
32004 Results.push_back(LowerCVTPS2PH(SDValue(N, 0), DAG));
32005 return;
32006 case ISD::CTPOP: {
32007 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!")(static_cast <bool> (N->getValueType(0) == MVT::i64 &&
"Unexpected VT!") ? void (0) : __assert_fail ("N->getValueType(0) == MVT::i64 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32007, __extension__
__PRETTY_FUNCTION__))
;
32008 // Use a v2i64 if possible.
32009 bool NoImplicitFloatOps =
32010 DAG.getMachineFunction().getFunction().hasFnAttribute(
32011 Attribute::NoImplicitFloat);
32012 if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {
32013 SDValue Wide =
32014 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));
32015 Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);
32016 // Bit count should fit in 32-bits, extract it as that and then zero
32017 // extend to i64. Otherwise we end up extracting bits 63:32 separately.
32018 Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);
32019 Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,
32020 DAG.getIntPtrConstant(0, dl));
32021 Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);
32022 Results.push_back(Wide);
32023 }
32024 return;
32025 }
32026 case ISD::MUL: {
32027 EVT VT = N->getValueType(0);
32028 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && VT.getVectorElementType() == MVT
::i8 && "Unexpected VT!") ? void (0) : __assert_fail (
"getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && VT.getVectorElementType() == MVT::i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32029, __extension__
__PRETTY_FUNCTION__))
32029 VT.getVectorElementType() == MVT::i8 && "Unexpected VT!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && VT.getVectorElementType() == MVT
::i8 && "Unexpected VT!") ? void (0) : __assert_fail (
"getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && VT.getVectorElementType() == MVT::i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32029, __extension__
__PRETTY_FUNCTION__))
;
32030 // Pre-promote these to vXi16 to avoid op legalization thinking all 16
32031 // elements are needed.
32032 MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
32033 SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
32034 SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
32035 SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);
32036 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
32037 unsigned NumConcats = 16 / VT.getVectorNumElements();
32038 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
32039 ConcatOps[0] = Res;
32040 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);
32041 Results.push_back(Res);
32042 return;
32043 }
32044 case X86ISD::VPMADDWD: {
32045 // Legalize types for X86ISD::VPMADDWD by widening.
32046 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32046, __extension__
__PRETTY_FUNCTION__))
;
32047
32048 EVT VT = N->getValueType(0);
32049 EVT InVT = N->getOperand(0).getValueType();
32050 assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&(static_cast <bool> (VT.getSizeInBits() < 128 &&
128 % VT.getSizeInBits() == 0 && "Expected a VT that divides into 128 bits."
) ? void (0) : __assert_fail ("VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 && \"Expected a VT that divides into 128 bits.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32051, __extension__
__PRETTY_FUNCTION__))
32051 "Expected a VT that divides into 128 bits.")(static_cast <bool> (VT.getSizeInBits() < 128 &&
128 % VT.getSizeInBits() == 0 && "Expected a VT that divides into 128 bits."
) ? void (0) : __assert_fail ("VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 && \"Expected a VT that divides into 128 bits.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32051, __extension__
__PRETTY_FUNCTION__))
;
32052 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32053, __extension__
__PRETTY_FUNCTION__))
32053 "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32053, __extension__
__PRETTY_FUNCTION__))
;
32054 unsigned NumConcat = 128 / InVT.getSizeInBits();
32055
32056 EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),
32057 InVT.getVectorElementType(),
32058 NumConcat * InVT.getVectorNumElements());
32059 EVT WideVT = EVT::getVectorVT(*DAG.getContext(),
32060 VT.getVectorElementType(),
32061 NumConcat * VT.getVectorNumElements());
32062
32063 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
32064 Ops[0] = N->getOperand(0);
32065 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
32066 Ops[0] = N->getOperand(1);
32067 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
32068
32069 SDValue Res = DAG.getNode(N->getOpcode(), dl, WideVT, InVec0, InVec1);
32070 Results.push_back(Res);
32071 return;
32072 }
32073 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
32074 case X86ISD::FMINC:
32075 case X86ISD::FMIN:
32076 case X86ISD::FMAXC:
32077 case X86ISD::FMAX: {
32078 EVT VT = N->getValueType(0);
32079 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.")(static_cast <bool> (VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX."
) ? void (0) : __assert_fail ("VT == MVT::v2f32 && \"Unexpected type (!= v2f32) on FMIN/FMAX.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32079, __extension__
__PRETTY_FUNCTION__))
;
32080 SDValue UNDEF = DAG.getUNDEF(VT);
32081 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
32082 N->getOperand(0), UNDEF);
32083 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
32084 N->getOperand(1), UNDEF);
32085 Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
32086 return;
32087 }
32088 case ISD::SDIV:
32089 case ISD::UDIV:
32090 case ISD::SREM:
32091 case ISD::UREM: {
32092 EVT VT = N->getValueType(0);
32093 if (VT.isVector()) {
32094 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32095, __extension__
__PRETTY_FUNCTION__))
32095 "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32095, __extension__
__PRETTY_FUNCTION__))
;
32096 // If this RHS is a constant splat vector we can widen this and let
32097 // division/remainder by constant optimize it.
32098 // TODO: Can we do something for non-splat?
32099 APInt SplatVal;
32100 if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {
32101 unsigned NumConcats = 128 / VT.getSizeInBits();
32102 SmallVector<SDValue, 8> Ops0(NumConcats, DAG.getUNDEF(VT));
32103 Ops0[0] = N->getOperand(0);
32104 EVT ResVT = getTypeToTransformTo(*DAG.getContext(), VT);
32105 SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0);
32106 SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT);
32107 SDValue Res = DAG.getNode(N->getOpcode(), dl, ResVT, N0, N1);
32108 Results.push_back(Res);
32109 }
32110 return;
32111 }
32112
32113 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
32114 Results.push_back(V);
32115 return;
32116 }
32117 case ISD::TRUNCATE: {
32118 MVT VT = N->getSimpleValueType(0);
32119 if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
32120 return;
32121
32122 // The generic legalizer will try to widen the input type to the same
32123 // number of elements as the widened result type. But this isn't always
32124 // the best thing so do some custom legalization to avoid some cases.
32125 MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT();
32126 SDValue In = N->getOperand(0);
32127 EVT InVT = In.getValueType();
32128
32129 unsigned InBits = InVT.getSizeInBits();
32130 if (128 % InBits == 0) {
32131 // 128 bit and smaller inputs should avoid truncate all together and
32132 // just use a build_vector that will become a shuffle.
32133 // TODO: Widen and use a shuffle directly?
32134 MVT InEltVT = InVT.getSimpleVT().getVectorElementType();
32135 EVT EltVT = VT.getVectorElementType();
32136 unsigned WidenNumElts = WidenVT.getVectorNumElements();
32137 SmallVector<SDValue, 16> Ops(WidenNumElts, DAG.getUNDEF(EltVT));
32138 // Use the original element count so we don't do more scalar opts than
32139 // necessary.
32140 unsigned MinElts = VT.getVectorNumElements();
32141 for (unsigned i=0; i < MinElts; ++i) {
32142 SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, In,
32143 DAG.getIntPtrConstant(i, dl));
32144 Ops[i] = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Val);
32145 }
32146 Results.push_back(DAG.getBuildVector(WidenVT, dl, Ops));
32147 return;
32148 }
32149 // With AVX512 there are some cases that can use a target specific
32150 // truncate node to go from 256/512 to less than 128 with zeros in the
32151 // upper elements of the 128 bit result.
32152 if (Subtarget.hasAVX512() && isTypeLegal(InVT)) {
32153 // We can use VTRUNC directly if for 256 bits with VLX or for any 512.
32154 if ((InBits == 256 && Subtarget.hasVLX()) || InBits == 512) {
32155 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
32156 return;
32157 }
32158 // There's one case we can widen to 512 bits and use VTRUNC.
32159 if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) {
32160 In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In,
32161 DAG.getUNDEF(MVT::v4i64));
32162 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
32163 return;
32164 }
32165 }
32166 if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 &&
32167 getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector &&
32168 isTypeLegal(MVT::v4i64)) {
32169 // Input needs to be split and output needs to widened. Let's use two
32170 // VTRUNCs, and shuffle their results together into the wider type.
32171 SDValue Lo, Hi;
32172 std::tie(Lo, Hi) = DAG.SplitVector(In, dl);
32173
32174 Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo);
32175 Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi);
32176 SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi,
32177 { 0, 1, 2, 3, 16, 17, 18, 19,
32178 -1, -1, -1, -1, -1, -1, -1, -1 });
32179 Results.push_back(Res);
32180 return;
32181 }
32182
32183 return;
32184 }
32185 case ISD::ANY_EXTEND:
32186 // Right now, only MVT::v8i8 has Custom action for an illegal type.
32187 // It's intended to custom handle the input type.
32188 assert(N->getValueType(0) == MVT::v8i8 &&(static_cast <bool> (N->getValueType(0) == MVT::v8i8
&& "Do not know how to legalize this Node") ? void (
0) : __assert_fail ("N->getValueType(0) == MVT::v8i8 && \"Do not know how to legalize this Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32189, __extension__
__PRETTY_FUNCTION__))
32189 "Do not know how to legalize this Node")(static_cast <bool> (N->getValueType(0) == MVT::v8i8
&& "Do not know how to legalize this Node") ? void (
0) : __assert_fail ("N->getValueType(0) == MVT::v8i8 && \"Do not know how to legalize this Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32189, __extension__
__PRETTY_FUNCTION__))
;
32190 return;
32191 case ISD::SIGN_EXTEND:
32192 case ISD::ZERO_EXTEND: {
32193 EVT VT = N->getValueType(0);
32194 SDValue In = N->getOperand(0);
32195 EVT InVT = In.getValueType();
32196 if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
32197 (InVT == MVT::v4i16 || InVT == MVT::v4i8)){
32198 assert(getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), InVT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32199, __extension__
__PRETTY_FUNCTION__))
32199 "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), InVT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32199, __extension__
__PRETTY_FUNCTION__))
;
32200 assert(N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode")(static_cast <bool> (N->getOpcode() == ISD::SIGN_EXTEND
&& "Unexpected opcode") ? void (0) : __assert_fail (
"N->getOpcode() == ISD::SIGN_EXTEND && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32200, __extension__
__PRETTY_FUNCTION__))
;
32201 // Custom split this so we can extend i8/i16->i32 invec. This is better
32202 // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
32203 // sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
32204 // we allow the sra from the extend to i32 to be shared by the split.
32205 In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In);
32206
32207 // Fill a vector with sign bits for each element.
32208 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
32209 SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT);
32210
32211 // Create an unpackl and unpackh to interleave the sign bits then bitcast
32212 // to v2i64.
32213 SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
32214 {0, 4, 1, 5});
32215 Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);
32216 SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
32217 {2, 6, 3, 7});
32218 Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);
32219
32220 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
32221 Results.push_back(Res);
32222 return;
32223 }
32224
32225 if (VT == MVT::v16i32 || VT == MVT::v8i64) {
32226 if (!InVT.is128BitVector()) {
32227 // Not a 128 bit vector, but maybe type legalization will promote
32228 // it to 128 bits.
32229 if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger)
32230 return;
32231 InVT = getTypeToTransformTo(*DAG.getContext(), InVT);
32232 if (!InVT.is128BitVector())
32233 return;
32234
32235 // Promote the input to 128 bits. Type legalization will turn this into
32236 // zext_inreg/sext_inreg.
32237 In = DAG.getNode(N->getOpcode(), dl, InVT, In);
32238 }
32239
32240 // Perform custom splitting instead of the two stage extend we would get
32241 // by default.
32242 EVT LoVT, HiVT;
32243 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
32244 assert(isTypeLegal(LoVT) && "Split VT not legal?")(static_cast <bool> (isTypeLegal(LoVT) && "Split VT not legal?"
) ? void (0) : __assert_fail ("isTypeLegal(LoVT) && \"Split VT not legal?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32244, __extension__
__PRETTY_FUNCTION__))
;
32245
32246 SDValue Lo = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, LoVT, In, DAG);
32247
32248 // We need to shift the input over by half the number of elements.
32249 unsigned NumElts = InVT.getVectorNumElements();
32250 unsigned HalfNumElts = NumElts / 2;
32251 SmallVector<int, 16> ShufMask(NumElts, SM_SentinelUndef);
32252 for (unsigned i = 0; i != HalfNumElts; ++i)
32253 ShufMask[i] = i + HalfNumElts;
32254
32255 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
32256 Hi = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, HiVT, Hi, DAG);
32257
32258 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
32259 Results.push_back(Res);
32260 }
32261 return;
32262 }
32263 case ISD::FP_TO_SINT:
32264 case ISD::STRICT_FP_TO_SINT:
32265 case ISD::FP_TO_UINT:
32266 case ISD::STRICT_FP_TO_UINT: {
32267 bool IsStrict = N->isStrictFPOpcode();
32268 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT ||
32269 N->getOpcode() == ISD::STRICT_FP_TO_SINT;
32270 EVT VT = N->getValueType(0);
32271 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
32272 EVT SrcVT = Src.getValueType();
32273
32274 if (VT.isVector() && Subtarget.hasFP16() &&
32275 SrcVT.getVectorElementType() == MVT::f16) {
32276 EVT EleVT = VT.getVectorElementType();
32277 EVT ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
32278
32279 if (SrcVT != MVT::v8f16) {
32280 SDValue Tmp =
32281 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
32282 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
32283 Ops[0] = Src;
32284 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
32285 }
32286
32287 SDValue Res, Chain;
32288 if (IsStrict) {
32289 unsigned Opc =
32290 IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
32291 Res =
32292 DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {N->getOperand(0), Src});
32293 Chain = Res.getValue(1);
32294 } else {
32295 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
32296 Res = DAG.getNode(Opc, dl, ResVT, Src);
32297 }
32298
32299 // TODO: Need to add exception check code for strict FP.
32300 if (EleVT.getSizeInBits() < 16) {
32301 MVT TmpVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8);
32302 Res = DAG.getNode(ISD::TRUNCATE, dl, TmpVT, Res);
32303
32304 // Now widen to 128 bits.
32305 unsigned NumConcats = 128 / TmpVT.getSizeInBits();
32306 MVT ConcatVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8 * NumConcats);
32307 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(TmpVT));
32308 ConcatOps[0] = Res;
32309 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
32310 }
32311
32312 Results.push_back(Res);
32313 if (IsStrict)
32314 Results.push_back(Chain);
32315
32316 return;
32317 }
32318
32319 if (VT.isVector() && VT.getScalarSizeInBits() < 32) {
32320 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32321, __extension__
__PRETTY_FUNCTION__))
32321 "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32321, __extension__
__PRETTY_FUNCTION__))
;
32322
32323 // Try to create a 128 bit vector, but don't exceed a 32 bit element.
32324 unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);
32325 MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),
32326 VT.getVectorNumElements());
32327 SDValue Res;
32328 SDValue Chain;
32329 if (IsStrict) {
32330 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {PromoteVT, MVT::Other},
32331 {N->getOperand(0), Src});
32332 Chain = Res.getValue(1);
32333 } else
32334 Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);
32335
32336 // Preserve what we know about the size of the original result. If the
32337 // result is v2i32, we have to manually widen the assert.
32338 if (PromoteVT == MVT::v2i32)
32339 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
32340 DAG.getUNDEF(MVT::v2i32));
32341
32342 Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext, dl,
32343 Res.getValueType(), Res,
32344 DAG.getValueType(VT.getVectorElementType()));
32345
32346 if (PromoteVT == MVT::v2i32)
32347 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
32348 DAG.getIntPtrConstant(0, dl));
32349
32350 // Truncate back to the original width.
32351 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
32352
32353 // Now widen to 128 bits.
32354 unsigned NumConcats = 128 / VT.getSizeInBits();
32355 MVT ConcatVT = MVT::getVectorVT(VT.getSimpleVT().getVectorElementType(),
32356 VT.getVectorNumElements() * NumConcats);
32357 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
32358 ConcatOps[0] = Res;
32359 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
32360 Results.push_back(Res);
32361 if (IsStrict)
32362 Results.push_back(Chain);
32363 return;
32364 }
32365
32366
32367 if (VT == MVT::v2i32) {
32368 assert((!IsStrict || IsSigned || Subtarget.hasAVX512()) &&(static_cast <bool> ((!IsStrict || IsSigned || Subtarget
.hasAVX512()) && "Strict unsigned conversion requires AVX512"
) ? void (0) : __assert_fail ("(!IsStrict || IsSigned || Subtarget.hasAVX512()) && \"Strict unsigned conversion requires AVX512\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32369, __extension__
__PRETTY_FUNCTION__))
32369 "Strict unsigned conversion requires AVX512")(static_cast <bool> ((!IsStrict || IsSigned || Subtarget
.hasAVX512()) && "Strict unsigned conversion requires AVX512"
) ? void (0) : __assert_fail ("(!IsStrict || IsSigned || Subtarget.hasAVX512()) && \"Strict unsigned conversion requires AVX512\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32369, __extension__
__PRETTY_FUNCTION__))
;
32370 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32370, __extension__
__PRETTY_FUNCTION__))
;
32371 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32372, __extension__
__PRETTY_FUNCTION__))
32372 "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32372, __extension__
__PRETTY_FUNCTION__))
;
32373 if (Src.getValueType() == MVT::v2f64) {
32374 if (!IsSigned && !Subtarget.hasAVX512()) {
32375 SDValue Res =
32376 expandFP_TO_UINT_SSE(MVT::v4i32, Src, dl, DAG, Subtarget);
32377 Results.push_back(Res);
32378 return;
32379 }
32380
32381 unsigned Opc;
32382 if (IsStrict)
32383 Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
32384 else
32385 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
32386
32387 // If we have VLX we can emit a target specific FP_TO_UINT node,.
32388 if (!IsSigned && !Subtarget.hasVLX()) {
32389 // Otherwise we can defer to the generic legalizer which will widen
32390 // the input as well. This will be further widened during op
32391 // legalization to v8i32<-v8f64.
32392 // For strict nodes we'll need to widen ourselves.
32393 // FIXME: Fix the type legalizer to safely widen strict nodes?
32394 if (!IsStrict)
32395 return;
32396 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, Src,
32397 DAG.getConstantFP(0.0, dl, MVT::v2f64));
32398 Opc = N->getOpcode();
32399 }
32400 SDValue Res;
32401 SDValue Chain;
32402 if (IsStrict) {
32403 Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
32404 {N->getOperand(0), Src});
32405 Chain = Res.getValue(1);
32406 } else {
32407 Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);
32408 }
32409 Results.push_back(Res);
32410 if (IsStrict)
32411 Results.push_back(Chain);
32412 return;
32413 }
32414
32415 // Custom widen strict v2f32->v2i32 by padding with zeros.
32416 // FIXME: Should generic type legalizer do this?
32417 if (Src.getValueType() == MVT::v2f32 && IsStrict) {
32418 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
32419 DAG.getConstantFP(0.0, dl, MVT::v2f32));
32420 SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4i32, MVT::Other},
32421 {N->getOperand(0), Src});
32422 Results.push_back(Res);
32423 Results.push_back(Res.getValue(1));
32424 return;
32425 }
32426
32427 // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
32428 // so early out here.
32429 return;
32430 }
32431
32432 assert(!VT.isVector() && "Vectors should have been handled above!")(static_cast <bool> (!VT.isVector() && "Vectors should have been handled above!"
) ? void (0) : __assert_fail ("!VT.isVector() && \"Vectors should have been handled above!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32432, __extension__
__PRETTY_FUNCTION__))
;
32433
32434 if ((Subtarget.hasDQI() && VT == MVT::i64 &&
32435 (SrcVT == MVT::f32 || SrcVT == MVT::f64)) ||
32436 (Subtarget.hasFP16() && SrcVT == MVT::f16)) {
32437 assert(!Subtarget.is64Bit() && "i64 should be legal")(static_cast <bool> (!Subtarget.is64Bit() && "i64 should be legal"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"i64 should be legal\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32437, __extension__
__PRETTY_FUNCTION__))
;
32438 unsigned NumElts = Subtarget.hasVLX() ? 2 : 8;
32439 // If we use a 128-bit result we might need to use a target specific node.
32440 unsigned SrcElts =
32441 std::max(NumElts, 128U / (unsigned)SrcVT.getSizeInBits());
32442 MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);
32443 MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), SrcElts);
32444 unsigned Opc = N->getOpcode();
32445 if (NumElts != SrcElts) {
32446 if (IsStrict)
32447 Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
32448 else
32449 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
32450 }
32451
32452 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
32453 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
32454 DAG.getConstantFP(0.0, dl, VecInVT), Src,
32455 ZeroIdx);
32456 SDValue Chain;
32457 if (IsStrict) {
32458 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
32459 Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res);
32460 Chain = Res.getValue(1);
32461 } else
32462 Res = DAG.getNode(Opc, SDLoc(N), VecVT, Res);
32463 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);
32464 Results.push_back(Res);
32465 if (IsStrict)
32466 Results.push_back(Chain);
32467 return;
32468 }
32469
32470 if (VT == MVT::i128 && Subtarget.isTargetWin64()) {
32471 SDValue Chain;
32472 SDValue V = LowerWin64_FP_TO_INT128(SDValue(N, 0), DAG, Chain);
32473 Results.push_back(V);
32474 if (IsStrict)
32475 Results.push_back(Chain);
32476 return;
32477 }
32478
32479 SDValue Chain;
32480 if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) {
32481 Results.push_back(V);
32482 if (IsStrict)
32483 Results.push_back(Chain);
32484 }
32485 return;
32486 }
32487 case ISD::LRINT:
32488 case ISD::LLRINT: {
32489 if (SDValue V = LRINT_LLRINTHelper(N, DAG))
32490 Results.push_back(V);
32491 return;
32492 }
32493
32494 case ISD::SINT_TO_FP:
32495 case ISD::STRICT_SINT_TO_FP:
32496 case ISD::UINT_TO_FP:
32497 case ISD::STRICT_UINT_TO_FP: {
32498 bool IsStrict = N->isStrictFPOpcode();
32499 bool IsSigned = N->getOpcode() == ISD::SINT_TO_FP ||
32500 N->getOpcode() == ISD::STRICT_SINT_TO_FP;
32501 EVT VT = N->getValueType(0);
32502 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
32503 if (VT.getVectorElementType() == MVT::f16 && Subtarget.hasFP16() &&
32504 Subtarget.hasVLX()) {
32505 if (Src.getValueType().getVectorElementType() == MVT::i16)
32506 return;
32507
32508 if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2i32)
32509 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
32510 IsStrict ? DAG.getConstant(0, dl, MVT::v2i32)
32511 : DAG.getUNDEF(MVT::v2i32));
32512 if (IsStrict) {
32513 unsigned Opc =
32514 IsSigned ? X86ISD::STRICT_CVTSI2P : X86ISD::STRICT_CVTUI2P;
32515 SDValue Res = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
32516 {N->getOperand(0), Src});
32517 Results.push_back(Res);
32518 Results.push_back(Res.getValue(1));
32519 } else {
32520 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
32521 Results.push_back(DAG.getNode(Opc, dl, MVT::v8f16, Src));
32522 }
32523 return;
32524 }
32525 if (VT != MVT::v2f32)
32526 return;
32527 EVT SrcVT = Src.getValueType();
32528 if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
32529 if (IsStrict) {
32530 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P
32531 : X86ISD::STRICT_CVTUI2P;
32532 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
32533 {N->getOperand(0), Src});
32534 Results.push_back(Res);
32535 Results.push_back(Res.getValue(1));
32536 } else {
32537 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
32538 Results.push_back(DAG.getNode(Opc, dl, MVT::v4f32, Src));
32539 }
32540 return;
32541 }
32542 if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() &&
32543 Subtarget.hasSSE41() && !Subtarget.hasAVX512()) {
32544 SDValue Zero = DAG.getConstant(0, dl, SrcVT);
32545 SDValue One = DAG.getConstant(1, dl, SrcVT);
32546 SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT,
32547 DAG.getNode(ISD::SRL, dl, SrcVT, Src, One),
32548 DAG.getNode(ISD::AND, dl, SrcVT, Src, One));
32549 SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT);
32550 SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src);
32551 SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32));
32552 for (int i = 0; i != 2; ++i) {
32553 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
32554 SignSrc, DAG.getIntPtrConstant(i, dl));
32555 if (IsStrict)
32556 SignCvts[i] =
32557 DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other},
32558 {N->getOperand(0), Elt});
32559 else
32560 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Elt);
32561 };
32562 SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts);
32563 SDValue Slow, Chain;
32564 if (IsStrict) {
32565 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
32566 SignCvts[0].getValue(1), SignCvts[1].getValue(1));
32567 Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other},
32568 {Chain, SignCvt, SignCvt});
32569 Chain = Slow.getValue(1);
32570 } else {
32571 Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt);
32572 }
32573 IsNeg = DAG.getBitcast(MVT::v4i32, IsNeg);
32574 IsNeg =
32575 DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1});
32576 SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt);
32577 Results.push_back(Cvt);
32578 if (IsStrict)
32579 Results.push_back(Chain);
32580 return;
32581 }
32582
32583 if (SrcVT != MVT::v2i32)
32584 return;
32585
32586 if (IsSigned || Subtarget.hasAVX512()) {
32587 if (!IsStrict)
32588 return;
32589
32590 // Custom widen strict v2i32->v2f32 to avoid scalarization.
32591 // FIXME: Should generic type legalizer do this?
32592 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
32593 DAG.getConstant(0, dl, MVT::v2i32));
32594 SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
32595 {N->getOperand(0), Src});
32596 Results.push_back(Res);
32597 Results.push_back(Res.getValue(1));
32598 return;
32599 }
32600
32601 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32601, __extension__
__PRETTY_FUNCTION__))
;
32602 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
32603 SDValue VBias =
32604 DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
32605 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
32606 DAG.getBitcast(MVT::v2i64, VBias));
32607 Or = DAG.getBitcast(MVT::v2f64, Or);
32608 if (IsStrict) {
32609 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},
32610 {N->getOperand(0), Or, VBias});
32611 SDValue Res = DAG.getNode(X86ISD::STRICT_VFPROUND, dl,
32612 {MVT::v4f32, MVT::Other},
32613 {Sub.getValue(1), Sub});
32614 Results.push_back(Res);
32615 Results.push_back(Res.getValue(1));
32616 } else {
32617 // TODO: Are there any fast-math-flags to propagate here?
32618 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
32619 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
32620 }
32621 return;
32622 }
32623 case ISD::STRICT_FP_ROUND:
32624 case ISD::FP_ROUND: {
32625 bool IsStrict = N->isStrictFPOpcode();
32626 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
32627 EVT VT = N->getValueType(0);
32628 EVT NewVT = VT.getVectorElementType() == MVT::f16 ? MVT::v8f16 : MVT::v4f32;
32629 if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2f32) {
32630 SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f32)
32631 : DAG.getUNDEF(MVT::v2f32);
32632 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, Ext);
32633 }
32634 if (!isTypeLegal(Src.getValueType()))
32635 return;
32636 SDValue V;
32637 if (IsStrict)
32638 V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {NewVT, MVT::Other},
32639 {N->getOperand(0), Src});
32640 else
32641 V = DAG.getNode(X86ISD::VFPROUND, dl, NewVT, Src);
32642 Results.push_back(V);
32643 if (IsStrict)
32644 Results.push_back(V.getValue(1));
32645 return;
32646 }
32647 case ISD::FP_EXTEND:
32648 case ISD::STRICT_FP_EXTEND: {
32649 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
32650 // No other ValueType for FP_EXTEND should reach this point.
32651 assert(N->getValueType(0) == MVT::v2f32 &&(static_cast <bool> (N->getValueType(0) == MVT::v2f32
&& "Do not know how to legalize this Node") ? void (
0) : __assert_fail ("N->getValueType(0) == MVT::v2f32 && \"Do not know how to legalize this Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32652, __extension__
__PRETTY_FUNCTION__))
32652 "Do not know how to legalize this Node")(static_cast <bool> (N->getValueType(0) == MVT::v2f32
&& "Do not know how to legalize this Node") ? void (
0) : __assert_fail ("N->getValueType(0) == MVT::v2f32 && \"Do not know how to legalize this Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32652, __extension__
__PRETTY_FUNCTION__))
;
32653 if (!Subtarget.hasFP16() || !Subtarget.hasVLX())
32654 return;
32655 bool IsStrict = N->isStrictFPOpcode();
32656 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
32657 SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f16)
32658 : DAG.getUNDEF(MVT::v2f16);
32659 SDValue V = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f16, Src, Ext);
32660 if (IsStrict)
32661 V = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::v4f32, MVT::Other},
32662 {N->getOperand(0), V});
32663 else
32664 V = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, V);
32665 Results.push_back(V);
32666 if (IsStrict)
32667 Results.push_back(V.getValue(1));
32668 return;
32669 }
32670 case ISD::INTRINSIC_W_CHAIN: {
32671 unsigned IntNo = N->getConstantOperandVal(1);
32672 switch (IntNo) {
32673 default : llvm_unreachable("Do not know how to custom type "::llvm::llvm_unreachable_internal("Do not know how to custom type "
"legalize this intrinsic operation!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32674)
32674 "legalize this intrinsic operation!")::llvm::llvm_unreachable_internal("Do not know how to custom type "
"legalize this intrinsic operation!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32674)
;
32675 case Intrinsic::x86_rdtsc:
32676 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,
32677 Results);
32678 case Intrinsic::x86_rdtscp:
32679 return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget,
32680 Results);
32681 case Intrinsic::x86_rdpmc:
32682 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget,
32683 Results);
32684 return;
32685 case Intrinsic::x86_xgetbv:
32686 expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,
32687 Results);
32688 return;
32689 }
32690 }
32691 case ISD::READCYCLECOUNTER: {
32692 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results);
32693 }
32694 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
32695 EVT T = N->getValueType(0);
32696 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair")(static_cast <bool> ((T == MVT::i64 || T == MVT::i128) &&
"can only expand cmpxchg pair") ? void (0) : __assert_fail (
"(T == MVT::i64 || T == MVT::i128) && \"can only expand cmpxchg pair\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32696, __extension__
__PRETTY_FUNCTION__))
;
32697 bool Regs64bit = T == MVT::i128;
32698 assert((!Regs64bit || Subtarget.canUseCMPXCHG16B()) &&(static_cast <bool> ((!Regs64bit || Subtarget.canUseCMPXCHG16B
()) && "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B"
) ? void (0) : __assert_fail ("(!Regs64bit || Subtarget.canUseCMPXCHG16B()) && \"64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32699, __extension__
__PRETTY_FUNCTION__))
32699 "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B")(static_cast <bool> ((!Regs64bit || Subtarget.canUseCMPXCHG16B
()) && "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B"
) ? void (0) : __assert_fail ("(!Regs64bit || Subtarget.canUseCMPXCHG16B()) && \"64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32699, __extension__
__PRETTY_FUNCTION__))
;
32700 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
32701 SDValue cpInL, cpInH;
32702 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
32703 DAG.getConstant(0, dl, HalfT));
32704 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
32705 DAG.getConstant(1, dl, HalfT));
32706 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
32707 Regs64bit ? X86::RAX : X86::EAX,
32708 cpInL, SDValue());
32709 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
32710 Regs64bit ? X86::RDX : X86::EDX,
32711 cpInH, cpInL.getValue(1));
32712 SDValue swapInL, swapInH;
32713 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
32714 DAG.getConstant(0, dl, HalfT));
32715 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
32716 DAG.getConstant(1, dl, HalfT));
32717 swapInH =
32718 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
32719 swapInH, cpInH.getValue(1));
32720
32721 // In 64-bit mode we might need the base pointer in RBX, but we can't know
32722 // until later. So we keep the RBX input in a vreg and use a custom
32723 // inserter.
32724 // Since RBX will be a reserved register the register allocator will not
32725 // make sure its value will be properly saved and restored around this
32726 // live-range.
32727 SDValue Result;
32728 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
32729 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
32730 if (Regs64bit) {
32731 SDValue Ops[] = {swapInH.getValue(0), N->getOperand(1), swapInL,
32732 swapInH.getValue(1)};
32733 Result =
32734 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG16_DAG, dl, Tys, Ops, T, MMO);
32735 } else {
32736 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, X86::EBX, swapInL,
32737 swapInH.getValue(1));
32738 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
32739 swapInL.getValue(1)};
32740 Result =
32741 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, T, MMO);
32742 }
32743
32744 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
32745 Regs64bit ? X86::RAX : X86::EAX,
32746 HalfT, Result.getValue(1));
32747 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
32748 Regs64bit ? X86::RDX : X86::EDX,
32749 HalfT, cpOutL.getValue(2));
32750 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
32751
32752 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
32753 MVT::i32, cpOutH.getValue(2));
32754 SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
32755 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
32756
32757 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
32758 Results.push_back(Success);
32759 Results.push_back(EFLAGS.getValue(1));
32760 return;
32761 }
32762 case ISD::ATOMIC_LOAD: {
32763 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!")(static_cast <bool> (N->getValueType(0) == MVT::i64 &&
"Unexpected VT!") ? void (0) : __assert_fail ("N->getValueType(0) == MVT::i64 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32763, __extension__
__PRETTY_FUNCTION__))
;
32764 bool NoImplicitFloatOps =
32765 DAG.getMachineFunction().getFunction().hasFnAttribute(
32766 Attribute::NoImplicitFloat);
32767 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
32768 auto *Node = cast<AtomicSDNode>(N);
32769 if (Subtarget.hasSSE1()) {
32770 // Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.
32771 // Then extract the lower 64-bits.
32772 MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
32773 SDVTList Tys = DAG.getVTList(LdVT, MVT::Other);
32774 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
32775 SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
32776 MVT::i64, Node->getMemOperand());
32777 if (Subtarget.hasSSE2()) {
32778 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
32779 DAG.getIntPtrConstant(0, dl));
32780 Results.push_back(Res);
32781 Results.push_back(Ld.getValue(1));
32782 return;
32783 }
32784 // We use an alternative sequence for SSE1 that extracts as v2f32 and
32785 // then casts to i64. This avoids a 128-bit stack temporary being
32786 // created by type legalization if we were to cast v4f32->v2i64.
32787 SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld,
32788 DAG.getIntPtrConstant(0, dl));
32789 Res = DAG.getBitcast(MVT::i64, Res);
32790 Results.push_back(Res);
32791 Results.push_back(Ld.getValue(1));
32792 return;
32793 }
32794 if (Subtarget.hasX87()) {
32795 // First load this into an 80-bit X87 register. This will put the whole
32796 // integer into the significand.
32797 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
32798 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
32799 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::FILD,
32800 dl, Tys, Ops, MVT::i64,
32801 Node->getMemOperand());
32802 SDValue Chain = Result.getValue(1);
32803
32804 // Now store the X87 register to a stack temporary and convert to i64.
32805 // This store is not atomic and doesn't need to be.
32806 // FIXME: We don't need a stack temporary if the result of the load
32807 // is already being stored. We could just directly store there.
32808 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
32809 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
32810 MachinePointerInfo MPI =
32811 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
32812 SDValue StoreOps[] = { Chain, Result, StackPtr };
32813 Chain = DAG.getMemIntrinsicNode(
32814 X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64,
32815 MPI, None /*Align*/, MachineMemOperand::MOStore);
32816
32817 // Finally load the value back from the stack temporary and return it.
32818 // This load is not atomic and doesn't need to be.
32819 // This load will be further type legalized.
32820 Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI);
32821 Results.push_back(Result);
32822 Results.push_back(Result.getValue(1));
32823 return;
32824 }
32825 }
32826 // TODO: Use MOVLPS when SSE1 is available?
32827 // Delegate to generic TypeLegalization. Situations we can really handle
32828 // should have already been dealt with by AtomicExpandPass.cpp.
32829 break;
32830 }
32831 case ISD::ATOMIC_SWAP:
32832 case ISD::ATOMIC_LOAD_ADD:
32833 case ISD::ATOMIC_LOAD_SUB:
32834 case ISD::ATOMIC_LOAD_AND:
32835 case ISD::ATOMIC_LOAD_OR:
32836 case ISD::ATOMIC_LOAD_XOR:
32837 case ISD::ATOMIC_LOAD_NAND:
32838 case ISD::ATOMIC_LOAD_MIN:
32839 case ISD::ATOMIC_LOAD_MAX:
32840 case ISD::ATOMIC_LOAD_UMIN:
32841 case ISD::ATOMIC_LOAD_UMAX:
32842 // Delegate to generic TypeLegalization. Situations we can really handle
32843 // should have already been dealt with by AtomicExpandPass.cpp.
32844 break;
32845
32846 case ISD::BITCAST: {
32847 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32847, __extension__
__PRETTY_FUNCTION__))
;
32848 EVT DstVT = N->getValueType(0);
32849 EVT SrcVT = N->getOperand(0).getValueType();
32850
32851 // If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
32852 // we can split using the k-register rather than memory.
32853 if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {
32854 assert(!Subtarget.is64Bit() && "Expected 32-bit mode")(static_cast <bool> (!Subtarget.is64Bit() && "Expected 32-bit mode"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Expected 32-bit mode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32854, __extension__
__PRETTY_FUNCTION__))
;
32855 SDValue Lo, Hi;
32856 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
32857 Lo = DAG.getBitcast(MVT::i32, Lo);
32858 Hi = DAG.getBitcast(MVT::i32, Hi);
32859 SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
32860 Results.push_back(Res);
32861 return;
32862 }
32863
32864 if (DstVT.isVector() && SrcVT == MVT::x86mmx) {
32865 // FIXME: Use v4f32 for SSE1?
32866 assert(Subtarget.hasSSE2() && "Requires SSE2")(static_cast <bool> (Subtarget.hasSSE2() && "Requires SSE2"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires SSE2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32866, __extension__
__PRETTY_FUNCTION__))
;
32867 assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), DstVT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32868, __extension__
__PRETTY_FUNCTION__))
32868 "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), DstVT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32868, __extension__
__PRETTY_FUNCTION__))
;
32869 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);
32870 SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64,
32871 N->getOperand(0));
32872 Res = DAG.getBitcast(WideVT, Res);
32873 Results.push_back(Res);
32874 return;
32875 }
32876
32877 return;
32878 }
32879 case ISD::MGATHER: {
32880 EVT VT = N->getValueType(0);
32881 if ((VT == MVT::v2f32 || VT == MVT::v2i32) &&
32882 (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
32883 auto *Gather = cast<MaskedGatherSDNode>(N);
32884 SDValue Index = Gather->getIndex();
32885 if (Index.getValueType() != MVT::v2i64)
32886 return;
32887 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32888, __extension__
__PRETTY_FUNCTION__))
32888 "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32888, __extension__
__PRETTY_FUNCTION__))
;
32889 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
32890 SDValue Mask = Gather->getMask();
32891 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type")(static_cast <bool> (Mask.getValueType() == MVT::v2i1 &&
"Unexpected mask type") ? void (0) : __assert_fail ("Mask.getValueType() == MVT::v2i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32891, __extension__
__PRETTY_FUNCTION__))
;
32892 SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT,
32893 Gather->getPassThru(),
32894 DAG.getUNDEF(VT));
32895 if (!Subtarget.hasVLX()) {
32896 // We need to widen the mask, but the instruction will only use 2
32897 // of its elements. So we can use undef.
32898 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
32899 DAG.getUNDEF(MVT::v2i1));
32900 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
32901 }
32902 SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
32903 Gather->getBasePtr(), Index, Gather->getScale() };
32904 SDValue Res = DAG.getMemIntrinsicNode(
32905 X86ISD::MGATHER, dl, DAG.getVTList(WideVT, MVT::Other), Ops,
32906 Gather->getMemoryVT(), Gather->getMemOperand());
32907 Results.push_back(Res);
32908 Results.push_back(Res.getValue(1));
32909 return;
32910 }
32911 return;
32912 }
32913 case ISD::LOAD: {
32914 // Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This
32915 // avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp
32916 // cast since type legalization will try to use an i64 load.
32917 MVT VT = N->getSimpleValueType(0);
32918 assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT")(static_cast <bool> (VT.isVector() && VT.getSizeInBits
() == 64 && "Unexpected VT") ? void (0) : __assert_fail
("VT.isVector() && VT.getSizeInBits() == 64 && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32918, __extension__
__PRETTY_FUNCTION__))
;
32919 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32920, __extension__
__PRETTY_FUNCTION__))
32920 "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32920, __extension__
__PRETTY_FUNCTION__))
;
32921 if (!ISD::isNON_EXTLoad(N))
32922 return;
32923 auto *Ld = cast<LoadSDNode>(N);
32924 if (Subtarget.hasSSE2()) {
32925 MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
32926 SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
32927 Ld->getPointerInfo(), Ld->getOriginalAlign(),
32928 Ld->getMemOperand()->getFlags());
32929 SDValue Chain = Res.getValue(1);
32930 MVT VecVT = MVT::getVectorVT(LdVT, 2);
32931 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res);
32932 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
32933 Res = DAG.getBitcast(WideVT, Res);
32934 Results.push_back(Res);
32935 Results.push_back(Chain);
32936 return;
32937 }
32938 assert(Subtarget.hasSSE1() && "Expected SSE")(static_cast <bool> (Subtarget.hasSSE1() && "Expected SSE"
) ? void (0) : __assert_fail ("Subtarget.hasSSE1() && \"Expected SSE\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32938, __extension__
__PRETTY_FUNCTION__))
;
32939 SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);
32940 SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
32941 SDValue Res = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
32942 MVT::i64, Ld->getMemOperand());
32943 Results.push_back(Res);
32944 Results.push_back(Res.getValue(1));
32945 return;
32946 }
32947 case ISD::ADDRSPACECAST: {
32948 SDValue V = LowerADDRSPACECAST(SDValue(N,0), DAG);
32949 Results.push_back(V);
32950 return;
32951 }
32952 case ISD::BITREVERSE:
32953 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!")(static_cast <bool> (N->getValueType(0) == MVT::i64 &&
"Unexpected VT!") ? void (0) : __assert_fail ("N->getValueType(0) == MVT::i64 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32953, __extension__
__PRETTY_FUNCTION__))
;
32954 assert(Subtarget.hasXOP() && "Expected XOP")(static_cast <bool> (Subtarget.hasXOP() && "Expected XOP"
) ? void (0) : __assert_fail ("Subtarget.hasXOP() && \"Expected XOP\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32954, __extension__
__PRETTY_FUNCTION__))
;
32955 // We can use VPPERM by copying to a vector register and back. We'll need
32956 // to move the scalar in two i32 pieces.
32957 Results.push_back(LowerBITREVERSE(SDValue(N, 0), Subtarget, DAG));
32958 return;
32959 }
32960}
32961
32962const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
32963 switch ((X86ISD::NodeType)Opcode) {
32964 case X86ISD::FIRST_NUMBER: break;
32965#define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE;
32966 NODE_NAME_CASE(BSF)
32967 NODE_NAME_CASE(BSR)
32968 NODE_NAME_CASE(FSHL)
32969 NODE_NAME_CASE(FSHR)
32970 NODE_NAME_CASE(FAND)
32971 NODE_NAME_CASE(FANDN)
32972 NODE_NAME_CASE(FOR)
32973 NODE_NAME_CASE(FXOR)
32974 NODE_NAME_CASE(FILD)
32975 NODE_NAME_CASE(FIST)
32976 NODE_NAME_CASE(FP_TO_INT_IN_MEM)
32977 NODE_NAME_CASE(FLD)
32978 NODE_NAME_CASE(FST)
32979 NODE_NAME_CASE(CALL)
32980 NODE_NAME_CASE(CALL_RVMARKER)
32981 NODE_NAME_CASE(BT)
32982 NODE_NAME_CASE(CMP)
32983 NODE_NAME_CASE(FCMP)
32984 NODE_NAME_CASE(STRICT_FCMP)
32985 NODE_NAME_CASE(STRICT_FCMPS)
32986 NODE_NAME_CASE(COMI)
32987 NODE_NAME_CASE(UCOMI)
32988 NODE_NAME_CASE(CMPM)
32989 NODE_NAME_CASE(CMPMM)
32990 NODE_NAME_CASE(STRICT_CMPM)
32991 NODE_NAME_CASE(CMPMM_SAE)
32992 NODE_NAME_CASE(SETCC)
32993 NODE_NAME_CASE(SETCC_CARRY)
32994 NODE_NAME_CASE(FSETCC)
32995 NODE_NAME_CASE(FSETCCM)
32996 NODE_NAME_CASE(FSETCCM_SAE)
32997 NODE_NAME_CASE(CMOV)
32998 NODE_NAME_CASE(BRCOND)
32999 NODE_NAME_CASE(RET_FLAG)
33000 NODE_NAME_CASE(IRET)
33001 NODE_NAME_CASE(REP_STOS)
33002 NODE_NAME_CASE(REP_MOVS)
33003 NODE_NAME_CASE(GlobalBaseReg)
33004 NODE_NAME_CASE(Wrapper)
33005 NODE_NAME_CASE(WrapperRIP)
33006 NODE_NAME_CASE(MOVQ2DQ)
33007 NODE_NAME_CASE(MOVDQ2Q)
33008 NODE_NAME_CASE(MMX_MOVD2W)
33009 NODE_NAME_CASE(MMX_MOVW2D)
33010 NODE_NAME_CASE(PEXTRB)
33011 NODE_NAME_CASE(PEXTRW)
33012 NODE_NAME_CASE(INSERTPS)
33013 NODE_NAME_CASE(PINSRB)
33014 NODE_NAME_CASE(PINSRW)
33015 NODE_NAME_CASE(PSHUFB)
33016 NODE_NAME_CASE(ANDNP)
33017 NODE_NAME_CASE(BLENDI)
33018 NODE_NAME_CASE(BLENDV)
33019 NODE_NAME_CASE(HADD)
33020 NODE_NAME_CASE(HSUB)
33021 NODE_NAME_CASE(FHADD)
33022 NODE_NAME_CASE(FHSUB)
33023 NODE_NAME_CASE(CONFLICT)
33024 NODE_NAME_CASE(FMAX)
33025 NODE_NAME_CASE(FMAXS)
33026 NODE_NAME_CASE(FMAX_SAE)
33027 NODE_NAME_CASE(FMAXS_SAE)
33028 NODE_NAME_CASE(FMIN)
33029 NODE_NAME_CASE(FMINS)
33030 NODE_NAME_CASE(FMIN_SAE)
33031 NODE_NAME_CASE(FMINS_SAE)
33032 NODE_NAME_CASE(FMAXC)
33033 NODE_NAME_CASE(FMINC)
33034 NODE_NAME_CASE(FRSQRT)
33035 NODE_NAME_CASE(FRCP)
33036 NODE_NAME_CASE(EXTRQI)
33037 NODE_NAME_CASE(INSERTQI)
33038 NODE_NAME_CASE(TLSADDR)
33039 NODE_NAME_CASE(TLSBASEADDR)
33040 NODE_NAME_CASE(TLSCALL)
33041 NODE_NAME_CASE(EH_SJLJ_SETJMP)
33042 NODE_NAME_CASE(EH_SJLJ_LONGJMP)
33043 NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)
33044 NODE_NAME_CASE(EH_RETURN)
33045 NODE_NAME_CASE(TC_RETURN)
33046 NODE_NAME_CASE(FNSTCW16m)
33047 NODE_NAME_CASE(FLDCW16m)
33048 NODE_NAME_CASE(LCMPXCHG_DAG)
33049 NODE_NAME_CASE(LCMPXCHG8_DAG)
33050 NODE_NAME_CASE(LCMPXCHG16_DAG)
33051 NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)
33052 NODE_NAME_CASE(LADD)
33053 NODE_NAME_CASE(LSUB)
33054 NODE_NAME_CASE(LOR)
33055 NODE_NAME_CASE(LXOR)
33056 NODE_NAME_CASE(LAND)
33057 NODE_NAME_CASE(LBTS)
33058 NODE_NAME_CASE(LBTC)
33059 NODE_NAME_CASE(LBTR)
33060 NODE_NAME_CASE(VZEXT_MOVL)
33061 NODE_NAME_CASE(VZEXT_LOAD)
33062 NODE_NAME_CASE(VEXTRACT_STORE)
33063 NODE_NAME_CASE(VTRUNC)
33064 NODE_NAME_CASE(VTRUNCS)
33065 NODE_NAME_CASE(VTRUNCUS)
33066 NODE_NAME_CASE(VMTRUNC)
33067 NODE_NAME_CASE(VMTRUNCS)
33068 NODE_NAME_CASE(VMTRUNCUS)
33069 NODE_NAME_CASE(VTRUNCSTORES)
33070 NODE_NAME_CASE(VTRUNCSTOREUS)
33071 NODE_NAME_CASE(VMTRUNCSTORES)
33072 NODE_NAME_CASE(VMTRUNCSTOREUS)
33073 NODE_NAME_CASE(VFPEXT)
33074 NODE_NAME_CASE(STRICT_VFPEXT)
33075 NODE_NAME_CASE(VFPEXT_SAE)
33076 NODE_NAME_CASE(VFPEXTS)
33077 NODE_NAME_CASE(VFPEXTS_SAE)
33078 NODE_NAME_CASE(VFPROUND)
33079 NODE_NAME_CASE(STRICT_VFPROUND)
33080 NODE_NAME_CASE(VMFPROUND)
33081 NODE_NAME_CASE(VFPROUND_RND)
33082 NODE_NAME_CASE(VFPROUNDS)
33083 NODE_NAME_CASE(VFPROUNDS_RND)
33084 NODE_NAME_CASE(VSHLDQ)
33085 NODE_NAME_CASE(VSRLDQ)
33086 NODE_NAME_CASE(VSHL)
33087 NODE_NAME_CASE(VSRL)
33088 NODE_NAME_CASE(VSRA)
33089 NODE_NAME_CASE(VSHLI)
33090 NODE_NAME_CASE(VSRLI)
33091 NODE_NAME_CASE(VSRAI)
33092 NODE_NAME_CASE(VSHLV)
33093 NODE_NAME_CASE(VSRLV)
33094 NODE_NAME_CASE(VSRAV)
33095 NODE_NAME_CASE(VROTLI)
33096 NODE_NAME_CASE(VROTRI)
33097 NODE_NAME_CASE(VPPERM)
33098 NODE_NAME_CASE(CMPP)
33099 NODE_NAME_CASE(STRICT_CMPP)
33100 NODE_NAME_CASE(PCMPEQ)
33101 NODE_NAME_CASE(PCMPGT)
33102 NODE_NAME_CASE(PHMINPOS)
33103 NODE_NAME_CASE(ADD)
33104 NODE_NAME_CASE(SUB)
33105 NODE_NAME_CASE(ADC)
33106 NODE_NAME_CASE(SBB)
33107 NODE_NAME_CASE(SMUL)
33108 NODE_NAME_CASE(UMUL)
33109 NODE_NAME_CASE(OR)
33110 NODE_NAME_CASE(XOR)
33111 NODE_NAME_CASE(AND)
33112 NODE_NAME_CASE(BEXTR)
33113 NODE_NAME_CASE(BEXTRI)
33114 NODE_NAME_CASE(BZHI)
33115 NODE_NAME_CASE(PDEP)
33116 NODE_NAME_CASE(PEXT)
33117 NODE_NAME_CASE(MUL_IMM)
33118 NODE_NAME_CASE(MOVMSK)
33119 NODE_NAME_CASE(PTEST)
33120 NODE_NAME_CASE(TESTP)
33121 NODE_NAME_CASE(KORTEST)
33122 NODE_NAME_CASE(KTEST)
33123 NODE_NAME_CASE(KADD)
33124 NODE_NAME_CASE(KSHIFTL)
33125 NODE_NAME_CASE(KSHIFTR)
33126 NODE_NAME_CASE(PACKSS)
33127 NODE_NAME_CASE(PACKUS)
33128 NODE_NAME_CASE(PALIGNR)
33129 NODE_NAME_CASE(VALIGN)
33130 NODE_NAME_CASE(VSHLD)
33131 NODE_NAME_CASE(VSHRD)
33132 NODE_NAME_CASE(VSHLDV)
33133 NODE_NAME_CASE(VSHRDV)
33134 NODE_NAME_CASE(PSHUFD)
33135 NODE_NAME_CASE(PSHUFHW)
33136 NODE_NAME_CASE(PSHUFLW)
33137 NODE_NAME_CASE(SHUFP)
33138 NODE_NAME_CASE(SHUF128)
33139 NODE_NAME_CASE(MOVLHPS)
33140 NODE_NAME_CASE(MOVHLPS)
33141 NODE_NAME_CASE(MOVDDUP)
33142 NODE_NAME_CASE(MOVSHDUP)
33143 NODE_NAME_CASE(MOVSLDUP)
33144 NODE_NAME_CASE(MOVSD)
33145 NODE_NAME_CASE(MOVSS)
33146 NODE_NAME_CASE(MOVSH)
33147 NODE_NAME_CASE(UNPCKL)
33148 NODE_NAME_CASE(UNPCKH)
33149 NODE_NAME_CASE(VBROADCAST)
33150 NODE_NAME_CASE(VBROADCAST_LOAD)
33151 NODE_NAME_CASE(VBROADCASTM)
33152 NODE_NAME_CASE(SUBV_BROADCAST_LOAD)
33153 NODE_NAME_CASE(VPERMILPV)
33154 NODE_NAME_CASE(VPERMILPI)
33155 NODE_NAME_CASE(VPERM2X128)
33156 NODE_NAME_CASE(VPERMV)
33157 NODE_NAME_CASE(VPERMV3)
33158 NODE_NAME_CASE(VPERMI)
33159 NODE_NAME_CASE(VPTERNLOG)
33160 NODE_NAME_CASE(VFIXUPIMM)
33161 NODE_NAME_CASE(VFIXUPIMM_SAE)
33162 NODE_NAME_CASE(VFIXUPIMMS)
33163 NODE_NAME_CASE(VFIXUPIMMS_SAE)
33164 NODE_NAME_CASE(VRANGE)
33165 NODE_NAME_CASE(VRANGE_SAE)
33166 NODE_NAME_CASE(VRANGES)
33167 NODE_NAME_CASE(VRANGES_SAE)
33168 NODE_NAME_CASE(PMULUDQ)
33169 NODE_NAME_CASE(PMULDQ)
33170 NODE_NAME_CASE(PSADBW)
33171 NODE_NAME_CASE(DBPSADBW)
33172 NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)
33173 NODE_NAME_CASE(VAARG_64)
33174 NODE_NAME_CASE(VAARG_X32)
33175 NODE_NAME_CASE(DYN_ALLOCA)
33176 NODE_NAME_CASE(MEMBARRIER)
33177 NODE_NAME_CASE(MFENCE)
33178 NODE_NAME_CASE(SEG_ALLOCA)
33179 NODE_NAME_CASE(PROBED_ALLOCA)
33180 NODE_NAME_CASE(RDRAND)
33181 NODE_NAME_CASE(RDSEED)
33182 NODE_NAME_CASE(RDPKRU)
33183 NODE_NAME_CASE(WRPKRU)
33184 NODE_NAME_CASE(VPMADDUBSW)
33185 NODE_NAME_CASE(VPMADDWD)
33186 NODE_NAME_CASE(VPSHA)
33187 NODE_NAME_CASE(VPSHL)
33188 NODE_NAME_CASE(VPCOM)
33189 NODE_NAME_CASE(VPCOMU)
33190 NODE_NAME_CASE(VPERMIL2)
33191 NODE_NAME_CASE(FMSUB)
33192 NODE_NAME_CASE(STRICT_FMSUB)
33193 NODE_NAME_CASE(FNMADD)
33194 NODE_NAME_CASE(STRICT_FNMADD)
33195 NODE_NAME_CASE(FNMSUB)
33196 NODE_NAME_CASE(STRICT_FNMSUB)
33197 NODE_NAME_CASE(FMADDSUB)
33198 NODE_NAME_CASE(FMSUBADD)
33199 NODE_NAME_CASE(FMADD_RND)
33200 NODE_NAME_CASE(FNMADD_RND)
33201 NODE_NAME_CASE(FMSUB_RND)
33202 NODE_NAME_CASE(FNMSUB_RND)
33203 NODE_NAME_CASE(FMADDSUB_RND)
33204 NODE_NAME_CASE(FMSUBADD_RND)
33205 NODE_NAME_CASE(VFMADDC)
33206 NODE_NAME_CASE(VFMADDC_RND)
33207 NODE_NAME_CASE(VFCMADDC)
33208 NODE_NAME_CASE(VFCMADDC_RND)
33209 NODE_NAME_CASE(VFMULC)
33210 NODE_NAME_CASE(VFMULC_RND)
33211 NODE_NAME_CASE(VFCMULC)
33212 NODE_NAME_CASE(VFCMULC_RND)
33213 NODE_NAME_CASE(VFMULCSH)
33214 NODE_NAME_CASE(VFMULCSH_RND)
33215 NODE_NAME_CASE(VFCMULCSH)
33216 NODE_NAME_CASE(VFCMULCSH_RND)
33217 NODE_NAME_CASE(VFMADDCSH)
33218 NODE_NAME_CASE(VFMADDCSH_RND)
33219 NODE_NAME_CASE(VFCMADDCSH)
33220 NODE_NAME_CASE(VFCMADDCSH_RND)
33221 NODE_NAME_CASE(VPMADD52H)
33222 NODE_NAME_CASE(VPMADD52L)
33223 NODE_NAME_CASE(VRNDSCALE)
33224 NODE_NAME_CASE(STRICT_VRNDSCALE)
33225 NODE_NAME_CASE(VRNDSCALE_SAE)
33226 NODE_NAME_CASE(VRNDSCALES)
33227 NODE_NAME_CASE(VRNDSCALES_SAE)
33228 NODE_NAME_CASE(VREDUCE)
33229 NODE_NAME_CASE(VREDUCE_SAE)
33230 NODE_NAME_CASE(VREDUCES)
33231 NODE_NAME_CASE(VREDUCES_SAE)
33232 NODE_NAME_CASE(VGETMANT)
33233 NODE_NAME_CASE(VGETMANT_SAE)
33234 NODE_NAME_CASE(VGETMANTS)
33235 NODE_NAME_CASE(VGETMANTS_SAE)
33236 NODE_NAME_CASE(PCMPESTR)
33237 NODE_NAME_CASE(PCMPISTR)
33238 NODE_NAME_CASE(XTEST)
33239 NODE_NAME_CASE(COMPRESS)
33240 NODE_NAME_CASE(EXPAND)
33241 NODE_NAME_CASE(SELECTS)
33242 NODE_NAME_CASE(ADDSUB)
33243 NODE_NAME_CASE(RCP14)
33244 NODE_NAME_CASE(RCP14S)
33245 NODE_NAME_CASE(RCP28)
33246 NODE_NAME_CASE(RCP28_SAE)
33247 NODE_NAME_CASE(RCP28S)
33248 NODE_NAME_CASE(RCP28S_SAE)
33249 NODE_NAME_CASE(EXP2)
33250 NODE_NAME_CASE(EXP2_SAE)
33251 NODE_NAME_CASE(RSQRT14)
33252 NODE_NAME_CASE(RSQRT14S)
33253 NODE_NAME_CASE(RSQRT28)
33254 NODE_NAME_CASE(RSQRT28_SAE)
33255 NODE_NAME_CASE(RSQRT28S)
33256 NODE_NAME_CASE(RSQRT28S_SAE)
33257 NODE_NAME_CASE(FADD_RND)
33258 NODE_NAME_CASE(FADDS)
33259 NODE_NAME_CASE(FADDS_RND)
33260 NODE_NAME_CASE(FSUB_RND)
33261 NODE_NAME_CASE(FSUBS)
33262 NODE_NAME_CASE(FSUBS_RND)
33263 NODE_NAME_CASE(FMUL_RND)
33264 NODE_NAME_CASE(FMULS)
33265 NODE_NAME_CASE(FMULS_RND)
33266 NODE_NAME_CASE(FDIV_RND)
33267 NODE_NAME_CASE(FDIVS)
33268 NODE_NAME_CASE(FDIVS_RND)
33269 NODE_NAME_CASE(FSQRT_RND)
33270 NODE_NAME_CASE(FSQRTS)
33271 NODE_NAME_CASE(FSQRTS_RND)
33272 NODE_NAME_CASE(FGETEXP)
33273 NODE_NAME_CASE(FGETEXP_SAE)
33274 NODE_NAME_CASE(FGETEXPS)
33275 NODE_NAME_CASE(FGETEXPS_SAE)
33276 NODE_NAME_CASE(SCALEF)
33277 NODE_NAME_CASE(SCALEF_RND)
33278 NODE_NAME_CASE(SCALEFS)
33279 NODE_NAME_CASE(SCALEFS_RND)
33280 NODE_NAME_CASE(MULHRS)
33281 NODE_NAME_CASE(SINT_TO_FP_RND)
33282 NODE_NAME_CASE(UINT_TO_FP_RND)
33283 NODE_NAME_CASE(CVTTP2SI)
33284 NODE_NAME_CASE(CVTTP2UI)
33285 NODE_NAME_CASE(STRICT_CVTTP2SI)
33286 NODE_NAME_CASE(STRICT_CVTTP2UI)
33287 NODE_NAME_CASE(MCVTTP2SI)
33288 NODE_NAME_CASE(MCVTTP2UI)
33289 NODE_NAME_CASE(CVTTP2SI_SAE)
33290 NODE_NAME_CASE(CVTTP2UI_SAE)
33291 NODE_NAME_CASE(CVTTS2SI)
33292 NODE_NAME_CASE(CVTTS2UI)
33293 NODE_NAME_CASE(CVTTS2SI_SAE)
33294 NODE_NAME_CASE(CVTTS2UI_SAE)
33295 NODE_NAME_CASE(CVTSI2P)
33296 NODE_NAME_CASE(CVTUI2P)
33297 NODE_NAME_CASE(STRICT_CVTSI2P)
33298 NODE_NAME_CASE(STRICT_CVTUI2P)
33299 NODE_NAME_CASE(MCVTSI2P)
33300 NODE_NAME_CASE(MCVTUI2P)
33301 NODE_NAME_CASE(VFPCLASS)
33302 NODE_NAME_CASE(VFPCLASSS)
33303 NODE_NAME_CASE(MULTISHIFT)
33304 NODE_NAME_CASE(SCALAR_SINT_TO_FP)
33305 NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND)
33306 NODE_NAME_CASE(SCALAR_UINT_TO_FP)
33307 NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)
33308 NODE_NAME_CASE(CVTPS2PH)
33309 NODE_NAME_CASE(STRICT_CVTPS2PH)
33310 NODE_NAME_CASE(MCVTPS2PH)
33311 NODE_NAME_CASE(CVTPH2PS)
33312 NODE_NAME_CASE(STRICT_CVTPH2PS)
33313 NODE_NAME_CASE(CVTPH2PS_SAE)
33314 NODE_NAME_CASE(CVTP2SI)
33315 NODE_NAME_CASE(CVTP2UI)
33316 NODE_NAME_CASE(MCVTP2SI)
33317 NODE_NAME_CASE(MCVTP2UI)
33318 NODE_NAME_CASE(CVTP2SI_RND)
33319 NODE_NAME_CASE(CVTP2UI_RND)
33320 NODE_NAME_CASE(CVTS2SI)
33321 NODE_NAME_CASE(CVTS2UI)
33322 NODE_NAME_CASE(CVTS2SI_RND)
33323 NODE_NAME_CASE(CVTS2UI_RND)
33324 NODE_NAME_CASE(CVTNE2PS2BF16)
33325 NODE_NAME_CASE(CVTNEPS2BF16)
33326 NODE_NAME_CASE(MCVTNEPS2BF16)
33327 NODE_NAME_CASE(DPBF16PS)
33328 NODE_NAME_CASE(LWPINS)
33329 NODE_NAME_CASE(MGATHER)
33330 NODE_NAME_CASE(MSCATTER)
33331 NODE_NAME_CASE(VPDPBUSD)
33332 NODE_NAME_CASE(VPDPBUSDS)
33333 NODE_NAME_CASE(VPDPWSSD)
33334 NODE_NAME_CASE(VPDPWSSDS)
33335 NODE_NAME_CASE(VPSHUFBITQMB)
33336 NODE_NAME_CASE(GF2P8MULB)
33337 NODE_NAME_CASE(GF2P8AFFINEQB)
33338 NODE_NAME_CASE(GF2P8AFFINEINVQB)
33339 NODE_NAME_CASE(NT_CALL)
33340 NODE_NAME_CASE(NT_BRIND)
33341 NODE_NAME_CASE(UMWAIT)
33342 NODE_NAME_CASE(TPAUSE)
33343 NODE_NAME_CASE(ENQCMD)
33344 NODE_NAME_CASE(ENQCMDS)
33345 NODE_NAME_CASE(VP2INTERSECT)
33346 NODE_NAME_CASE(AESENC128KL)
33347 NODE_NAME_CASE(AESDEC128KL)
33348 NODE_NAME_CASE(AESENC256KL)
33349 NODE_NAME_CASE(AESDEC256KL)
33350 NODE_NAME_CASE(AESENCWIDE128KL)
33351 NODE_NAME_CASE(AESDECWIDE128KL)
33352 NODE_NAME_CASE(AESENCWIDE256KL)
33353 NODE_NAME_CASE(AESDECWIDE256KL)
33354 NODE_NAME_CASE(TESTUI)
33355 }
33356 return nullptr;
33357#undef NODE_NAME_CASE
33358}
33359
33360/// Return true if the addressing mode represented by AM is legal for this
33361/// target, for a load/store of the specified type.
33362bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
33363 const AddrMode &AM, Type *Ty,
33364 unsigned AS,
33365 Instruction *I) const {
33366 // X86 supports extremely general addressing modes.
33367 CodeModel::Model M = getTargetMachine().getCodeModel();
33368
33369 // X86 allows a sign-extended 32-bit immediate field as a displacement.
33370 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
33371 return false;
33372
33373 if (AM.BaseGV) {
33374 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
33375
33376 // If a reference to this global requires an extra load, we can't fold it.
33377 if (isGlobalStubReference(GVFlags))
33378 return false;
33379
33380 // If BaseGV requires a register for the PIC base, we cannot also have a
33381 // BaseReg specified.
33382 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
33383 return false;
33384
33385 // If lower 4G is not available, then we must use rip-relative addressing.
33386 if ((M != CodeModel::Small || isPositionIndependent()) &&
33387 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
33388 return false;
33389 }
33390
33391 switch (AM.Scale) {
33392 case 0:
33393 case 1:
33394 case 2:
33395 case 4:
33396 case 8:
33397 // These scales always work.
33398 break;
33399 case 3:
33400 case 5:
33401 case 9:
33402 // These scales are formed with basereg+scalereg. Only accept if there is
33403 // no basereg yet.
33404 if (AM.HasBaseReg)
33405 return false;
33406 break;
33407 default: // Other stuff never works.
33408 return false;
33409 }
33410
33411 return true;
33412}
33413
33414bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
33415 unsigned Bits = Ty->getScalarSizeInBits();
33416
33417 // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
33418 // Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred.
33419 if (Subtarget.hasXOP() &&
33420 (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))
33421 return false;
33422
33423 // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
33424 // shifts just as cheap as scalar ones.
33425 if (Subtarget.hasAVX2() && (Bits == 32 || Bits == 64))
33426 return false;
33427
33428 // AVX512BW has shifts such as vpsllvw.
33429 if (Subtarget.hasBWI() && Bits == 16)
33430 return false;
33431
33432 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
33433 // fully general vector.
33434 return true;
33435}
33436
33437bool X86TargetLowering::isBinOp(unsigned Opcode) const {
33438 switch (Opcode) {
33439 // These are non-commutative binops.
33440 // TODO: Add more X86ISD opcodes once we have test coverage.
33441 case X86ISD::ANDNP:
33442 case X86ISD::PCMPGT:
33443 case X86ISD::FMAX:
33444 case X86ISD::FMIN:
33445 case X86ISD::FANDN:
33446 case X86ISD::VPSHA:
33447 case X86ISD::VPSHL:
33448 case X86ISD::VSHLV:
33449 case X86ISD::VSRLV:
33450 case X86ISD::VSRAV:
33451 return true;
33452 }
33453
33454 return TargetLoweringBase::isBinOp(Opcode);
33455}
33456
33457bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {
33458 switch (Opcode) {
33459 // TODO: Add more X86ISD opcodes once we have test coverage.
33460 case X86ISD::PCMPEQ:
33461 case X86ISD::PMULDQ:
33462 case X86ISD::PMULUDQ:
33463 case X86ISD::FMAXC:
33464 case X86ISD::FMINC:
33465 case X86ISD::FAND:
33466 case X86ISD::FOR:
33467 case X86ISD::FXOR:
33468 return true;
33469 }
33470
33471 return TargetLoweringBase::isCommutativeBinOp(Opcode);
33472}
33473
33474bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
33475 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
33476 return false;
33477 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
33478 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
33479 return NumBits1 > NumBits2;
33480}
33481
33482bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
33483 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
33484 return false;
33485
33486 if (!isTypeLegal(EVT::getEVT(Ty1)))
33487 return false;
33488
33489 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop")(static_cast <bool> (Ty1->getPrimitiveSizeInBits() <=
64 && "i128 is probably not a noop") ? void (0) : __assert_fail
("Ty1->getPrimitiveSizeInBits() <= 64 && \"i128 is probably not a noop\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33489, __extension__
__PRETTY_FUNCTION__))
;
33490
33491 // Assuming the caller doesn't have a zeroext or signext return parameter,
33492 // truncation all the way down to i1 is valid.
33493 return true;
33494}
33495
33496bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
33497 return isInt<32>(Imm);
33498}
33499
33500bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
33501 // Can also use sub to handle negated immediates.
33502 return isInt<32>(Imm);
33503}
33504
33505bool X86TargetLowering::isLegalStoreImmediate(int64_t Imm) const {
33506 return isInt<32>(Imm);
33507}
33508
33509bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
33510 if (!VT1.isScalarInteger() || !VT2.isScalarInteger())
33511 return false;
33512 unsigned NumBits1 = VT1.getSizeInBits();
33513 unsigned NumBits2 = VT2.getSizeInBits();
33514 return NumBits1 > NumBits2;
33515}
33516
33517bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
33518 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
33519 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
33520}
33521
33522bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
33523 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
33524 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
33525}
33526
33527bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
33528 EVT VT1 = Val.getValueType();
33529 if (isZExtFree(VT1, VT2))
33530 return true;
33531
33532 if (Val.getOpcode() != ISD::LOAD)
33533 return false;
33534
33535 if (!VT1.isSimple() || !VT1.isInteger() ||
33536 !VT2.isSimple() || !VT2.isInteger())
33537 return false;
33538
33539 switch (VT1.getSimpleVT().SimpleTy) {
33540 default: break;
33541 case MVT::i8:
33542 case MVT::i16:
33543 case MVT::i32:
33544 // X86 has 8, 16, and 32-bit zero-extending loads.
33545 return true;
33546 }
33547
33548 return false;
33549}
33550
33551bool X86TargetLowering::shouldSinkOperands(Instruction *I,
33552 SmallVectorImpl<Use *> &Ops) const {
33553 using namespace llvm::PatternMatch;
33554
33555 FixedVectorType *VTy = dyn_cast<FixedVectorType>(I->getType());
33556 if (!VTy)
33557 return false;
33558
33559 if (I->getOpcode() == Instruction::Mul &&
33560 VTy->getElementType()->isIntegerTy(64)) {
33561 for (auto &Op : I->operands()) {
33562 // Make sure we are not already sinking this operand
33563 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
33564 continue;
33565
33566 // Look for PMULDQ pattern where the input is a sext_inreg from vXi32 or
33567 // the PMULUDQ pattern where the input is a zext_inreg from vXi32.
33568 if (Subtarget.hasSSE41() &&
33569 match(Op.get(), m_AShr(m_Shl(m_Value(), m_SpecificInt(32)),
33570 m_SpecificInt(32)))) {
33571 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
33572 Ops.push_back(&Op);
33573 } else if (Subtarget.hasSSE2() &&
33574 match(Op.get(),
33575 m_And(m_Value(), m_SpecificInt(UINT64_C(0xffffffff)0xffffffffUL)))) {
33576 Ops.push_back(&Op);
33577 }
33578 }
33579
33580 return !Ops.empty();
33581 }
33582
33583 // A uniform shift amount in a vector shift or funnel shift may be much
33584 // cheaper than a generic variable vector shift, so make that pattern visible
33585 // to SDAG by sinking the shuffle instruction next to the shift.
33586 int ShiftAmountOpNum = -1;
33587 if (I->isShift())
33588 ShiftAmountOpNum = 1;
33589 else if (auto *II = dyn_cast<IntrinsicInst>(I)) {
33590 if (II->getIntrinsicID() == Intrinsic::fshl ||
33591 II->getIntrinsicID() == Intrinsic::fshr)
33592 ShiftAmountOpNum = 2;
33593 }
33594
33595 if (ShiftAmountOpNum == -1)
33596 return false;
33597
33598 auto *Shuf = dyn_cast<ShuffleVectorInst>(I->getOperand(ShiftAmountOpNum));
33599 if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 &&
33600 isVectorShiftByScalarCheap(I->getType())) {
33601 Ops.push_back(&I->getOperandUse(ShiftAmountOpNum));
33602 return true;
33603 }
33604
33605 return false;
33606}
33607
33608bool X86TargetLowering::shouldConvertPhiType(Type *From, Type *To) const {
33609 if (!Subtarget.is64Bit())
33610 return false;
33611 return TargetLowering::shouldConvertPhiType(From, To);
33612}
33613
33614bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
33615 if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))
33616 return false;
33617
33618 EVT SrcVT = ExtVal.getOperand(0).getValueType();
33619
33620 // There is no extending load for vXi1.
33621 if (SrcVT.getScalarType() == MVT::i1)
33622 return false;
33623
33624 return true;
33625}
33626
33627bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
33628 EVT VT) const {
33629 if (!Subtarget.hasAnyFMA())
33630 return false;
33631
33632 VT = VT.getScalarType();
33633
33634 if (!VT.isSimple())
33635 return false;
33636
33637 switch (VT.getSimpleVT().SimpleTy) {
33638 case MVT::f16:
33639 return Subtarget.hasFP16();
33640 case MVT::f32:
33641 case MVT::f64:
33642 return true;
33643 default:
33644 break;
33645 }
33646
33647 return false;
33648}
33649
33650bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
33651 // i16 instructions are longer (0x66 prefix) and potentially slower.
33652 return !(VT1 == MVT::i32 && VT2 == MVT::i16);
33653}
33654
33655bool X86TargetLowering::shouldFoldSelectWithIdentityConstant(unsigned Opcode,
33656 EVT VT) const {
33657 // TODO: This is too general. There are cases where pre-AVX512 codegen would
33658 // benefit. The transform may also be profitable for scalar code.
33659 if (!Subtarget.hasAVX512())
33660 return false;
33661 if (!Subtarget.hasVLX() && !VT.is512BitVector())
33662 return false;
33663 if (!VT.isVector())
33664 return false;
33665
33666 return true;
33667}
33668
33669/// Targets can use this to indicate that they only support *some*
33670/// VECTOR_SHUFFLE operations, those with specific masks.
33671/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
33672/// are assumed to be legal.
33673bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const {
33674 if (!VT.isSimple())
33675 return false;
33676
33677 // Not for i1 vectors
33678 if (VT.getSimpleVT().getScalarType() == MVT::i1)
33679 return false;
33680
33681 // Very little shuffling can be done for 64-bit vectors right now.
33682 if (VT.getSimpleVT().getSizeInBits() == 64)
33683 return false;
33684
33685 // We only care that the types being shuffled are legal. The lowering can
33686 // handle any possible shuffle mask that results.
33687 return isTypeLegal(VT.getSimpleVT());
33688}
33689
33690bool X86TargetLowering::isVectorClearMaskLegal(ArrayRef<int> Mask,
33691 EVT VT) const {
33692 // Don't convert an 'and' into a shuffle that we don't directly support.
33693 // vpblendw and vpshufb for 256-bit vectors are not available on AVX1.
33694 if (!Subtarget.hasAVX2())
33695 if (VT == MVT::v32i8 || VT == MVT::v16i16)
33696 return false;
33697
33698 // Just delegate to the generic legality, clear masks aren't special.
33699 return isShuffleMaskLegal(Mask, VT);
33700}
33701
33702bool X86TargetLowering::areJTsAllowed(const Function *Fn) const {
33703 // If the subtarget is using thunks, we need to not generate jump tables.
33704 if (Subtarget.useIndirectThunkBranches())
33705 return false;
33706
33707 // Otherwise, fallback on the generic logic.
33708 return TargetLowering::areJTsAllowed(Fn);
33709}
33710
33711//===----------------------------------------------------------------------===//
33712// X86 Scheduler Hooks
33713//===----------------------------------------------------------------------===//
33714
33715// Returns true if EFLAG is consumed after this iterator in the rest of the
33716// basic block or any successors of the basic block.
33717static bool isEFLAGSLiveAfter(MachineBasicBlock::iterator Itr,
33718 MachineBasicBlock *BB) {
33719 // Scan forward through BB for a use/def of EFLAGS.
33720 for (const MachineInstr &mi : llvm::make_range(std::next(Itr), BB->end())) {
33721 if (mi.readsRegister(X86::EFLAGS))
33722 return true;
33723 // If we found a def, we can stop searching.
33724 if (mi.definesRegister(X86::EFLAGS))
33725 return false;
33726 }
33727
33728 // If we hit the end of the block, check whether EFLAGS is live into a
33729 // successor.
33730 for (MachineBasicBlock *Succ : BB->successors())
33731 if (Succ->isLiveIn(X86::EFLAGS))
33732 return true;
33733
33734 return false;
33735}
33736
33737/// Utility function to emit xbegin specifying the start of an RTM region.
33738static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
33739 const TargetInstrInfo *TII) {
33740 const DebugLoc &DL = MI.getDebugLoc();
33741
33742 const BasicBlock *BB = MBB->getBasicBlock();
33743 MachineFunction::iterator I = ++MBB->getIterator();
33744
33745 // For the v = xbegin(), we generate
33746 //
33747 // thisMBB:
33748 // xbegin sinkMBB
33749 //
33750 // mainMBB:
33751 // s0 = -1
33752 //
33753 // fallBB:
33754 // eax = # XABORT_DEF
33755 // s1 = eax
33756 //
33757 // sinkMBB:
33758 // v = phi(s0/mainBB, s1/fallBB)
33759
33760 MachineBasicBlock *thisMBB = MBB;
33761 MachineFunction *MF = MBB->getParent();
33762 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
33763 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
33764 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
33765 MF->insert(I, mainMBB);
33766 MF->insert(I, fallMBB);
33767 MF->insert(I, sinkMBB);
33768
33769 if (isEFLAGSLiveAfter(MI, MBB)) {
33770 mainMBB->addLiveIn(X86::EFLAGS);
33771 fallMBB->addLiveIn(X86::EFLAGS);
33772 sinkMBB->addLiveIn(X86::EFLAGS);
33773 }
33774
33775 // Transfer the remainder of BB and its successor edges to sinkMBB.
33776 sinkMBB->splice(sinkMBB->begin(), MBB,
33777 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
33778 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
33779
33780 MachineRegisterInfo &MRI = MF->getRegInfo();
33781 Register DstReg = MI.getOperand(0).getReg();
33782 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
33783 Register mainDstReg = MRI.createVirtualRegister(RC);
33784 Register fallDstReg = MRI.createVirtualRegister(RC);
33785
33786 // thisMBB:
33787 // xbegin fallMBB
33788 // # fallthrough to mainMBB
33789 // # abortion to fallMBB
33790 BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
33791 thisMBB->addSuccessor(mainMBB);
33792 thisMBB->addSuccessor(fallMBB);
33793
33794 // mainMBB:
33795 // mainDstReg := -1
33796 BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
33797 BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
33798 mainMBB->addSuccessor(sinkMBB);
33799
33800 // fallMBB:
33801 // ; pseudo instruction to model hardware's definition from XABORT
33802 // EAX := XABORT_DEF
33803 // fallDstReg := EAX
33804 BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));
33805 BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)
33806 .addReg(X86::EAX);
33807 fallMBB->addSuccessor(sinkMBB);
33808
33809 // sinkMBB:
33810 // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
33811 BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)
33812 .addReg(mainDstReg).addMBB(mainMBB)
33813 .addReg(fallDstReg).addMBB(fallMBB);
33814
33815 MI.eraseFromParent();
33816 return sinkMBB;
33817}
33818
33819MachineBasicBlock *
33820X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
33821 MachineBasicBlock *MBB) const {
33822 // Emit va_arg instruction on X86-64.
33823
33824 // Operands to this pseudo-instruction:
33825 // 0 ) Output : destination address (reg)
33826 // 1-5) Input : va_list address (addr, i64mem)
33827 // 6 ) ArgSize : Size (in bytes) of vararg type
33828 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
33829 // 8 ) Align : Alignment of type
33830 // 9 ) EFLAGS (implicit-def)
33831
33832 assert(MI.getNumOperands() == 10 && "VAARG should have 10 operands!")(static_cast <bool> (MI.getNumOperands() == 10 &&
"VAARG should have 10 operands!") ? void (0) : __assert_fail
("MI.getNumOperands() == 10 && \"VAARG should have 10 operands!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33832, __extension__
__PRETTY_FUNCTION__))
;
33833 static_assert(X86::AddrNumOperands == 5, "VAARG assumes 5 address operands");
33834
33835 Register DestReg = MI.getOperand(0).getReg();
33836 MachineOperand &Base = MI.getOperand(1);
33837 MachineOperand &Scale = MI.getOperand(2);
33838 MachineOperand &Index = MI.getOperand(3);
33839 MachineOperand &Disp = MI.getOperand(4);
33840 MachineOperand &Segment = MI.getOperand(5);
33841 unsigned ArgSize = MI.getOperand(6).getImm();
33842 unsigned ArgMode = MI.getOperand(7).getImm();
33843 Align Alignment = Align(MI.getOperand(8).getImm());
33844
33845 MachineFunction *MF = MBB->getParent();
33846
33847 // Memory Reference
33848 assert(MI.hasOneMemOperand() && "Expected VAARG to have one memoperand")(static_cast <bool> (MI.hasOneMemOperand() && "Expected VAARG to have one memoperand"
) ? void (0) : __assert_fail ("MI.hasOneMemOperand() && \"Expected VAARG to have one memoperand\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33848, __extension__
__PRETTY_FUNCTION__))
;
33849
33850 MachineMemOperand *OldMMO = MI.memoperands().front();
33851
33852 // Clone the MMO into two separate MMOs for loading and storing
33853 MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(
33854 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);
33855 MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(
33856 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);
33857
33858 // Machine Information
33859 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
33860 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
33861 const TargetRegisterClass *AddrRegClass =
33862 getRegClassFor(getPointerTy(MBB->getParent()->getDataLayout()));
33863 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
33864 const DebugLoc &DL = MI.getDebugLoc();
33865
33866 // struct va_list {
33867 // i32 gp_offset
33868 // i32 fp_offset
33869 // i64 overflow_area (address)
33870 // i64 reg_save_area (address)
33871 // }
33872 // sizeof(va_list) = 24
33873 // alignment(va_list) = 8
33874
33875 unsigned TotalNumIntRegs = 6;
33876 unsigned TotalNumXMMRegs = 8;
33877 bool UseGPOffset = (ArgMode == 1);
33878 bool UseFPOffset = (ArgMode == 2);
33879 unsigned MaxOffset = TotalNumIntRegs * 8 +
33880 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
33881
33882 /* Align ArgSize to a multiple of 8 */
33883 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
33884 bool NeedsAlign = (Alignment > 8);
33885
33886 MachineBasicBlock *thisMBB = MBB;
33887 MachineBasicBlock *overflowMBB;
33888 MachineBasicBlock *offsetMBB;
33889 MachineBasicBlock *endMBB;
33890
33891 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
33892 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
33893 unsigned OffsetReg = 0;
33894
33895 if (!UseGPOffset && !UseFPOffset) {
33896 // If we only pull from the overflow region, we don't create a branch.
33897 // We don't need to alter control flow.
33898 OffsetDestReg = 0; // unused
33899 OverflowDestReg = DestReg;
33900
33901 offsetMBB = nullptr;
33902 overflowMBB = thisMBB;
33903 endMBB = thisMBB;
33904 } else {
33905 // First emit code to check if gp_offset (or fp_offset) is below the bound.
33906 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
33907 // If not, pull from overflow_area. (branch to overflowMBB)
33908 //
33909 // thisMBB
33910 // | .
33911 // | .
33912 // offsetMBB overflowMBB
33913 // | .
33914 // | .
33915 // endMBB
33916
33917 // Registers for the PHI in endMBB
33918 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
33919 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
33920
33921 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
33922 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
33923 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
33924 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
33925
33926 MachineFunction::iterator MBBIter = ++MBB->getIterator();
33927
33928 // Insert the new basic blocks
33929 MF->insert(MBBIter, offsetMBB);
33930 MF->insert(MBBIter, overflowMBB);
33931 MF->insert(MBBIter, endMBB);
33932
33933 // Transfer the remainder of MBB and its successor edges to endMBB.
33934 endMBB->splice(endMBB->begin(), thisMBB,
33935 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
33936 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
33937
33938 // Make offsetMBB and overflowMBB successors of thisMBB
33939 thisMBB->addSuccessor(offsetMBB);
33940 thisMBB->addSuccessor(overflowMBB);
33941
33942 // endMBB is a successor of both offsetMBB and overflowMBB
33943 offsetMBB->addSuccessor(endMBB);
33944 overflowMBB->addSuccessor(endMBB);
33945
33946 // Load the offset value into a register
33947 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
33948 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
33949 .add(Base)
33950 .add(Scale)
33951 .add(Index)
33952 .addDisp(Disp, UseFPOffset ? 4 : 0)
33953 .add(Segment)
33954 .setMemRefs(LoadOnlyMMO);
33955
33956 // Check if there is enough room left to pull this argument.
33957 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
33958 .addReg(OffsetReg)
33959 .addImm(MaxOffset + 8 - ArgSizeA8);
33960
33961 // Branch to "overflowMBB" if offset >= max
33962 // Fall through to "offsetMBB" otherwise
33963 BuildMI(thisMBB, DL, TII->get(X86::JCC_1))
33964 .addMBB(overflowMBB).addImm(X86::COND_AE);
33965 }
33966
33967 // In offsetMBB, emit code to use the reg_save_area.
33968 if (offsetMBB) {
33969 assert(OffsetReg != 0)(static_cast <bool> (OffsetReg != 0) ? void (0) : __assert_fail
("OffsetReg != 0", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33969, __extension__ __PRETTY_FUNCTION__))
;
33970
33971 // Read the reg_save_area address.
33972 Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
33973 BuildMI(
33974 offsetMBB, DL,
33975 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
33976 RegSaveReg)
33977 .add(Base)
33978 .add(Scale)
33979 .add(Index)
33980 .addDisp(Disp, Subtarget.isTarget64BitLP64() ? 16 : 12)
33981 .add(Segment)
33982 .setMemRefs(LoadOnlyMMO);
33983
33984 if (Subtarget.isTarget64BitLP64()) {
33985 // Zero-extend the offset
33986 Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
33987 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
33988 .addImm(0)
33989 .addReg(OffsetReg)
33990 .addImm(X86::sub_32bit);
33991
33992 // Add the offset to the reg_save_area to get the final address.
33993 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
33994 .addReg(OffsetReg64)
33995 .addReg(RegSaveReg);
33996 } else {
33997 // Add the offset to the reg_save_area to get the final address.
33998 BuildMI(offsetMBB, DL, TII->get(X86::ADD32rr), OffsetDestReg)
33999 .addReg(OffsetReg)
34000 .addReg(RegSaveReg);
34001 }
34002
34003 // Compute the offset for the next argument
34004 Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
34005 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
34006 .addReg(OffsetReg)
34007 .addImm(UseFPOffset ? 16 : 8);
34008
34009 // Store it back into the va_list.
34010 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
34011 .add(Base)
34012 .add(Scale)
34013 .add(Index)
34014 .addDisp(Disp, UseFPOffset ? 4 : 0)
34015 .add(Segment)
34016 .addReg(NextOffsetReg)
34017 .setMemRefs(StoreOnlyMMO);
34018
34019 // Jump to endMBB
34020 BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
34021 .addMBB(endMBB);
34022 }
34023
34024 //
34025 // Emit code to use overflow area
34026 //
34027
34028 // Load the overflow_area address into a register.
34029 Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
34030 BuildMI(overflowMBB, DL,
34031 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
34032 OverflowAddrReg)
34033 .add(Base)
34034 .add(Scale)
34035 .add(Index)
34036 .addDisp(Disp, 8)
34037 .add(Segment)
34038 .setMemRefs(LoadOnlyMMO);
34039
34040 // If we need to align it, do so. Otherwise, just copy the address
34041 // to OverflowDestReg.
34042 if (NeedsAlign) {
34043 // Align the overflow address
34044 Register TmpReg = MRI.createVirtualRegister(AddrRegClass);
34045
34046 // aligned_addr = (addr + (align-1)) & ~(align-1)
34047 BuildMI(
34048 overflowMBB, DL,
34049 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
34050 TmpReg)
34051 .addReg(OverflowAddrReg)
34052 .addImm(Alignment.value() - 1);
34053
34054 BuildMI(
34055 overflowMBB, DL,
34056 TII->get(Subtarget.isTarget64BitLP64() ? X86::AND64ri32 : X86::AND32ri),
34057 OverflowDestReg)
34058 .addReg(TmpReg)
34059 .addImm(~(uint64_t)(Alignment.value() - 1));
34060 } else {
34061 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
34062 .addReg(OverflowAddrReg);
34063 }
34064
34065 // Compute the next overflow address after this argument.
34066 // (the overflow address should be kept 8-byte aligned)
34067 Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
34068 BuildMI(
34069 overflowMBB, DL,
34070 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
34071 NextAddrReg)
34072 .addReg(OverflowDestReg)
34073 .addImm(ArgSizeA8);
34074
34075 // Store the new overflow address.
34076 BuildMI(overflowMBB, DL,
34077 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64mr : X86::MOV32mr))
34078 .add(Base)
34079 .add(Scale)
34080 .add(Index)
34081 .addDisp(Disp, 8)
34082 .add(Segment)
34083 .addReg(NextAddrReg)
34084 .setMemRefs(StoreOnlyMMO);
34085
34086 // If we branched, emit the PHI to the front of endMBB.
34087 if (offsetMBB) {
34088 BuildMI(*endMBB, endMBB->begin(), DL,
34089 TII->get(X86::PHI), DestReg)
34090 .addReg(OffsetDestReg).addMBB(offsetMBB)
34091 .addReg(OverflowDestReg).addMBB(overflowMBB);
34092 }
34093
34094 // Erase the pseudo instruction
34095 MI.eraseFromParent();
34096
34097 return endMBB;
34098}
34099
34100// The EFLAGS operand of SelectItr might be missing a kill marker
34101// because there were multiple uses of EFLAGS, and ISel didn't know
34102// which to mark. Figure out whether SelectItr should have had a
34103// kill marker, and set it if it should. Returns the correct kill
34104// marker value.
34105static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
34106 MachineBasicBlock* BB,
34107 const TargetRegisterInfo* TRI) {
34108 if (isEFLAGSLiveAfter(SelectItr, BB))
34109 return false;
34110
34111 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
34112 // out. SelectMI should have a kill flag on EFLAGS.
34113 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
34114 return true;
34115}
34116
34117// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
34118// together with other CMOV pseudo-opcodes into a single basic-block with
34119// conditional jump around it.
34120static bool isCMOVPseudo(MachineInstr &MI) {
34121 switch (MI.getOpcode()) {
34122 case X86::CMOV_FR16X:
34123 case X86::CMOV_FR32:
34124 case X86::CMOV_FR32X:
34125 case X86::CMOV_FR64:
34126 case X86::CMOV_FR64X:
34127 case X86::CMOV_GR8:
34128 case X86::CMOV_GR16:
34129 case X86::CMOV_GR32:
34130 case X86::CMOV_RFP32:
34131 case X86::CMOV_RFP64:
34132 case X86::CMOV_RFP80:
34133 case X86::CMOV_VR64:
34134 case X86::CMOV_VR128:
34135 case X86::CMOV_VR128X:
34136 case X86::CMOV_VR256:
34137 case X86::CMOV_VR256X:
34138 case X86::CMOV_VR512:
34139 case X86::CMOV_VK1:
34140 case X86::CMOV_VK2:
34141 case X86::CMOV_VK4:
34142 case X86::CMOV_VK8:
34143 case X86::CMOV_VK16:
34144 case X86::CMOV_VK32:
34145 case X86::CMOV_VK64:
34146 return true;
34147
34148 default:
34149 return false;
34150 }
34151}
34152
34153// Helper function, which inserts PHI functions into SinkMBB:
34154// %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
34155// where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
34156// in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
34157// the last PHI function inserted.
34158static MachineInstrBuilder createPHIsForCMOVsInSinkBB(
34159 MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd,
34160 MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
34161 MachineBasicBlock *SinkMBB) {
34162 MachineFunction *MF = TrueMBB->getParent();
34163 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
34164 const DebugLoc &DL = MIItBegin->getDebugLoc();
34165
34166 X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
34167 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
34168
34169 MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
34170
34171 // As we are creating the PHIs, we have to be careful if there is more than
34172 // one. Later CMOVs may reference the results of earlier CMOVs, but later
34173 // PHIs have to reference the individual true/false inputs from earlier PHIs.
34174 // That also means that PHI construction must work forward from earlier to
34175 // later, and that the code must maintain a mapping from earlier PHI's
34176 // destination registers, and the registers that went into the PHI.
34177 DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
34178 MachineInstrBuilder MIB;
34179
34180 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
34181 Register DestReg = MIIt->getOperand(0).getReg();
34182 Register Op1Reg = MIIt->getOperand(1).getReg();
34183 Register Op2Reg = MIIt->getOperand(2).getReg();
34184
34185 // If this CMOV we are generating is the opposite condition from
34186 // the jump we generated, then we have to swap the operands for the
34187 // PHI that is going to be generated.
34188 if (MIIt->getOperand(3).getImm() == OppCC)
34189 std::swap(Op1Reg, Op2Reg);
34190
34191 if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
34192 Op1Reg = RegRewriteTable[Op1Reg].first;
34193
34194 if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
34195 Op2Reg = RegRewriteTable[Op2Reg].second;
34196
34197 MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)
34198 .addReg(Op1Reg)
34199 .addMBB(FalseMBB)
34200 .addReg(Op2Reg)
34201 .addMBB(TrueMBB);
34202
34203 // Add this PHI to the rewrite table.
34204 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
34205 }
34206
34207 return MIB;
34208}
34209
34210// Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
34211MachineBasicBlock *
34212X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
34213 MachineInstr &SecondCascadedCMOV,
34214 MachineBasicBlock *ThisMBB) const {
34215 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
34216 const DebugLoc &DL = FirstCMOV.getDebugLoc();
34217
34218 // We lower cascaded CMOVs such as
34219 //
34220 // (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
34221 //
34222 // to two successive branches.
34223 //
34224 // Without this, we would add a PHI between the two jumps, which ends up
34225 // creating a few copies all around. For instance, for
34226 //
34227 // (sitofp (zext (fcmp une)))
34228 //
34229 // we would generate:
34230 //
34231 // ucomiss %xmm1, %xmm0
34232 // movss <1.0f>, %xmm0
34233 // movaps %xmm0, %xmm1
34234 // jne .LBB5_2
34235 // xorps %xmm1, %xmm1
34236 // .LBB5_2:
34237 // jp .LBB5_4
34238 // movaps %xmm1, %xmm0
34239 // .LBB5_4:
34240 // retq
34241 //
34242 // because this custom-inserter would have generated:
34243 //
34244 // A
34245 // | \
34246 // | B
34247 // | /
34248 // C
34249 // | \
34250 // | D
34251 // | /
34252 // E
34253 //
34254 // A: X = ...; Y = ...
34255 // B: empty
34256 // C: Z = PHI [X, A], [Y, B]
34257 // D: empty
34258 // E: PHI [X, C], [Z, D]
34259 //
34260 // If we lower both CMOVs in a single step, we can instead generate:
34261 //
34262 // A
34263 // | \
34264 // | C
34265 // | /|
34266 // |/ |
34267 // | |
34268 // | D
34269 // | /
34270 // E
34271 //
34272 // A: X = ...; Y = ...
34273 // D: empty
34274 // E: PHI [X, A], [X, C], [Y, D]
34275 //
34276 // Which, in our sitofp/fcmp example, gives us something like:
34277 //
34278 // ucomiss %xmm1, %xmm0
34279 // movss <1.0f>, %xmm0
34280 // jne .LBB5_4
34281 // jp .LBB5_4
34282 // xorps %xmm0, %xmm0
34283 // .LBB5_4:
34284 // retq
34285 //
34286
34287 // We lower cascaded CMOV into two successive branches to the same block.
34288 // EFLAGS is used by both, so mark it as live in the second.
34289 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
34290 MachineFunction *F = ThisMBB->getParent();
34291 MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
34292 MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
34293 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
34294
34295 MachineFunction::iterator It = ++ThisMBB->getIterator();
34296 F->insert(It, FirstInsertedMBB);
34297 F->insert(It, SecondInsertedMBB);
34298 F->insert(It, SinkMBB);
34299
34300 // For a cascaded CMOV, we lower it to two successive branches to
34301 // the same block (SinkMBB). EFLAGS is used by both, so mark it as live in
34302 // the FirstInsertedMBB.
34303 FirstInsertedMBB->addLiveIn(X86::EFLAGS);
34304
34305 // If the EFLAGS register isn't dead in the terminator, then claim that it's
34306 // live into the sink and copy blocks.
34307 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
34308 if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS) &&
34309 !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
34310 SecondInsertedMBB->addLiveIn(X86::EFLAGS);
34311 SinkMBB->addLiveIn(X86::EFLAGS);
34312 }
34313
34314 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
34315 SinkMBB->splice(SinkMBB->begin(), ThisMBB,
34316 std::next(MachineBasicBlock::iterator(FirstCMOV)),
34317 ThisMBB->end());
34318 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
34319
34320 // Fallthrough block for ThisMBB.
34321 ThisMBB->addSuccessor(FirstInsertedMBB);
34322 // The true block target of the first branch is always SinkMBB.
34323 ThisMBB->addSuccessor(SinkMBB);
34324 // Fallthrough block for FirstInsertedMBB.
34325 FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
34326 // The true block for the branch of FirstInsertedMBB.
34327 FirstInsertedMBB->addSuccessor(SinkMBB);
34328 // This is fallthrough.
34329 SecondInsertedMBB->addSuccessor(SinkMBB);
34330
34331 // Create the conditional branch instructions.
34332 X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
34333 BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);
34334
34335 X86::CondCode SecondCC =
34336 X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
34337 BuildMI(FirstInsertedMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(SecondCC);
34338
34339 // SinkMBB:
34340 // %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
34341 Register DestReg = FirstCMOV.getOperand(0).getReg();
34342 Register Op1Reg = FirstCMOV.getOperand(1).getReg();
34343 Register Op2Reg = FirstCMOV.getOperand(2).getReg();
34344 MachineInstrBuilder MIB =
34345 BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg)
34346 .addReg(Op1Reg)
34347 .addMBB(SecondInsertedMBB)
34348 .addReg(Op2Reg)
34349 .addMBB(ThisMBB);
34350
34351 // The second SecondInsertedMBB provides the same incoming value as the
34352 // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
34353 MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
34354 // Copy the PHI result to the register defined by the second CMOV.
34355 BuildMI(*SinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), DL,
34356 TII->get(TargetOpcode::COPY),
34357 SecondCascadedCMOV.getOperand(0).getReg())
34358 .addReg(FirstCMOV.getOperand(0).getReg());
34359
34360 // Now remove the CMOVs.
34361 FirstCMOV.eraseFromParent();
34362 SecondCascadedCMOV.eraseFromParent();
34363
34364 return SinkMBB;
34365}
34366
34367MachineBasicBlock *
34368X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
34369 MachineBasicBlock *ThisMBB) const {
34370 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
34371 const DebugLoc &DL = MI.getDebugLoc();
34372
34373 // To "insert" a SELECT_CC instruction, we actually have to insert the
34374 // diamond control-flow pattern. The incoming instruction knows the
34375 // destination vreg to set, the condition code register to branch on, the
34376 // true/false values to select between and a branch opcode to use.
34377
34378 // ThisMBB:
34379 // ...
34380 // TrueVal = ...
34381 // cmpTY ccX, r1, r2
34382 // bCC copy1MBB
34383 // fallthrough --> FalseMBB
34384
34385 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
34386 // as described above, by inserting a BB, and then making a PHI at the join
34387 // point to select the true and false operands of the CMOV in the PHI.
34388 //
34389 // The code also handles two different cases of multiple CMOV opcodes
34390 // in a row.
34391 //
34392 // Case 1:
34393 // In this case, there are multiple CMOVs in a row, all which are based on
34394 // the same condition setting (or the exact opposite condition setting).
34395 // In this case we can lower all the CMOVs using a single inserted BB, and
34396 // then make a number of PHIs at the join point to model the CMOVs. The only
34397 // trickiness here, is that in a case like:
34398 //
34399 // t2 = CMOV cond1 t1, f1
34400 // t3 = CMOV cond1 t2, f2
34401 //
34402 // when rewriting this into PHIs, we have to perform some renaming on the
34403 // temps since you cannot have a PHI operand refer to a PHI result earlier
34404 // in the same block. The "simple" but wrong lowering would be:
34405 //
34406 // t2 = PHI t1(BB1), f1(BB2)
34407 // t3 = PHI t2(BB1), f2(BB2)
34408 //
34409 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
34410 // renaming is to note that on the path through BB1, t2 is really just a
34411 // copy of t1, and do that renaming, properly generating:
34412 //
34413 // t2 = PHI t1(BB1), f1(BB2)
34414 // t3 = PHI t1(BB1), f2(BB2)
34415 //
34416 // Case 2:
34417 // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
34418 // function - EmitLoweredCascadedSelect.
34419
34420 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
34421 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
34422 MachineInstr *LastCMOV = &MI;
34423 MachineBasicBlock::iterator NextMIIt = MachineBasicBlock::iterator(MI);
34424
34425 // Check for case 1, where there are multiple CMOVs with the same condition
34426 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
34427 // number of jumps the most.
34428
34429 if (isCMOVPseudo(MI)) {
34430 // See if we have a string of CMOVS with the same condition. Skip over
34431 // intervening debug insts.
34432 while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
34433 (NextMIIt->getOperand(3).getImm() == CC ||
34434 NextMIIt->getOperand(3).getImm() == OppCC)) {
34435 LastCMOV = &*NextMIIt;
34436 NextMIIt = next_nodbg(NextMIIt, ThisMBB->end());
34437 }
34438 }
34439
34440 // This checks for case 2, but only do this if we didn't already find
34441 // case 1, as indicated by LastCMOV == MI.
34442 if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
34443 NextMIIt->getOpcode() == MI.getOpcode() &&
34444 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
34445 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
34446 NextMIIt->getOperand(1).isKill()) {
34447 return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
34448 }
34449
34450 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
34451 MachineFunction *F = ThisMBB->getParent();
34452 MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
34453 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
34454
34455 MachineFunction::iterator It = ++ThisMBB->getIterator();
34456 F->insert(It, FalseMBB);
34457 F->insert(It, SinkMBB);
34458
34459 // If the EFLAGS register isn't dead in the terminator, then claim that it's
34460 // live into the sink and copy blocks.
34461 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
34462 if (!LastCMOV->killsRegister(X86::EFLAGS) &&
34463 !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
34464 FalseMBB->addLiveIn(X86::EFLAGS);
34465 SinkMBB->addLiveIn(X86::EFLAGS);
34466 }
34467
34468 // Transfer any debug instructions inside the CMOV sequence to the sunk block.
34469 auto DbgRange = llvm::make_range(MachineBasicBlock::iterator(MI),
34470 MachineBasicBlock::iterator(LastCMOV));
34471 for (MachineInstr &MI : llvm::make_early_inc_range(DbgRange))
34472 if (MI.isDebugInstr())
34473 SinkMBB->push_back(MI.removeFromParent());
34474
34475 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
34476 SinkMBB->splice(SinkMBB->end(), ThisMBB,
34477 std::next(MachineBasicBlock::iterator(LastCMOV)),
34478 ThisMBB->end());
34479 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
34480
34481 // Fallthrough block for ThisMBB.
34482 ThisMBB->addSuccessor(FalseMBB);
34483 // The true block target of the first (or only) branch is always a SinkMBB.
34484 ThisMBB->addSuccessor(SinkMBB);
34485 // Fallthrough block for FalseMBB.
34486 FalseMBB->addSuccessor(SinkMBB);
34487
34488 // Create the conditional branch instruction.
34489 BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);
34490
34491 // SinkMBB:
34492 // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
34493 // ...
34494 MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
34495 MachineBasicBlock::iterator MIItEnd =
34496 std::next(MachineBasicBlock::iterator(LastCMOV));
34497 createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);
34498
34499 // Now remove the CMOV(s).
34500 ThisMBB->erase(MIItBegin, MIItEnd);
34501
34502 return SinkMBB;
34503}
34504
34505static unsigned getSUBriOpcode(bool IsLP64, int64_t Imm) {
34506 if (IsLP64) {
34507 if (isInt<8>(Imm))
34508 return X86::SUB64ri8;
34509 return X86::SUB64ri32;
34510 } else {
34511 if (isInt<8>(Imm))
34512 return X86::SUB32ri8;
34513 return X86::SUB32ri;
34514 }
34515}
34516
34517MachineBasicBlock *
34518X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,
34519 MachineBasicBlock *MBB) const {
34520 MachineFunction *MF = MBB->getParent();
34521 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
34522 const X86FrameLowering &TFI = *Subtarget.getFrameLowering();
34523 const DebugLoc &DL = MI.getDebugLoc();
34524 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
34525
34526 const unsigned ProbeSize = getStackProbeSize(*MF);
34527
34528 MachineRegisterInfo &MRI = MF->getRegInfo();
34529 MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB);
34530 MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB);
34531 MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB);
34532
34533 MachineFunction::iterator MBBIter = ++MBB->getIterator();
34534 MF->insert(MBBIter, testMBB);
34535 MF->insert(MBBIter, blockMBB);
34536 MF->insert(MBBIter, tailMBB);
34537
34538 Register sizeVReg = MI.getOperand(1).getReg();
34539
34540 Register physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP;
34541
34542 Register TmpStackPtr = MRI.createVirtualRegister(
34543 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
34544 Register FinalStackPtr = MRI.createVirtualRegister(
34545 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
34546
34547 BuildMI(*MBB, {MI}, DL, TII->get(TargetOpcode::COPY), TmpStackPtr)
34548 .addReg(physSPReg);
34549 {
34550 const unsigned Opc = TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr;
34551 BuildMI(*MBB, {MI}, DL, TII->get(Opc), FinalStackPtr)
34552 .addReg(TmpStackPtr)
34553 .addReg(sizeVReg);
34554 }
34555
34556 // test rsp size
34557
34558 BuildMI(testMBB, DL,
34559 TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
34560 .addReg(FinalStackPtr)
34561 .addReg(physSPReg);
34562
34563 BuildMI(testMBB, DL, TII->get(X86::JCC_1))
34564 .addMBB(tailMBB)
34565 .addImm(X86::COND_GE);
34566 testMBB->addSuccessor(blockMBB);
34567 testMBB->addSuccessor(tailMBB);
34568
34569 // Touch the block then extend it. This is done on the opposite side of
34570 // static probe where we allocate then touch, to avoid the need of probing the
34571 // tail of the static alloca. Possible scenarios are:
34572 //
34573 // + ---- <- ------------ <- ------------- <- ------------ +
34574 // | |
34575 // [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +
34576 // | |
34577 // + <- ----------- <- ------------ <- ----------- <- ------------ +
34578 //
34579 // The property we want to enforce is to never have more than [page alloc] between two probes.
34580
34581 const unsigned XORMIOpc =
34582 TFI.Uses64BitFramePtr ? X86::XOR64mi8 : X86::XOR32mi8;
34583 addRegOffset(BuildMI(blockMBB, DL, TII->get(XORMIOpc)), physSPReg, false, 0)
34584 .addImm(0);
34585
34586 BuildMI(blockMBB, DL,
34587 TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr, ProbeSize)), physSPReg)
34588 .addReg(physSPReg)
34589 .addImm(ProbeSize);
34590
34591
34592 BuildMI(blockMBB, DL, TII->get(X86::JMP_1)).addMBB(testMBB);
34593 blockMBB->addSuccessor(testMBB);
34594
34595 // Replace original instruction by the expected stack ptr
34596 BuildMI(tailMBB, DL, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
34597 .addReg(FinalStackPtr);
34598
34599 tailMBB->splice(tailMBB->end(), MBB,
34600 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
34601 tailMBB->transferSuccessorsAndUpdatePHIs(MBB);
34602 MBB->addSuccessor(testMBB);
34603
34604 // Delete the original pseudo instruction.
34605 MI.eraseFromParent();
34606
34607 // And we're done.
34608 return tailMBB;
34609}
34610
34611MachineBasicBlock *
34612X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
34613 MachineBasicBlock *BB) const {
34614 MachineFunction *MF = BB->getParent();
34615 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
34616 const DebugLoc &DL = MI.getDebugLoc();
34617 const BasicBlock *LLVM_BB = BB->getBasicBlock();
34618
34619 assert(MF->shouldSplitStack())(static_cast <bool> (MF->shouldSplitStack()) ? void (
0) : __assert_fail ("MF->shouldSplitStack()", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34619, __extension__ __PRETTY_FUNCTION__))
;
34620
34621 const bool Is64Bit = Subtarget.is64Bit();
34622 const bool IsLP64 = Subtarget.isTarget64BitLP64();
34623
34624 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
34625 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
34626
34627 // BB:
34628 // ... [Till the alloca]
34629 // If stacklet is not large enough, jump to mallocMBB
34630 //
34631 // bumpMBB:
34632 // Allocate by subtracting from RSP
34633 // Jump to continueMBB
34634 //
34635 // mallocMBB:
34636 // Allocate by call to runtime
34637 //
34638 // continueMBB:
34639 // ...
34640 // [rest of original BB]
34641 //
34642
34643 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
34644 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
34645 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
34646
34647 MachineRegisterInfo &MRI = MF->getRegInfo();
34648 const TargetRegisterClass *AddrRegClass =
34649 getRegClassFor(getPointerTy(MF->getDataLayout()));
34650
34651 Register mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
34652 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
34653 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
34654 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
34655 sizeVReg = MI.getOperand(1).getReg(),
34656 physSPReg =
34657 IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
34658
34659 MachineFunction::iterator MBBIter = ++BB->getIterator();
34660
34661 MF->insert(MBBIter, bumpMBB);
34662 MF->insert(MBBIter, mallocMBB);
34663 MF->insert(MBBIter, continueMBB);
34664
34665 continueMBB->splice(continueMBB->begin(), BB,
34666 std::next(MachineBasicBlock::iterator(MI)), BB->end());
34667 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
34668
34669 // Add code to the main basic block to check if the stack limit has been hit,
34670 // and if so, jump to mallocMBB otherwise to bumpMBB.
34671 BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
34672 BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
34673 .addReg(tmpSPVReg).addReg(sizeVReg);
34674 BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
34675 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
34676 .addReg(SPLimitVReg);
34677 BuildMI(BB, DL, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);
34678
34679 // bumpMBB simply decreases the stack pointer, since we know the current
34680 // stacklet has enough space.
34681 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
34682 .addReg(SPLimitVReg);
34683 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
34684 .addReg(SPLimitVReg);
34685 BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
34686
34687 // Calls into a routine in libgcc to allocate more space from the heap.
34688 const uint32_t *RegMask =
34689 Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
34690 if (IsLP64) {
34691 BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
34692 .addReg(sizeVReg);
34693 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
34694 .addExternalSymbol("__morestack_allocate_stack_space")
34695 .addRegMask(RegMask)
34696 .addReg(X86::RDI, RegState::Implicit)
34697 .addReg(X86::RAX, RegState::ImplicitDefine);
34698 } else if (Is64Bit) {
34699 BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
34700 .addReg(sizeVReg);
34701 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
34702 .addExternalSymbol("__morestack_allocate_stack_space")
34703 .addRegMask(RegMask)
34704 .addReg(X86::EDI, RegState::Implicit)
34705 .addReg(X86::EAX, RegState::ImplicitDefine);
34706 } else {
34707 BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
34708 .addImm(12);
34709 BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
34710 BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
34711 .addExternalSymbol("__morestack_allocate_stack_space")
34712 .addRegMask(RegMask)
34713 .addReg(X86::EAX, RegState::ImplicitDefine);
34714 }
34715
34716 if (!Is64Bit)
34717 BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
34718 .addImm(16);
34719
34720 BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
34721 .addReg(IsLP64 ? X86::RAX : X86::EAX);
34722 BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
34723
34724 // Set up the CFG correctly.
34725 BB->addSuccessor(bumpMBB);
34726 BB->addSuccessor(mallocMBB);
34727 mallocMBB->addSuccessor(continueMBB);
34728 bumpMBB->addSuccessor(continueMBB);
34729
34730 // Take care of the PHI nodes.
34731 BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
34732 MI.getOperand(0).getReg())
34733 .addReg(mallocPtrVReg)
34734 .addMBB(mallocMBB)
34735 .addReg(bumpSPPtrVReg)
34736 .addMBB(bumpMBB);
34737
34738 // Delete the original pseudo instruction.
34739 MI.eraseFromParent();
34740
34741 // And we're done.
34742 return continueMBB;
34743}
34744
34745MachineBasicBlock *
34746X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
34747 MachineBasicBlock *BB) const {
34748 MachineFunction *MF = BB->getParent();
34749 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
34750 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
34751 const DebugLoc &DL = MI.getDebugLoc();
34752
34753 assert(!isAsynchronousEHPersonality((static_cast <bool> (!isAsynchronousEHPersonality( classifyEHPersonality
(MF->getFunction().getPersonalityFn())) && "SEH does not use catchret!"
) ? void (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction().getPersonalityFn())) && \"SEH does not use catchret!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34755, __extension__
__PRETTY_FUNCTION__))
34754 classifyEHPersonality(MF->getFunction().getPersonalityFn())) &&(static_cast <bool> (!isAsynchronousEHPersonality( classifyEHPersonality
(MF->getFunction().getPersonalityFn())) && "SEH does not use catchret!"
) ? void (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction().getPersonalityFn())) && \"SEH does not use catchret!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34755, __extension__
__PRETTY_FUNCTION__))
34755 "SEH does not use catchret!")(static_cast <bool> (!isAsynchronousEHPersonality( classifyEHPersonality
(MF->getFunction().getPersonalityFn())) && "SEH does not use catchret!"
) ? void (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction().getPersonalityFn())) && \"SEH does not use catchret!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34755, __extension__
__PRETTY_FUNCTION__))
;
34756
34757 // Only 32-bit EH needs to worry about manually restoring stack pointers.
34758 if (!Subtarget.is32Bit())
34759 return BB;
34760
34761 // C++ EH creates a new target block to hold the restore code, and wires up
34762 // the new block to the return destination with a normal JMP_4.
34763 MachineBasicBlock *RestoreMBB =
34764 MF->CreateMachineBasicBlock(BB->getBasicBlock());
34765 assert(BB->succ_size() == 1)(static_cast <bool> (BB->succ_size() == 1) ? void (0
) : __assert_fail ("BB->succ_size() == 1", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34765, __extension__ __PRETTY_FUNCTION__))
;
34766 MF->insert(std::next(BB->getIterator()), RestoreMBB);
34767 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
34768 BB->addSuccessor(RestoreMBB);
34769 MI.getOperand(0).setMBB(RestoreMBB);
34770
34771 // Marking this as an EH pad but not a funclet entry block causes PEI to
34772 // restore stack pointers in the block.
34773 RestoreMBB->setIsEHPad(true);
34774
34775 auto RestoreMBBI = RestoreMBB->begin();
34776 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
34777 return BB;
34778}
34779
34780MachineBasicBlock *
34781X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
34782 MachineBasicBlock *BB) const {
34783 // So, here we replace TLSADDR with the sequence:
34784 // adjust_stackdown -> TLSADDR -> adjust_stackup.
34785 // We need this because TLSADDR is lowered into calls
34786 // inside MC, therefore without the two markers shrink-wrapping
34787 // may push the prologue/epilogue pass them.
34788 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
34789 const DebugLoc &DL = MI.getDebugLoc();
34790 MachineFunction &MF = *BB->getParent();
34791
34792 // Emit CALLSEQ_START right before the instruction.
34793 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
34794 MachineInstrBuilder CallseqStart =
34795 BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
34796 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
34797
34798 // Emit CALLSEQ_END right after the instruction.
34799 // We don't call erase from parent because we want to keep the
34800 // original instruction around.
34801 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
34802 MachineInstrBuilder CallseqEnd =
34803 BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
34804 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
34805
34806 return BB;
34807}
34808
34809MachineBasicBlock *
34810X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
34811 MachineBasicBlock *BB) const {
34812 // This is pretty easy. We're taking the value that we received from
34813 // our load from the relocation, sticking it in either RDI (x86-64)
34814 // or EAX and doing an indirect call. The return value will then
34815 // be in the normal return register.
34816 MachineFunction *F = BB->getParent();
34817 const X86InstrInfo *TII = Subtarget.getInstrInfo();
34818 const DebugLoc &DL = MI.getDebugLoc();
34819
34820 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?")(static_cast <bool> (Subtarget.isTargetDarwin() &&
"Darwin only instr emitted?") ? void (0) : __assert_fail ("Subtarget.isTargetDarwin() && \"Darwin only instr emitted?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34820, __extension__
__PRETTY_FUNCTION__))
;
34821 assert(MI.getOperand(3).isGlobal() && "This should be a global")(static_cast <bool> (MI.getOperand(3).isGlobal() &&
"This should be a global") ? void (0) : __assert_fail ("MI.getOperand(3).isGlobal() && \"This should be a global\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34821, __extension__
__PRETTY_FUNCTION__))
;
34822
34823 // Get a register mask for the lowered call.
34824 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
34825 // proper register mask.
34826 const uint32_t *RegMask =
34827 Subtarget.is64Bit() ?
34828 Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
34829 Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
34830 if (Subtarget.is64Bit()) {
34831 MachineInstrBuilder MIB =
34832 BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
34833 .addReg(X86::RIP)
34834 .addImm(0)
34835 .addReg(0)
34836 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
34837 MI.getOperand(3).getTargetFlags())
34838 .addReg(0);
34839 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
34840 addDirectMem(MIB, X86::RDI);
34841 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
34842 } else if (!isPositionIndependent()) {
34843 MachineInstrBuilder MIB =
34844 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
34845 .addReg(0)
34846 .addImm(0)
34847 .addReg(0)
34848 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
34849 MI.getOperand(3).getTargetFlags())
34850 .addReg(0);
34851 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
34852 addDirectMem(MIB, X86::EAX);
34853 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
34854 } else {
34855 MachineInstrBuilder MIB =
34856 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
34857 .addReg(TII->getGlobalBaseReg(F))
34858 .addImm(0)
34859 .addReg(0)
34860 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
34861 MI.getOperand(3).getTargetFlags())
34862 .addReg(0);
34863 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
34864 addDirectMem(MIB, X86::EAX);
34865 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
34866 }
34867
34868 MI.eraseFromParent(); // The pseudo instruction is gone now.
34869 return BB;
34870}
34871
34872static unsigned getOpcodeForIndirectThunk(unsigned RPOpc) {
34873 switch (RPOpc) {
34874 case X86::INDIRECT_THUNK_CALL32:
34875 return X86::CALLpcrel32;
34876 case X86::INDIRECT_THUNK_CALL64:
34877 return X86::CALL64pcrel32;
34878 case X86::INDIRECT_THUNK_TCRETURN32:
34879 return X86::TCRETURNdi;
34880 case X86::INDIRECT_THUNK_TCRETURN64:
34881 return X86::TCRETURNdi64;
34882 }
34883 llvm_unreachable("not indirect thunk opcode")::llvm::llvm_unreachable_internal("not indirect thunk opcode"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34883)
;
34884}
34885
34886static const char *getIndirectThunkSymbol(const X86Subtarget &Subtarget,
34887 unsigned Reg) {
34888 if (Subtarget.useRetpolineExternalThunk()) {
34889 // When using an external thunk for retpolines, we pick names that match the
34890 // names GCC happens to use as well. This helps simplify the implementation
34891 // of the thunks for kernels where they have no easy ability to create
34892 // aliases and are doing non-trivial configuration of the thunk's body. For
34893 // example, the Linux kernel will do boot-time hot patching of the thunk
34894 // bodies and cannot easily export aliases of these to loaded modules.
34895 //
34896 // Note that at any point in the future, we may need to change the semantics
34897 // of how we implement retpolines and at that time will likely change the
34898 // name of the called thunk. Essentially, there is no hard guarantee that
34899 // LLVM will generate calls to specific thunks, we merely make a best-effort
34900 // attempt to help out kernels and other systems where duplicating the
34901 // thunks is costly.
34902 switch (Reg) {
34903 case X86::EAX:
34904 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34904, __extension__
__PRETTY_FUNCTION__))
;
34905 return "__x86_indirect_thunk_eax";
34906 case X86::ECX:
34907 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34907, __extension__
__PRETTY_FUNCTION__))
;
34908 return "__x86_indirect_thunk_ecx";
34909 case X86::EDX:
34910 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34910, __extension__
__PRETTY_FUNCTION__))
;
34911 return "__x86_indirect_thunk_edx";
34912 case X86::EDI:
34913 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34913, __extension__
__PRETTY_FUNCTION__))
;
34914 return "__x86_indirect_thunk_edi";
34915 case X86::R11:
34916 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!")(static_cast <bool> (Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Should not be using a 64-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34916, __extension__
__PRETTY_FUNCTION__))
;
34917 return "__x86_indirect_thunk_r11";
34918 }
34919 llvm_unreachable("unexpected reg for external indirect thunk")::llvm::llvm_unreachable_internal("unexpected reg for external indirect thunk"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34919)
;
34920 }
34921
34922 if (Subtarget.useRetpolineIndirectCalls() ||
34923 Subtarget.useRetpolineIndirectBranches()) {
34924 // When targeting an internal COMDAT thunk use an LLVM-specific name.
34925 switch (Reg) {
34926 case X86::EAX:
34927 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34927, __extension__
__PRETTY_FUNCTION__))
;
34928 return "__llvm_retpoline_eax";
34929 case X86::ECX:
34930 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34930, __extension__
__PRETTY_FUNCTION__))
;
34931 return "__llvm_retpoline_ecx";
34932 case X86::EDX:
34933 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34933, __extension__
__PRETTY_FUNCTION__))
;
34934 return "__llvm_retpoline_edx";
34935 case X86::EDI:
34936 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34936, __extension__
__PRETTY_FUNCTION__))
;
34937 return "__llvm_retpoline_edi";
34938 case X86::R11:
34939 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!")(static_cast <bool> (Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Should not be using a 64-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34939, __extension__
__PRETTY_FUNCTION__))
;
34940 return "__llvm_retpoline_r11";
34941 }
34942 llvm_unreachable("unexpected reg for retpoline")::llvm::llvm_unreachable_internal("unexpected reg for retpoline"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34942)
;
34943 }
34944
34945 if (Subtarget.useLVIControlFlowIntegrity()) {
34946 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!")(static_cast <bool> (Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Should not be using a 64-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34946, __extension__
__PRETTY_FUNCTION__))
;
34947 return "__llvm_lvi_thunk_r11";
34948 }
34949 llvm_unreachable("getIndirectThunkSymbol() invoked without thunk feature")::llvm::llvm_unreachable_internal("getIndirectThunkSymbol() invoked without thunk feature"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34949)
;
34950}
34951
34952MachineBasicBlock *
34953X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI,
34954 MachineBasicBlock *BB) const {
34955 // Copy the virtual register into the R11 physical register and
34956 // call the retpoline thunk.
34957 const DebugLoc &DL = MI.getDebugLoc();
34958 const X86InstrInfo *TII = Subtarget.getInstrInfo();
34959 Register CalleeVReg = MI.getOperand(0).getReg();
34960 unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode());
34961
34962 // Find an available scratch register to hold the callee. On 64-bit, we can
34963 // just use R11, but we scan for uses anyway to ensure we don't generate
34964 // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
34965 // already a register use operand to the call to hold the callee. If none
34966 // are available, use EDI instead. EDI is chosen because EBX is the PIC base
34967 // register and ESI is the base pointer to realigned stack frames with VLAs.
34968 SmallVector<unsigned, 3> AvailableRegs;
34969 if (Subtarget.is64Bit())
34970 AvailableRegs.push_back(X86::R11);
34971 else
34972 AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});
34973
34974 // Zero out any registers that are already used.
34975 for (const auto &MO : MI.operands()) {
34976 if (MO.isReg() && MO.isUse())
34977 for (unsigned &Reg : AvailableRegs)
34978 if (Reg == MO.getReg())
34979 Reg = 0;
34980 }
34981
34982 // Choose the first remaining non-zero available register.
34983 unsigned AvailableReg = 0;
34984 for (unsigned MaybeReg : AvailableRegs) {
34985 if (MaybeReg) {
34986 AvailableReg = MaybeReg;
34987 break;
34988 }
34989 }
34990 if (!AvailableReg)
34991 report_fatal_error("calling convention incompatible with retpoline, no "
34992 "available registers");
34993
34994 const char *Symbol = getIndirectThunkSymbol(Subtarget, AvailableReg);
34995
34996 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)
34997 .addReg(CalleeVReg);
34998 MI.getOperand(0).ChangeToES(Symbol);
34999 MI.setDesc(TII->get(Opc));
35000 MachineInstrBuilder(*BB->getParent(), &MI)
35001 .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
35002 return BB;
35003}
35004
35005/// SetJmp implies future control flow change upon calling the corresponding
35006/// LongJmp.
35007/// Instead of using the 'return' instruction, the long jump fixes the stack and
35008/// performs an indirect branch. To do so it uses the registers that were stored
35009/// in the jump buffer (when calling SetJmp).
35010/// In case the shadow stack is enabled we need to fix it as well, because some
35011/// return addresses will be skipped.
35012/// The function will save the SSP for future fixing in the function
35013/// emitLongJmpShadowStackFix.
35014/// \sa emitLongJmpShadowStackFix
35015/// \param [in] MI The temporary Machine Instruction for the builtin.
35016/// \param [in] MBB The Machine Basic Block that will be modified.
35017void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
35018 MachineBasicBlock *MBB) const {
35019 const DebugLoc &DL = MI.getDebugLoc();
35020 MachineFunction *MF = MBB->getParent();
35021 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35022 MachineRegisterInfo &MRI = MF->getRegInfo();
35023 MachineInstrBuilder MIB;
35024
35025 // Memory Reference.
35026 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
35027 MI.memoperands_end());
35028
35029 // Initialize a register with zero.
35030 MVT PVT = getPointerTy(MF->getDataLayout());
35031 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
35032 Register ZReg = MRI.createVirtualRegister(PtrRC);
35033 unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
35034 BuildMI(*MBB, MI, DL, TII->get(XorRROpc))
35035 .addDef(ZReg)
35036 .addReg(ZReg, RegState::Undef)
35037 .addReg(ZReg, RegState::Undef);
35038
35039 // Read the current SSP Register value to the zeroed register.
35040 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
35041 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
35042 BuildMI(*MBB, MI, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
35043
35044 // Write the SSP register value to offset 3 in input memory buffer.
35045 unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
35046 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrStoreOpc));
35047 const int64_t SSPOffset = 3 * PVT.getStoreSize();
35048 const unsigned MemOpndSlot = 1;
35049 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
35050 if (i == X86::AddrDisp)
35051 MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);
35052 else
35053 MIB.add(MI.getOperand(MemOpndSlot + i));
35054 }
35055 MIB.addReg(SSPCopyReg);
35056 MIB.setMemRefs(MMOs);
35057}
35058
35059MachineBasicBlock *
35060X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
35061 MachineBasicBlock *MBB) const {
35062 const DebugLoc &DL = MI.getDebugLoc();
35063 MachineFunction *MF = MBB->getParent();
35064 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35065 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
35066 MachineRegisterInfo &MRI = MF->getRegInfo();
35067
35068 const BasicBlock *BB = MBB->getBasicBlock();
35069 MachineFunction::iterator I = ++MBB->getIterator();
35070
35071 // Memory Reference
35072 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
35073 MI.memoperands_end());
35074
35075 unsigned DstReg;
35076 unsigned MemOpndSlot = 0;
35077
35078 unsigned CurOp = 0;
35079
35080 DstReg = MI.getOperand(CurOp++).getReg();
35081 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
35082 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!")(static_cast <bool> (TRI->isTypeLegalForClass(*RC, MVT
::i32) && "Invalid destination!") ? void (0) : __assert_fail
("TRI->isTypeLegalForClass(*RC, MVT::i32) && \"Invalid destination!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35082, __extension__
__PRETTY_FUNCTION__))
;
35083 (void)TRI;
35084 Register mainDstReg = MRI.createVirtualRegister(RC);
35085 Register restoreDstReg = MRI.createVirtualRegister(RC);
35086
35087 MemOpndSlot = CurOp;
35088
35089 MVT PVT = getPointerTy(MF->getDataLayout());
35090 assert((PVT == MVT::i64 || PVT == MVT::i32) &&(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35091, __extension__
__PRETTY_FUNCTION__))
35091 "Invalid Pointer Size!")(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35091, __extension__
__PRETTY_FUNCTION__))
;
35092
35093 // For v = setjmp(buf), we generate
35094 //
35095 // thisMBB:
35096 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
35097 // SjLjSetup restoreMBB
35098 //
35099 // mainMBB:
35100 // v_main = 0
35101 //
35102 // sinkMBB:
35103 // v = phi(main, restore)
35104 //
35105 // restoreMBB:
35106 // if base pointer being used, load it from frame
35107 // v_restore = 1
35108
35109 MachineBasicBlock *thisMBB = MBB;
35110 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
35111 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
35112 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
35113 MF->insert(I, mainMBB);
35114 MF->insert(I, sinkMBB);
35115 MF->push_back(restoreMBB);
35116 restoreMBB->setHasAddressTaken();
35117
35118 MachineInstrBuilder MIB;
35119
35120 // Transfer the remainder of BB and its successor edges to sinkMBB.
35121 sinkMBB->splice(sinkMBB->begin(), MBB,
35122 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
35123 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
35124
35125 // thisMBB:
35126 unsigned PtrStoreOpc = 0;
35127 unsigned LabelReg = 0;
35128 const int64_t LabelOffset = 1 * PVT.getStoreSize();
35129 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
35130 !isPositionIndependent();
35131
35132 // Prepare IP either in reg or imm.
35133 if (!UseImmLabel) {
35134 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
35135 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
35136 LabelReg = MRI.createVirtualRegister(PtrRC);
35137 if (Subtarget.is64Bit()) {
35138 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
35139 .addReg(X86::RIP)
35140 .addImm(0)
35141 .addReg(0)
35142 .addMBB(restoreMBB)
35143 .addReg(0);
35144 } else {
35145 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
35146 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
35147 .addReg(XII->getGlobalBaseReg(MF))
35148 .addImm(0)
35149 .addReg(0)
35150 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
35151 .addReg(0);
35152 }
35153 } else
35154 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
35155 // Store IP
35156 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
35157 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
35158 if (i == X86::AddrDisp)
35159 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
35160 else
35161 MIB.add(MI.getOperand(MemOpndSlot + i));
35162 }
35163 if (!UseImmLabel)
35164 MIB.addReg(LabelReg);
35165 else
35166 MIB.addMBB(restoreMBB);
35167 MIB.setMemRefs(MMOs);
35168
35169 if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
35170 emitSetJmpShadowStackFix(MI, thisMBB);
35171 }
35172
35173 // Setup
35174 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
35175 .addMBB(restoreMBB);
35176
35177 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
35178 MIB.addRegMask(RegInfo->getNoPreservedMask());
35179 thisMBB->addSuccessor(mainMBB);
35180 thisMBB->addSuccessor(restoreMBB);
35181
35182 // mainMBB:
35183 // EAX = 0
35184 BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
35185 mainMBB->addSuccessor(sinkMBB);
35186
35187 // sinkMBB:
35188 BuildMI(*sinkMBB, sinkMBB->begin(), DL,
35189 TII->get(X86::PHI), DstReg)
35190 .addReg(mainDstReg).addMBB(mainMBB)
35191 .addReg(restoreDstReg).addMBB(restoreMBB);
35192
35193 // restoreMBB:
35194 if (RegInfo->hasBasePointer(*MF)) {
35195 const bool Uses64BitFramePtr =
35196 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
35197 X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
35198 X86FI->setRestoreBasePointer(MF);
35199 Register FramePtr = RegInfo->getFrameRegister(*MF);
35200 Register BasePtr = RegInfo->getBaseRegister();
35201 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
35202 addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
35203 FramePtr, true, X86FI->getRestoreBasePointerOffset())
35204 .setMIFlag(MachineInstr::FrameSetup);
35205 }
35206 BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
35207 BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
35208 restoreMBB->addSuccessor(sinkMBB);
35209
35210 MI.eraseFromParent();
35211 return sinkMBB;
35212}
35213
35214/// Fix the shadow stack using the previously saved SSP pointer.
35215/// \sa emitSetJmpShadowStackFix
35216/// \param [in] MI The temporary Machine Instruction for the builtin.
35217/// \param [in] MBB The Machine Basic Block that will be modified.
35218/// \return The sink MBB that will perform the future indirect branch.
35219MachineBasicBlock *
35220X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
35221 MachineBasicBlock *MBB) const {
35222 const DebugLoc &DL = MI.getDebugLoc();
35223 MachineFunction *MF = MBB->getParent();
35224 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35225 MachineRegisterInfo &MRI = MF->getRegInfo();
35226
35227 // Memory Reference
35228 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
35229 MI.memoperands_end());
35230
35231 MVT PVT = getPointerTy(MF->getDataLayout());
35232 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
35233
35234 // checkSspMBB:
35235 // xor vreg1, vreg1
35236 // rdssp vreg1
35237 // test vreg1, vreg1
35238 // je sinkMBB # Jump if Shadow Stack is not supported
35239 // fallMBB:
35240 // mov buf+24/12(%rip), vreg2
35241 // sub vreg1, vreg2
35242 // jbe sinkMBB # No need to fix the Shadow Stack
35243 // fixShadowMBB:
35244 // shr 3/2, vreg2
35245 // incssp vreg2 # fix the SSP according to the lower 8 bits
35246 // shr 8, vreg2
35247 // je sinkMBB
35248 // fixShadowLoopPrepareMBB:
35249 // shl vreg2
35250 // mov 128, vreg3
35251 // fixShadowLoopMBB:
35252 // incssp vreg3
35253 // dec vreg2
35254 // jne fixShadowLoopMBB # Iterate until you finish fixing
35255 // # the Shadow Stack
35256 // sinkMBB:
35257
35258 MachineFunction::iterator I = ++MBB->getIterator();
35259 const BasicBlock *BB = MBB->getBasicBlock();
35260
35261 MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);
35262 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
35263 MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);
35264 MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);
35265 MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);
35266 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
35267 MF->insert(I, checkSspMBB);
35268 MF->insert(I, fallMBB);
35269 MF->insert(I, fixShadowMBB);
35270 MF->insert(I, fixShadowLoopPrepareMBB);
35271 MF->insert(I, fixShadowLoopMBB);
35272 MF->insert(I, sinkMBB);
35273
35274 // Transfer the remainder of BB and its successor edges to sinkMBB.
35275 sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),
35276 MBB->end());
35277 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
35278
35279 MBB->addSuccessor(checkSspMBB);
35280
35281 // Initialize a register with zero.
35282 Register ZReg = MRI.createVirtualRegister(&X86::GR32RegClass);
35283 BuildMI(checkSspMBB, DL, TII->get(X86::MOV32r0), ZReg);
35284
35285 if (PVT == MVT::i64) {
35286 Register TmpZReg = MRI.createVirtualRegister(PtrRC);
35287 BuildMI(checkSspMBB, DL, TII->get(X86::SUBREG_TO_REG), TmpZReg)
35288 .addImm(0)
35289 .addReg(ZReg)
35290 .addImm(X86::sub_32bit);
35291 ZReg = TmpZReg;
35292 }
35293
35294 // Read the current SSP Register value to the zeroed register.
35295 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
35296 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
35297 BuildMI(checkSspMBB, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
35298
35299 // Check whether the result of the SSP register is zero and jump directly
35300 // to the sink.
35301 unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;
35302 BuildMI(checkSspMBB, DL, TII->get(TestRROpc))
35303 .addReg(SSPCopyReg)
35304 .addReg(SSPCopyReg);
35305 BuildMI(checkSspMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);
35306 checkSspMBB->addSuccessor(sinkMBB);
35307 checkSspMBB->addSuccessor(fallMBB);
35308
35309 // Reload the previously saved SSP register value.
35310 Register PrevSSPReg = MRI.createVirtualRegister(PtrRC);
35311 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
35312 const int64_t SPPOffset = 3 * PVT.getStoreSize();
35313 MachineInstrBuilder MIB =
35314 BuildMI(fallMBB, DL, TII->get(PtrLoadOpc), PrevSSPReg);
35315 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
35316 const MachineOperand &MO = MI.getOperand(i);
35317 if (i == X86::AddrDisp)
35318 MIB.addDisp(MO, SPPOffset);
35319 else if (MO.isReg()) // Don't add the whole operand, we don't want to
35320 // preserve kill flags.
35321 MIB.addReg(MO.getReg());
35322 else
35323 MIB.add(MO);
35324 }
35325 MIB.setMemRefs(MMOs);
35326
35327 // Subtract the current SSP from the previous SSP.
35328 Register SspSubReg = MRI.createVirtualRegister(PtrRC);
35329 unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
35330 BuildMI(fallMBB, DL, TII->get(SubRROpc), SspSubReg)
35331 .addReg(PrevSSPReg)
35332 .addReg(SSPCopyReg);
35333
35334 // Jump to sink in case PrevSSPReg <= SSPCopyReg.
35335 BuildMI(fallMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_BE);
35336 fallMBB->addSuccessor(sinkMBB);
35337 fallMBB->addSuccessor(fixShadowMBB);
35338
35339 // Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.
35340 unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
35341 unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
35342 Register SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
35343 BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspFirstShrReg)
35344 .addReg(SspSubReg)
35345 .addImm(Offset);
35346
35347 // Increase SSP when looking only on the lower 8 bits of the delta.
35348 unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;
35349 BuildMI(fixShadowMBB, DL, TII->get(IncsspOpc)).addReg(SspFirstShrReg);
35350
35351 // Reset the lower 8 bits.
35352 Register SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
35353 BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspSecondShrReg)
35354 .addReg(SspFirstShrReg)
35355 .addImm(8);
35356
35357 // Jump if the result of the shift is zero.
35358 BuildMI(fixShadowMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);
35359 fixShadowMBB->addSuccessor(sinkMBB);
35360 fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);
35361
35362 // Do a single shift left.
35363 unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64r1 : X86::SHL32r1;
35364 Register SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
35365 BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(ShlR1Opc), SspAfterShlReg)
35366 .addReg(SspSecondShrReg);
35367
35368 // Save the value 128 to a register (will be used next with incssp).
35369 Register Value128InReg = MRI.createVirtualRegister(PtrRC);
35370 unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
35371 BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(MovRIOpc), Value128InReg)
35372 .addImm(128);
35373 fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);
35374
35375 // Since incssp only looks at the lower 8 bits, we might need to do several
35376 // iterations of incssp until we finish fixing the shadow stack.
35377 Register DecReg = MRI.createVirtualRegister(PtrRC);
35378 Register CounterReg = MRI.createVirtualRegister(PtrRC);
35379 BuildMI(fixShadowLoopMBB, DL, TII->get(X86::PHI), CounterReg)
35380 .addReg(SspAfterShlReg)
35381 .addMBB(fixShadowLoopPrepareMBB)
35382 .addReg(DecReg)
35383 .addMBB(fixShadowLoopMBB);
35384
35385 // Every iteration we increase the SSP by 128.
35386 BuildMI(fixShadowLoopMBB, DL, TII->get(IncsspOpc)).addReg(Value128InReg);
35387
35388 // Every iteration we decrement the counter by 1.
35389 unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;
35390 BuildMI(fixShadowLoopMBB, DL, TII->get(DecROpc), DecReg).addReg(CounterReg);
35391
35392 // Jump if the counter is not zero yet.
35393 BuildMI(fixShadowLoopMBB, DL, TII->get(X86::JCC_1)).addMBB(fixShadowLoopMBB).addImm(X86::COND_NE);
35394 fixShadowLoopMBB->addSuccessor(sinkMBB);
35395 fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);
35396
35397 return sinkMBB;
35398}
35399
35400MachineBasicBlock *
35401X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
35402 MachineBasicBlock *MBB) const {
35403 const DebugLoc &DL = MI.getDebugLoc();
35404 MachineFunction *MF = MBB->getParent();
35405 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35406 MachineRegisterInfo &MRI = MF->getRegInfo();
35407
35408 // Memory Reference
35409 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
35410 MI.memoperands_end());
35411
35412 MVT PVT = getPointerTy(MF->getDataLayout());
35413 assert((PVT == MVT::i64 || PVT == MVT::i32) &&(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35414, __extension__
__PRETTY_FUNCTION__))
35414 "Invalid Pointer Size!")(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35414, __extension__
__PRETTY_FUNCTION__))
;
35415
35416 const TargetRegisterClass *RC =
35417 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
35418 Register Tmp = MRI.createVirtualRegister(RC);
35419 // Since FP is only updated here but NOT referenced, it's treated as GPR.
35420 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
35421 Register FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
35422 Register SP = RegInfo->getStackRegister();
35423
35424 MachineInstrBuilder MIB;
35425
35426 const int64_t LabelOffset = 1 * PVT.getStoreSize();
35427 const int64_t SPOffset = 2 * PVT.getStoreSize();
35428
35429 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
35430 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
35431
35432 MachineBasicBlock *thisMBB = MBB;
35433
35434 // When CET and shadow stack is enabled, we need to fix the Shadow Stack.
35435 if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
35436 thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);
35437 }
35438
35439 // Reload FP
35440 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), FP);
35441 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
35442 const MachineOperand &MO = MI.getOperand(i);
35443 if (MO.isReg()) // Don't add the whole operand, we don't want to
35444 // preserve kill flags.
35445 MIB.addReg(MO.getReg());
35446 else
35447 MIB.add(MO);
35448 }
35449 MIB.setMemRefs(MMOs);
35450
35451 // Reload IP
35452 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
35453 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
35454 const MachineOperand &MO = MI.getOperand(i);
35455 if (i == X86::AddrDisp)
35456 MIB.addDisp(MO, LabelOffset);
35457 else if (MO.isReg()) // Don't add the whole operand, we don't want to
35458 // preserve kill flags.
35459 MIB.addReg(MO.getReg());
35460 else
35461 MIB.add(MO);
35462 }
35463 MIB.setMemRefs(MMOs);
35464
35465 // Reload SP
35466 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), SP);
35467 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
35468 if (i == X86::AddrDisp)
35469 MIB.addDisp(MI.getOperand(i), SPOffset);
35470 else
35471 MIB.add(MI.getOperand(i)); // We can preserve the kill flags here, it's
35472 // the last instruction of the expansion.
35473 }
35474 MIB.setMemRefs(MMOs);
35475
35476 // Jump
35477 BuildMI(*thisMBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
35478
35479 MI.eraseFromParent();
35480 return thisMBB;
35481}
35482
35483void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
35484 MachineBasicBlock *MBB,
35485 MachineBasicBlock *DispatchBB,
35486 int FI) const {
35487 const DebugLoc &DL = MI.getDebugLoc();
35488 MachineFunction *MF = MBB->getParent();
35489 MachineRegisterInfo *MRI = &MF->getRegInfo();
35490 const X86InstrInfo *TII = Subtarget.getInstrInfo();
35491
35492 MVT PVT = getPointerTy(MF->getDataLayout());
35493 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!")(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35493, __extension__
__PRETTY_FUNCTION__))
;
35494
35495 unsigned Op = 0;
35496 unsigned VR = 0;
35497
35498 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
35499 !isPositionIndependent();
35500
35501 if (UseImmLabel) {
35502 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
35503 } else {
35504 const TargetRegisterClass *TRC =
35505 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
35506 VR = MRI->createVirtualRegister(TRC);
35507 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
35508
35509 if (Subtarget.is64Bit())
35510 BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
35511 .addReg(X86::RIP)
35512 .addImm(1)
35513 .addReg(0)
35514 .addMBB(DispatchBB)
35515 .addReg(0);
35516 else
35517 BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
35518 .addReg(0) /* TII->getGlobalBaseReg(MF) */
35519 .addImm(1)
35520 .addReg(0)
35521 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
35522 .addReg(0);
35523 }
35524
35525 MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
35526 addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
35527 if (UseImmLabel)
35528 MIB.addMBB(DispatchBB);
35529 else
35530 MIB.addReg(VR);
35531}
35532
35533MachineBasicBlock *
35534X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
35535 MachineBasicBlock *BB) const {
35536 const DebugLoc &DL = MI.getDebugLoc();
35537 MachineFunction *MF = BB->getParent();
35538 MachineRegisterInfo *MRI = &MF->getRegInfo();
35539 const X86InstrInfo *TII = Subtarget.getInstrInfo();
35540 int FI = MF->getFrameInfo().getFunctionContextIndex();
35541
35542 // Get a mapping of the call site numbers to all of the landing pads they're
35543 // associated with.
35544 DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
35545 unsigned MaxCSNum = 0;
35546 for (auto &MBB : *MF) {
35547 if (!MBB.isEHPad())
35548 continue;
35549
35550 MCSymbol *Sym = nullptr;
35551 for (const auto &MI : MBB) {
35552 if (MI.isDebugInstr())
35553 continue;
35554
35555 assert(MI.isEHLabel() && "expected EH_LABEL")(static_cast <bool> (MI.isEHLabel() && "expected EH_LABEL"
) ? void (0) : __assert_fail ("MI.isEHLabel() && \"expected EH_LABEL\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35555, __extension__
__PRETTY_FUNCTION__))
;
35556 Sym = MI.getOperand(0).getMCSymbol();
35557 break;
35558 }
35559
35560 if (!MF->hasCallSiteLandingPad(Sym))
35561 continue;
35562
35563 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
35564 CallSiteNumToLPad[CSI].push_back(&MBB);
35565 MaxCSNum = std::max(MaxCSNum, CSI);
35566 }
35567 }
35568
35569 // Get an ordered list of the machine basic blocks for the jump table.
35570 std::vector<MachineBasicBlock *> LPadList;
35571 SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
35572 LPadList.reserve(CallSiteNumToLPad.size());
35573
35574 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
35575 for (auto &LP : CallSiteNumToLPad[CSI]) {
35576 LPadList.push_back(LP);
35577 InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
35578 }
35579 }
35580
35581 assert(!LPadList.empty() &&(static_cast <bool> (!LPadList.empty() && "No landing pad destinations for the dispatch jump table!"
) ? void (0) : __assert_fail ("!LPadList.empty() && \"No landing pad destinations for the dispatch jump table!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35582, __extension__
__PRETTY_FUNCTION__))
35582 "No landing pad destinations for the dispatch jump table!")(static_cast <bool> (!LPadList.empty() && "No landing pad destinations for the dispatch jump table!"
) ? void (0) : __assert_fail ("!LPadList.empty() && \"No landing pad destinations for the dispatch jump table!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35582, __extension__
__PRETTY_FUNCTION__))
;
35583
35584 // Create the MBBs for the dispatch code.
35585
35586 // Shove the dispatch's address into the return slot in the function context.
35587 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
35588 DispatchBB->setIsEHPad(true);
35589
35590 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
35591 BuildMI(TrapBB, DL, TII->get(X86::TRAP));
35592 DispatchBB->addSuccessor(TrapBB);
35593
35594 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
35595 DispatchBB->addSuccessor(DispContBB);
35596
35597 // Insert MBBs.
35598 MF->push_back(DispatchBB);
35599 MF->push_back(DispContBB);
35600 MF->push_back(TrapBB);
35601
35602 // Insert code into the entry block that creates and registers the function
35603 // context.
35604 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
35605
35606 // Create the jump table and associated information
35607 unsigned JTE = getJumpTableEncoding();
35608 MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
35609 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
35610
35611 const X86RegisterInfo &RI = TII->getRegisterInfo();
35612 // Add a register mask with no preserved registers. This results in all
35613 // registers being marked as clobbered.
35614 if (RI.hasBasePointer(*MF)) {
35615 const bool FPIs64Bit =
35616 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
35617 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
35618 MFI->setRestoreBasePointer(MF);
35619
35620 Register FP = RI.getFrameRegister(*MF);
35621 Register BP = RI.getBaseRegister();
35622 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
35623 addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
35624 MFI->getRestoreBasePointerOffset())
35625 .addRegMask(RI.getNoPreservedMask());
35626 } else {
35627 BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
35628 .addRegMask(RI.getNoPreservedMask());
35629 }
35630
35631 // IReg is used as an index in a memory operand and therefore can't be SP
35632 Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
35633 addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
35634 Subtarget.is64Bit() ? 8 : 4);
35635 BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
35636 .addReg(IReg)
35637 .addImm(LPadList.size());
35638 BuildMI(DispatchBB, DL, TII->get(X86::JCC_1)).addMBB(TrapBB).addImm(X86::COND_AE);
35639
35640 if (Subtarget.is64Bit()) {
35641 Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
35642 Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
35643
35644 // leaq .LJTI0_0(%rip), BReg
35645 BuildMI(DispContBB, DL, TII->get(X86::LEA64r), BReg)
35646 .addReg(X86::RIP)
35647 .addImm(1)
35648 .addReg(0)
35649 .addJumpTableIndex(MJTI)
35650 .addReg(0);
35651 // movzx IReg64, IReg
35652 BuildMI(DispContBB, DL, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
35653 .addImm(0)
35654 .addReg(IReg)
35655 .addImm(X86::sub_32bit);
35656
35657 switch (JTE) {
35658 case MachineJumpTableInfo::EK_BlockAddress:
35659 // jmpq *(BReg,IReg64,8)
35660 BuildMI(DispContBB, DL, TII->get(X86::JMP64m))
35661 .addReg(BReg)
35662 .addImm(8)
35663 .addReg(IReg64)
35664 .addImm(0)
35665 .addReg(0);
35666 break;
35667 case MachineJumpTableInfo::EK_LabelDifference32: {
35668 Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
35669 Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
35670 Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
35671
35672 // movl (BReg,IReg64,4), OReg
35673 BuildMI(DispContBB, DL, TII->get(X86::MOV32rm), OReg)
35674 .addReg(BReg)
35675 .addImm(4)
35676 .addReg(IReg64)
35677 .addImm(0)
35678 .addReg(0);
35679 // movsx OReg64, OReg
35680 BuildMI(DispContBB, DL, TII->get(X86::MOVSX64rr32), OReg64).addReg(OReg);
35681 // addq BReg, OReg64, TReg
35682 BuildMI(DispContBB, DL, TII->get(X86::ADD64rr), TReg)
35683 .addReg(OReg64)
35684 .addReg(BReg);
35685 // jmpq *TReg
35686 BuildMI(DispContBB, DL, TII->get(X86::JMP64r)).addReg(TReg);
35687 break;
35688 }
35689 default:
35690 llvm_unreachable("Unexpected jump table encoding")::llvm::llvm_unreachable_internal("Unexpected jump table encoding"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35690)
;
35691 }
35692 } else {
35693 // jmpl *.LJTI0_0(,IReg,4)
35694 BuildMI(DispContBB, DL, TII->get(X86::JMP32m))
35695 .addReg(0)
35696 .addImm(4)
35697 .addReg(IReg)
35698 .addJumpTableIndex(MJTI)
35699 .addReg(0);
35700 }
35701
35702 // Add the jump table entries as successors to the MBB.
35703 SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
35704 for (auto &LP : LPadList)
35705 if (SeenMBBs.insert(LP).second)
35706 DispContBB->addSuccessor(LP);
35707
35708 // N.B. the order the invoke BBs are processed in doesn't matter here.
35709 SmallVector<MachineBasicBlock *, 64> MBBLPads;
35710 const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
35711 for (MachineBasicBlock *MBB : InvokeBBs) {
35712 // Remove the landing pad successor from the invoke block and replace it
35713 // with the new dispatch block.
35714 // Keep a copy of Successors since it's modified inside the loop.
35715 SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
35716 MBB->succ_rend());
35717 // FIXME: Avoid quadratic complexity.
35718 for (auto MBBS : Successors) {
35719 if (MBBS->isEHPad()) {
35720 MBB->removeSuccessor(MBBS);
35721 MBBLPads.push_back(MBBS);
35722 }
35723 }
35724
35725 MBB->addSuccessor(DispatchBB);
35726
35727 // Find the invoke call and mark all of the callee-saved registers as
35728 // 'implicit defined' so that they're spilled. This prevents code from
35729 // moving instructions to before the EH block, where they will never be
35730 // executed.
35731 for (auto &II : reverse(*MBB)) {
35732 if (!II.isCall())
35733 continue;
35734
35735 DenseMap<unsigned, bool> DefRegs;
35736 for (auto &MOp : II.operands())
35737 if (MOp.isReg())
35738 DefRegs[MOp.getReg()] = true;
35739
35740 MachineInstrBuilder MIB(*MF, &II);
35741 for (unsigned RegIdx = 0; SavedRegs[RegIdx]; ++RegIdx) {
35742 unsigned Reg = SavedRegs[RegIdx];
35743 if (!DefRegs[Reg])
35744 MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
35745 }
35746
35747 break;
35748 }
35749 }
35750
35751 // Mark all former landing pads as non-landing pads. The dispatch is the only
35752 // landing pad now.
35753 for (auto &LP : MBBLPads)
35754 LP->setIsEHPad(false);
35755
35756 // The instruction is gone now.
35757 MI.eraseFromParent();
35758 return BB;
35759}
35760
35761MachineBasicBlock *
35762X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
35763 MachineBasicBlock *BB) const {
35764 MachineFunction *MF = BB->getParent();
35765 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35766 const DebugLoc &DL = MI.getDebugLoc();
35767
35768 auto TMMImmToTMMReg = [](unsigned Imm) {
35769 assert (Imm < 8 && "Illegal tmm index")(static_cast <bool> (Imm < 8 && "Illegal tmm index"
) ? void (0) : __assert_fail ("Imm < 8 && \"Illegal tmm index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35769, __extension__
__PRETTY_FUNCTION__))
;
35770 return X86::TMM0 + Imm;
35771 };
35772 switch (MI.getOpcode()) {
35773 default: llvm_unreachable("Unexpected instr type to insert")::llvm::llvm_unreachable_internal("Unexpected instr type to insert"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35773)
;
35774 case X86::TLS_addr32:
35775 case X86::TLS_addr64:
35776 case X86::TLS_addrX32:
35777 case X86::TLS_base_addr32:
35778 case X86::TLS_base_addr64:
35779 case X86::TLS_base_addrX32:
35780 return EmitLoweredTLSAddr(MI, BB);
35781 case X86::INDIRECT_THUNK_CALL32:
35782 case X86::INDIRECT_THUNK_CALL64:
35783 case X86::INDIRECT_THUNK_TCRETURN32:
35784 case X86::INDIRECT_THUNK_TCRETURN64:
35785 return EmitLoweredIndirectThunk(MI, BB);
35786 case X86::CATCHRET:
35787 return EmitLoweredCatchRet(MI, BB);
35788 case X86::SEG_ALLOCA_32:
35789 case X86::SEG_ALLOCA_64:
35790 return EmitLoweredSegAlloca(MI, BB);
35791 case X86::PROBED_ALLOCA_32:
35792 case X86::PROBED_ALLOCA_64:
35793 return EmitLoweredProbedAlloca(MI, BB);
35794 case X86::TLSCall_32:
35795 case X86::TLSCall_64:
35796 return EmitLoweredTLSCall(MI, BB);
35797 case X86::CMOV_FR32:
35798 case X86::CMOV_FR32X:
35799 case X86::CMOV_FR64:
35800 case X86::CMOV_FR64X:
35801 case X86::CMOV_GR8:
35802 case X86::CMOV_GR16:
35803 case X86::CMOV_GR32:
35804 case X86::CMOV_RFP32:
35805 case X86::CMOV_RFP64:
35806 case X86::CMOV_RFP80:
35807 case X86::CMOV_VR64:
35808 case X86::CMOV_VR128:
35809 case X86::CMOV_VR128X:
35810 case X86::CMOV_VR256:
35811 case X86::CMOV_VR256X:
35812 case X86::CMOV_VR512:
35813 case X86::CMOV_VK1:
35814 case X86::CMOV_VK2:
35815 case X86::CMOV_VK4:
35816 case X86::CMOV_VK8:
35817 case X86::CMOV_VK16:
35818 case X86::CMOV_VK32:
35819 case X86::CMOV_VK64:
35820 return EmitLoweredSelect(MI, BB);
35821
35822 case X86::RDFLAGS32:
35823 case X86::RDFLAGS64: {
35824 unsigned PushF =
35825 MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
35826 unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
35827 MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));
35828 // Permit reads of the EFLAGS and DF registers without them being defined.
35829 // This intrinsic exists to read external processor state in flags, such as
35830 // the trap flag, interrupt flag, and direction flag, none of which are
35831 // modeled by the backend.
35832 assert(Push->getOperand(2).getReg() == X86::EFLAGS &&(static_cast <bool> (Push->getOperand(2).getReg() ==
X86::EFLAGS && "Unexpected register in operand!") ? void
(0) : __assert_fail ("Push->getOperand(2).getReg() == X86::EFLAGS && \"Unexpected register in operand!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35833, __extension__
__PRETTY_FUNCTION__))
35833 "Unexpected register in operand!")(static_cast <bool> (Push->getOperand(2).getReg() ==
X86::EFLAGS && "Unexpected register in operand!") ? void
(0) : __assert_fail ("Push->getOperand(2).getReg() == X86::EFLAGS && \"Unexpected register in operand!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35833, __extension__
__PRETTY_FUNCTION__))
;
35834 Push->getOperand(2).setIsUndef();
35835 assert(Push->getOperand(3).getReg() == X86::DF &&(static_cast <bool> (Push->getOperand(3).getReg() ==
X86::DF && "Unexpected register in operand!") ? void
(0) : __assert_fail ("Push->getOperand(3).getReg() == X86::DF && \"Unexpected register in operand!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35836, __extension__
__PRETTY_FUNCTION__))
35836 "Unexpected register in operand!")(static_cast <bool> (Push->getOperand(3).getReg() ==
X86::DF && "Unexpected register in operand!") ? void
(0) : __assert_fail ("Push->getOperand(3).getReg() == X86::DF && \"Unexpected register in operand!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35836, __extension__
__PRETTY_FUNCTION__))
;
35837 Push->getOperand(3).setIsUndef();
35838 BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());
35839
35840 MI.eraseFromParent(); // The pseudo is gone now.
35841 return BB;
35842 }
35843
35844 case X86::WRFLAGS32:
35845 case X86::WRFLAGS64: {
35846 unsigned Push =
35847 MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
35848 unsigned PopF =
35849 MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
35850 BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
35851 BuildMI(*BB, MI, DL, TII->get(PopF));
35852
35853 MI.eraseFromParent(); // The pseudo is gone now.
35854 return BB;
35855 }
35856
35857 case X86::FP32_TO_INT16_IN_MEM:
35858 case X86::FP32_TO_INT32_IN_MEM:
35859 case X86::FP32_TO_INT64_IN_MEM:
35860 case X86::FP64_TO_INT16_IN_MEM:
35861 case X86::FP64_TO_INT32_IN_MEM:
35862 case X86::FP64_TO_INT64_IN_MEM:
35863 case X86::FP80_TO_INT16_IN_MEM:
35864 case X86::FP80_TO_INT32_IN_MEM:
35865 case X86::FP80_TO_INT64_IN_MEM: {
35866 // Change the floating point control register to use "round towards zero"
35867 // mode when truncating to an integer value.
35868 int OrigCWFrameIdx =
35869 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
35870 addFrameReference(BuildMI(*BB, MI, DL,
35871 TII->get(X86::FNSTCW16m)), OrigCWFrameIdx);
35872
35873 // Load the old value of the control word...
35874 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
35875 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW),
35876 OrigCWFrameIdx);
35877
35878 // OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.
35879 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
35880 BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW)
35881 .addReg(OldCW, RegState::Kill).addImm(0xC00);
35882
35883 // Extract to 16 bits.
35884 Register NewCW16 =
35885 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
35886 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16)
35887 .addReg(NewCW, RegState::Kill, X86::sub_16bit);
35888
35889 // Prepare memory for FLDCW.
35890 int NewCWFrameIdx =
35891 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
35892 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)),
35893 NewCWFrameIdx)
35894 .addReg(NewCW16, RegState::Kill);
35895
35896 // Reload the modified control word now...
35897 addFrameReference(BuildMI(*BB, MI, DL,
35898 TII->get(X86::FLDCW16m)), NewCWFrameIdx);
35899
35900 // Get the X86 opcode to use.
35901 unsigned Opc;
35902 switch (MI.getOpcode()) {
35903 default: llvm_unreachable("illegal opcode!")::llvm::llvm_unreachable_internal("illegal opcode!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35903)
;
35904 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
35905 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
35906 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
35907 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
35908 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
35909 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
35910 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
35911 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
35912 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
35913 }
35914
35915 X86AddressMode AM = getAddressFromInstr(&MI, 0);
35916 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
35917 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
35918
35919 // Reload the original control word now.
35920 addFrameReference(BuildMI(*BB, MI, DL,
35921 TII->get(X86::FLDCW16m)), OrigCWFrameIdx);
35922
35923 MI.eraseFromParent(); // The pseudo instruction is gone now.
35924 return BB;
35925 }
35926
35927 // xbegin
35928 case X86::XBEGIN:
35929 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
35930
35931 case X86::VAARG_64:
35932 case X86::VAARG_X32:
35933 return EmitVAARGWithCustomInserter(MI, BB);
35934
35935 case X86::EH_SjLj_SetJmp32:
35936 case X86::EH_SjLj_SetJmp64:
35937 return emitEHSjLjSetJmp(MI, BB);
35938
35939 case X86::EH_SjLj_LongJmp32:
35940 case X86::EH_SjLj_LongJmp64:
35941 return emitEHSjLjLongJmp(MI, BB);
35942
35943 case X86::Int_eh_sjlj_setup_dispatch:
35944 return EmitSjLjDispatchBlock(MI, BB);
35945
35946 case TargetOpcode::STATEPOINT:
35947 // As an implementation detail, STATEPOINT shares the STACKMAP format at
35948 // this point in the process. We diverge later.
35949 return emitPatchPoint(MI, BB);
35950
35951 case TargetOpcode::STACKMAP:
35952 case TargetOpcode::PATCHPOINT:
35953 return emitPatchPoint(MI, BB);
35954
35955 case TargetOpcode::PATCHABLE_EVENT_CALL:
35956 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
35957 return BB;
35958
35959 case X86::LCMPXCHG8B: {
35960 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
35961 // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
35962 // requires a memory operand. If it happens that current architecture is
35963 // i686 and for current function we need a base pointer
35964 // - which is ESI for i686 - register allocator would not be able to
35965 // allocate registers for an address in form of X(%reg, %reg, Y)
35966 // - there never would be enough unreserved registers during regalloc
35967 // (without the need for base ptr the only option would be X(%edi, %esi, Y).
35968 // We are giving a hand to register allocator by precomputing the address in
35969 // a new vreg using LEA.
35970
35971 // If it is not i686 or there is no base pointer - nothing to do here.
35972 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
35973 return BB;
35974
35975 // Even though this code does not necessarily needs the base pointer to
35976 // be ESI, we check for that. The reason: if this assert fails, there are
35977 // some changes happened in the compiler base pointer handling, which most
35978 // probably have to be addressed somehow here.
35979 assert(TRI->getBaseRegister() == X86::ESI &&(static_cast <bool> (TRI->getBaseRegister() == X86::
ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
"base pointer in mind") ? void (0) : __assert_fail ("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35981, __extension__
__PRETTY_FUNCTION__))
35980 "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "(static_cast <bool> (TRI->getBaseRegister() == X86::
ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
"base pointer in mind") ? void (0) : __assert_fail ("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35981, __extension__
__PRETTY_FUNCTION__))
35981 "base pointer in mind")(static_cast <bool> (TRI->getBaseRegister() == X86::
ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
"base pointer in mind") ? void (0) : __assert_fail ("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35981, __extension__
__PRETTY_FUNCTION__))
;
35982
35983 MachineRegisterInfo &MRI = MF->getRegInfo();
35984 MVT SPTy = getPointerTy(MF->getDataLayout());
35985 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
35986 Register computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
35987
35988 X86AddressMode AM = getAddressFromInstr(&MI, 0);
35989 // Regalloc does not need any help when the memory operand of CMPXCHG8B
35990 // does not use index register.
35991 if (AM.IndexReg == X86::NoRegister)
35992 return BB;
35993
35994 // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
35995 // four operand definitions that are E[ABCD] registers. We skip them and
35996 // then insert the LEA.
35997 MachineBasicBlock::reverse_iterator RMBBI(MI.getReverseIterator());
35998 while (RMBBI != BB->rend() && (RMBBI->definesRegister(X86::EAX) ||
35999 RMBBI->definesRegister(X86::EBX) ||
36000 RMBBI->definesRegister(X86::ECX) ||
36001 RMBBI->definesRegister(X86::EDX))) {
36002 ++RMBBI;
36003 }
36004 MachineBasicBlock::iterator MBBI(RMBBI);
36005 addFullAddress(
36006 BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);
36007
36008 setDirectAddressInInstr(&MI, 0, computedAddrVReg);
36009
36010 return BB;
36011 }
36012 case X86::LCMPXCHG16B_NO_RBX: {
36013 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
36014 Register BasePtr = TRI->getBaseRegister();
36015 if (TRI->hasBasePointer(*MF) &&
36016 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
36017 if (!BB->isLiveIn(BasePtr))
36018 BB->addLiveIn(BasePtr);
36019 // Save RBX into a virtual register.
36020 Register SaveRBX =
36021 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
36022 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX)
36023 .addReg(X86::RBX);
36024 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
36025 MachineInstrBuilder MIB =
36026 BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B_SAVE_RBX), Dst);
36027 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
36028 MIB.add(MI.getOperand(Idx));
36029 MIB.add(MI.getOperand(X86::AddrNumOperands));
36030 MIB.addReg(SaveRBX);
36031 } else {
36032 // Simple case, just copy the virtual register to RBX.
36033 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::RBX)
36034 .add(MI.getOperand(X86::AddrNumOperands));
36035 MachineInstrBuilder MIB =
36036 BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B));
36037 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
36038 MIB.add(MI.getOperand(Idx));
36039 }
36040 MI.eraseFromParent();
36041 return BB;
36042 }
36043 case X86::MWAITX: {
36044 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
36045 Register BasePtr = TRI->getBaseRegister();
36046 bool IsRBX = (BasePtr == X86::RBX || BasePtr == X86::EBX);
36047 // If no need to save the base pointer, we generate MWAITXrrr,
36048 // else we generate pseudo MWAITX_SAVE_RBX.
36049 if (!IsRBX || !TRI->hasBasePointer(*MF)) {
36050 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX)
36051 .addReg(MI.getOperand(0).getReg());
36052 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX)
36053 .addReg(MI.getOperand(1).getReg());
36054 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EBX)
36055 .addReg(MI.getOperand(2).getReg());
36056 BuildMI(*BB, MI, DL, TII->get(X86::MWAITXrrr));
36057 MI.eraseFromParent();
36058 } else {
36059 if (!BB->isLiveIn(BasePtr)) {
36060 BB->addLiveIn(BasePtr);
36061 }
36062 // Parameters can be copied into ECX and EAX but not EBX yet.
36063 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX)
36064 .addReg(MI.getOperand(0).getReg());
36065 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX)
36066 .addReg(MI.getOperand(1).getReg());
36067 assert(Subtarget.is64Bit() && "Expected 64-bit mode!")(static_cast <bool> (Subtarget.is64Bit() && "Expected 64-bit mode!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Expected 64-bit mode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36067, __extension__
__PRETTY_FUNCTION__))
;
36068 // Save RBX into a virtual register.
36069 Register SaveRBX =
36070 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
36071 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX)
36072 .addReg(X86::RBX);
36073 // Generate mwaitx pseudo.
36074 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
36075 BuildMI(*BB, MI, DL, TII->get(X86::MWAITX_SAVE_RBX))
36076 .addDef(Dst) // Destination tied in with SaveRBX.
36077 .addReg(MI.getOperand(2).getReg()) // input value of EBX.
36078 .addUse(SaveRBX); // Save of base pointer.
36079 MI.eraseFromParent();
36080 }
36081 return BB;
36082 }
36083 case TargetOpcode::PREALLOCATED_SETUP: {
36084 assert(Subtarget.is32Bit() && "preallocated only used in 32-bit")(static_cast <bool> (Subtarget.is32Bit() && "preallocated only used in 32-bit"
) ? void (0) : __assert_fail ("Subtarget.is32Bit() && \"preallocated only used in 32-bit\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36084, __extension__
__PRETTY_FUNCTION__))
;
36085 auto MFI = MF->getInfo<X86MachineFunctionInfo>();
36086 MFI->setHasPreallocatedCall(true);
36087 int64_t PreallocatedId = MI.getOperand(0).getImm();
36088 size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId);
36089 assert(StackAdjustment != 0 && "0 stack adjustment")(static_cast <bool> (StackAdjustment != 0 && "0 stack adjustment"
) ? void (0) : __assert_fail ("StackAdjustment != 0 && \"0 stack adjustment\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36089, __extension__
__PRETTY_FUNCTION__))
;
36090 LLVM_DEBUG(dbgs() << "PREALLOCATED_SETUP stack adjustment "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("x86-isel")) { dbgs() << "PREALLOCATED_SETUP stack adjustment "
<< StackAdjustment << "\n"; } } while (false)
36091 << StackAdjustment << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("x86-isel")) { dbgs() << "PREALLOCATED_SETUP stack adjustment "
<< StackAdjustment << "\n"; } } while (false)
;
36092 BuildMI(*BB, MI, DL, TII->get(X86::SUB32ri), X86::ESP)
36093 .addReg(X86::ESP)
36094 .addImm(StackAdjustment);
36095 MI.eraseFromParent();
36096 return BB;
36097 }
36098 case TargetOpcode::PREALLOCATED_ARG: {
36099 assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit")(static_cast <bool> (Subtarget.is32Bit() && "preallocated calls only used in 32-bit"
) ? void (0) : __assert_fail ("Subtarget.is32Bit() && \"preallocated calls only used in 32-bit\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36099, __extension__
__PRETTY_FUNCTION__))
;
36100 int64_t PreallocatedId = MI.getOperand(1).getImm();
36101 int64_t ArgIdx = MI.getOperand(2).getImm();
36102 auto MFI = MF->getInfo<X86MachineFunctionInfo>();
36103 size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx];
36104 LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdxdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("x86-isel")) { dbgs() << "PREALLOCATED_ARG arg index "
<< ArgIdx << ", arg offset " << ArgOffset <<
"\n"; } } while (false)
36105 << ", arg offset " << ArgOffset << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("x86-isel")) { dbgs() << "PREALLOCATED_ARG arg index "
<< ArgIdx << ", arg offset " << ArgOffset <<
"\n"; } } while (false)
;
36106 // stack pointer + offset
36107 addRegOffset(
36108 BuildMI(*BB, MI, DL, TII->get(X86::LEA32r), MI.getOperand(0).getReg()),
36109 X86::ESP, false, ArgOffset);
36110 MI.eraseFromParent();
36111 return BB;
36112 }
36113 case X86::PTDPBSSD:
36114 case X86::PTDPBSUD:
36115 case X86::PTDPBUSD:
36116 case X86::PTDPBUUD:
36117 case X86::PTDPBF16PS: {
36118 unsigned Opc;
36119 switch (MI.getOpcode()) {
36120 default: llvm_unreachable("illegal opcode!")::llvm::llvm_unreachable_internal("illegal opcode!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36120)
;
36121 case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;
36122 case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;
36123 case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;
36124 case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;
36125 case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;
36126 }
36127
36128 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
36129 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
36130 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
36131 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
36132 MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
36133
36134 MI.eraseFromParent(); // The pseudo is gone now.
36135 return BB;
36136 }
36137 case X86::PTILEZERO: {
36138 unsigned Imm = MI.getOperand(0).getImm();
36139 BuildMI(*BB, MI, DL, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));
36140 MI.eraseFromParent(); // The pseudo is gone now.
36141 return BB;
36142 }
36143 case X86::PTILELOADD:
36144 case X86::PTILELOADDT1:
36145 case X86::PTILESTORED: {
36146 unsigned Opc;
36147 switch (MI.getOpcode()) {
36148 default: llvm_unreachable("illegal opcode!")::llvm::llvm_unreachable_internal("illegal opcode!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36148)
;
36149 case X86::PTILELOADD: Opc = X86::TILELOADD; break;
36150 case X86::PTILELOADDT1: Opc = X86::TILELOADDT1; break;
36151 case X86::PTILESTORED: Opc = X86::TILESTORED; break;
36152 }
36153
36154 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
36155 unsigned CurOp = 0;
36156 if (Opc != X86::TILESTORED)
36157 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
36158 RegState::Define);
36159
36160 MIB.add(MI.getOperand(CurOp++)); // base
36161 MIB.add(MI.getOperand(CurOp++)); // scale
36162 MIB.add(MI.getOperand(CurOp++)); // index -- stride
36163 MIB.add(MI.getOperand(CurOp++)); // displacement
36164 MIB.add(MI.getOperand(CurOp++)); // segment
36165
36166 if (Opc == X86::TILESTORED)
36167 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
36168 RegState::Undef);
36169
36170 MI.eraseFromParent(); // The pseudo is gone now.
36171 return BB;
36172 }
36173 }
36174}
36175
36176//===----------------------------------------------------------------------===//
36177// X86 Optimization Hooks
36178//===----------------------------------------------------------------------===//
36179
36180bool
36181X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,
36182 const APInt &DemandedBits,
36183 const APInt &DemandedElts,
36184 TargetLoweringOpt &TLO) const {
36185 EVT VT = Op.getValueType();
36186 unsigned Opcode = Op.getOpcode();
36187 unsigned EltSize = VT.getScalarSizeInBits();
36188
36189 if (VT.isVector()) {
36190 // If the constant is only all signbits in the active bits, then we should
36191 // extend it to the entire constant to allow it act as a boolean constant
36192 // vector.
36193 auto NeedsSignExtension = [&](SDValue V, unsigned ActiveBits) {
36194 if (!ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
36195 return false;
36196 for (unsigned i = 0, e = V.getNumOperands(); i != e; ++i) {
36197 if (!DemandedElts[i] || V.getOperand(i).isUndef())
36198 continue;
36199 const APInt &Val = V.getConstantOperandAPInt(i);
36200 if (Val.getBitWidth() > Val.getNumSignBits() &&
36201 Val.trunc(ActiveBits).getNumSignBits() == ActiveBits)
36202 return true;
36203 }
36204 return false;
36205 };
36206 // For vectors - if we have a constant, then try to sign extend.
36207 // TODO: Handle AND/ANDN cases.
36208 unsigned ActiveBits = DemandedBits.getActiveBits();
36209 if (EltSize > ActiveBits && EltSize > 1 && isTypeLegal(VT) &&
36210 (Opcode == ISD::OR || Opcode == ISD::XOR) &&
36211 NeedsSignExtension(Op.getOperand(1), ActiveBits)) {
36212 EVT ExtSVT = EVT::getIntegerVT(*TLO.DAG.getContext(), ActiveBits);
36213 EVT ExtVT = EVT::getVectorVT(*TLO.DAG.getContext(), ExtSVT,
36214 VT.getVectorNumElements());
36215 SDValue NewC =
36216 TLO.DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(Op), VT,
36217 Op.getOperand(1), TLO.DAG.getValueType(ExtVT));
36218 SDValue NewOp =
36219 TLO.DAG.getNode(Opcode, SDLoc(Op), VT, Op.getOperand(0), NewC);
36220 return TLO.CombineTo(Op, NewOp);
36221 }
36222 return false;
36223 }
36224
36225 // Only optimize Ands to prevent shrinking a constant that could be
36226 // matched by movzx.
36227 if (Opcode != ISD::AND)
36228 return false;
36229
36230 // Make sure the RHS really is a constant.
36231 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
36232 if (!C)
36233 return false;
36234
36235 const APInt &Mask = C->getAPIntValue();
36236
36237 // Clear all non-demanded bits initially.
36238 APInt ShrunkMask = Mask & DemandedBits;
36239
36240 // Find the width of the shrunk mask.
36241 unsigned Width = ShrunkMask.getActiveBits();
36242
36243 // If the mask is all 0s there's nothing to do here.
36244 if (Width == 0)
36245 return false;
36246
36247 // Find the next power of 2 width, rounding up to a byte.
36248 Width = PowerOf2Ceil(std::max(Width, 8U));
36249 // Truncate the width to size to handle illegal types.
36250 Width = std::min(Width, EltSize);
36251
36252 // Calculate a possible zero extend mask for this constant.
36253 APInt ZeroExtendMask = APInt::getLowBitsSet(EltSize, Width);
36254
36255 // If we aren't changing the mask, just return true to keep it and prevent
36256 // the caller from optimizing.
36257 if (ZeroExtendMask == Mask)
36258 return true;
36259
36260 // Make sure the new mask can be represented by a combination of mask bits
36261 // and non-demanded bits.
36262 if (!ZeroExtendMask.isSubsetOf(Mask | ~DemandedBits))
36263 return false;
36264
36265 // Replace the constant with the zero extend mask.
36266 SDLoc DL(Op);
36267 SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);
36268 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
36269 return TLO.CombineTo(Op, NewOp);
36270}
36271
36272void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
36273 KnownBits &Known,
36274 const APInt &DemandedElts,
36275 const SelectionDAG &DAG,
36276 unsigned Depth) const {
36277 unsigned BitWidth = Known.getBitWidth();
36278 unsigned NumElts = DemandedElts.getBitWidth();
36279 unsigned Opc = Op.getOpcode();
36280 EVT VT = Op.getValueType();
36281 assert((Opc >= ISD::BUILTIN_OP_END ||(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36286, __extension__
__PRETTY_FUNCTION__))
36282 Opc == ISD::INTRINSIC_WO_CHAIN ||(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36286, __extension__
__PRETTY_FUNCTION__))
36283 Opc == ISD::INTRINSIC_W_CHAIN ||(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36286, __extension__
__PRETTY_FUNCTION__))
36284 Opc == ISD::INTRINSIC_VOID) &&(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36286, __extension__
__PRETTY_FUNCTION__))
36285 "Should use MaskedValueIsZero if you don't know whether Op"(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36286, __extension__
__PRETTY_FUNCTION__))
36286 " is a target node!")(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36286, __extension__
__PRETTY_FUNCTION__))
;
36287
36288 Known.resetAll();
36289 switch (Opc) {
36290 default: break;
36291 case X86ISD::SETCC:
36292 Known.Zero.setBitsFrom(1);
36293 break;
36294 case X86ISD::MOVMSK: {
36295 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
36296 Known.Zero.setBitsFrom(NumLoBits);
36297 break;
36298 }
36299 case X86ISD::PEXTRB:
36300 case X86ISD::PEXTRW: {
36301 SDValue Src = Op.getOperand(0);
36302 EVT SrcVT = Src.getValueType();
36303 APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
36304 Op.getConstantOperandVal(1));
36305 Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);
36306 Known = Known.anyextOrTrunc(BitWidth);
36307 Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
36308 break;
36309 }
36310 case X86ISD::VSRAI:
36311 case X86ISD::VSHLI:
36312 case X86ISD::VSRLI: {
36313 unsigned ShAmt = Op.getConstantOperandVal(1);
36314 if (ShAmt >= VT.getScalarSizeInBits()) {
36315 Known.setAllZero();
36316 break;
36317 }
36318
36319 Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
36320 if (Opc == X86ISD::VSHLI) {
36321 Known.Zero <<= ShAmt;
36322 Known.One <<= ShAmt;
36323 // Low bits are known zero.
36324 Known.Zero.setLowBits(ShAmt);
36325 } else if (Opc == X86ISD::VSRLI) {
36326 Known.Zero.lshrInPlace(ShAmt);
36327 Known.One.lshrInPlace(ShAmt);
36328 // High bits are known zero.
36329 Known.Zero.setHighBits(ShAmt);
36330 } else {
36331 Known.Zero.ashrInPlace(ShAmt);
36332 Known.One.ashrInPlace(ShAmt);
36333 }
36334 break;
36335 }
36336 case X86ISD::PACKUS: {
36337 // PACKUS is just a truncation if the upper half is zero.
36338 APInt DemandedLHS, DemandedRHS;
36339 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
36340
36341 Known.One = APInt::getAllOnes(BitWidth * 2);
36342 Known.Zero = APInt::getAllOnes(BitWidth * 2);
36343
36344 KnownBits Known2;
36345 if (!!DemandedLHS) {
36346 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);
36347 Known = KnownBits::commonBits(Known, Known2);
36348 }
36349 if (!!DemandedRHS) {
36350 Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);
36351 Known = KnownBits::commonBits(Known, Known2);
36352 }
36353
36354 if (Known.countMinLeadingZeros() < BitWidth)
36355 Known.resetAll();
36356 Known = Known.trunc(BitWidth);
36357 break;
36358 }
36359 case X86ISD::VBROADCAST: {
36360 SDValue Src = Op.getOperand(0);
36361 if (!Src.getSimpleValueType().isVector()) {
36362 Known = DAG.computeKnownBits(Src, Depth + 1);
36363 return;
36364 }
36365 break;
36366 }
36367 case X86ISD::AND: {
36368 if (Op.getResNo() == 0) {
36369 KnownBits Known2;
36370 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
36371 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
36372 Known &= Known2;
36373 }
36374 break;
36375 }
36376 case X86ISD::ANDNP: {
36377 KnownBits Known2;
36378 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
36379 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
36380
36381 // ANDNP = (~X & Y);
36382 Known.One &= Known2.Zero;
36383 Known.Zero |= Known2.One;
36384 break;
36385 }
36386 case X86ISD::FOR: {
36387 KnownBits Known2;
36388 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
36389 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
36390
36391 Known |= Known2;
36392 break;
36393 }
36394 case X86ISD::PSADBW: {
36395 assert(VT.getScalarType() == MVT::i64 &&(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&
"Unexpected PSADBW types") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && Op.getOperand(0).getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36397, __extension__
__PRETTY_FUNCTION__))
36396 Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&
"Unexpected PSADBW types") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && Op.getOperand(0).getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36397, __extension__
__PRETTY_FUNCTION__))
36397 "Unexpected PSADBW types")(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&
"Unexpected PSADBW types") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && Op.getOperand(0).getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36397, __extension__
__PRETTY_FUNCTION__))
;
36398
36399 // PSADBW - fills low 16 bits and zeros upper 48 bits of each i64 result.
36400 Known.Zero.setBitsFrom(16);
36401 break;
36402 }
36403 case X86ISD::PMULUDQ: {
36404 KnownBits Known2;
36405 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
36406 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
36407
36408 Known = Known.trunc(BitWidth / 2).zext(BitWidth);
36409 Known2 = Known2.trunc(BitWidth / 2).zext(BitWidth);
36410 Known = KnownBits::mul(Known, Known2);
36411 break;
36412 }
36413 case X86ISD::CMOV: {
36414 Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
36415 // If we don't know any bits, early out.
36416 if (Known.isUnknown())
36417 break;
36418 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
36419
36420 // Only known if known in both the LHS and RHS.
36421 Known = KnownBits::commonBits(Known, Known2);
36422 break;
36423 }
36424 case X86ISD::BEXTR:
36425 case X86ISD::BEXTRI: {
36426 SDValue Op0 = Op.getOperand(0);
36427 SDValue Op1 = Op.getOperand(1);
36428
36429 if (auto* Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
36430 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
36431 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
36432
36433 // If the length is 0, the result is 0.
36434 if (Length == 0) {
36435 Known.setAllZero();
36436 break;
36437 }
36438
36439 if ((Shift + Length) <= BitWidth) {
36440 Known = DAG.computeKnownBits(Op0, Depth + 1);
36441 Known = Known.extractBits(Length, Shift);
36442 Known = Known.zextOrTrunc(BitWidth);
36443 }
36444 }
36445 break;
36446 }
36447 case X86ISD::PDEP: {
36448 KnownBits Known2;
36449 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
36450 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
36451 // Zeros are retained from the mask operand. But not ones.
36452 Known.One.clearAllBits();
36453 // The result will have at least as many trailing zeros as the non-mask
36454 // operand since bits can only map to the same or higher bit position.
36455 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
36456 break;
36457 }
36458 case X86ISD::PEXT: {
36459 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
36460 // The result has as many leading zeros as the number of zeroes in the mask.
36461 unsigned Count = Known.Zero.countPopulation();
36462 Known.Zero = APInt::getHighBitsSet(BitWidth, Count);
36463 Known.One.clearAllBits();
36464 break;
36465 }
36466 case X86ISD::VTRUNC:
36467 case X86ISD::VTRUNCS:
36468 case X86ISD::VTRUNCUS:
36469 case X86ISD::CVTSI2P:
36470 case X86ISD::CVTUI2P:
36471 case X86ISD::CVTP2SI:
36472 case X86ISD::CVTP2UI:
36473 case X86ISD::MCVTP2SI:
36474 case X86ISD::MCVTP2UI:
36475 case X86ISD::CVTTP2SI:
36476 case X86ISD::CVTTP2UI:
36477 case X86ISD::MCVTTP2SI:
36478 case X86ISD::MCVTTP2UI:
36479 case X86ISD::MCVTSI2P:
36480 case X86ISD::MCVTUI2P:
36481 case X86ISD::VFPROUND:
36482 case X86ISD::VMFPROUND:
36483 case X86ISD::CVTPS2PH:
36484 case X86ISD::MCVTPS2PH: {
36485 // Truncations/Conversions - upper elements are known zero.
36486 EVT SrcVT = Op.getOperand(0).getValueType();
36487 if (SrcVT.isVector()) {
36488 unsigned NumSrcElts = SrcVT.getVectorNumElements();
36489 if (NumElts > NumSrcElts &&
36490 DemandedElts.countTrailingZeros() >= NumSrcElts)
36491 Known.setAllZero();
36492 }
36493 break;
36494 }
36495 case X86ISD::STRICT_CVTTP2SI:
36496 case X86ISD::STRICT_CVTTP2UI:
36497 case X86ISD::STRICT_CVTSI2P:
36498 case X86ISD::STRICT_CVTUI2P:
36499 case X86ISD::STRICT_VFPROUND:
36500 case X86ISD::STRICT_CVTPS2PH: {
36501 // Strict Conversions - upper elements are known zero.
36502 EVT SrcVT = Op.getOperand(1).getValueType();
36503 if (SrcVT.isVector()) {
36504 unsigned NumSrcElts = SrcVT.getVectorNumElements();
36505 if (NumElts > NumSrcElts &&
36506 DemandedElts.countTrailingZeros() >= NumSrcElts)
36507 Known.setAllZero();
36508 }
36509 break;
36510 }
36511 case X86ISD::MOVQ2DQ: {
36512 // Move from MMX to XMM. Upper half of XMM should be 0.
36513 if (DemandedElts.countTrailingZeros() >= (NumElts / 2))
36514 Known.setAllZero();
36515 break;
36516 }
36517 }
36518
36519 // Handle target shuffles.
36520 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
36521 if (isTargetShuffle(Opc)) {
36522 SmallVector<int, 64> Mask;
36523 SmallVector<SDValue, 2> Ops;
36524 if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask)) {
36525 unsigned NumOps = Ops.size();
36526 unsigned NumElts = VT.getVectorNumElements();
36527 if (Mask.size() == NumElts) {
36528 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
36529 Known.Zero.setAllBits(); Known.One.setAllBits();
36530 for (unsigned i = 0; i != NumElts; ++i) {
36531 if (!DemandedElts[i])
36532 continue;
36533 int M = Mask[i];
36534 if (M == SM_SentinelUndef) {
36535 // For UNDEF elements, we don't know anything about the common state
36536 // of the shuffle result.
36537 Known.resetAll();
36538 break;
36539 }
36540 if (M == SM_SentinelZero) {
36541 Known.One.clearAllBits();
36542 continue;
36543 }
36544 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&(static_cast <bool> (0 <= M && (unsigned)M <
(NumOps * NumElts) && "Shuffle index out of range") ?
void (0) : __assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36545, __extension__
__PRETTY_FUNCTION__))
36545 "Shuffle index out of range")(static_cast <bool> (0 <= M && (unsigned)M <
(NumOps * NumElts) && "Shuffle index out of range") ?
void (0) : __assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36545, __extension__
__PRETTY_FUNCTION__))
;
36546
36547 unsigned OpIdx = (unsigned)M / NumElts;
36548 unsigned EltIdx = (unsigned)M % NumElts;
36549 if (Ops[OpIdx].getValueType() != VT) {
36550 // TODO - handle target shuffle ops with different value types.
36551 Known.resetAll();
36552 break;
36553 }
36554 DemandedOps[OpIdx].setBit(EltIdx);
36555 }
36556 // Known bits are the values that are shared by every demanded element.
36557 for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {
36558 if (!DemandedOps[i])
36559 continue;
36560 KnownBits Known2 =
36561 DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);
36562 Known = KnownBits::commonBits(Known, Known2);
36563 }
36564 }
36565 }
36566 }
36567}
36568
36569unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
36570 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
36571 unsigned Depth) const {
36572 EVT VT = Op.getValueType();
36573 unsigned VTBits = VT.getScalarSizeInBits();
36574 unsigned Opcode = Op.getOpcode();
36575 switch (Opcode) {
36576 case X86ISD::SETCC_CARRY:
36577 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
36578 return VTBits;
36579
36580 case X86ISD::VTRUNC: {
36581 SDValue Src = Op.getOperand(0);
36582 MVT SrcVT = Src.getSimpleValueType();
36583 unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
36584 assert(VTBits < NumSrcBits && "Illegal truncation input type")(static_cast <bool> (VTBits < NumSrcBits && "Illegal truncation input type"
) ? void (0) : __assert_fail ("VTBits < NumSrcBits && \"Illegal truncation input type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36584, __extension__
__PRETTY_FUNCTION__))
;
36585 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
36586 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
36587 if (Tmp > (NumSrcBits - VTBits))
36588 return Tmp - (NumSrcBits - VTBits);
36589 return 1;
36590 }
36591
36592 case X86ISD::PACKSS: {
36593 // PACKSS is just a truncation if the sign bits extend to the packed size.
36594 APInt DemandedLHS, DemandedRHS;
36595 getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,
36596 DemandedRHS);
36597
36598 unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
36599 unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;
36600 if (!!DemandedLHS)
36601 Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), DemandedLHS, Depth + 1);
36602 if (!!DemandedRHS)
36603 Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), DemandedRHS, Depth + 1);
36604 unsigned Tmp = std::min(Tmp0, Tmp1);
36605 if (Tmp > (SrcBits - VTBits))
36606 return Tmp - (SrcBits - VTBits);
36607 return 1;
36608 }
36609
36610 case X86ISD::VBROADCAST: {
36611 SDValue Src = Op.getOperand(0);
36612 if (!Src.getSimpleValueType().isVector())
36613 return DAG.ComputeNumSignBits(Src, Depth + 1);
36614 break;
36615 }
36616
36617 case X86ISD::VSHLI: {
36618 SDValue Src = Op.getOperand(0);
36619 const APInt &ShiftVal = Op.getConstantOperandAPInt(1);
36620 if (ShiftVal.uge(VTBits))
36621 return VTBits; // Shifted all bits out --> zero.
36622 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
36623 if (ShiftVal.uge(Tmp))
36624 return 1; // Shifted all sign bits out --> unknown.
36625 return Tmp - ShiftVal.getZExtValue();
36626 }
36627
36628 case X86ISD::VSRAI: {
36629 SDValue Src = Op.getOperand(0);
36630 APInt ShiftVal = Op.getConstantOperandAPInt(1);
36631 if (ShiftVal.uge(VTBits - 1))
36632 return VTBits; // Sign splat.
36633 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
36634 ShiftVal += Tmp;
36635 return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
36636 }
36637
36638 case X86ISD::FSETCC:
36639 // cmpss/cmpsd return zero/all-bits result values in the bottom element.
36640 if (VT == MVT::f32 || VT == MVT::f64 ||
36641 ((VT == MVT::v4f32 || VT == MVT::v2f64) && DemandedElts == 1))
36642 return VTBits;
36643 break;
36644
36645 case X86ISD::PCMPGT:
36646 case X86ISD::PCMPEQ:
36647 case X86ISD::CMPP:
36648 case X86ISD::VPCOM:
36649 case X86ISD::VPCOMU:
36650 // Vector compares return zero/all-bits result values.
36651 return VTBits;
36652
36653 case X86ISD::ANDNP: {
36654 unsigned Tmp0 =
36655 DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
36656 if (Tmp0 == 1) return 1; // Early out.
36657 unsigned Tmp1 =
36658 DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
36659 return std::min(Tmp0, Tmp1);
36660 }
36661
36662 case X86ISD::CMOV: {
36663 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
36664 if (Tmp0 == 1) return 1; // Early out.
36665 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
36666 return std::min(Tmp0, Tmp1);
36667 }
36668 }
36669
36670 // Handle target shuffles.
36671 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
36672 if (isTargetShuffle(Opcode)) {
36673 SmallVector<int, 64> Mask;
36674 SmallVector<SDValue, 2> Ops;
36675 if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask)) {
36676 unsigned NumOps = Ops.size();
36677 unsigned NumElts = VT.getVectorNumElements();
36678 if (Mask.size() == NumElts) {
36679 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
36680 for (unsigned i = 0; i != NumElts; ++i) {
36681 if (!DemandedElts[i])
36682 continue;
36683 int M = Mask[i];
36684 if (M == SM_SentinelUndef) {
36685 // For UNDEF elements, we don't know anything about the common state
36686 // of the shuffle result.
36687 return 1;
36688 } else if (M == SM_SentinelZero) {
36689 // Zero = all sign bits.
36690 continue;
36691 }
36692 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&(static_cast <bool> (0 <= M && (unsigned)M <
(NumOps * NumElts) && "Shuffle index out of range") ?
void (0) : __assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36693, __extension__
__PRETTY_FUNCTION__))
36693 "Shuffle index out of range")(static_cast <bool> (0 <= M && (unsigned)M <
(NumOps * NumElts) && "Shuffle index out of range") ?
void (0) : __assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36693, __extension__
__PRETTY_FUNCTION__))
;
36694
36695 unsigned OpIdx = (unsigned)M / NumElts;
36696 unsigned EltIdx = (unsigned)M % NumElts;
36697 if (Ops[OpIdx].getValueType() != VT) {
36698 // TODO - handle target shuffle ops with different value types.
36699 return 1;
36700 }
36701 DemandedOps[OpIdx].setBit(EltIdx);
36702 }
36703 unsigned Tmp0 = VTBits;
36704 for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) {
36705 if (!DemandedOps[i])
36706 continue;
36707 unsigned Tmp1 =
36708 DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1);
36709 Tmp0 = std::min(Tmp0, Tmp1);
36710 }
36711 return Tmp0;
36712 }
36713 }
36714 }
36715
36716 // Fallback case.
36717 return 1;
36718}
36719
36720SDValue X86TargetLowering::unwrapAddress(SDValue N) const {
36721 if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
36722 return N->getOperand(0);
36723 return N;
36724}
36725
36726// Helper to look for a normal load that can be narrowed into a vzload with the
36727// specified VT and memory VT. Returns SDValue() on failure.
36728static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT,
36729 SelectionDAG &DAG) {
36730 // Can't if the load is volatile or atomic.
36731 if (!LN->isSimple())
36732 return SDValue();
36733
36734 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
36735 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
36736 return DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, SDLoc(LN), Tys, Ops, MemVT,
36737 LN->getPointerInfo(), LN->getOriginalAlign(),
36738 LN->getMemOperand()->getFlags());
36739}
36740
36741// Attempt to match a combined shuffle mask against supported unary shuffle
36742// instructions.
36743// TODO: Investigate sharing more of this with shuffle lowering.
36744static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
36745 bool AllowFloatDomain, bool AllowIntDomain,
36746 SDValue V1, const X86Subtarget &Subtarget,
36747 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) {
36748 unsigned NumMaskElts = Mask.size();
36749 unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
36750
36751 // Match against a VZEXT_MOVL vXi32 and vXi16 zero-extending instruction.
36752 if (Mask[0] == 0 &&
36753 (MaskEltSize == 32 || (MaskEltSize == 16 && Subtarget.hasFP16()))) {
36754 if ((isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) ||
36755 (V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
36756 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1))) {
36757 Shuffle = X86ISD::VZEXT_MOVL;
36758 if (MaskEltSize == 16)
36759 SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
36760 else
36761 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
36762 return true;
36763 }
36764 }
36765
36766 // Match against a ANY/ZERO_EXTEND_VECTOR_INREG instruction.
36767 // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
36768 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
36769 (MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
36770 unsigned MaxScale = 64 / MaskEltSize;
36771 for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
36772 bool MatchAny = true;
36773 bool MatchZero = true;
36774 unsigned NumDstElts = NumMaskElts / Scale;
36775 for (unsigned i = 0; i != NumDstElts && (MatchAny || MatchZero); ++i) {
36776 if (!isUndefOrEqual(Mask[i * Scale], (int)i)) {
36777 MatchAny = MatchZero = false;
36778 break;
36779 }
36780 MatchAny &= isUndefInRange(Mask, (i * Scale) + 1, Scale - 1);
36781 MatchZero &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
36782 }
36783 if (MatchAny || MatchZero) {
36784 assert(MatchZero && "Failed to match zext but matched aext?")(static_cast <bool> (MatchZero && "Failed to match zext but matched aext?"
) ? void (0) : __assert_fail ("MatchZero && \"Failed to match zext but matched aext?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36784, __extension__
__PRETTY_FUNCTION__))
;
36785 unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
36786 MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() :
36787 MVT::getIntegerVT(MaskEltSize);
36788 SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
36789
36790 Shuffle = unsigned(MatchAny ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND);
36791 if (SrcVT.getVectorNumElements() != NumDstElts)
36792 Shuffle = getOpcode_EXTEND_VECTOR_INREG(Shuffle);
36793
36794 DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
36795 DstVT = MVT::getVectorVT(DstVT, NumDstElts);
36796 return true;
36797 }
36798 }
36799 }
36800
36801 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
36802 if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2()) ||
36803 (MaskEltSize == 16 && Subtarget.hasFP16())) &&
36804 isUndefOrEqual(Mask[0], 0) &&
36805 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
36806 Shuffle = X86ISD::VZEXT_MOVL;
36807 if (MaskEltSize == 16)
36808 SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
36809 else
36810 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
36811 return true;
36812 }
36813
36814 // Check if we have SSE3 which will let us use MOVDDUP etc. The
36815 // instructions are no slower than UNPCKLPD but has the option to
36816 // fold the input operand into even an unaligned memory load.
36817 if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
36818 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, V1)) {
36819 Shuffle = X86ISD::MOVDDUP;
36820 SrcVT = DstVT = MVT::v2f64;
36821 return true;
36822 }
36823 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, V1)) {
36824 Shuffle = X86ISD::MOVSLDUP;
36825 SrcVT = DstVT = MVT::v4f32;
36826 return true;
36827 }
36828 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3}, V1)) {
36829 Shuffle = X86ISD::MOVSHDUP;
36830 SrcVT = DstVT = MVT::v4f32;
36831 return true;
36832 }
36833 }
36834
36835 if (MaskVT.is256BitVector() && AllowFloatDomain) {
36836 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles")(static_cast <bool> (Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"AVX required for 256-bit vector shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36836, __extension__
__PRETTY_FUNCTION__))
;
36837 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, V1)) {
36838 Shuffle = X86ISD::MOVDDUP;
36839 SrcVT = DstVT = MVT::v4f64;
36840 return true;
36841 }
36842 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1)) {
36843 Shuffle = X86ISD::MOVSLDUP;
36844 SrcVT = DstVT = MVT::v8f32;
36845 return true;
36846 }
36847 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3, 5, 5, 7, 7}, V1)) {
36848 Shuffle = X86ISD::MOVSHDUP;
36849 SrcVT = DstVT = MVT::v8f32;
36850 return true;
36851 }
36852 }
36853
36854 if (MaskVT.is512BitVector() && AllowFloatDomain) {
36855 assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "AVX512 required for 512-bit vector shuffles"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"AVX512 required for 512-bit vector shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36856, __extension__
__PRETTY_FUNCTION__))
36856 "AVX512 required for 512-bit vector shuffles")(static_cast <bool> (Subtarget.hasAVX512() && "AVX512 required for 512-bit vector shuffles"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"AVX512 required for 512-bit vector shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36856, __extension__
__PRETTY_FUNCTION__))
;
36857 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1)) {
36858 Shuffle = X86ISD::MOVDDUP;
36859 SrcVT = DstVT = MVT::v8f64;
36860 return true;
36861 }
36862 if (isTargetShuffleEquivalent(
36863 MaskVT, Mask,
36864 {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, V1)) {
36865 Shuffle = X86ISD::MOVSLDUP;
36866 SrcVT = DstVT = MVT::v16f32;
36867 return true;
36868 }
36869 if (isTargetShuffleEquivalent(
36870 MaskVT, Mask,
36871 {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, V1)) {
36872 Shuffle = X86ISD::MOVSHDUP;
36873 SrcVT = DstVT = MVT::v16f32;
36874 return true;
36875 }
36876 }
36877
36878 return false;
36879}
36880
36881// Attempt to match a combined shuffle mask against supported unary immediate
36882// permute instructions.
36883// TODO: Investigate sharing more of this with shuffle lowering.
36884static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,
36885 const APInt &Zeroable,
36886 bool AllowFloatDomain, bool AllowIntDomain,
36887 const X86Subtarget &Subtarget,
36888 unsigned &Shuffle, MVT &ShuffleVT,
36889 unsigned &PermuteImm) {
36890 unsigned NumMaskElts = Mask.size();
36891 unsigned InputSizeInBits = MaskVT.getSizeInBits();
36892 unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
36893 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
36894 bool ContainsZeros = isAnyZero(Mask);
36895
36896 // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
36897 if (!ContainsZeros && MaskScalarSizeInBits == 64) {
36898 // Check for lane crossing permutes.
36899 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
36900 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
36901 if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
36902 Shuffle = X86ISD::VPERMI;
36903 ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
36904 PermuteImm = getV4X86ShuffleImm(Mask);
36905 return true;
36906 }
36907 if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
36908 SmallVector<int, 4> RepeatedMask;
36909 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
36910 Shuffle = X86ISD::VPERMI;
36911 ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
36912 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
36913 return true;
36914 }
36915 }
36916 } else if (AllowFloatDomain && Subtarget.hasAVX()) {
36917 // VPERMILPD can permute with a non-repeating shuffle.
36918 Shuffle = X86ISD::VPERMILPI;
36919 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
36920 PermuteImm = 0;
36921 for (int i = 0, e = Mask.size(); i != e; ++i) {
36922 int M = Mask[i];
36923 if (M == SM_SentinelUndef)
36924 continue;
36925 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index")(static_cast <bool> (((M / 2) == (i / 2)) && "Out of range shuffle mask index"
) ? void (0) : __assert_fail ("((M / 2) == (i / 2)) && \"Out of range shuffle mask index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36925, __extension__
__PRETTY_FUNCTION__))
;
36926 PermuteImm |= (M & 1) << i;
36927 }
36928 return true;
36929 }
36930 }
36931
36932 // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
36933 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
36934 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
36935 if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
36936 !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
36937 SmallVector<int, 4> RepeatedMask;
36938 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
36939 // Narrow the repeated mask to create 32-bit element permutes.
36940 SmallVector<int, 4> WordMask = RepeatedMask;
36941 if (MaskScalarSizeInBits == 64)
36942 narrowShuffleMaskElts(2, RepeatedMask, WordMask);
36943
36944 Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
36945 ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
36946 ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
36947 PermuteImm = getV4X86ShuffleImm(WordMask);
36948 return true;
36949 }
36950 }
36951
36952 // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
36953 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16 &&
36954 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
36955 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
36956 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
36957 SmallVector<int, 4> RepeatedMask;
36958 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
36959 ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4);
36960 ArrayRef<int> HiMask(RepeatedMask.data() + 4, 4);
36961
36962 // PSHUFLW: permute lower 4 elements only.
36963 if (isUndefOrInRange(LoMask, 0, 4) &&
36964 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
36965 Shuffle = X86ISD::PSHUFLW;
36966 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
36967 PermuteImm = getV4X86ShuffleImm(LoMask);
36968 return true;
36969 }
36970
36971 // PSHUFHW: permute upper 4 elements only.
36972 if (isUndefOrInRange(HiMask, 4, 8) &&
36973 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
36974 // Offset the HiMask so that we can create the shuffle immediate.
36975 int OffsetHiMask[4];
36976 for (int i = 0; i != 4; ++i)
36977 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
36978
36979 Shuffle = X86ISD::PSHUFHW;
36980 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
36981 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
36982 return true;
36983 }
36984 }
36985 }
36986
36987 // Attempt to match against byte/bit shifts.
36988 if (AllowIntDomain &&
36989 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
36990 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
36991 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
36992 int ShiftAmt = matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits,
36993 Mask, 0, Zeroable, Subtarget);
36994 if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() || Subtarget.hasBWI() ||
36995 32 <= ShuffleVT.getScalarSizeInBits())) {
36996 PermuteImm = (unsigned)ShiftAmt;
36997 return true;
36998 }
36999 }
37000
37001 // Attempt to match against bit rotates.
37002 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 &&
37003 ((MaskVT.is128BitVector() && Subtarget.hasXOP()) ||
37004 Subtarget.hasAVX512())) {
37005 int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits,
37006 Subtarget, Mask);
37007 if (0 < RotateAmt) {
37008 Shuffle = X86ISD::VROTLI;
37009 PermuteImm = (unsigned)RotateAmt;
37010 return true;
37011 }
37012 }
37013
37014 return false;
37015}
37016
37017// Attempt to match a combined unary shuffle mask against supported binary
37018// shuffle instructions.
37019// TODO: Investigate sharing more of this with shuffle lowering.
37020static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
37021 bool AllowFloatDomain, bool AllowIntDomain,
37022 SDValue &V1, SDValue &V2, const SDLoc &DL,
37023 SelectionDAG &DAG, const X86Subtarget &Subtarget,
37024 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
37025 bool IsUnary) {
37026 unsigned NumMaskElts = Mask.size();
37027 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
37028
37029 if (MaskVT.is128BitVector()) {
37030 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}) && AllowFloatDomain) {
37031 V2 = V1;
37032 V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
37033 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;
37034 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
37035 return true;
37036 }
37037 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1}) && AllowFloatDomain) {
37038 V2 = V1;
37039 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;
37040 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
37041 return true;
37042 }
37043 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 3}) &&
37044 Subtarget.hasSSE2() && (AllowFloatDomain || !Subtarget.hasSSE41())) {
37045 std::swap(V1, V2);
37046 Shuffle = X86ISD::MOVSD;
37047 SrcVT = DstVT = MVT::v2f64;
37048 return true;
37049 }
37050 if (isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}) &&
37051 (AllowFloatDomain || !Subtarget.hasSSE41())) {
37052 Shuffle = X86ISD::MOVSS;
37053 SrcVT = DstVT = MVT::v4f32;
37054 return true;
37055 }
37056 if (isTargetShuffleEquivalent(MaskVT, Mask, {8, 1, 2, 3, 4, 5, 6, 7}) &&
37057 Subtarget.hasFP16()) {
37058 Shuffle = X86ISD::MOVSH;
37059 SrcVT = DstVT = MVT::v8f16;
37060 return true;
37061 }
37062 }
37063
37064 // Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.
37065 if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) ||
37066 ((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) ||
37067 ((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {
37068 if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
37069 Subtarget)) {
37070 DstVT = MaskVT;
37071 return true;
37072 }
37073 }
37074
37075 // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
37076 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
37077 (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
37078 (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
37079 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
37080 (MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
37081 if (matchShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG,
37082 Subtarget)) {
37083 SrcVT = DstVT = MaskVT;
37084 if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
37085 SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
37086 return true;
37087 }
37088 }
37089
37090 // Attempt to match against a OR if we're performing a blend shuffle and the
37091 // non-blended source element is zero in each case.
37092 if ((EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
37093 (EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) {
37094 bool IsBlend = true;
37095 unsigned NumV1Elts = V1.getValueType().getVectorNumElements();
37096 unsigned NumV2Elts = V2.getValueType().getVectorNumElements();
37097 unsigned Scale1 = NumV1Elts / NumMaskElts;
37098 unsigned Scale2 = NumV2Elts / NumMaskElts;
37099 APInt DemandedZeroV1 = APInt::getZero(NumV1Elts);
37100 APInt DemandedZeroV2 = APInt::getZero(NumV2Elts);
37101 for (unsigned i = 0; i != NumMaskElts; ++i) {
37102 int M = Mask[i];
37103 if (M == SM_SentinelUndef)
37104 continue;
37105 if (M == SM_SentinelZero) {
37106 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
37107 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
37108 continue;
37109 }
37110 if (M == (int)i) {
37111 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
37112 continue;
37113 }
37114 if (M == (int)(i + NumMaskElts)) {
37115 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
37116 continue;
37117 }
37118 IsBlend = false;
37119 break;
37120 }
37121 if (IsBlend) {
37122 if (DAG.computeKnownBits(V1, DemandedZeroV1).isZero() &&
37123 DAG.computeKnownBits(V2, DemandedZeroV2).isZero()) {
37124 Shuffle = ISD::OR;
37125 SrcVT = DstVT = MaskVT.changeTypeToInteger();
37126 return true;
37127 }
37128 if (NumV1Elts == NumV2Elts && NumV1Elts == NumMaskElts) {
37129 // FIXME: handle mismatched sizes?
37130 // TODO: investigate if `ISD::OR` handling in
37131 // `TargetLowering::SimplifyDemandedVectorElts` can be improved instead.
37132 auto computeKnownBitsElementWise = [&DAG](SDValue V) {
37133 unsigned NumElts = V.getValueType().getVectorNumElements();
37134 KnownBits Known(NumElts);
37135 for (unsigned EltIdx = 0; EltIdx != NumElts; ++EltIdx) {
37136 APInt Mask = APInt::getOneBitSet(NumElts, EltIdx);
37137 KnownBits PeepholeKnown = DAG.computeKnownBits(V, Mask);
37138 if (PeepholeKnown.isZero())
37139 Known.Zero.setBit(EltIdx);
37140 if (PeepholeKnown.isAllOnes())
37141 Known.One.setBit(EltIdx);
37142 }
37143 return Known;
37144 };
37145
37146 KnownBits V1Known = computeKnownBitsElementWise(V1);
37147 KnownBits V2Known = computeKnownBitsElementWise(V2);
37148
37149 for (unsigned i = 0; i != NumMaskElts && IsBlend; ++i) {
37150 int M = Mask[i];
37151 if (M == SM_SentinelUndef)
37152 continue;
37153 if (M == SM_SentinelZero) {
37154 IsBlend &= V1Known.Zero[i] && V2Known.Zero[i];
37155 continue;
37156 }
37157 if (M == (int)i) {
37158 IsBlend &= V2Known.Zero[i] || V1Known.One[i];
37159 continue;
37160 }
37161 if (M == (int)(i + NumMaskElts)) {
37162 IsBlend &= V1Known.Zero[i] || V2Known.One[i];
37163 continue;
37164 }
37165 llvm_unreachable("will not get here.")::llvm::llvm_unreachable_internal("will not get here.", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37165)
;
37166 }
37167 if (IsBlend) {
37168 Shuffle = ISD::OR;
37169 SrcVT = DstVT = MaskVT.changeTypeToInteger();
37170 return true;
37171 }
37172 }
37173 }
37174 }
37175
37176 return false;
37177}
37178
37179static bool matchBinaryPermuteShuffle(
37180 MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
37181 bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
37182 const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
37183 unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {
37184 unsigned NumMaskElts = Mask.size();
37185 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
37186
37187 // Attempt to match against VALIGND/VALIGNQ rotate.
37188 if (AllowIntDomain && (EltSizeInBits == 64 || EltSizeInBits == 32) &&
37189 ((MaskVT.is128BitVector() && Subtarget.hasVLX()) ||
37190 (MaskVT.is256BitVector() && Subtarget.hasVLX()) ||
37191 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
37192 if (!isAnyZero(Mask)) {
37193 int Rotation = matchShuffleAsElementRotate(V1, V2, Mask);
37194 if (0 < Rotation) {
37195 Shuffle = X86ISD::VALIGN;
37196 if (EltSizeInBits == 64)
37197 ShuffleVT = MVT::getVectorVT(MVT::i64, MaskVT.getSizeInBits() / 64);
37198 else
37199 ShuffleVT = MVT::getVectorVT(MVT::i32, MaskVT.getSizeInBits() / 32);
37200 PermuteImm = Rotation;
37201 return true;
37202 }
37203 }
37204 }
37205
37206 // Attempt to match against PALIGNR byte rotate.
37207 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
37208 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
37209 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
37210 int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);
37211 if (0 < ByteRotation) {
37212 Shuffle = X86ISD::PALIGNR;
37213 ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
37214 PermuteImm = ByteRotation;
37215 return true;
37216 }
37217 }
37218
37219 // Attempt to combine to X86ISD::BLENDI.
37220 if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
37221 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
37222 (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
37223 uint64_t BlendMask = 0;
37224 bool ForceV1Zero = false, ForceV2Zero = false;
37225 SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
37226 if (matchShuffleAsBlend(V1, V2, TargetMask, Zeroable, ForceV1Zero,
37227 ForceV2Zero, BlendMask)) {
37228 if (MaskVT == MVT::v16i16) {
37229 // We can only use v16i16 PBLENDW if the lanes are repeated.
37230 SmallVector<int, 8> RepeatedMask;
37231 if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
37232 RepeatedMask)) {
37233 assert(RepeatedMask.size() == 8 &&(static_cast <bool> (RepeatedMask.size() == 8 &&
"Repeated mask size doesn't match!") ? void (0) : __assert_fail
("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37234, __extension__
__PRETTY_FUNCTION__))
37234 "Repeated mask size doesn't match!")(static_cast <bool> (RepeatedMask.size() == 8 &&
"Repeated mask size doesn't match!") ? void (0) : __assert_fail
("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37234, __extension__
__PRETTY_FUNCTION__))
;
37235 PermuteImm = 0;
37236 for (int i = 0; i < 8; ++i)
37237 if (RepeatedMask[i] >= 8)
37238 PermuteImm |= 1 << i;
37239 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
37240 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
37241 Shuffle = X86ISD::BLENDI;
37242 ShuffleVT = MaskVT;
37243 return true;
37244 }
37245 } else {
37246 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
37247 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
37248 PermuteImm = (unsigned)BlendMask;
37249 Shuffle = X86ISD::BLENDI;
37250 ShuffleVT = MaskVT;
37251 return true;
37252 }
37253 }
37254 }
37255
37256 // Attempt to combine to INSERTPS, but only if it has elements that need to
37257 // be set to zero.
37258 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
37259 MaskVT.is128BitVector() && isAnyZero(Mask) &&
37260 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
37261 Shuffle = X86ISD::INSERTPS;
37262 ShuffleVT = MVT::v4f32;
37263 return true;
37264 }
37265
37266 // Attempt to combine to SHUFPD.
37267 if (AllowFloatDomain && EltSizeInBits == 64 &&
37268 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
37269 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
37270 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
37271 bool ForceV1Zero = false, ForceV2Zero = false;
37272 if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero,
37273 PermuteImm, Mask, Zeroable)) {
37274 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
37275 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
37276 Shuffle = X86ISD::SHUFP;
37277 ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
37278 return true;
37279 }
37280 }
37281
37282 // Attempt to combine to SHUFPS.
37283 if (AllowFloatDomain && EltSizeInBits == 32 &&
37284 ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
37285 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
37286 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
37287 SmallVector<int, 4> RepeatedMask;
37288 if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
37289 // Match each half of the repeated mask, to determine if its just
37290 // referencing one of the vectors, is zeroable or entirely undef.
37291 auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
37292 int M0 = RepeatedMask[Offset];
37293 int M1 = RepeatedMask[Offset + 1];
37294
37295 if (isUndefInRange(RepeatedMask, Offset, 2)) {
37296 return DAG.getUNDEF(MaskVT);
37297 } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
37298 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
37299 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
37300 return getZeroVector(MaskVT, Subtarget, DAG, DL);
37301 } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
37302 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
37303 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
37304 return V1;
37305 } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
37306 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
37307 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
37308 return V2;
37309 }
37310
37311 return SDValue();
37312 };
37313
37314 int ShufMask[4] = {-1, -1, -1, -1};
37315 SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
37316 SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
37317
37318 if (Lo && Hi) {
37319 V1 = Lo;
37320 V2 = Hi;
37321 Shuffle = X86ISD::SHUFP;
37322 ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
37323 PermuteImm = getV4X86ShuffleImm(ShufMask);
37324 return true;
37325 }
37326 }
37327 }
37328
37329 // Attempt to combine to INSERTPS more generally if X86ISD::SHUFP failed.
37330 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
37331 MaskVT.is128BitVector() &&
37332 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
37333 Shuffle = X86ISD::INSERTPS;
37334 ShuffleVT = MVT::v4f32;
37335 return true;
37336 }
37337
37338 return false;
37339}
37340
37341static SDValue combineX86ShuffleChainWithExtract(
37342 ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
37343 bool HasVariableMask, bool AllowVariableCrossLaneMask,
37344 bool AllowVariablePerLaneMask, SelectionDAG &DAG,
37345 const X86Subtarget &Subtarget);
37346
37347/// Combine an arbitrary chain of shuffles into a single instruction if
37348/// possible.
37349///
37350/// This is the leaf of the recursive combine below. When we have found some
37351/// chain of single-use x86 shuffle instructions and accumulated the combined
37352/// shuffle mask represented by them, this will try to pattern match that mask
37353/// into either a single instruction if there is a special purpose instruction
37354/// for this operation, or into a PSHUFB instruction which is a fully general
37355/// instruction but should only be used to replace chains over a certain depth.
37356static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
37357 ArrayRef<int> BaseMask, int Depth,
37358 bool HasVariableMask,
37359 bool AllowVariableCrossLaneMask,
37360 bool AllowVariablePerLaneMask,
37361 SelectionDAG &DAG,
37362 const X86Subtarget &Subtarget) {
37363 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!")(static_cast <bool> (!BaseMask.empty() && "Cannot combine an empty shuffle mask!"
) ? void (0) : __assert_fail ("!BaseMask.empty() && \"Cannot combine an empty shuffle mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37363, __extension__
__PRETTY_FUNCTION__))
;
37364 assert((Inputs.size() == 1 || Inputs.size() == 2) &&(static_cast <bool> ((Inputs.size() == 1 || Inputs.size
() == 2) && "Unexpected number of shuffle inputs!") ?
void (0) : __assert_fail ("(Inputs.size() == 1 || Inputs.size() == 2) && \"Unexpected number of shuffle inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37365, __extension__
__PRETTY_FUNCTION__))
37365 "Unexpected number of shuffle inputs!")(static_cast <bool> ((Inputs.size() == 1 || Inputs.size
() == 2) && "Unexpected number of shuffle inputs!") ?
void (0) : __assert_fail ("(Inputs.size() == 1 || Inputs.size() == 2) && \"Unexpected number of shuffle inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37365, __extension__
__PRETTY_FUNCTION__))
;
37366
37367 SDLoc DL(Root);
37368 MVT RootVT = Root.getSimpleValueType();
37369 unsigned RootSizeInBits = RootVT.getSizeInBits();
37370 unsigned NumRootElts = RootVT.getVectorNumElements();
37371
37372 // Canonicalize shuffle input op to the requested type.
37373 auto CanonicalizeShuffleInput = [&](MVT VT, SDValue Op) {
37374 if (VT.getSizeInBits() > Op.getValueSizeInBits())
37375 Op = widenSubVector(Op, false, Subtarget, DAG, DL, VT.getSizeInBits());
37376 else if (VT.getSizeInBits() < Op.getValueSizeInBits())
37377 Op = extractSubVector(Op, 0, DAG, DL, VT.getSizeInBits());
37378 return DAG.getBitcast(VT, Op);
37379 };
37380
37381 // Find the inputs that enter the chain. Note that multiple uses are OK
37382 // here, we're not going to remove the operands we find.
37383 bool UnaryShuffle = (Inputs.size() == 1);
37384 SDValue V1 = peekThroughBitcasts(Inputs[0]);
37385 SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
37386 : peekThroughBitcasts(Inputs[1]));
37387
37388 MVT VT1 = V1.getSimpleValueType();
37389 MVT VT2 = V2.getSimpleValueType();
37390 assert((RootSizeInBits % VT1.getSizeInBits()) == 0 &&(static_cast <bool> ((RootSizeInBits % VT1.getSizeInBits
()) == 0 && (RootSizeInBits % VT2.getSizeInBits()) ==
0 && "Vector size mismatch") ? void (0) : __assert_fail
("(RootSizeInBits % VT1.getSizeInBits()) == 0 && (RootSizeInBits % VT2.getSizeInBits()) == 0 && \"Vector size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37391, __extension__
__PRETTY_FUNCTION__))
37391 (RootSizeInBits % VT2.getSizeInBits()) == 0 && "Vector size mismatch")(static_cast <bool> ((RootSizeInBits % VT1.getSizeInBits
()) == 0 && (RootSizeInBits % VT2.getSizeInBits()) ==
0 && "Vector size mismatch") ? void (0) : __assert_fail
("(RootSizeInBits % VT1.getSizeInBits()) == 0 && (RootSizeInBits % VT2.getSizeInBits()) == 0 && \"Vector size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37391, __extension__
__PRETTY_FUNCTION__))
;
37392
37393 SDValue Res;
37394
37395 unsigned NumBaseMaskElts = BaseMask.size();
37396 if (NumBaseMaskElts == 1) {
37397 assert(BaseMask[0] == 0 && "Invalid shuffle index found!")(static_cast <bool> (BaseMask[0] == 0 && "Invalid shuffle index found!"
) ? void (0) : __assert_fail ("BaseMask[0] == 0 && \"Invalid shuffle index found!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37397, __extension__
__PRETTY_FUNCTION__))
;
37398 return CanonicalizeShuffleInput(RootVT, V1);
37399 }
37400
37401 bool OptForSize = DAG.shouldOptForSize();
37402 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
37403 bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
37404 (RootVT.isFloatingPoint() && Depth >= 1) ||
37405 (RootVT.is256BitVector() && !Subtarget.hasAVX2());
37406
37407 // Don't combine if we are a AVX512/EVEX target and the mask element size
37408 // is different from the root element size - this would prevent writemasks
37409 // from being reused.
37410 bool IsMaskedShuffle = false;
37411 if (RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128)) {
37412 if (Root.hasOneUse() && Root->use_begin()->getOpcode() == ISD::VSELECT &&
37413 Root->use_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
37414 IsMaskedShuffle = true;
37415 }
37416 }
37417
37418 // If we are shuffling a broadcast (and not introducing zeros) then
37419 // we can just use the broadcast directly. This works for smaller broadcast
37420 // elements as well as they already repeat across each mask element
37421 if (UnaryShuffle && isTargetShuffleSplat(V1) && !isAnyZero(BaseMask) &&
37422 (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
37423 V1.getValueSizeInBits() >= RootSizeInBits) {
37424 return CanonicalizeShuffleInput(RootVT, V1);
37425 }
37426
37427 SmallVector<int, 64> Mask(BaseMask.begin(), BaseMask.end());
37428
37429 // See if the shuffle is a hidden identity shuffle - repeated args in HOPs
37430 // etc. can be simplified.
37431 if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits && VT1.isVector()) {
37432 SmallVector<int> ScaledMask, IdentityMask;
37433 unsigned NumElts = VT1.getVectorNumElements();
37434 if (Mask.size() <= NumElts &&
37435 scaleShuffleElements(Mask, NumElts, ScaledMask)) {
37436 for (unsigned i = 0; i != NumElts; ++i)
37437 IdentityMask.push_back(i);
37438 if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, V1, V2))
37439 return CanonicalizeShuffleInput(RootVT, V1);
37440 }
37441 }
37442
37443 // Handle 128/256-bit lane shuffles of 512-bit vectors.
37444 if (RootVT.is512BitVector() &&
37445 (NumBaseMaskElts == 2 || NumBaseMaskElts == 4)) {
37446 // If the upper subvectors are zeroable, then an extract+insert is more
37447 // optimal than using X86ISD::SHUF128. The insertion is free, even if it has
37448 // to zero the upper subvectors.
37449 if (isUndefOrZeroInRange(Mask, 1, NumBaseMaskElts - 1)) {
37450 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
37451 return SDValue(); // Nothing to do!
37452 assert(isInRange(Mask[0], 0, NumBaseMaskElts) &&(static_cast <bool> (isInRange(Mask[0], 0, NumBaseMaskElts
) && "Unexpected lane shuffle") ? void (0) : __assert_fail
("isInRange(Mask[0], 0, NumBaseMaskElts) && \"Unexpected lane shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37453, __extension__
__PRETTY_FUNCTION__))
37453 "Unexpected lane shuffle")(static_cast <bool> (isInRange(Mask[0], 0, NumBaseMaskElts
) && "Unexpected lane shuffle") ? void (0) : __assert_fail
("isInRange(Mask[0], 0, NumBaseMaskElts) && \"Unexpected lane shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37453, __extension__
__PRETTY_FUNCTION__))
;
37454 Res = CanonicalizeShuffleInput(RootVT, V1);
37455 unsigned SubIdx = Mask[0] * (NumRootElts / NumBaseMaskElts);
37456 bool UseZero = isAnyZero(Mask);
37457 Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);
37458 return widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);
37459 }
37460
37461 // Narrow shuffle mask to v4x128.
37462 SmallVector<int, 4> ScaledMask;
37463 assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size")(static_cast <bool> ((BaseMaskEltSizeInBits % 128) == 0
&& "Illegal mask size") ? void (0) : __assert_fail (
"(BaseMaskEltSizeInBits % 128) == 0 && \"Illegal mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37463, __extension__
__PRETTY_FUNCTION__))
;
37464 narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, Mask, ScaledMask);
37465
37466 // Try to lower to vshuf64x2/vshuf32x4.
37467 auto MatchSHUF128 = [&](MVT ShuffleVT, const SDLoc &DL,
37468 ArrayRef<int> ScaledMask, SDValue V1, SDValue V2,
37469 SelectionDAG &DAG) {
37470 unsigned PermMask = 0;
37471 // Insure elements came from the same Op.
37472 SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)};
37473 for (int i = 0; i < 4; ++i) {
37474 assert(ScaledMask[i] >= -1 && "Illegal shuffle sentinel value")(static_cast <bool> (ScaledMask[i] >= -1 && "Illegal shuffle sentinel value"
) ? void (0) : __assert_fail ("ScaledMask[i] >= -1 && \"Illegal shuffle sentinel value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37474, __extension__
__PRETTY_FUNCTION__))
;
37475 if (ScaledMask[i] < 0)
37476 continue;
37477
37478 SDValue Op = ScaledMask[i] >= 4 ? V2 : V1;
37479 unsigned OpIndex = i / 2;
37480 if (Ops[OpIndex].isUndef())
37481 Ops[OpIndex] = Op;
37482 else if (Ops[OpIndex] != Op)
37483 return SDValue();
37484
37485 // Convert the 128-bit shuffle mask selection values into 128-bit
37486 // selection bits defined by a vshuf64x2 instruction's immediate control
37487 // byte.
37488 PermMask |= (ScaledMask[i] % 4) << (i * 2);
37489 }
37490
37491 return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
37492 CanonicalizeShuffleInput(ShuffleVT, Ops[0]),
37493 CanonicalizeShuffleInput(ShuffleVT, Ops[1]),
37494 DAG.getTargetConstant(PermMask, DL, MVT::i8));
37495 };
37496
37497 // FIXME: Is there a better way to do this? is256BitLaneRepeatedShuffleMask
37498 // doesn't work because our mask is for 128 bits and we don't have an MVT
37499 // to match that.
37500 bool PreferPERMQ = UnaryShuffle && isUndefOrInRange(ScaledMask[0], 0, 2) &&
37501 isUndefOrInRange(ScaledMask[1], 0, 2) &&
37502 isUndefOrInRange(ScaledMask[2], 2, 4) &&
37503 isUndefOrInRange(ScaledMask[3], 2, 4) &&
37504 (ScaledMask[0] < 0 || ScaledMask[2] < 0 ||
37505 ScaledMask[0] == (ScaledMask[2] % 2)) &&
37506 (ScaledMask[1] < 0 || ScaledMask[3] < 0 ||
37507 ScaledMask[1] == (ScaledMask[3] % 2));
37508
37509 if (!isAnyZero(ScaledMask) && !PreferPERMQ) {
37510 if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
37511 return SDValue(); // Nothing to do!
37512 MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
37513 if (SDValue V = MatchSHUF128(ShuffleVT, DL, ScaledMask, V1, V2, DAG))
37514 return DAG.getBitcast(RootVT, V);
37515 }
37516 }
37517
37518 // Handle 128-bit lane shuffles of 256-bit vectors.
37519 if (RootVT.is256BitVector() && NumBaseMaskElts == 2) {
37520 // If the upper half is zeroable, then an extract+insert is more optimal
37521 // than using X86ISD::VPERM2X128. The insertion is free, even if it has to
37522 // zero the upper half.
37523 if (isUndefOrZero(Mask[1])) {
37524 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
37525 return SDValue(); // Nothing to do!
37526 assert(isInRange(Mask[0], 0, 2) && "Unexpected lane shuffle")(static_cast <bool> (isInRange(Mask[0], 0, 2) &&
"Unexpected lane shuffle") ? void (0) : __assert_fail ("isInRange(Mask[0], 0, 2) && \"Unexpected lane shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37526, __extension__
__PRETTY_FUNCTION__))
;
37527 Res = CanonicalizeShuffleInput(RootVT, V1);
37528 Res = extract128BitVector(Res, Mask[0] * (NumRootElts / 2), DAG, DL);
37529 return widenSubVector(Res, Mask[1] == SM_SentinelZero, Subtarget, DAG, DL,
37530 256);
37531 }
37532
37533 // If we're inserting the low subvector, an insert-subvector 'concat'
37534 // pattern is quicker than VPERM2X128.
37535 // TODO: Add AVX2 support instead of VPERMQ/VPERMPD.
37536 if (BaseMask[0] == 0 && (BaseMask[1] == 0 || BaseMask[1] == 2) &&
37537 !Subtarget.hasAVX2()) {
37538 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
37539 return SDValue(); // Nothing to do!
37540 SDValue Lo = CanonicalizeShuffleInput(RootVT, V1);
37541 SDValue Hi = CanonicalizeShuffleInput(RootVT, BaseMask[1] == 0 ? V1 : V2);
37542 Hi = extractSubVector(Hi, 0, DAG, DL, 128);
37543 return insertSubVector(Lo, Hi, NumRootElts / 2, DAG, DL, 128);
37544 }
37545
37546 if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128)
37547 return SDValue(); // Nothing to do!
37548
37549 // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
37550 // we need to use the zeroing feature.
37551 // Prefer blends for sequential shuffles unless we are optimizing for size.
37552 if (UnaryShuffle &&
37553 !(Subtarget.hasAVX2() && isUndefOrInRange(Mask, 0, 2)) &&
37554 (OptForSize || !isSequentialOrUndefOrZeroInRange(Mask, 0, 2, 0))) {
37555 unsigned PermMask = 0;
37556 PermMask |= ((Mask[0] < 0 ? 0x8 : (Mask[0] & 1)) << 0);
37557 PermMask |= ((Mask[1] < 0 ? 0x8 : (Mask[1] & 1)) << 4);
37558 return DAG.getNode(
37559 X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, V1),
37560 DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8));
37561 }
37562
37563 if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
37564 return SDValue(); // Nothing to do!
37565
37566 // TODO - handle AVX512VL cases with X86ISD::SHUF128.
37567 if (!UnaryShuffle && !IsMaskedShuffle) {
37568 assert(llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) &&(static_cast <bool> (llvm::all_of(Mask, [](int M) { return
0 <= M && M < 4; }) && "Unexpected shuffle sentinel value"
) ? void (0) : __assert_fail ("llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) && \"Unexpected shuffle sentinel value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37569, __extension__
__PRETTY_FUNCTION__))
37569 "Unexpected shuffle sentinel value")(static_cast <bool> (llvm::all_of(Mask, [](int M) { return
0 <= M && M < 4; }) && "Unexpected shuffle sentinel value"
) ? void (0) : __assert_fail ("llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) && \"Unexpected shuffle sentinel value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37569, __extension__
__PRETTY_FUNCTION__))
;
37570 // Prefer blends to X86ISD::VPERM2X128.
37571 if (!((Mask[0] == 0 && Mask[1] == 3) || (Mask[0] == 2 && Mask[1] == 1))) {
37572 unsigned PermMask = 0;
37573 PermMask |= ((Mask[0] & 3) << 0);
37574 PermMask |= ((Mask[1] & 3) << 4);
37575 SDValue LHS = isInRange(Mask[0], 0, 2) ? V1 : V2;
37576 SDValue RHS = isInRange(Mask[1], 0, 2) ? V1 : V2;
37577 return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT,
37578 CanonicalizeShuffleInput(RootVT, LHS),
37579 CanonicalizeShuffleInput(RootVT, RHS),
37580 DAG.getTargetConstant(PermMask, DL, MVT::i8));
37581 }
37582 }
37583 }
37584
37585 // For masks that have been widened to 128-bit elements or more,
37586 // narrow back down to 64-bit elements.
37587 if (BaseMaskEltSizeInBits > 64) {
37588 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size")(static_cast <bool> ((BaseMaskEltSizeInBits % 64) == 0 &&
"Illegal mask size") ? void (0) : __assert_fail ("(BaseMaskEltSizeInBits % 64) == 0 && \"Illegal mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37588, __extension__
__PRETTY_FUNCTION__))
;
37589 int MaskScale = BaseMaskEltSizeInBits / 64;
37590 SmallVector<int, 64> ScaledMask;
37591 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
37592 Mask = std::move(ScaledMask);
37593 }
37594
37595 // For masked shuffles, we're trying to match the root width for better
37596 // writemask folding, attempt to scale the mask.
37597 // TODO - variable shuffles might need this to be widened again.
37598 if (IsMaskedShuffle && NumRootElts > Mask.size()) {
37599 assert((NumRootElts % Mask.size()) == 0 && "Illegal mask size")(static_cast <bool> ((NumRootElts % Mask.size()) == 0 &&
"Illegal mask size") ? void (0) : __assert_fail ("(NumRootElts % Mask.size()) == 0 && \"Illegal mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37599, __extension__
__PRETTY_FUNCTION__))
;
37600 int MaskScale = NumRootElts / Mask.size();
37601 SmallVector<int, 64> ScaledMask;
37602 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
37603 Mask = std::move(ScaledMask);
37604 }
37605
37606 unsigned NumMaskElts = Mask.size();
37607 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
37608
37609 // Determine the effective mask value type.
37610 FloatDomain &= (32 <= MaskEltSizeInBits);
37611 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
37612 : MVT::getIntegerVT(MaskEltSizeInBits);
37613 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
37614
37615 // Only allow legal mask types.
37616 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
37617 return SDValue();
37618
37619 // Attempt to match the mask against known shuffle patterns.
37620 MVT ShuffleSrcVT, ShuffleVT;
37621 unsigned Shuffle, PermuteImm;
37622
37623 // Which shuffle domains are permitted?
37624 // Permit domain crossing at higher combine depths.
37625 // TODO: Should we indicate which domain is preferred if both are allowed?
37626 bool AllowFloatDomain = FloatDomain || (Depth >= 3);
37627 bool AllowIntDomain = (!FloatDomain || (Depth >= 3)) && Subtarget.hasSSE2() &&
37628 (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
37629
37630 // Determine zeroable mask elements.
37631 APInt KnownUndef, KnownZero;
37632 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
37633 APInt Zeroable = KnownUndef | KnownZero;
37634
37635 if (UnaryShuffle) {
37636 // Attempt to match against broadcast-from-vector.
37637 // Limit AVX1 to cases where we're loading+broadcasting a scalar element.
37638 if ((Subtarget.hasAVX2() ||
37639 (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) &&
37640 (!IsMaskedShuffle || NumRootElts == NumMaskElts)) {
37641 if (isUndefOrEqual(Mask, 0)) {
37642 if (V1.getValueType() == MaskVT &&
37643 V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
37644 X86::mayFoldLoad(V1.getOperand(0), Subtarget)) {
37645 if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
37646 return SDValue(); // Nothing to do!
37647 Res = V1.getOperand(0);
37648 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
37649 return DAG.getBitcast(RootVT, Res);
37650 }
37651 if (Subtarget.hasAVX2()) {
37652 if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
37653 return SDValue(); // Nothing to do!
37654 Res = CanonicalizeShuffleInput(MaskVT, V1);
37655 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
37656 return DAG.getBitcast(RootVT, Res);
37657 }
37658 }
37659 }
37660
37661 if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, V1,
37662 Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT) &&
37663 (!IsMaskedShuffle ||
37664 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
37665 if (Depth == 0 && Root.getOpcode() == Shuffle)
37666 return SDValue(); // Nothing to do!
37667 Res = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
37668 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
37669 return DAG.getBitcast(RootVT, Res);
37670 }
37671
37672 if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
37673 AllowIntDomain, Subtarget, Shuffle, ShuffleVT,
37674 PermuteImm) &&
37675 (!IsMaskedShuffle ||
37676 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
37677 if (Depth == 0 && Root.getOpcode() == Shuffle)
37678 return SDValue(); // Nothing to do!
37679 Res = CanonicalizeShuffleInput(ShuffleVT, V1);
37680 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
37681 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
37682 return DAG.getBitcast(RootVT, Res);
37683 }
37684 }
37685
37686 // Attempt to combine to INSERTPS, but only if the inserted element has come
37687 // from a scalar.
37688 // TODO: Handle other insertions here as well?
37689 if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 &&
37690 Subtarget.hasSSE41() &&
37691 !isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3})) {
37692 if (MaskEltSizeInBits == 32) {
37693 SDValue SrcV1 = V1, SrcV2 = V2;
37694 if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask,
37695 DAG) &&
37696 SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {
37697 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
37698 return SDValue(); // Nothing to do!
37699 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
37700 CanonicalizeShuffleInput(MVT::v4f32, SrcV1),
37701 CanonicalizeShuffleInput(MVT::v4f32, SrcV2),
37702 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
37703 return DAG.getBitcast(RootVT, Res);
37704 }
37705 }
37706 if (MaskEltSizeInBits == 64 &&
37707 isTargetShuffleEquivalent(MaskVT, Mask, {0, 2}) &&
37708 V2.getOpcode() == ISD::SCALAR_TO_VECTOR &&
37709 V2.getScalarValueSizeInBits() <= 32) {
37710 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
37711 return SDValue(); // Nothing to do!
37712 PermuteImm = (/*DstIdx*/2 << 4) | (/*SrcIdx*/0 << 0);
37713 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
37714 CanonicalizeShuffleInput(MVT::v4f32, V1),
37715 CanonicalizeShuffleInput(MVT::v4f32, V2),
37716 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
37717 return DAG.getBitcast(RootVT, Res);
37718 }
37719 }
37720
37721 SDValue NewV1 = V1; // Save operands in case early exit happens.
37722 SDValue NewV2 = V2;
37723 if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
37724 NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
37725 ShuffleVT, UnaryShuffle) &&
37726 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
37727 if (Depth == 0 && Root.getOpcode() == Shuffle)
37728 return SDValue(); // Nothing to do!
37729 NewV1 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);
37730 NewV2 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV2);
37731 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
37732 return DAG.getBitcast(RootVT, Res);
37733 }
37734
37735 NewV1 = V1; // Save operands in case early exit happens.
37736 NewV2 = V2;
37737 if (matchBinaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
37738 AllowIntDomain, NewV1, NewV2, DL, DAG,
37739 Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
37740 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
37741 if (Depth == 0 && Root.getOpcode() == Shuffle)
37742 return SDValue(); // Nothing to do!
37743 NewV1 = CanonicalizeShuffleInput(ShuffleVT, NewV1);
37744 NewV2 = CanonicalizeShuffleInput(ShuffleVT, NewV2);
37745 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
37746 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
37747 return DAG.getBitcast(RootVT, Res);
37748 }
37749
37750 // Typically from here on, we need an integer version of MaskVT.
37751 MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
37752 IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
37753
37754 // Annoyingly, SSE4A instructions don't map into the above match helpers.
37755 if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
37756 uint64_t BitLen, BitIdx;
37757 if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
37758 Zeroable)) {
37759 if (Depth == 0 && Root.getOpcode() == X86ISD::EXTRQI)
37760 return SDValue(); // Nothing to do!
37761 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
37762 Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
37763 DAG.getTargetConstant(BitLen, DL, MVT::i8),
37764 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
37765 return DAG.getBitcast(RootVT, Res);
37766 }
37767
37768 if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
37769 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTQI)
37770 return SDValue(); // Nothing to do!
37771 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
37772 V2 = CanonicalizeShuffleInput(IntMaskVT, V2);
37773 Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
37774 DAG.getTargetConstant(BitLen, DL, MVT::i8),
37775 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
37776 return DAG.getBitcast(RootVT, Res);
37777 }
37778 }
37779
37780 // Match shuffle against TRUNCATE patterns.
37781 if (AllowIntDomain && MaskEltSizeInBits < 64 && Subtarget.hasAVX512()) {
37782 // Match against a VTRUNC instruction, accounting for src/dst sizes.
37783 if (matchShuffleAsVTRUNC(ShuffleSrcVT, ShuffleVT, IntMaskVT, Mask, Zeroable,
37784 Subtarget)) {
37785 bool IsTRUNCATE = ShuffleVT.getVectorNumElements() ==
37786 ShuffleSrcVT.getVectorNumElements();
37787 unsigned Opc =
37788 IsTRUNCATE ? (unsigned)ISD::TRUNCATE : (unsigned)X86ISD::VTRUNC;
37789 if (Depth == 0 && Root.getOpcode() == Opc)
37790 return SDValue(); // Nothing to do!
37791 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
37792 Res = DAG.getNode(Opc, DL, ShuffleVT, V1);
37793 if (ShuffleVT.getSizeInBits() < RootSizeInBits)
37794 Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits);
37795 return DAG.getBitcast(RootVT, Res);
37796 }
37797
37798 // Do we need a more general binary truncation pattern?
37799 if (RootSizeInBits < 512 &&
37800 ((RootVT.is256BitVector() && Subtarget.useAVX512Regs()) ||
37801 (RootVT.is128BitVector() && Subtarget.hasVLX())) &&
37802 (MaskEltSizeInBits > 8 || Subtarget.hasBWI()) &&
37803 isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) {
37804 if (Depth == 0 && Root.getOpcode() == ISD::TRUNCATE)
37805 return SDValue(); // Nothing to do!
37806 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
37807 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);
37808 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
37809 V2 = CanonicalizeShuffleInput(ShuffleSrcVT, V2);
37810 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
37811 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts);
37812 Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2);
37813 Res = DAG.getNode(ISD::TRUNCATE, DL, IntMaskVT, Res);
37814 return DAG.getBitcast(RootVT, Res);
37815 }
37816 }
37817
37818 // Don't try to re-form single instruction chains under any circumstances now
37819 // that we've done encoding canonicalization for them.
37820 if (Depth < 1)
37821 return SDValue();
37822
37823 // Depth threshold above which we can efficiently use variable mask shuffles.
37824 int VariableCrossLaneShuffleDepth =
37825 Subtarget.hasFastVariableCrossLaneShuffle() ? 1 : 2;
37826 int VariablePerLaneShuffleDepth =
37827 Subtarget.hasFastVariablePerLaneShuffle() ? 1 : 2;
37828 AllowVariableCrossLaneMask &=
37829 (Depth >= VariableCrossLaneShuffleDepth) || HasVariableMask;
37830 AllowVariablePerLaneMask &=
37831 (Depth >= VariablePerLaneShuffleDepth) || HasVariableMask;
37832 // VPERMI2W/VPERMI2B are 3 uops on Skylake and Icelake so we require a
37833 // higher depth before combining them.
37834 bool AllowBWIVPERMV3 =
37835 (Depth >= (VariableCrossLaneShuffleDepth + 2) || HasVariableMask);
37836
37837 bool MaskContainsZeros = isAnyZero(Mask);
37838
37839 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
37840 // If we have a single input lane-crossing shuffle then lower to VPERMV.
37841 if (UnaryShuffle && AllowVariableCrossLaneMask && !MaskContainsZeros) {
37842 if (Subtarget.hasAVX2() &&
37843 (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) {
37844 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
37845 Res = CanonicalizeShuffleInput(MaskVT, V1);
37846 Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
37847 return DAG.getBitcast(RootVT, Res);
37848 }
37849 // AVX512 variants (non-VLX will pad to 512-bit shuffles).
37850 if ((Subtarget.hasAVX512() &&
37851 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
37852 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
37853 (Subtarget.hasBWI() &&
37854 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
37855 (Subtarget.hasVBMI() &&
37856 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8))) {
37857 V1 = CanonicalizeShuffleInput(MaskVT, V1);
37858 V2 = DAG.getUNDEF(MaskVT);
37859 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
37860 return DAG.getBitcast(RootVT, Res);
37861 }
37862 }
37863
37864 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
37865 // vector as the second source (non-VLX will pad to 512-bit shuffles).
37866 if (UnaryShuffle && AllowVariableCrossLaneMask &&
37867 ((Subtarget.hasAVX512() &&
37868 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
37869 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
37870 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32 ||
37871 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
37872 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
37873 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
37874 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
37875 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
37876 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
37877 for (unsigned i = 0; i != NumMaskElts; ++i)
37878 if (Mask[i] == SM_SentinelZero)
37879 Mask[i] = NumMaskElts + i;
37880 V1 = CanonicalizeShuffleInput(MaskVT, V1);
37881 V2 = getZeroVector(MaskVT, Subtarget, DAG, DL);
37882 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
37883 return DAG.getBitcast(RootVT, Res);
37884 }
37885
37886 // If that failed and either input is extracted then try to combine as a
37887 // shuffle with the larger type.
37888 if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
37889 Inputs, Root, BaseMask, Depth, HasVariableMask,
37890 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG,
37891 Subtarget))
37892 return WideShuffle;
37893
37894 // If we have a dual input lane-crossing shuffle then lower to VPERMV3,
37895 // (non-VLX will pad to 512-bit shuffles).
37896 if (AllowVariableCrossLaneMask && !MaskContainsZeros &&
37897 ((Subtarget.hasAVX512() &&
37898 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
37899 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
37900 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32 ||
37901 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
37902 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
37903 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
37904 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
37905 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
37906 V1 = CanonicalizeShuffleInput(MaskVT, V1);
37907 V2 = CanonicalizeShuffleInput(MaskVT, V2);
37908 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
37909 return DAG.getBitcast(RootVT, Res);
37910 }
37911 return SDValue();
37912 }
37913
37914 // See if we can combine a single input shuffle with zeros to a bit-mask,
37915 // which is much simpler than any shuffle.
37916 if (UnaryShuffle && MaskContainsZeros && AllowVariablePerLaneMask &&
37917 isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
37918 DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
37919 APInt Zero = APInt::getZero(MaskEltSizeInBits);
37920 APInt AllOnes = APInt::getAllOnes(MaskEltSizeInBits);
37921 APInt UndefElts(NumMaskElts, 0);
37922 SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
37923 for (unsigned i = 0; i != NumMaskElts; ++i) {
37924 int M = Mask[i];
37925 if (M == SM_SentinelUndef) {
37926 UndefElts.setBit(i);
37927 continue;
37928 }
37929 if (M == SM_SentinelZero)
37930 continue;
37931 EltBits[i] = AllOnes;
37932 }
37933 SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
37934 Res = CanonicalizeShuffleInput(MaskVT, V1);
37935 unsigned AndOpcode =
37936 MaskVT.isFloatingPoint() ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
37937 Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
37938 return DAG.getBitcast(RootVT, Res);
37939 }
37940
37941 // If we have a single input shuffle with different shuffle patterns in the
37942 // the 128-bit lanes use the variable mask to VPERMILPS.
37943 // TODO Combine other mask types at higher depths.
37944 if (UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
37945 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
37946 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
37947 SmallVector<SDValue, 16> VPermIdx;
37948 for (int M : Mask) {
37949 SDValue Idx =
37950 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
37951 VPermIdx.push_back(Idx);
37952 }
37953 SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
37954 Res = CanonicalizeShuffleInput(MaskVT, V1);
37955 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
37956 return DAG.getBitcast(RootVT, Res);
37957 }
37958
37959 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
37960 // to VPERMIL2PD/VPERMIL2PS.
37961 if (AllowVariablePerLaneMask && Subtarget.hasXOP() &&
37962 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
37963 MaskVT == MVT::v8f32)) {
37964 // VPERMIL2 Operation.
37965 // Bits[3] - Match Bit.
37966 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
37967 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
37968 unsigned NumLanes = MaskVT.getSizeInBits() / 128;
37969 unsigned NumEltsPerLane = NumMaskElts / NumLanes;
37970 SmallVector<int, 8> VPerm2Idx;
37971 unsigned M2ZImm = 0;
37972 for (int M : Mask) {
37973 if (M == SM_SentinelUndef) {
37974 VPerm2Idx.push_back(-1);
37975 continue;
37976 }
37977 if (M == SM_SentinelZero) {
37978 M2ZImm = 2;
37979 VPerm2Idx.push_back(8);
37980 continue;
37981 }
37982 int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
37983 Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
37984 VPerm2Idx.push_back(Index);
37985 }
37986 V1 = CanonicalizeShuffleInput(MaskVT, V1);
37987 V2 = CanonicalizeShuffleInput(MaskVT, V2);
37988 SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
37989 Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
37990 DAG.getTargetConstant(M2ZImm, DL, MVT::i8));
37991 return DAG.getBitcast(RootVT, Res);
37992 }
37993
37994 // If we have 3 or more shuffle instructions or a chain involving a variable
37995 // mask, we can replace them with a single PSHUFB instruction profitably.
37996 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
37997 // instructions, but in practice PSHUFB tends to be *very* fast so we're
37998 // more aggressive.
37999 if (UnaryShuffle && AllowVariablePerLaneMask &&
38000 ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
38001 (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
38002 (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
38003 SmallVector<SDValue, 16> PSHUFBMask;
38004 int NumBytes = RootVT.getSizeInBits() / 8;
38005 int Ratio = NumBytes / NumMaskElts;
38006 for (int i = 0; i < NumBytes; ++i) {
38007 int M = Mask[i / Ratio];
38008 if (M == SM_SentinelUndef) {
38009 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
38010 continue;
38011 }
38012 if (M == SM_SentinelZero) {
38013 PSHUFBMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
38014 continue;
38015 }
38016 M = Ratio * M + i % Ratio;
38017 assert((M / 16) == (i / 16) && "Lane crossing detected")(static_cast <bool> ((M / 16) == (i / 16) && "Lane crossing detected"
) ? void (0) : __assert_fail ("(M / 16) == (i / 16) && \"Lane crossing detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38017, __extension__
__PRETTY_FUNCTION__))
;
38018 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
38019 }
38020 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
38021 Res = CanonicalizeShuffleInput(ByteVT, V1);
38022 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
38023 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
38024 return DAG.getBitcast(RootVT, Res);
38025 }
38026
38027 // With XOP, if we have a 128-bit binary input shuffle we can always combine
38028 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
38029 // slower than PSHUFB on targets that support both.
38030 if (AllowVariablePerLaneMask && RootVT.is128BitVector() &&
38031 Subtarget.hasXOP()) {
38032 // VPPERM Mask Operation
38033 // Bits[4:0] - Byte Index (0 - 31)
38034 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
38035 SmallVector<SDValue, 16> VPPERMMask;
38036 int NumBytes = 16;
38037 int Ratio = NumBytes / NumMaskElts;
38038 for (int i = 0; i < NumBytes; ++i) {
38039 int M = Mask[i / Ratio];
38040 if (M == SM_SentinelUndef) {
38041 VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
38042 continue;
38043 }
38044 if (M == SM_SentinelZero) {
38045 VPPERMMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
38046 continue;
38047 }
38048 M = Ratio * M + i % Ratio;
38049 VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
38050 }
38051 MVT ByteVT = MVT::v16i8;
38052 V1 = CanonicalizeShuffleInput(ByteVT, V1);
38053 V2 = CanonicalizeShuffleInput(ByteVT, V2);
38054 SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
38055 Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
38056 return DAG.getBitcast(RootVT, Res);
38057 }
38058
38059 // If that failed and either input is extracted then try to combine as a
38060 // shuffle with the larger type.
38061 if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
38062 Inputs, Root, BaseMask, Depth, HasVariableMask,
38063 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG, Subtarget))
38064 return WideShuffle;
38065
38066 // If we have a dual input shuffle then lower to VPERMV3,
38067 // (non-VLX will pad to 512-bit shuffles)
38068 if (!UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
38069 ((Subtarget.hasAVX512() &&
38070 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v8f64 ||
38071 MaskVT == MVT::v2i64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8i64 ||
38072 MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || MaskVT == MVT::v8f32 ||
38073 MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 ||
38074 MaskVT == MVT::v16i32)) ||
38075 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
38076 (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 ||
38077 MaskVT == MVT::v32i16)) ||
38078 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
38079 (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 ||
38080 MaskVT == MVT::v64i8)))) {
38081 V1 = CanonicalizeShuffleInput(MaskVT, V1);
38082 V2 = CanonicalizeShuffleInput(MaskVT, V2);
38083 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
38084 return DAG.getBitcast(RootVT, Res);
38085 }
38086
38087 // Failed to find any combines.
38088 return SDValue();
38089}
38090
38091// Combine an arbitrary chain of shuffles + extract_subvectors into a single
38092// instruction if possible.
38093//
38094// Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger
38095// type size to attempt to combine:
38096// shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)
38097// -->
38098// extract_subvector(shuffle(x,y,m2),0)
38099static SDValue combineX86ShuffleChainWithExtract(
38100 ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
38101 bool HasVariableMask, bool AllowVariableCrossLaneMask,
38102 bool AllowVariablePerLaneMask, SelectionDAG &DAG,
38103 const X86Subtarget &Subtarget) {
38104 unsigned NumMaskElts = BaseMask.size();
38105 unsigned NumInputs = Inputs.size();
38106 if (NumInputs == 0)
38107 return SDValue();
38108
38109 EVT RootVT = Root.getValueType();
38110 unsigned RootSizeInBits = RootVT.getSizeInBits();
38111 assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask")(static_cast <bool> ((RootSizeInBits % NumMaskElts) == 0
&& "Unexpected root shuffle mask") ? void (0) : __assert_fail
("(RootSizeInBits % NumMaskElts) == 0 && \"Unexpected root shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38111, __extension__
__PRETTY_FUNCTION__))
;
38112
38113 // Bail if we have any smaller inputs.
38114 if (llvm::any_of(Inputs, [RootSizeInBits](SDValue Input) {
38115 return Input.getValueSizeInBits() < RootSizeInBits;
38116 }))
38117 return SDValue();
38118
38119 SmallVector<SDValue, 4> WideInputs(Inputs.begin(), Inputs.end());
38120 SmallVector<unsigned, 4> Offsets(NumInputs, 0);
38121
38122 // Peek through subvectors.
38123 // TODO: Support inter-mixed EXTRACT_SUBVECTORs + BITCASTs?
38124 unsigned WideSizeInBits = RootSizeInBits;
38125 for (unsigned i = 0; i != NumInputs; ++i) {
38126 SDValue &Src = WideInputs[i];
38127 unsigned &Offset = Offsets[i];
38128 Src = peekThroughBitcasts(Src);
38129 EVT BaseVT = Src.getValueType();
38130 while (Src.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
38131 Offset += Src.getConstantOperandVal(1);
38132 Src = Src.getOperand(0);
38133 }
38134 WideSizeInBits = std::max(WideSizeInBits,
38135 (unsigned)Src.getValueSizeInBits());
38136 assert((Offset % BaseVT.getVectorNumElements()) == 0 &&(static_cast <bool> ((Offset % BaseVT.getVectorNumElements
()) == 0 && "Unexpected subvector extraction") ? void
(0) : __assert_fail ("(Offset % BaseVT.getVectorNumElements()) == 0 && \"Unexpected subvector extraction\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38137, __extension__
__PRETTY_FUNCTION__))
38137 "Unexpected subvector extraction")(static_cast <bool> ((Offset % BaseVT.getVectorNumElements
()) == 0 && "Unexpected subvector extraction") ? void
(0) : __assert_fail ("(Offset % BaseVT.getVectorNumElements()) == 0 && \"Unexpected subvector extraction\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38137, __extension__
__PRETTY_FUNCTION__))
;
38138 Offset /= BaseVT.getVectorNumElements();
38139 Offset *= NumMaskElts;
38140 }
38141
38142 // Bail if we're always extracting from the lowest subvectors,
38143 // combineX86ShuffleChain should match this for the current width.
38144 if (llvm::all_of(Offsets, [](unsigned Offset) { return Offset == 0; }))
38145 return SDValue();
38146
38147 unsigned Scale = WideSizeInBits / RootSizeInBits;
38148 assert((WideSizeInBits % RootSizeInBits) == 0 &&(static_cast <bool> ((WideSizeInBits % RootSizeInBits) ==
0 && "Unexpected subvector extraction") ? void (0) :
__assert_fail ("(WideSizeInBits % RootSizeInBits) == 0 && \"Unexpected subvector extraction\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38149, __extension__
__PRETTY_FUNCTION__))
38149 "Unexpected subvector extraction")(static_cast <bool> ((WideSizeInBits % RootSizeInBits) ==
0 && "Unexpected subvector extraction") ? void (0) :
__assert_fail ("(WideSizeInBits % RootSizeInBits) == 0 && \"Unexpected subvector extraction\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38149, __extension__
__PRETTY_FUNCTION__))
;
38150
38151 // If the src vector types aren't the same, see if we can extend
38152 // them to match each other.
38153 // TODO: Support different scalar types?
38154 EVT WideSVT = WideInputs[0].getValueType().getScalarType();
38155 if (llvm::any_of(WideInputs, [&WideSVT, &DAG](SDValue Op) {
38156 return !DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()) ||
38157 Op.getValueType().getScalarType() != WideSVT;
38158 }))
38159 return SDValue();
38160
38161 // Create new mask for larger type.
38162 for (unsigned i = 1; i != NumInputs; ++i)
38163 Offsets[i] += i * Scale * NumMaskElts;
38164
38165 SmallVector<int, 64> WideMask(BaseMask.begin(), BaseMask.end());
38166 for (int &M : WideMask) {
38167 if (M < 0)
38168 continue;
38169 M = (M % NumMaskElts) + Offsets[M / NumMaskElts];
38170 }
38171 WideMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef);
38172
38173 // Remove unused/repeated shuffle source ops.
38174 resolveTargetShuffleInputsAndMask(WideInputs, WideMask);
38175 assert(!WideInputs.empty() && "Shuffle with no inputs detected")(static_cast <bool> (!WideInputs.empty() && "Shuffle with no inputs detected"
) ? void (0) : __assert_fail ("!WideInputs.empty() && \"Shuffle with no inputs detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38175, __extension__
__PRETTY_FUNCTION__))
;
38176
38177 if (WideInputs.size() > 2)
38178 return SDValue();
38179
38180 // Increase depth for every upper subvector we've peeked through.
38181 Depth += count_if(Offsets, [](unsigned Offset) { return Offset > 0; });
38182
38183 // Attempt to combine wider chain.
38184 // TODO: Can we use a better Root?
38185 SDValue WideRoot = WideInputs.front().getValueSizeInBits() >
38186 WideInputs.back().getValueSizeInBits()
38187 ? WideInputs.front()
38188 : WideInputs.back();
38189 if (SDValue WideShuffle =
38190 combineX86ShuffleChain(WideInputs, WideRoot, WideMask, Depth,
38191 HasVariableMask, AllowVariableCrossLaneMask,
38192 AllowVariablePerLaneMask, DAG, Subtarget)) {
38193 WideShuffle =
38194 extractSubVector(WideShuffle, 0, DAG, SDLoc(Root), RootSizeInBits);
38195 return DAG.getBitcast(RootVT, WideShuffle);
38196 }
38197 return SDValue();
38198}
38199
38200// Canonicalize the combined shuffle mask chain with horizontal ops.
38201// NOTE: This may update the Ops and Mask.
38202static SDValue canonicalizeShuffleMaskWithHorizOp(
38203 MutableArrayRef<SDValue> Ops, MutableArrayRef<int> Mask,
38204 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
38205 const X86Subtarget &Subtarget) {
38206 if (Mask.empty() || Ops.empty())
38207 return SDValue();
38208
38209 SmallVector<SDValue> BC;
38210 for (SDValue Op : Ops)
38211 BC.push_back(peekThroughBitcasts(Op));
38212
38213 // All ops must be the same horizop + type.
38214 SDValue BC0 = BC[0];
38215 EVT VT0 = BC0.getValueType();
38216 unsigned Opcode0 = BC0.getOpcode();
38217 if (VT0.getSizeInBits() != RootSizeInBits || llvm::any_of(BC, [&](SDValue V) {
38218 return V.getOpcode() != Opcode0 || V.getValueType() != VT0;
38219 }))
38220 return SDValue();
38221
38222 bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
38223 Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);
38224 bool isPack = (Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS);
38225 if (!isHoriz && !isPack)
38226 return SDValue();
38227
38228 // Do all ops have a single use?
38229 bool OneUseOps = llvm::all_of(Ops, [](SDValue Op) {
38230 return Op.hasOneUse() &&
38231 peekThroughBitcasts(Op) == peekThroughOneUseBitcasts(Op);
38232 });
38233
38234 int NumElts = VT0.getVectorNumElements();
38235 int NumLanes = VT0.getSizeInBits() / 128;
38236 int NumEltsPerLane = NumElts / NumLanes;
38237 int NumHalfEltsPerLane = NumEltsPerLane / 2;
38238 MVT SrcVT = BC0.getOperand(0).getSimpleValueType();
38239 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
38240
38241 if (NumEltsPerLane >= 4 &&
38242 (isPack || shouldUseHorizontalOp(Ops.size() == 1, DAG, Subtarget))) {
38243 SmallVector<int> LaneMask, ScaledMask;
38244 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, LaneMask) &&
38245 scaleShuffleElements(LaneMask, 4, ScaledMask)) {
38246 // See if we can remove the shuffle by resorting the HOP chain so that
38247 // the HOP args are pre-shuffled.
38248 // TODO: Generalize to any sized/depth chain.
38249 // TODO: Add support for PACKSS/PACKUS.
38250 if (isHoriz) {
38251 // Attempt to find a HOP(HOP(X,Y),HOP(Z,W)) source operand.
38252 auto GetHOpSrc = [&](int M) {
38253 if (M == SM_SentinelUndef)
38254 return DAG.getUNDEF(VT0);
38255 if (M == SM_SentinelZero)
38256 return getZeroVector(VT0.getSimpleVT(), Subtarget, DAG, DL);
38257 SDValue Src0 = BC[M / 4];
38258 SDValue Src1 = Src0.getOperand((M % 4) >= 2);
38259 if (Src1.getOpcode() == Opcode0 && Src0->isOnlyUserOf(Src1.getNode()))
38260 return Src1.getOperand(M % 2);
38261 return SDValue();
38262 };
38263 SDValue M0 = GetHOpSrc(ScaledMask[0]);
38264 SDValue M1 = GetHOpSrc(ScaledMask[1]);
38265 SDValue M2 = GetHOpSrc(ScaledMask[2]);
38266 SDValue M3 = GetHOpSrc(ScaledMask[3]);
38267 if (M0 && M1 && M2 && M3) {
38268 SDValue LHS = DAG.getNode(Opcode0, DL, SrcVT, M0, M1);
38269 SDValue RHS = DAG.getNode(Opcode0, DL, SrcVT, M2, M3);
38270 return DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
38271 }
38272 }
38273 // shuffle(hop(x,y),hop(z,w)) -> permute(hop(x,z)) etc.
38274 if (Ops.size() >= 2) {
38275 SDValue LHS, RHS;
38276 auto GetHOpSrc = [&](int M, int &OutM) {
38277 // TODO: Support SM_SentinelZero
38278 if (M < 0)
38279 return M == SM_SentinelUndef;
38280 SDValue Src = BC[M / 4].getOperand((M % 4) >= 2);
38281 if (!LHS || LHS == Src) {
38282 LHS = Src;
38283 OutM = (M % 2);
38284 return true;
38285 }
38286 if (!RHS || RHS == Src) {
38287 RHS = Src;
38288 OutM = (M % 2) + 2;
38289 return true;
38290 }
38291 return false;
38292 };
38293 int PostMask[4] = {-1, -1, -1, -1};
38294 if (GetHOpSrc(ScaledMask[0], PostMask[0]) &&
38295 GetHOpSrc(ScaledMask[1], PostMask[1]) &&
38296 GetHOpSrc(ScaledMask[2], PostMask[2]) &&
38297 GetHOpSrc(ScaledMask[3], PostMask[3])) {
38298 LHS = DAG.getBitcast(SrcVT, LHS);
38299 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
38300 SDValue Res = DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
38301 // Use SHUFPS for the permute so this will work on SSE3 targets,
38302 // shuffle combining and domain handling will simplify this later on.
38303 MVT ShuffleVT = MVT::getVectorVT(MVT::f32, RootSizeInBits / 32);
38304 Res = DAG.getBitcast(ShuffleVT, Res);
38305 return DAG.getNode(X86ISD::SHUFP, DL, ShuffleVT, Res, Res,
38306 getV4X86ShuffleImm8ForMask(PostMask, DL, DAG));
38307 }
38308 }
38309 }
38310 }
38311
38312 if (2 < Ops.size())
38313 return SDValue();
38314
38315 SDValue BC1 = BC[BC.size() - 1];
38316 if (Mask.size() == VT0.getVectorNumElements()) {
38317 // Canonicalize binary shuffles of horizontal ops that use the
38318 // same sources to an unary shuffle.
38319 // TODO: Try to perform this fold even if the shuffle remains.
38320 if (Ops.size() == 2) {
38321 auto ContainsOps = [](SDValue HOp, SDValue Op) {
38322 return Op == HOp.getOperand(0) || Op == HOp.getOperand(1);
38323 };
38324 // Commute if all BC0's ops are contained in BC1.
38325 if (ContainsOps(BC1, BC0.getOperand(0)) &&
38326 ContainsOps(BC1, BC0.getOperand(1))) {
38327 ShuffleVectorSDNode::commuteMask(Mask);
38328 std::swap(Ops[0], Ops[1]);
38329 std::swap(BC0, BC1);
38330 }
38331
38332 // If BC1 can be represented by BC0, then convert to unary shuffle.
38333 if (ContainsOps(BC0, BC1.getOperand(0)) &&
38334 ContainsOps(BC0, BC1.getOperand(1))) {
38335 for (int &M : Mask) {
38336 if (M < NumElts) // BC0 element or UNDEF/Zero sentinel.
38337 continue;
38338 int SubLane = ((M % NumEltsPerLane) >= NumHalfEltsPerLane) ? 1 : 0;
38339 M -= NumElts + (SubLane * NumHalfEltsPerLane);
38340 if (BC1.getOperand(SubLane) != BC0.getOperand(0))
38341 M += NumHalfEltsPerLane;
38342 }
38343 }
38344 }
38345
38346 // Canonicalize unary horizontal ops to only refer to lower halves.
38347 for (int i = 0; i != NumElts; ++i) {
38348 int &M = Mask[i];
38349 if (isUndefOrZero(M))
38350 continue;
38351 if (M < NumElts && BC0.getOperand(0) == BC0.getOperand(1) &&
38352 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
38353 M -= NumHalfEltsPerLane;
38354 if (NumElts <= M && BC1.getOperand(0) == BC1.getOperand(1) &&
38355 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
38356 M -= NumHalfEltsPerLane;
38357 }
38358 }
38359
38360 // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
38361 // single instruction. Attempt to match a v2X64 repeating shuffle pattern that
38362 // represents the LHS/RHS inputs for the lower/upper halves.
38363 SmallVector<int, 16> TargetMask128, WideMask128;
38364 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, TargetMask128) &&
38365 scaleShuffleElements(TargetMask128, 2, WideMask128)) {
38366 assert(isUndefOrZeroOrInRange(WideMask128, 0, 4) && "Illegal shuffle")(static_cast <bool> (isUndefOrZeroOrInRange(WideMask128
, 0, 4) && "Illegal shuffle") ? void (0) : __assert_fail
("isUndefOrZeroOrInRange(WideMask128, 0, 4) && \"Illegal shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38366, __extension__
__PRETTY_FUNCTION__))
;
38367 bool SingleOp = (Ops.size() == 1);
38368 if (isPack || OneUseOps ||
38369 shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {
38370 SDValue Lo = isInRange(WideMask128[0], 0, 2) ? BC0 : BC1;
38371 SDValue Hi = isInRange(WideMask128[1], 0, 2) ? BC0 : BC1;
38372 Lo = Lo.getOperand(WideMask128[0] & 1);
38373 Hi = Hi.getOperand(WideMask128[1] & 1);
38374 if (SingleOp) {
38375 SDValue Undef = DAG.getUNDEF(SrcVT);
38376 SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL);
38377 Lo = (WideMask128[0] == SM_SentinelZero ? Zero : Lo);
38378 Hi = (WideMask128[1] == SM_SentinelZero ? Zero : Hi);
38379 Lo = (WideMask128[0] == SM_SentinelUndef ? Undef : Lo);
38380 Hi = (WideMask128[1] == SM_SentinelUndef ? Undef : Hi);
38381 }
38382 return DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
38383 }
38384 }
38385
38386 return SDValue();
38387}
38388
38389// Attempt to constant fold all of the constant source ops.
38390// Returns true if the entire shuffle is folded to a constant.
38391// TODO: Extend this to merge multiple constant Ops and update the mask.
38392static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,
38393 ArrayRef<int> Mask, SDValue Root,
38394 bool HasVariableMask,
38395 SelectionDAG &DAG,
38396 const X86Subtarget &Subtarget) {
38397 MVT VT = Root.getSimpleValueType();
38398
38399 unsigned SizeInBits = VT.getSizeInBits();
38400 unsigned NumMaskElts = Mask.size();
38401 unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
38402 unsigned NumOps = Ops.size();
38403
38404 // Extract constant bits from each source op.
38405 bool OneUseConstantOp = false;
38406 SmallVector<APInt, 16> UndefEltsOps(NumOps);
38407 SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
38408 for (unsigned i = 0; i != NumOps; ++i) {
38409 SDValue SrcOp = Ops[i];
38410 OneUseConstantOp |= SrcOp.hasOneUse();
38411 if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
38412 RawBitsOps[i]))
38413 return SDValue();
38414 }
38415
38416 // If we're optimizing for size, only fold if at least one of the constants is
38417 // only used once or the combined shuffle has included a variable mask
38418 // shuffle, this is to avoid constant pool bloat.
38419 bool IsOptimizingSize = DAG.shouldOptForSize();
38420 if (IsOptimizingSize && !OneUseConstantOp && !HasVariableMask)
38421 return SDValue();
38422
38423 // Shuffle the constant bits according to the mask.
38424 SDLoc DL(Root);
38425 APInt UndefElts(NumMaskElts, 0);
38426 APInt ZeroElts(NumMaskElts, 0);
38427 APInt ConstantElts(NumMaskElts, 0);
38428 SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
38429 APInt::getZero(MaskSizeInBits));
38430 for (unsigned i = 0; i != NumMaskElts; ++i) {
38431 int M = Mask[i];
38432 if (M == SM_SentinelUndef) {
38433 UndefElts.setBit(i);
38434 continue;
38435 } else if (M == SM_SentinelZero) {
38436 ZeroElts.setBit(i);
38437 continue;
38438 }
38439 assert(0 <= M && M < (int)(NumMaskElts * NumOps))(static_cast <bool> (0 <= M && M < (int)(
NumMaskElts * NumOps)) ? void (0) : __assert_fail ("0 <= M && M < (int)(NumMaskElts * NumOps)"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38439, __extension__
__PRETTY_FUNCTION__))
;
38440
38441 unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
38442 unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
38443
38444 auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
38445 if (SrcUndefElts[SrcMaskIdx]) {
38446 UndefElts.setBit(i);
38447 continue;
38448 }
38449
38450 auto &SrcEltBits = RawBitsOps[SrcOpIdx];
38451 APInt &Bits = SrcEltBits[SrcMaskIdx];
38452 if (!Bits) {
38453 ZeroElts.setBit(i);
38454 continue;
38455 }
38456
38457 ConstantElts.setBit(i);
38458 ConstantBitData[i] = Bits;
38459 }
38460 assert((UndefElts | ZeroElts | ConstantElts).isAllOnes())(static_cast <bool> ((UndefElts | ZeroElts | ConstantElts
).isAllOnes()) ? void (0) : __assert_fail ("(UndefElts | ZeroElts | ConstantElts).isAllOnes()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38460, __extension__
__PRETTY_FUNCTION__))
;
38461
38462 // Attempt to create a zero vector.
38463 if ((UndefElts | ZeroElts).isAllOnes())
38464 return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG, DL);
38465
38466 // Create the constant data.
38467 MVT MaskSVT;
38468 if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
38469 MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
38470 else
38471 MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
38472
38473 MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
38474 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
38475 return SDValue();
38476
38477 SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
38478 return DAG.getBitcast(VT, CstOp);
38479}
38480
38481namespace llvm {
38482 namespace X86 {
38483 enum {
38484 MaxShuffleCombineDepth = 8
38485 };
38486 }
38487} // namespace llvm
38488
38489/// Fully generic combining of x86 shuffle instructions.
38490///
38491/// This should be the last combine run over the x86 shuffle instructions. Once
38492/// they have been fully optimized, this will recursively consider all chains
38493/// of single-use shuffle instructions, build a generic model of the cumulative
38494/// shuffle operation, and check for simpler instructions which implement this
38495/// operation. We use this primarily for two purposes:
38496///
38497/// 1) Collapse generic shuffles to specialized single instructions when
38498/// equivalent. In most cases, this is just an encoding size win, but
38499/// sometimes we will collapse multiple generic shuffles into a single
38500/// special-purpose shuffle.
38501/// 2) Look for sequences of shuffle instructions with 3 or more total
38502/// instructions, and replace them with the slightly more expensive SSSE3
38503/// PSHUFB instruction if available. We do this as the last combining step
38504/// to ensure we avoid using PSHUFB if we can implement the shuffle with
38505/// a suitable short sequence of other instructions. The PSHUFB will either
38506/// use a register or have to read from memory and so is slightly (but only
38507/// slightly) more expensive than the other shuffle instructions.
38508///
38509/// Because this is inherently a quadratic operation (for each shuffle in
38510/// a chain, we recurse up the chain), the depth is limited to 8 instructions.
38511/// This should never be an issue in practice as the shuffle lowering doesn't
38512/// produce sequences of more than 8 instructions.
38513///
38514/// FIXME: We will currently miss some cases where the redundant shuffling
38515/// would simplify under the threshold for PSHUFB formation because of
38516/// combine-ordering. To fix this, we should do the redundant instruction
38517/// combining in this recursive walk.
38518static SDValue combineX86ShufflesRecursively(
38519 ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
38520 ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
38521 unsigned MaxDepth, bool HasVariableMask, bool AllowVariableCrossLaneMask,
38522 bool AllowVariablePerLaneMask, SelectionDAG &DAG,
38523 const X86Subtarget &Subtarget) {
38524 assert(RootMask.size() > 0 &&(static_cast <bool> (RootMask.size() > 0 && (
RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex
== 0)) && "Illegal shuffle root mask") ? void (0) : __assert_fail
("RootMask.size() > 0 && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && \"Illegal shuffle root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38526, __extension__
__PRETTY_FUNCTION__))
38525 (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&(static_cast <bool> (RootMask.size() > 0 && (
RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex
== 0)) && "Illegal shuffle root mask") ? void (0) : __assert_fail
("RootMask.size() > 0 && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && \"Illegal shuffle root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38526, __extension__
__PRETTY_FUNCTION__))
38526 "Illegal shuffle root mask")(static_cast <bool> (RootMask.size() > 0 && (
RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex
== 0)) && "Illegal shuffle root mask") ? void (0) : __assert_fail
("RootMask.size() > 0 && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && \"Illegal shuffle root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38526, __extension__
__PRETTY_FUNCTION__))
;
38527 MVT RootVT = Root.getSimpleValueType();
38528 assert(RootVT.isVector() && "Shuffles operate on vector types!")(static_cast <bool> (RootVT.isVector() && "Shuffles operate on vector types!"
) ? void (0) : __assert_fail ("RootVT.isVector() && \"Shuffles operate on vector types!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38528, __extension__
__PRETTY_FUNCTION__))
;
38529 unsigned RootSizeInBits = RootVT.getSizeInBits();
38530
38531 // Bound the depth of our recursive combine because this is ultimately
38532 // quadratic in nature.
38533 if (Depth >= MaxDepth)
38534 return SDValue();
38535
38536 // Directly rip through bitcasts to find the underlying operand.
38537 SDValue Op = SrcOps[SrcOpIndex];
38538 Op = peekThroughOneUseBitcasts(Op);
38539
38540 EVT VT = Op.getValueType();
38541 if (!VT.isVector() || !VT.isSimple())
38542 return SDValue(); // Bail if we hit a non-simple non-vector.
38543
38544 // FIXME: Just bail on f16 for now.
38545 if (VT.getVectorElementType() == MVT::f16)
38546 return SDValue();
38547
38548 assert((RootSizeInBits % VT.getSizeInBits()) == 0 &&(static_cast <bool> ((RootSizeInBits % VT.getSizeInBits
()) == 0 && "Can only combine shuffles upto size of the root op."
) ? void (0) : __assert_fail ("(RootSizeInBits % VT.getSizeInBits()) == 0 && \"Can only combine shuffles upto size of the root op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38549, __extension__
__PRETTY_FUNCTION__))
38549 "Can only combine shuffles upto size of the root op.")(static_cast <bool> ((RootSizeInBits % VT.getSizeInBits
()) == 0 && "Can only combine shuffles upto size of the root op."
) ? void (0) : __assert_fail ("(RootSizeInBits % VT.getSizeInBits()) == 0 && \"Can only combine shuffles upto size of the root op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38549, __extension__
__PRETTY_FUNCTION__))
;
38550
38551 // Extract target shuffle mask and resolve sentinels and inputs.
38552 // TODO - determine Op's demanded elts from RootMask.
38553 SmallVector<int, 64> OpMask;
38554 SmallVector<SDValue, 2> OpInputs;
38555 APInt OpUndef, OpZero;
38556 APInt OpDemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
38557 bool IsOpVariableMask = isTargetShuffleVariableMask(Op.getOpcode());
38558 if (getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,
38559 OpZero, DAG, Depth, false)) {
38560 // Shuffle inputs must not be larger than the shuffle result.
38561 // TODO: Relax this for single input faux shuffles (e.g. trunc).
38562 if (llvm::any_of(OpInputs, [VT](SDValue OpInput) {
38563 return OpInput.getValueSizeInBits() > VT.getSizeInBits();
38564 }))
38565 return SDValue();
38566 } else if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
38567 (RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 &&
38568 !isNullConstant(Op.getOperand(1))) {
38569 SDValue SrcVec = Op.getOperand(0);
38570 int ExtractIdx = Op.getConstantOperandVal(1);
38571 unsigned NumElts = VT.getVectorNumElements();
38572 OpInputs.assign({SrcVec});
38573 OpMask.assign(NumElts, SM_SentinelUndef);
38574 std::iota(OpMask.begin(), OpMask.end(), ExtractIdx);
38575 OpZero = OpUndef = APInt::getNullValue(NumElts);
38576 } else {
38577 return SDValue();
38578 }
38579
38580 // If the shuffle result was smaller than the root, we need to adjust the
38581 // mask indices and pad the mask with undefs.
38582 if (RootSizeInBits > VT.getSizeInBits()) {
38583 unsigned NumSubVecs = RootSizeInBits / VT.getSizeInBits();
38584 unsigned OpMaskSize = OpMask.size();
38585 if (OpInputs.size() > 1) {
38586 unsigned PaddedMaskSize = NumSubVecs * OpMaskSize;
38587 for (int &M : OpMask) {
38588 if (M < 0)
38589 continue;
38590 int EltIdx = M % OpMaskSize;
38591 int OpIdx = M / OpMaskSize;
38592 M = (PaddedMaskSize * OpIdx) + EltIdx;
38593 }
38594 }
38595 OpZero = OpZero.zext(NumSubVecs * OpMaskSize);
38596 OpUndef = OpUndef.zext(NumSubVecs * OpMaskSize);
38597 OpMask.append((NumSubVecs - 1) * OpMaskSize, SM_SentinelUndef);
38598 }
38599
38600 SmallVector<int, 64> Mask;
38601 SmallVector<SDValue, 16> Ops;
38602
38603 // We don't need to merge masks if the root is empty.
38604 bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);
38605 if (EmptyRoot) {
38606 // Only resolve zeros if it will remove an input, otherwise we might end
38607 // up in an infinite loop.
38608 bool ResolveKnownZeros = true;
38609 if (!OpZero.isZero()) {
38610 APInt UsedInputs = APInt::getZero(OpInputs.size());
38611 for (int i = 0, e = OpMask.size(); i != e; ++i) {
38612 int M = OpMask[i];
38613 if (OpUndef[i] || OpZero[i] || isUndefOrZero(M))
38614 continue;
38615 UsedInputs.setBit(M / OpMask.size());
38616 if (UsedInputs.isAllOnes()) {
38617 ResolveKnownZeros = false;
38618 break;
38619 }
38620 }
38621 }
38622 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero,
38623 ResolveKnownZeros);
38624
38625 Mask = OpMask;
38626 Ops.append(OpInputs.begin(), OpInputs.end());
38627 } else {
38628 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);
38629
38630 // Add the inputs to the Ops list, avoiding duplicates.
38631 Ops.append(SrcOps.begin(), SrcOps.end());
38632
38633 auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
38634 // Attempt to find an existing match.
38635 SDValue InputBC = peekThroughBitcasts(Input);
38636 for (int i = 0, e = Ops.size(); i < e; ++i)
38637 if (InputBC == peekThroughBitcasts(Ops[i]))
38638 return i;
38639 // Match failed - should we replace an existing Op?
38640 if (InsertionPoint >= 0) {
38641 Ops[InsertionPoint] = Input;
38642 return InsertionPoint;
38643 }
38644 // Add to the end of the Ops list.
38645 Ops.push_back(Input);
38646 return Ops.size() - 1;
38647 };
38648
38649 SmallVector<int, 2> OpInputIdx;
38650 for (SDValue OpInput : OpInputs)
38651 OpInputIdx.push_back(
38652 AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));
38653
38654 assert(((RootMask.size() > OpMask.size() &&(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38659, __extension__
__PRETTY_FUNCTION__))
38655 RootMask.size() % OpMask.size() == 0) ||(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38659, __extension__
__PRETTY_FUNCTION__))
38656 (OpMask.size() > RootMask.size() &&(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38659, __extension__
__PRETTY_FUNCTION__))
38657 OpMask.size() % RootMask.size() == 0) ||(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38659, __extension__
__PRETTY_FUNCTION__))
38658 OpMask.size() == RootMask.size()) &&(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38659, __extension__
__PRETTY_FUNCTION__))
38659 "The smaller number of elements must divide the larger.")(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38659, __extension__
__PRETTY_FUNCTION__))
;
38660
38661 // This function can be performance-critical, so we rely on the power-of-2
38662 // knowledge that we have about the mask sizes to replace div/rem ops with
38663 // bit-masks and shifts.
38664 assert(isPowerOf2_32(RootMask.size()) &&(static_cast <bool> (isPowerOf2_32(RootMask.size()) &&
"Non-power-of-2 shuffle mask sizes") ? void (0) : __assert_fail
("isPowerOf2_32(RootMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38665, __extension__
__PRETTY_FUNCTION__))
38665 "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (isPowerOf2_32(RootMask.size()) &&
"Non-power-of-2 shuffle mask sizes") ? void (0) : __assert_fail
("isPowerOf2_32(RootMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38665, __extension__
__PRETTY_FUNCTION__))
;
38666 assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (isPowerOf2_32(OpMask.size()) &&
"Non-power-of-2 shuffle mask sizes") ? void (0) : __assert_fail
("isPowerOf2_32(OpMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38666, __extension__
__PRETTY_FUNCTION__))
;
38667 unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());
38668 unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());
38669
38670 unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
38671 unsigned RootRatio =
38672 std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
38673 unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
38674 assert((RootRatio == 1 || OpRatio == 1) &&(static_cast <bool> ((RootRatio == 1 || OpRatio == 1) &&
"Must not have a ratio for both incoming and op masks!") ? void
(0) : __assert_fail ("(RootRatio == 1 || OpRatio == 1) && \"Must not have a ratio for both incoming and op masks!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38675, __extension__
__PRETTY_FUNCTION__))
38675 "Must not have a ratio for both incoming and op masks!")(static_cast <bool> ((RootRatio == 1 || OpRatio == 1) &&
"Must not have a ratio for both incoming and op masks!") ? void
(0) : __assert_fail ("(RootRatio == 1 || OpRatio == 1) && \"Must not have a ratio for both incoming and op masks!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38675, __extension__
__PRETTY_FUNCTION__))
;
38676
38677 assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (isPowerOf2_32(MaskWidth) &&
"Non-power-of-2 shuffle mask sizes") ? void (0) : __assert_fail
("isPowerOf2_32(MaskWidth) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38677, __extension__
__PRETTY_FUNCTION__))
;
38678 assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (isPowerOf2_32(RootRatio) &&
"Non-power-of-2 shuffle mask sizes") ? void (0) : __assert_fail
("isPowerOf2_32(RootRatio) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38678, __extension__
__PRETTY_FUNCTION__))
;
38679 assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes"
) ? void (0) : __assert_fail ("isPowerOf2_32(OpRatio) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38679, __extension__
__PRETTY_FUNCTION__))
;
38680 unsigned RootRatioLog2 = countTrailingZeros(RootRatio);
38681 unsigned OpRatioLog2 = countTrailingZeros(OpRatio);
38682
38683 Mask.resize(MaskWidth, SM_SentinelUndef);
38684
38685 // Merge this shuffle operation's mask into our accumulated mask. Note that
38686 // this shuffle's mask will be the first applied to the input, followed by
38687 // the root mask to get us all the way to the root value arrangement. The
38688 // reason for this order is that we are recursing up the operation chain.
38689 for (unsigned i = 0; i < MaskWidth; ++i) {
38690 unsigned RootIdx = i >> RootRatioLog2;
38691 if (RootMask[RootIdx] < 0) {
38692 // This is a zero or undef lane, we're done.
38693 Mask[i] = RootMask[RootIdx];
38694 continue;
38695 }
38696
38697 unsigned RootMaskedIdx =
38698 RootRatio == 1
38699 ? RootMask[RootIdx]
38700 : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
38701
38702 // Just insert the scaled root mask value if it references an input other
38703 // than the SrcOp we're currently inserting.
38704 if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
38705 (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
38706 Mask[i] = RootMaskedIdx;
38707 continue;
38708 }
38709
38710 RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
38711 unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
38712 if (OpMask[OpIdx] < 0) {
38713 // The incoming lanes are zero or undef, it doesn't matter which ones we
38714 // are using.
38715 Mask[i] = OpMask[OpIdx];
38716 continue;
38717 }
38718
38719 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
38720 unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx]
38721 : (OpMask[OpIdx] << OpRatioLog2) +
38722 (RootMaskedIdx & (OpRatio - 1));
38723
38724 OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
38725 int InputIdx = OpMask[OpIdx] / (int)OpMask.size();
38726 assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input")(static_cast <bool> (0 <= OpInputIdx[InputIdx] &&
"Unknown target shuffle input") ? void (0) : __assert_fail (
"0 <= OpInputIdx[InputIdx] && \"Unknown target shuffle input\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38726, __extension__
__PRETTY_FUNCTION__))
;
38727 OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;
38728
38729 Mask[i] = OpMaskedIdx;
38730 }
38731 }
38732
38733 // Remove unused/repeated shuffle source ops.
38734 resolveTargetShuffleInputsAndMask(Ops, Mask);
38735
38736 // Handle the all undef/zero/ones cases early.
38737 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
38738 return DAG.getUNDEF(RootVT);
38739 if (all_of(Mask, [](int Idx) { return Idx < 0; }))
38740 return getZeroVector(RootVT, Subtarget, DAG, SDLoc(Root));
38741 if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) &&
38742 none_of(Mask, [](int M) { return M == SM_SentinelZero; }))
38743 return getOnesVector(RootVT, DAG, SDLoc(Root));
38744
38745 assert(!Ops.empty() && "Shuffle with no inputs detected")(static_cast <bool> (!Ops.empty() && "Shuffle with no inputs detected"
) ? void (0) : __assert_fail ("!Ops.empty() && \"Shuffle with no inputs detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38745, __extension__
__PRETTY_FUNCTION__))
;
38746 HasVariableMask |= IsOpVariableMask;
38747
38748 // Update the list of shuffle nodes that have been combined so far.
38749 SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
38750 SrcNodes.end());
38751 CombinedNodes.push_back(Op.getNode());
38752
38753 // See if we can recurse into each shuffle source op (if it's a target
38754 // shuffle). The source op should only be generally combined if it either has
38755 // a single use (i.e. current Op) or all its users have already been combined,
38756 // if not then we can still combine but should prevent generation of variable
38757 // shuffles to avoid constant pool bloat.
38758 // Don't recurse if we already have more source ops than we can combine in
38759 // the remaining recursion depth.
38760 if (Ops.size() < (MaxDepth - Depth)) {
38761 for (int i = 0, e = Ops.size(); i < e; ++i) {
38762 // For empty roots, we need to resolve zeroable elements before combining
38763 // them with other shuffles.
38764 SmallVector<int, 64> ResolvedMask = Mask;
38765 if (EmptyRoot)
38766 resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero);
38767 bool AllowCrossLaneVar = false;
38768 bool AllowPerLaneVar = false;
38769 if (Ops[i].getNode()->hasOneUse() ||
38770 SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode())) {
38771 AllowCrossLaneVar = AllowVariableCrossLaneMask;
38772 AllowPerLaneVar = AllowVariablePerLaneMask;
38773 }
38774 if (SDValue Res = combineX86ShufflesRecursively(
38775 Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1, MaxDepth,
38776 HasVariableMask, AllowCrossLaneVar, AllowPerLaneVar, DAG,
38777 Subtarget))
38778 return Res;
38779 }
38780 }
38781
38782 // Attempt to constant fold all of the constant source ops.
38783 if (SDValue Cst = combineX86ShufflesConstants(
38784 Ops, Mask, Root, HasVariableMask, DAG, Subtarget))
38785 return Cst;
38786
38787 // If constant fold failed and we only have constants - then we have
38788 // multiple uses by a single non-variable shuffle - just bail.
38789 if (Depth == 0 && llvm::all_of(Ops, [&](SDValue Op) {
38790 APInt UndefElts;
38791 SmallVector<APInt> RawBits;
38792 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
38793 return getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
38794 RawBits);
38795 })) {
38796 return SDValue();
38797 }
38798
38799 // Canonicalize the combined shuffle mask chain with horizontal ops.
38800 // NOTE: This will update the Ops and Mask.
38801 if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp(
38802 Ops, Mask, RootSizeInBits, SDLoc(Root), DAG, Subtarget))
38803 return DAG.getBitcast(RootVT, HOp);
38804
38805 // Try to refine our inputs given our knowledge of target shuffle mask.
38806 for (auto I : enumerate(Ops)) {
38807 int OpIdx = I.index();
38808 SDValue &Op = I.value();
38809
38810 // What range of shuffle mask element values results in picking from Op?
38811 int Lo = OpIdx * Mask.size();
38812 int Hi = Lo + Mask.size();
38813
38814 // Which elements of Op do we demand, given the mask's granularity?
38815 APInt OpDemandedElts(Mask.size(), 0);
38816 for (int MaskElt : Mask) {
38817 if (isInRange(MaskElt, Lo, Hi)) { // Picks from Op?
38818 int OpEltIdx = MaskElt - Lo;
38819 OpDemandedElts.setBit(OpEltIdx);
38820 }
38821 }
38822
38823 // Is the shuffle result smaller than the root?
38824 if (Op.getValueSizeInBits() < RootSizeInBits) {
38825 // We padded the mask with undefs. But we now need to undo that.
38826 unsigned NumExpectedVectorElts = Mask.size();
38827 unsigned EltSizeInBits = RootSizeInBits / NumExpectedVectorElts;
38828 unsigned NumOpVectorElts = Op.getValueSizeInBits() / EltSizeInBits;
38829 assert(!OpDemandedElts.extractBits((static_cast <bool> (!OpDemandedElts.extractBits( NumExpectedVectorElts
- NumOpVectorElts, NumOpVectorElts) && "Demanding the virtual undef widening padding?"
) ? void (0) : __assert_fail ("!OpDemandedElts.extractBits( NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) && \"Demanding the virtual undef widening padding?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38831, __extension__
__PRETTY_FUNCTION__))
38830 NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) &&(static_cast <bool> (!OpDemandedElts.extractBits( NumExpectedVectorElts
- NumOpVectorElts, NumOpVectorElts) && "Demanding the virtual undef widening padding?"
) ? void (0) : __assert_fail ("!OpDemandedElts.extractBits( NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) && \"Demanding the virtual undef widening padding?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38831, __extension__
__PRETTY_FUNCTION__))
38831 "Demanding the virtual undef widening padding?")(static_cast <bool> (!OpDemandedElts.extractBits( NumExpectedVectorElts
- NumOpVectorElts, NumOpVectorElts) && "Demanding the virtual undef widening padding?"
) ? void (0) : __assert_fail ("!OpDemandedElts.extractBits( NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) && \"Demanding the virtual undef widening padding?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38831, __extension__
__PRETTY_FUNCTION__))
;
38832 OpDemandedElts = OpDemandedElts.trunc(NumOpVectorElts); // NUW
38833 }
38834
38835 // The Op itself may be of different VT, so we need to scale the mask.
38836 unsigned NumOpElts = Op.getValueType().getVectorNumElements();
38837 APInt OpScaledDemandedElts = APIntOps::ScaleBitMask(OpDemandedElts, NumOpElts);
38838
38839 // Can this operand be simplified any further, given it's demanded elements?
38840 if (SDValue NewOp =
38841 DAG.getTargetLoweringInfo().SimplifyMultipleUseDemandedVectorElts(
38842 Op, OpScaledDemandedElts, DAG))
38843 Op = NewOp;
38844 }
38845 // FIXME: should we rerun resolveTargetShuffleInputsAndMask() now?
38846
38847 // Widen any subvector shuffle inputs we've collected.
38848 // TODO: Remove this to avoid generating temporary nodes, we should only
38849 // widen once combineX86ShuffleChain has found a match.
38850 if (any_of(Ops, [RootSizeInBits](SDValue Op) {
38851 return Op.getValueSizeInBits() < RootSizeInBits;
38852 })) {
38853 for (SDValue &Op : Ops)
38854 if (Op.getValueSizeInBits() < RootSizeInBits)
38855 Op = widenSubVector(Op, false, Subtarget, DAG, SDLoc(Op),
38856 RootSizeInBits);
38857 // Reresolve - we might have repeated subvector sources.
38858 resolveTargetShuffleInputsAndMask(Ops, Mask);
38859 }
38860
38861 // We can only combine unary and binary shuffle mask cases.
38862 if (Ops.size() <= 2) {
38863 // Minor canonicalization of the accumulated shuffle mask to make it easier
38864 // to match below. All this does is detect masks with sequential pairs of
38865 // elements, and shrink them to the half-width mask. It does this in a loop
38866 // so it will reduce the size of the mask to the minimal width mask which
38867 // performs an equivalent shuffle.
38868 while (Mask.size() > 1) {
38869 SmallVector<int, 64> WidenedMask;
38870 if (!canWidenShuffleElements(Mask, WidenedMask))
38871 break;
38872 Mask = std::move(WidenedMask);
38873 }
38874
38875 // Canonicalization of binary shuffle masks to improve pattern matching by
38876 // commuting the inputs.
38877 if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
38878 ShuffleVectorSDNode::commuteMask(Mask);
38879 std::swap(Ops[0], Ops[1]);
38880 }
38881
38882 // Finally, try to combine into a single shuffle instruction.
38883 return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask,
38884 AllowVariableCrossLaneMask,
38885 AllowVariablePerLaneMask, DAG, Subtarget);
38886 }
38887
38888 // If that failed and any input is extracted then try to combine as a
38889 // shuffle with the larger type.
38890 return combineX86ShuffleChainWithExtract(
38891 Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask,
38892 AllowVariablePerLaneMask, DAG, Subtarget);
38893}
38894
38895/// Helper entry wrapper to combineX86ShufflesRecursively.
38896static SDValue combineX86ShufflesRecursively(SDValue Op, SelectionDAG &DAG,
38897 const X86Subtarget &Subtarget) {
38898 return combineX86ShufflesRecursively(
38899 {Op}, 0, Op, {0}, {}, /*Depth*/ 0, X86::MaxShuffleCombineDepth,
38900 /*HasVarMask*/ false,
38901 /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, DAG,
38902 Subtarget);
38903}
38904
38905/// Get the PSHUF-style mask from PSHUF node.
38906///
38907/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
38908/// PSHUF-style masks that can be reused with such instructions.
38909static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
38910 MVT VT = N.getSimpleValueType();
38911 SmallVector<int, 4> Mask;
38912 SmallVector<SDValue, 2> Ops;
38913 bool HaveMask =
38914 getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask);
38915 (void)HaveMask;
38916 assert(HaveMask)(static_cast <bool> (HaveMask) ? void (0) : __assert_fail
("HaveMask", "llvm/lib/Target/X86/X86ISelLowering.cpp", 38916
, __extension__ __PRETTY_FUNCTION__))
;
38917
38918 // If we have more than 128-bits, only the low 128-bits of shuffle mask
38919 // matter. Check that the upper masks are repeats and remove them.
38920 if (VT.getSizeInBits() > 128) {
38921 int LaneElts = 128 / VT.getScalarSizeInBits();
38922#ifndef NDEBUG
38923 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
38924 for (int j = 0; j < LaneElts; ++j)
38925 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&(static_cast <bool> (Mask[j] == Mask[i * LaneElts + j] -
(LaneElts * i) && "Mask doesn't repeat in high 128-bit lanes!"
) ? void (0) : __assert_fail ("Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) && \"Mask doesn't repeat in high 128-bit lanes!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38926, __extension__
__PRETTY_FUNCTION__))
38926 "Mask doesn't repeat in high 128-bit lanes!")(static_cast <bool> (Mask[j] == Mask[i * LaneElts + j] -
(LaneElts * i) && "Mask doesn't repeat in high 128-bit lanes!"
) ? void (0) : __assert_fail ("Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) && \"Mask doesn't repeat in high 128-bit lanes!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38926, __extension__
__PRETTY_FUNCTION__))
;
38927#endif
38928 Mask.resize(LaneElts);
38929 }
38930
38931 switch (N.getOpcode()) {
38932 case X86ISD::PSHUFD:
38933 return Mask;
38934 case X86ISD::PSHUFLW:
38935 Mask.resize(4);
38936 return Mask;
38937 case X86ISD::PSHUFHW:
38938 Mask.erase(Mask.begin(), Mask.begin() + 4);
38939 for (int &M : Mask)
38940 M -= 4;
38941 return Mask;
38942 default:
38943 llvm_unreachable("No valid shuffle instruction found!")::llvm::llvm_unreachable_internal("No valid shuffle instruction found!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38943)
;
38944 }
38945}
38946
38947/// Search for a combinable shuffle across a chain ending in pshufd.
38948///
38949/// We walk up the chain and look for a combinable shuffle, skipping over
38950/// shuffles that we could hoist this shuffle's transformation past without
38951/// altering anything.
38952static SDValue
38953combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
38954 SelectionDAG &DAG) {
38955 assert(N.getOpcode() == X86ISD::PSHUFD &&(static_cast <bool> (N.getOpcode() == X86ISD::PSHUFD &&
"Called with something other than an x86 128-bit half shuffle!"
) ? void (0) : __assert_fail ("N.getOpcode() == X86ISD::PSHUFD && \"Called with something other than an x86 128-bit half shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38956, __extension__
__PRETTY_FUNCTION__))
38956 "Called with something other than an x86 128-bit half shuffle!")(static_cast <bool> (N.getOpcode() == X86ISD::PSHUFD &&
"Called with something other than an x86 128-bit half shuffle!"
) ? void (0) : __assert_fail ("N.getOpcode() == X86ISD::PSHUFD && \"Called with something other than an x86 128-bit half shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38956, __extension__
__PRETTY_FUNCTION__))
;
38957 SDLoc DL(N);
38958
38959 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
38960 // of the shuffles in the chain so that we can form a fresh chain to replace
38961 // this one.
38962 SmallVector<SDValue, 8> Chain;
38963 SDValue V = N.getOperand(0);
38964 for (; V.hasOneUse(); V = V.getOperand(0)) {
38965 switch (V.getOpcode()) {
38966 default:
38967 return SDValue(); // Nothing combined!
38968
38969 case ISD::BITCAST:
38970 // Skip bitcasts as we always know the type for the target specific
38971 // instructions.
38972 continue;
38973
38974 case X86ISD::PSHUFD:
38975 // Found another dword shuffle.
38976 break;
38977
38978 case X86ISD::PSHUFLW:
38979 // Check that the low words (being shuffled) are the identity in the
38980 // dword shuffle, and the high words are self-contained.
38981 if (Mask[0] != 0 || Mask[1] != 1 ||
38982 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
38983 return SDValue();
38984
38985 Chain.push_back(V);
38986 continue;
38987
38988 case X86ISD::PSHUFHW:
38989 // Check that the high words (being shuffled) are the identity in the
38990 // dword shuffle, and the low words are self-contained.
38991 if (Mask[2] != 2 || Mask[3] != 3 ||
38992 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
38993 return SDValue();
38994
38995 Chain.push_back(V);
38996 continue;
38997
38998 case X86ISD::UNPCKL:
38999 case X86ISD::UNPCKH:
39000 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
39001 // shuffle into a preceding word shuffle.
39002 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
39003 V.getSimpleValueType().getVectorElementType() != MVT::i16)
39004 return SDValue();
39005
39006 // Search for a half-shuffle which we can combine with.
39007 unsigned CombineOp =
39008 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
39009 if (V.getOperand(0) != V.getOperand(1) ||
39010 !V->isOnlyUserOf(V.getOperand(0).getNode()))
39011 return SDValue();
39012 Chain.push_back(V);
39013 V = V.getOperand(0);
39014 do {
39015 switch (V.getOpcode()) {
39016 default:
39017 return SDValue(); // Nothing to combine.
39018
39019 case X86ISD::PSHUFLW:
39020 case X86ISD::PSHUFHW:
39021 if (V.getOpcode() == CombineOp)
39022 break;
39023
39024 Chain.push_back(V);
39025
39026 LLVM_FALLTHROUGH[[gnu::fallthrough]];
39027 case ISD::BITCAST:
39028 V = V.getOperand(0);
39029 continue;
39030 }
39031 break;
39032 } while (V.hasOneUse());
39033 break;
39034 }
39035 // Break out of the loop if we break out of the switch.
39036 break;
39037 }
39038
39039 if (!V.hasOneUse())
39040 // We fell out of the loop without finding a viable combining instruction.
39041 return SDValue();
39042
39043 // Merge this node's mask and our incoming mask.
39044 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
39045 for (int &M : Mask)
39046 M = VMask[M];
39047 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
39048 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
39049
39050 // Rebuild the chain around this new shuffle.
39051 while (!Chain.empty()) {
39052 SDValue W = Chain.pop_back_val();
39053
39054 if (V.getValueType() != W.getOperand(0).getValueType())
39055 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
39056
39057 switch (W.getOpcode()) {
39058 default:
39059 llvm_unreachable("Only PSHUF and UNPCK instructions get here!")::llvm::llvm_unreachable_internal("Only PSHUF and UNPCK instructions get here!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39059)
;
39060
39061 case X86ISD::UNPCKL:
39062 case X86ISD::UNPCKH:
39063 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
39064 break;
39065
39066 case X86ISD::PSHUFD:
39067 case X86ISD::PSHUFLW:
39068 case X86ISD::PSHUFHW:
39069 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
39070 break;
39071 }
39072 }
39073 if (V.getValueType() != N.getValueType())
39074 V = DAG.getBitcast(N.getValueType(), V);
39075
39076 // Return the new chain to replace N.
39077 return V;
39078}
39079
39080// Attempt to commute shufps LHS loads:
39081// permilps(shufps(load(),x)) --> permilps(shufps(x,load()))
39082static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL,
39083 SelectionDAG &DAG) {
39084 // TODO: Add vXf64 support.
39085 if (VT != MVT::v4f32 && VT != MVT::v8f32 && VT != MVT::v16f32)
39086 return SDValue();
39087
39088 // SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.
39089 auto commuteSHUFP = [&VT, &DL, &DAG](SDValue Parent, SDValue V) {
39090 if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode()))
39091 return SDValue();
39092 SDValue N0 = V.getOperand(0);
39093 SDValue N1 = V.getOperand(1);
39094 unsigned Imm = V.getConstantOperandVal(2);
39095 const X86Subtarget &Subtarget =
39096 static_cast<const X86Subtarget &>(DAG.getSubtarget());
39097 if (!X86::mayFoldLoad(peekThroughOneUseBitcasts(N0), Subtarget) ||
39098 X86::mayFoldLoad(peekThroughOneUseBitcasts(N1), Subtarget))
39099 return SDValue();
39100 Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);
39101 return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,
39102 DAG.getTargetConstant(Imm, DL, MVT::i8));
39103 };
39104
39105 switch (N.getOpcode()) {
39106 case X86ISD::VPERMILPI:
39107 if (SDValue NewSHUFP = commuteSHUFP(N, N.getOperand(0))) {
39108 unsigned Imm = N.getConstantOperandVal(1);
39109 return DAG.getNode(X86ISD::VPERMILPI, DL, VT, NewSHUFP,
39110 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
39111 }
39112 break;
39113 case X86ISD::SHUFP: {
39114 SDValue N0 = N.getOperand(0);
39115 SDValue N1 = N.getOperand(1);
39116 unsigned Imm = N.getConstantOperandVal(2);
39117 if (N0 == N1) {
39118 if (SDValue NewSHUFP = commuteSHUFP(N, N0))
39119 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, NewSHUFP,
39120 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
39121 } else if (SDValue NewSHUFP = commuteSHUFP(N, N0)) {
39122 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, N1,
39123 DAG.getTargetConstant(Imm ^ 0x0A, DL, MVT::i8));
39124 } else if (SDValue NewSHUFP = commuteSHUFP(N, N1)) {
39125 return DAG.getNode(X86ISD::SHUFP, DL, VT, N0, NewSHUFP,
39126 DAG.getTargetConstant(Imm ^ 0xA0, DL, MVT::i8));
39127 }
39128 break;
39129 }
39130 }
39131
39132 return SDValue();
39133}
39134
39135// Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
39136static SDValue canonicalizeShuffleWithBinOps(SDValue N, SelectionDAG &DAG,
39137 const SDLoc &DL) {
39138 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
39139 EVT ShuffleVT = N.getValueType();
39140
39141 auto IsMergeableWithShuffle = [](SDValue Op) {
39142 // AllZeros/AllOnes constants are freely shuffled and will peek through
39143 // bitcasts. Other constant build vectors do not peek through bitcasts. Only
39144 // merge with target shuffles if it has one use so shuffle combining is
39145 // likely to kick in.
39146 return ISD::isBuildVectorAllOnes(Op.getNode()) ||
39147 ISD::isBuildVectorAllZeros(Op.getNode()) ||
39148 ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) ||
39149 ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode()) ||
39150 (isTargetShuffle(Op.getOpcode()) && Op->hasOneUse());
39151 };
39152 auto IsSafeToMoveShuffle = [ShuffleVT](SDValue Op, unsigned BinOp) {
39153 // Ensure we only shuffle whole vector src elements, unless its a logical
39154 // binops where we can more aggressively move shuffles from dst to src.
39155 return BinOp == ISD::AND || BinOp == ISD::OR || BinOp == ISD::XOR ||
39156 (Op.getScalarValueSizeInBits() <= ShuffleVT.getScalarSizeInBits());
39157 };
39158
39159 unsigned Opc = N.getOpcode();
39160 switch (Opc) {
39161 // Unary and Unary+Permute Shuffles.
39162 case X86ISD::PSHUFB: {
39163 // Don't merge PSHUFB if it contains zero'd elements.
39164 SmallVector<int> Mask;
39165 SmallVector<SDValue> Ops;
39166 if (!getTargetShuffleMask(N.getNode(), ShuffleVT.getSimpleVT(), false, Ops,
39167 Mask))
39168 break;
39169 LLVM_FALLTHROUGH[[gnu::fallthrough]];
39170 }
39171 case X86ISD::VBROADCAST:
39172 case X86ISD::MOVDDUP:
39173 case X86ISD::PSHUFD:
39174 case X86ISD::PSHUFHW:
39175 case X86ISD::PSHUFLW:
39176 case X86ISD::VPERMI:
39177 case X86ISD::VPERMILPI: {
39178 if (N.getOperand(0).getValueType() == ShuffleVT &&
39179 N->isOnlyUserOf(N.getOperand(0).getNode())) {
39180 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
39181 unsigned SrcOpcode = N0.getOpcode();
39182 if (TLI.isBinOp(SrcOpcode) && IsSafeToMoveShuffle(N0, SrcOpcode)) {
39183 SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0));
39184 SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1));
39185 if (IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op01)) {
39186 SDValue LHS, RHS;
39187 Op00 = DAG.getBitcast(ShuffleVT, Op00);
39188 Op01 = DAG.getBitcast(ShuffleVT, Op01);
39189 if (N.getNumOperands() == 2) {
39190 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, N.getOperand(1));
39191 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, N.getOperand(1));
39192 } else {
39193 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00);
39194 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01);
39195 }
39196 EVT OpVT = N0.getValueType();
39197 return DAG.getBitcast(ShuffleVT,
39198 DAG.getNode(SrcOpcode, DL, OpVT,
39199 DAG.getBitcast(OpVT, LHS),
39200 DAG.getBitcast(OpVT, RHS)));
39201 }
39202 }
39203 }
39204 break;
39205 }
39206 // Binary and Binary+Permute Shuffles.
39207 case X86ISD::INSERTPS: {
39208 // Don't merge INSERTPS if it contains zero'd elements.
39209 unsigned InsertPSMask = N.getConstantOperandVal(2);
39210 unsigned ZeroMask = InsertPSMask & 0xF;
39211 if (ZeroMask != 0)
39212 break;
39213 LLVM_FALLTHROUGH[[gnu::fallthrough]];
39214 }
39215 case X86ISD::MOVSD:
39216 case X86ISD::MOVSS:
39217 case X86ISD::BLENDI:
39218 case X86ISD::SHUFP:
39219 case X86ISD::UNPCKH:
39220 case X86ISD::UNPCKL: {
39221 if (N->isOnlyUserOf(N.getOperand(0).getNode()) &&
39222 N->isOnlyUserOf(N.getOperand(1).getNode())) {
39223 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
39224 SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));
39225 unsigned SrcOpcode = N0.getOpcode();
39226 if (TLI.isBinOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&
39227 IsSafeToMoveShuffle(N0, SrcOpcode) &&
39228 IsSafeToMoveShuffle(N1, SrcOpcode)) {
39229 SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0));
39230 SDValue Op10 = peekThroughOneUseBitcasts(N1.getOperand(0));
39231 SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1));
39232 SDValue Op11 = peekThroughOneUseBitcasts(N1.getOperand(1));
39233 // Ensure the total number of shuffles doesn't increase by folding this
39234 // shuffle through to the source ops.
39235 if (((IsMergeableWithShuffle(Op00) && IsMergeableWithShuffle(Op10)) ||
39236 (IsMergeableWithShuffle(Op01) && IsMergeableWithShuffle(Op11))) ||
39237 ((IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op10)) &&
39238 (IsMergeableWithShuffle(Op01) || IsMergeableWithShuffle(Op11)))) {
39239 SDValue LHS, RHS;
39240 Op00 = DAG.getBitcast(ShuffleVT, Op00);
39241 Op10 = DAG.getBitcast(ShuffleVT, Op10);
39242 Op01 = DAG.getBitcast(ShuffleVT, Op01);
39243 Op11 = DAG.getBitcast(ShuffleVT, Op11);
39244 if (N.getNumOperands() == 3) {
39245 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));
39246 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11, N.getOperand(2));
39247 } else {
39248 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);
39249 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11);
39250 }
39251 EVT OpVT = N0.getValueType();
39252 return DAG.getBitcast(ShuffleVT,
39253 DAG.getNode(SrcOpcode, DL, OpVT,
39254 DAG.getBitcast(OpVT, LHS),
39255 DAG.getBitcast(OpVT, RHS)));
39256 }
39257 }
39258 }
39259 break;
39260 }
39261 }
39262 return SDValue();
39263}
39264
39265/// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
39266static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V,
39267 SelectionDAG &DAG,
39268 const SDLoc &DL) {
39269 assert(V.getOpcode() == X86ISD::VPERM2X128 && "Unknown lane shuffle")(static_cast <bool> (V.getOpcode() == X86ISD::VPERM2X128
&& "Unknown lane shuffle") ? void (0) : __assert_fail
("V.getOpcode() == X86ISD::VPERM2X128 && \"Unknown lane shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39269, __extension__
__PRETTY_FUNCTION__))
;
39270
39271 MVT VT = V.getSimpleValueType();
39272 SDValue Src0 = peekThroughBitcasts(V.getOperand(0));
39273 SDValue Src1 = peekThroughBitcasts(V.getOperand(1));
39274 unsigned SrcOpc0 = Src0.getOpcode();
39275 unsigned SrcOpc1 = Src1.getOpcode();
39276 EVT SrcVT0 = Src0.getValueType();
39277 EVT SrcVT1 = Src1.getValueType();
39278
39279 if (!Src1.isUndef() && (SrcVT0 != SrcVT1 || SrcOpc0 != SrcOpc1))
39280 return SDValue();
39281
39282 switch (SrcOpc0) {
39283 case X86ISD::MOVDDUP: {
39284 SDValue LHS = Src0.getOperand(0);
39285 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
39286 SDValue Res =
39287 DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS, V.getOperand(2));
39288 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res);
39289 return DAG.getBitcast(VT, Res);
39290 }
39291 case X86ISD::VPERMILPI:
39292 // TODO: Handle v4f64 permutes with different low/high lane masks.
39293 if (SrcVT0 == MVT::v4f64) {
39294 uint64_t Mask = Src0.getConstantOperandVal(1);
39295 if ((Mask & 0x3) != ((Mask >> 2) & 0x3))
39296 break;
39297 }
39298 LLVM_FALLTHROUGH[[gnu::fallthrough]];
39299 case X86ISD::VSHLI:
39300 case X86ISD::VSRLI:
39301 case X86ISD::VSRAI:
39302 case X86ISD::PSHUFD:
39303 if (Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) {
39304 SDValue LHS = Src0.getOperand(0);
39305 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
39306 SDValue Res = DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS,
39307 V.getOperand(2));
39308 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res, Src0.getOperand(1));
39309 return DAG.getBitcast(VT, Res);
39310 }
39311 break;
39312 }
39313
39314 return SDValue();
39315}
39316
39317/// Try to combine x86 target specific shuffles.
39318static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
39319 TargetLowering::DAGCombinerInfo &DCI,
39320 const X86Subtarget &Subtarget) {
39321 SDLoc DL(N);
39322 MVT VT = N.getSimpleValueType();
39323 SmallVector<int, 4> Mask;
39324 unsigned Opcode = N.getOpcode();
39325
39326 if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))
39327 return R;
39328
39329 // Handle specific target shuffles.
39330 switch (Opcode) {
39331 case X86ISD::MOVDDUP: {
39332 SDValue Src = N.getOperand(0);
39333 // Turn a 128-bit MOVDDUP of a full vector load into movddup+vzload.
39334 if (VT == MVT::v2f64 && Src.hasOneUse() &&
39335 ISD::isNormalLoad(Src.getNode())) {
39336 LoadSDNode *LN = cast<LoadSDNode>(Src);
39337 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::f64, MVT::v2f64, DAG)) {
39338 SDValue Movddup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, VZLoad);
39339 DCI.CombineTo(N.getNode(), Movddup);
39340 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
39341 DCI.recursivelyDeleteUnusedNodes(LN);
39342 return N; // Return N so it doesn't get rechecked!
39343 }
39344 }
39345
39346 return SDValue();
39347 }
39348 case X86ISD::VBROADCAST: {
39349 SDValue Src = N.getOperand(0);
39350 SDValue BC = peekThroughBitcasts(Src);
39351 EVT SrcVT = Src.getValueType();
39352 EVT BCVT = BC.getValueType();
39353
39354 // If broadcasting from another shuffle, attempt to simplify it.
39355 // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
39356 if (isTargetShuffle(BC.getOpcode()) &&
39357 VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
39358 unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
39359 SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),
39360 SM_SentinelUndef);
39361 for (unsigned i = 0; i != Scale; ++i)
39362 DemandedMask[i] = i;
39363 if (SDValue Res = combineX86ShufflesRecursively(
39364 {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 0,
39365 X86::MaxShuffleCombineDepth,
39366 /*HasVarMask*/ false, /*AllowCrossLaneVarMask*/ true,
39367 /*AllowPerLaneVarMask*/ true, DAG, Subtarget))
39368 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
39369 DAG.getBitcast(SrcVT, Res));
39370 }
39371
39372 // broadcast(bitcast(src)) -> bitcast(broadcast(src))
39373 // 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.
39374 if (Src.getOpcode() == ISD::BITCAST &&
39375 SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits() &&
39376 DAG.getTargetLoweringInfo().isTypeLegal(BCVT) &&
39377 FixedVectorType::isValidElementType(
39378 BCVT.getScalarType().getTypeForEVT(*DAG.getContext()))) {
39379 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),
39380 VT.getVectorNumElements());
39381 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
39382 }
39383
39384 // Reduce broadcast source vector to lowest 128-bits.
39385 if (SrcVT.getSizeInBits() > 128)
39386 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
39387 extract128BitVector(Src, 0, DAG, DL));
39388
39389 // broadcast(scalar_to_vector(x)) -> broadcast(x).
39390 if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR)
39391 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
39392
39393 // broadcast(extract_vector_elt(x, 0)) -> broadcast(x).
39394 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
39395 isNullConstant(Src.getOperand(1)) &&
39396 DAG.getTargetLoweringInfo().isTypeLegal(
39397 Src.getOperand(0).getValueType()))
39398 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
39399
39400 // Share broadcast with the longest vector and extract low subvector (free).
39401 // Ensure the same SDValue from the SDNode use is being used.
39402 for (SDNode *User : Src->uses())
39403 if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&
39404 Src == User->getOperand(0) &&
39405 User->getValueSizeInBits(0).getFixedSize() >
39406 VT.getFixedSizeInBits()) {
39407 return extractSubVector(SDValue(User, 0), 0, DAG, DL,
39408 VT.getSizeInBits());
39409 }
39410
39411 // vbroadcast(scalarload X) -> vbroadcast_load X
39412 // For float loads, extract other uses of the scalar from the broadcast.
39413 if (!SrcVT.isVector() && (Src.hasOneUse() || VT.isFloatingPoint()) &&
39414 ISD::isNormalLoad(Src.getNode())) {
39415 LoadSDNode *LN = cast<LoadSDNode>(Src);
39416 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
39417 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
39418 SDValue BcastLd =
39419 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
39420 LN->getMemoryVT(), LN->getMemOperand());
39421 // If the load value is used only by N, replace it via CombineTo N.
39422 bool NoReplaceExtract = Src.hasOneUse();
39423 DCI.CombineTo(N.getNode(), BcastLd);
39424 if (NoReplaceExtract) {
39425 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
39426 DCI.recursivelyDeleteUnusedNodes(LN);
39427 } else {
39428 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd,
39429 DAG.getIntPtrConstant(0, DL));
39430 DCI.CombineTo(LN, Scl, BcastLd.getValue(1));
39431 }
39432 return N; // Return N so it doesn't get rechecked!
39433 }
39434
39435 // Due to isTypeDesirableForOp, we won't always shrink a load truncated to
39436 // i16. So shrink it ourselves if we can make a broadcast_load.
39437 if (SrcVT == MVT::i16 && Src.getOpcode() == ISD::TRUNCATE &&
39438 Src.hasOneUse() && Src.getOperand(0).hasOneUse()) {
39439 assert(Subtarget.hasAVX2() && "Expected AVX2")(static_cast <bool> (Subtarget.hasAVX2() && "Expected AVX2"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"Expected AVX2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39439, __extension__
__PRETTY_FUNCTION__))
;
39440 SDValue TruncIn = Src.getOperand(0);
39441
39442 // If this is a truncate of a non extending load we can just narrow it to
39443 // use a broadcast_load.
39444 if (ISD::isNormalLoad(TruncIn.getNode())) {
39445 LoadSDNode *LN = cast<LoadSDNode>(TruncIn);
39446 // Unless its volatile or atomic.
39447 if (LN->isSimple()) {
39448 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
39449 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
39450 SDValue BcastLd = DAG.getMemIntrinsicNode(
39451 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
39452 LN->getPointerInfo(), LN->getOriginalAlign(),
39453 LN->getMemOperand()->getFlags());
39454 DCI.CombineTo(N.getNode(), BcastLd);
39455 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
39456 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
39457 return N; // Return N so it doesn't get rechecked!
39458 }
39459 }
39460
39461 // If this is a truncate of an i16 extload, we can directly replace it.
39462 if (ISD::isUNINDEXEDLoad(Src.getOperand(0).getNode()) &&
39463 ISD::isEXTLoad(Src.getOperand(0).getNode())) {
39464 LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0));
39465 if (LN->getMemoryVT().getSizeInBits() == 16) {
39466 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
39467 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
39468 SDValue BcastLd =
39469 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
39470 LN->getMemoryVT(), LN->getMemOperand());
39471 DCI.CombineTo(N.getNode(), BcastLd);
39472 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
39473 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
39474 return N; // Return N so it doesn't get rechecked!
39475 }
39476 }
39477
39478 // If this is a truncate of load that has been shifted right, we can
39479 // offset the pointer and use a narrower load.
39480 if (TruncIn.getOpcode() == ISD::SRL &&
39481 TruncIn.getOperand(0).hasOneUse() &&
39482 isa<ConstantSDNode>(TruncIn.getOperand(1)) &&
39483 ISD::isNormalLoad(TruncIn.getOperand(0).getNode())) {
39484 LoadSDNode *LN = cast<LoadSDNode>(TruncIn.getOperand(0));
39485 unsigned ShiftAmt = TruncIn.getConstantOperandVal(1);
39486 // Make sure the shift amount and the load size are divisible by 16.
39487 // Don't do this if the load is volatile or atomic.
39488 if (ShiftAmt % 16 == 0 && TruncIn.getValueSizeInBits() % 16 == 0 &&
39489 LN->isSimple()) {
39490 unsigned Offset = ShiftAmt / 8;
39491 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
39492 SDValue Ptr = DAG.getMemBasePlusOffset(LN->getBasePtr(),
39493 TypeSize::Fixed(Offset), DL);
39494 SDValue Ops[] = { LN->getChain(), Ptr };
39495 SDValue BcastLd = DAG.getMemIntrinsicNode(
39496 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
39497 LN->getPointerInfo().getWithOffset(Offset),
39498 LN->getOriginalAlign(),
39499 LN->getMemOperand()->getFlags());
39500 DCI.CombineTo(N.getNode(), BcastLd);
39501 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
39502 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
39503 return N; // Return N so it doesn't get rechecked!
39504 }
39505 }
39506 }
39507
39508 // vbroadcast(vzload X) -> vbroadcast_load X
39509 if (Src.getOpcode() == X86ISD::VZEXT_LOAD && Src.hasOneUse()) {
39510 MemSDNode *LN = cast<MemIntrinsicSDNode>(Src);
39511 if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) {
39512 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
39513 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
39514 SDValue BcastLd =
39515 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
39516 LN->getMemoryVT(), LN->getMemOperand());
39517 DCI.CombineTo(N.getNode(), BcastLd);
39518 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
39519 DCI.recursivelyDeleteUnusedNodes(LN);
39520 return N; // Return N so it doesn't get rechecked!
39521 }
39522 }
39523
39524 // vbroadcast(vector load X) -> vbroadcast_load
39525 if ((SrcVT == MVT::v2f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v2i64 ||
39526 SrcVT == MVT::v4i32) &&
39527 Src.hasOneUse() && ISD::isNormalLoad(Src.getNode())) {
39528 LoadSDNode *LN = cast<LoadSDNode>(Src);
39529 // Unless the load is volatile or atomic.
39530 if (LN->isSimple()) {
39531 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
39532 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
39533 SDValue BcastLd = DAG.getMemIntrinsicNode(
39534 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SrcVT.getScalarType(),
39535 LN->getPointerInfo(), LN->getOriginalAlign(),
39536 LN->getMemOperand()->getFlags());
39537 DCI.CombineTo(N.getNode(), BcastLd);
39538 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
39539 DCI.recursivelyDeleteUnusedNodes(LN);
39540 return N; // Return N so it doesn't get rechecked!
39541 }
39542 }
39543
39544 return SDValue();
39545 }
39546 case X86ISD::VZEXT_MOVL: {
39547 SDValue N0 = N.getOperand(0);
39548
39549 // If this a vzmovl of a full vector load, replace it with a vzload, unless
39550 // the load is volatile.
39551 if (N0.hasOneUse() && ISD::isNormalLoad(N0.getNode())) {
39552 auto *LN = cast<LoadSDNode>(N0);
39553 if (SDValue VZLoad =
39554 narrowLoadToVZLoad(LN, VT.getVectorElementType(), VT, DAG)) {
39555 DCI.CombineTo(N.getNode(), VZLoad);
39556 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
39557 DCI.recursivelyDeleteUnusedNodes(LN);
39558 return N;
39559 }
39560 }
39561
39562 // If this a VZEXT_MOVL of a VBROADCAST_LOAD, we don't need the broadcast
39563 // and can just use a VZEXT_LOAD.
39564 // FIXME: Is there some way to do this with SimplifyDemandedVectorElts?
39565 if (N0.hasOneUse() && N0.getOpcode() == X86ISD::VBROADCAST_LOAD) {
39566 auto *LN = cast<MemSDNode>(N0);
39567 if (VT.getScalarSizeInBits() == LN->getMemoryVT().getSizeInBits()) {
39568 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
39569 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
39570 SDValue VZLoad =
39571 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops,
39572 LN->getMemoryVT(), LN->getMemOperand());
39573 DCI.CombineTo(N.getNode(), VZLoad);
39574 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
39575 DCI.recursivelyDeleteUnusedNodes(LN);
39576 return N;
39577 }
39578 }
39579
39580 // Turn (v2i64 (vzext_movl (scalar_to_vector (i64 X)))) into
39581 // (v2i64 (bitcast (v4i32 (vzext_movl (scalar_to_vector (i32 (trunc X)))))))
39582 // if the upper bits of the i64 are zero.
39583 if (N0.hasOneUse() && N0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
39584 N0.getOperand(0).hasOneUse() &&
39585 N0.getOperand(0).getValueType() == MVT::i64) {
39586 SDValue In = N0.getOperand(0);
39587 APInt Mask = APInt::getHighBitsSet(64, 32);
39588 if (DAG.MaskedValueIsZero(In, Mask)) {
39589 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, In);
39590 MVT VecVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
39591 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Trunc);
39592 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, SclVec);
39593 return DAG.getBitcast(VT, Movl);
39594 }
39595 }
39596
39597 // Load a scalar integer constant directly to XMM instead of transferring an
39598 // immediate value from GPR.
39599 // vzext_movl (scalar_to_vector C) --> load [C,0...]
39600 if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR) {
39601 if (auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {
39602 // Create a vector constant - scalar constant followed by zeros.
39603 EVT ScalarVT = N0.getOperand(0).getValueType();
39604 Type *ScalarTy = ScalarVT.getTypeForEVT(*DAG.getContext());
39605 unsigned NumElts = VT.getVectorNumElements();
39606 Constant *Zero = ConstantInt::getNullValue(ScalarTy);
39607 SmallVector<Constant *, 32> ConstantVec(NumElts, Zero);
39608 ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue());
39609
39610 // Load the vector constant from constant pool.
39611 MVT PVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
39612 SDValue CP = DAG.getConstantPool(ConstantVector::get(ConstantVec), PVT);
39613 MachinePointerInfo MPI =
39614 MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
39615 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
39616 return DAG.getLoad(VT, DL, DAG.getEntryNode(), CP, MPI, Alignment,
39617 MachineMemOperand::MOLoad);
39618 }
39619 }
39620
39621 // Pull subvector inserts into undef through VZEXT_MOVL by making it an
39622 // insert into a zero vector. This helps get VZEXT_MOVL closer to
39623 // scalar_to_vectors where 256/512 are canonicalized to an insert and a
39624 // 128-bit scalar_to_vector. This reduces the number of isel patterns.
39625 if (!DCI.isBeforeLegalizeOps() && N0.hasOneUse()) {
39626 SDValue V = peekThroughOneUseBitcasts(N0);
39627
39628 if (V.getOpcode() == ISD::INSERT_SUBVECTOR && V.getOperand(0).isUndef() &&
39629 isNullConstant(V.getOperand(2))) {
39630 SDValue In = V.getOperand(1);
39631 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
39632 In.getValueSizeInBits() /
39633 VT.getScalarSizeInBits());
39634 In = DAG.getBitcast(SubVT, In);
39635 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, SubVT, In);
39636 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
39637 getZeroVector(VT, Subtarget, DAG, DL), Movl,
39638 V.getOperand(2));
39639 }
39640 }
39641
39642 return SDValue();
39643 }
39644 case X86ISD::BLENDI: {
39645 SDValue N0 = N.getOperand(0);
39646 SDValue N1 = N.getOperand(1);
39647
39648 // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
39649 // TODO: Handle MVT::v16i16 repeated blend mask.
39650 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
39651 N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {
39652 MVT SrcVT = N0.getOperand(0).getSimpleValueType();
39653 if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0 &&
39654 SrcVT.getScalarSizeInBits() >= 32) {
39655 unsigned BlendMask = N.getConstantOperandVal(2);
39656 unsigned Size = VT.getVectorNumElements();
39657 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
39658 BlendMask = scaleVectorShuffleBlendMask(BlendMask, Size, Scale);
39659 return DAG.getBitcast(
39660 VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
39661 N1.getOperand(0),
39662 DAG.getTargetConstant(BlendMask, DL, MVT::i8)));
39663 }
39664 }
39665 return SDValue();
39666 }
39667 case X86ISD::SHUFP: {
39668 // Fold shufps(shuffle(x),shuffle(y)) -> shufps(x,y).
39669 // This is a more relaxed shuffle combiner that can ignore oneuse limits.
39670 // TODO: Support types other than v4f32.
39671 if (VT == MVT::v4f32) {
39672 bool Updated = false;
39673 SmallVector<int> Mask;
39674 SmallVector<SDValue> Ops;
39675 if (getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask) &&
39676 Ops.size() == 2) {
39677 for (int i = 0; i != 2; ++i) {
39678 SmallVector<SDValue> SubOps;
39679 SmallVector<int> SubMask, SubScaledMask;
39680 SDValue Sub = peekThroughBitcasts(Ops[i]);
39681 // TODO: Scaling might be easier if we specify the demanded elts.
39682 if (getTargetShuffleInputs(Sub, SubOps, SubMask, DAG, 0, false) &&
39683 scaleShuffleElements(SubMask, 4, SubScaledMask) &&
39684 SubOps.size() == 1 && isUndefOrInRange(SubScaledMask, 0, 4)) {
39685 int Ofs = i * 2;
39686 Mask[Ofs + 0] = SubScaledMask[Mask[Ofs + 0] % 4] + (i * 4);
39687 Mask[Ofs + 1] = SubScaledMask[Mask[Ofs + 1] % 4] + (i * 4);
39688 Ops[i] = DAG.getBitcast(VT, SubOps[0]);
39689 Updated = true;
39690 }
39691 }
39692 }
39693 if (Updated) {
39694 for (int &M : Mask)
39695 M %= 4;
39696 Ops.push_back(getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
39697 return DAG.getNode(X86ISD::SHUFP, DL, VT, Ops);
39698 }
39699 }
39700 return SDValue();
39701 }
39702 case X86ISD::VPERMI: {
39703 // vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
39704 // TODO: Remove when we have preferred domains in combineX86ShuffleChain.
39705 SDValue N0 = N.getOperand(0);
39706 SDValue N1 = N.getOperand(1);
39707 unsigned EltSizeInBits = VT.getScalarSizeInBits();
39708 if (N0.getOpcode() == ISD::BITCAST &&
39709 N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) {
39710 SDValue Src = N0.getOperand(0);
39711 EVT SrcVT = Src.getValueType();
39712 SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1);
39713 return DAG.getBitcast(VT, Res);
39714 }
39715 return SDValue();
39716 }
39717 case X86ISD::VPERM2X128: {
39718 // Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)).
39719 SDValue LHS = N->getOperand(0);
39720 SDValue RHS = N->getOperand(1);
39721 if (LHS.getOpcode() == ISD::BITCAST &&
39722 (RHS.getOpcode() == ISD::BITCAST || RHS.isUndef())) {
39723 EVT SrcVT = LHS.getOperand(0).getValueType();
39724 if (RHS.isUndef() || SrcVT == RHS.getOperand(0).getValueType()) {
39725 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT,
39726 DAG.getBitcast(SrcVT, LHS),
39727 DAG.getBitcast(SrcVT, RHS),
39728 N->getOperand(2)));
39729 }
39730 }
39731
39732 // Fold vperm2x128(op(),op()) -> op(vperm2x128(),vperm2x128()).
39733 if (SDValue Res = canonicalizeLaneShuffleWithRepeatedOps(N, DAG, DL))
39734 return Res;
39735
39736 // Fold vperm2x128 subvector shuffle with an inner concat pattern.
39737 // vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc.
39738 auto FindSubVector128 = [&](unsigned Idx) {
39739 if (Idx > 3)
39740 return SDValue();
39741 SDValue Src = peekThroughBitcasts(N.getOperand(Idx < 2 ? 0 : 1));
39742 SmallVector<SDValue> SubOps;
39743 if (collectConcatOps(Src.getNode(), SubOps) && SubOps.size() == 2)
39744 return SubOps[Idx & 1];
39745 unsigned NumElts = Src.getValueType().getVectorNumElements();
39746 if ((Idx & 1) == 1 && Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
39747 Src.getOperand(1).getValueSizeInBits() == 128 &&
39748 Src.getConstantOperandAPInt(2) == (NumElts / 2)) {
39749 return Src.getOperand(1);
39750 }
39751 return SDValue();
39752 };
39753 unsigned Imm = N.getConstantOperandVal(2);
39754 if (SDValue SubLo = FindSubVector128(Imm & 0x0F)) {
39755 if (SDValue SubHi = FindSubVector128((Imm & 0xF0) >> 4)) {
39756 MVT SubVT = VT.getHalfNumVectorElementsVT();
39757 SubLo = DAG.getBitcast(SubVT, SubLo);
39758 SubHi = DAG.getBitcast(SubVT, SubHi);
39759 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, SubLo, SubHi);
39760 }
39761 }
39762 return SDValue();
39763 }
39764 case X86ISD::PSHUFD:
39765 case X86ISD::PSHUFLW:
39766 case X86ISD::PSHUFHW:
39767 Mask = getPSHUFShuffleMask(N);
39768 assert(Mask.size() == 4)(static_cast <bool> (Mask.size() == 4) ? void (0) : __assert_fail
("Mask.size() == 4", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 39768, __extension__ __PRETTY_FUNCTION__))
;
39769 break;
39770 case X86ISD::MOVSD:
39771 case X86ISD::MOVSH:
39772 case X86ISD::MOVSS: {
39773 SDValue N0 = N.getOperand(0);
39774 SDValue N1 = N.getOperand(1);
39775
39776 // Canonicalize scalar FPOps:
39777 // MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))
39778 // If commutable, allow OP(N1[0], N0[0]).
39779 unsigned Opcode1 = N1.getOpcode();
39780 if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB ||
39781 Opcode1 == ISD::FDIV) {
39782 SDValue N10 = N1.getOperand(0);
39783 SDValue N11 = N1.getOperand(1);
39784 if (N10 == N0 ||
39785 (N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) {
39786 if (N10 != N0)
39787 std::swap(N10, N11);
39788 MVT SVT = VT.getVectorElementType();
39789 SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
39790 N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);
39791 N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);
39792 SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);
39793 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
39794 return DAG.getNode(Opcode, DL, VT, N0, SclVec);
39795 }
39796 }
39797
39798 return SDValue();
39799 }
39800 case X86ISD::INSERTPS: {
39801 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32")(static_cast <bool> (VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32"
) ? void (0) : __assert_fail ("VT == MVT::v4f32 && \"INSERTPS ValueType must be MVT::v4f32\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39801, __extension__
__PRETTY_FUNCTION__))
;
39802 SDValue Op0 = N.getOperand(0);
39803 SDValue Op1 = N.getOperand(1);
39804 unsigned InsertPSMask = N.getConstantOperandVal(2);
39805 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
39806 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
39807 unsigned ZeroMask = InsertPSMask & 0xF;
39808
39809 // If we zero out all elements from Op0 then we don't need to reference it.
39810 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
39811 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
39812 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
39813
39814 // If we zero out the element from Op1 then we don't need to reference it.
39815 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
39816 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
39817 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
39818
39819 // Attempt to merge insertps Op1 with an inner target shuffle node.
39820 SmallVector<int, 8> TargetMask1;
39821 SmallVector<SDValue, 2> Ops1;
39822 APInt KnownUndef1, KnownZero1;
39823 if (getTargetShuffleAndZeroables(Op1, TargetMask1, Ops1, KnownUndef1,
39824 KnownZero1)) {
39825 if (KnownUndef1[SrcIdx] || KnownZero1[SrcIdx]) {
39826 // Zero/UNDEF insertion - zero out element and remove dependency.
39827 InsertPSMask |= (1u << DstIdx);
39828 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
39829 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
39830 }
39831 // Update insertps mask srcidx and reference the source input directly.
39832 int M = TargetMask1[SrcIdx];
39833 assert(0 <= M && M < 8 && "Shuffle index out of range")(static_cast <bool> (0 <= M && M < 8 &&
"Shuffle index out of range") ? void (0) : __assert_fail ("0 <= M && M < 8 && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39833, __extension__
__PRETTY_FUNCTION__))
;
39834 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
39835 Op1 = Ops1[M < 4 ? 0 : 1];
39836 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
39837 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
39838 }
39839
39840 // Attempt to merge insertps Op0 with an inner target shuffle node.
39841 SmallVector<int, 8> TargetMask0;
39842 SmallVector<SDValue, 2> Ops0;
39843 APInt KnownUndef0, KnownZero0;
39844 if (getTargetShuffleAndZeroables(Op0, TargetMask0, Ops0, KnownUndef0,
39845 KnownZero0)) {
39846 bool Updated = false;
39847 bool UseInput00 = false;
39848 bool UseInput01 = false;
39849 for (int i = 0; i != 4; ++i) {
39850 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
39851 // No change if element is already zero or the inserted element.
39852 continue;
39853 } else if (KnownUndef0[i] || KnownZero0[i]) {
39854 // If the target mask is undef/zero then we must zero the element.
39855 InsertPSMask |= (1u << i);
39856 Updated = true;
39857 continue;
39858 }
39859
39860 // The input vector element must be inline.
39861 int M = TargetMask0[i];
39862 if (M != i && M != (i + 4))
39863 return SDValue();
39864
39865 // Determine which inputs of the target shuffle we're using.
39866 UseInput00 |= (0 <= M && M < 4);
39867 UseInput01 |= (4 <= M);
39868 }
39869
39870 // If we're not using both inputs of the target shuffle then use the
39871 // referenced input directly.
39872 if (UseInput00 && !UseInput01) {
39873 Updated = true;
39874 Op0 = Ops0[0];
39875 } else if (!UseInput00 && UseInput01) {
39876 Updated = true;
39877 Op0 = Ops0[1];
39878 }
39879
39880 if (Updated)
39881 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
39882 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
39883 }
39884
39885 // If we're inserting an element from a vbroadcast load, fold the
39886 // load into the X86insertps instruction. We need to convert the scalar
39887 // load to a vector and clear the source lane of the INSERTPS control.
39888 if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) {
39889 auto *MemIntr = cast<MemIntrinsicSDNode>(Op1);
39890 if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {
39891 SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),
39892 MemIntr->getBasePtr(),
39893 MemIntr->getMemOperand());
39894 SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0,
39895 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT,
39896 Load),
39897 DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));
39898 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
39899 return Insert;
39900 }
39901 }
39902
39903 return SDValue();
39904 }
39905 default:
39906 return SDValue();
39907 }
39908
39909 // Nuke no-op shuffles that show up after combining.
39910 if (isNoopShuffleMask(Mask))
39911 return N.getOperand(0);
39912
39913 // Look for simplifications involving one or two shuffle instructions.
39914 SDValue V = N.getOperand(0);
39915 switch (N.getOpcode()) {
39916 default:
39917 break;
39918 case X86ISD::PSHUFLW:
39919 case X86ISD::PSHUFHW:
39920 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!")(static_cast <bool> (VT.getVectorElementType() == MVT::
i16 && "Bad word shuffle type!") ? void (0) : __assert_fail
("VT.getVectorElementType() == MVT::i16 && \"Bad word shuffle type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39920, __extension__
__PRETTY_FUNCTION__))
;
39921
39922 // See if this reduces to a PSHUFD which is no more expensive and can
39923 // combine with more operations. Note that it has to at least flip the
39924 // dwords as otherwise it would have been removed as a no-op.
39925 if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
39926 int DMask[] = {0, 1, 2, 3};
39927 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
39928 DMask[DOffset + 0] = DOffset + 1;
39929 DMask[DOffset + 1] = DOffset + 0;
39930 MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
39931 V = DAG.getBitcast(DVT, V);
39932 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
39933 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
39934 return DAG.getBitcast(VT, V);
39935 }
39936
39937 // Look for shuffle patterns which can be implemented as a single unpack.
39938 // FIXME: This doesn't handle the location of the PSHUFD generically, and
39939 // only works when we have a PSHUFD followed by two half-shuffles.
39940 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
39941 (V.getOpcode() == X86ISD::PSHUFLW ||
39942 V.getOpcode() == X86ISD::PSHUFHW) &&
39943 V.getOpcode() != N.getOpcode() &&
39944 V.hasOneUse() && V.getOperand(0).hasOneUse()) {
39945 SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
39946 if (D.getOpcode() == X86ISD::PSHUFD) {
39947 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
39948 SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
39949 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
39950 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
39951 int WordMask[8];
39952 for (int i = 0; i < 4; ++i) {
39953 WordMask[i + NOffset] = Mask[i] + NOffset;
39954 WordMask[i + VOffset] = VMask[i] + VOffset;
39955 }
39956 // Map the word mask through the DWord mask.
39957 int MappedMask[8];
39958 for (int i = 0; i < 8; ++i)
39959 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
39960 if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
39961 makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
39962 // We can replace all three shuffles with an unpack.
39963 V = DAG.getBitcast(VT, D.getOperand(0));
39964 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
39965 : X86ISD::UNPCKH,
39966 DL, VT, V, V);
39967 }
39968 }
39969 }
39970
39971 break;
39972
39973 case X86ISD::PSHUFD:
39974 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
39975 return NewN;
39976
39977 break;
39978 }
39979
39980 return SDValue();
39981}
39982
39983/// Checks if the shuffle mask takes subsequent elements
39984/// alternately from two vectors.
39985/// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.
39986static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {
39987
39988 int ParitySrc[2] = {-1, -1};
39989 unsigned Size = Mask.size();
39990 for (unsigned i = 0; i != Size; ++i) {
39991 int M = Mask[i];
39992 if (M < 0)
39993 continue;
39994
39995 // Make sure we are using the matching element from the input.
39996 if ((M % Size) != i)
39997 return false;
39998
39999 // Make sure we use the same input for all elements of the same parity.
40000 int Src = M / Size;
40001 if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)
40002 return false;
40003 ParitySrc[i % 2] = Src;
40004 }
40005
40006 // Make sure each input is used.
40007 if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])
40008 return false;
40009
40010 Op0Even = ParitySrc[0] == 0;
40011 return true;
40012}
40013
40014/// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
40015/// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
40016/// are written to the parameters \p Opnd0 and \p Opnd1.
40017///
40018/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
40019/// so it is easier to generically match. We also insert dummy vector shuffle
40020/// nodes for the operands which explicitly discard the lanes which are unused
40021/// by this operation to try to flow through the rest of the combiner
40022/// the fact that they're unused.
40023static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
40024 SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
40025 bool &IsSubAdd) {
40026
40027 EVT VT = N->getValueType(0);
40028 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40029 if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||
40030 !VT.getSimpleVT().isFloatingPoint())
40031 return false;
40032
40033 // We only handle target-independent shuffles.
40034 // FIXME: It would be easy and harmless to use the target shuffle mask
40035 // extraction tool to support more.
40036 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
40037 return false;
40038
40039 SDValue V1 = N->getOperand(0);
40040 SDValue V2 = N->getOperand(1);
40041
40042 // Make sure we have an FADD and an FSUB.
40043 if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||
40044 (V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||
40045 V1.getOpcode() == V2.getOpcode())
40046 return false;
40047
40048 // If there are other uses of these operations we can't fold them.
40049 if (!V1->hasOneUse() || !V2->hasOneUse())
40050 return false;
40051
40052 // Ensure that both operations have the same operands. Note that we can
40053 // commute the FADD operands.
40054 SDValue LHS, RHS;
40055 if (V1.getOpcode() == ISD::FSUB) {
40056 LHS = V1->getOperand(0); RHS = V1->getOperand(1);
40057 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
40058 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
40059 return false;
40060 } else {
40061 assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode")(static_cast <bool> (V2.getOpcode() == ISD::FSUB &&
"Unexpected opcode") ? void (0) : __assert_fail ("V2.getOpcode() == ISD::FSUB && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40061, __extension__
__PRETTY_FUNCTION__))
;
40062 LHS = V2->getOperand(0); RHS = V2->getOperand(1);
40063 if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&
40064 (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))
40065 return false;
40066 }
40067
40068 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
40069 bool Op0Even;
40070 if (!isAddSubOrSubAddMask(Mask, Op0Even))
40071 return false;
40072
40073 // It's a subadd if the vector in the even parity is an FADD.
40074 IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
40075 : V2->getOpcode() == ISD::FADD;
40076
40077 Opnd0 = LHS;
40078 Opnd1 = RHS;
40079 return true;
40080}
40081
40082/// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
40083static SDValue combineShuffleToFMAddSub(SDNode *N,
40084 const X86Subtarget &Subtarget,
40085 SelectionDAG &DAG) {
40086 // We only handle target-independent shuffles.
40087 // FIXME: It would be easy and harmless to use the target shuffle mask
40088 // extraction tool to support more.
40089 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
40090 return SDValue();
40091
40092 MVT VT = N->getSimpleValueType(0);
40093 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40094 if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT))
40095 return SDValue();
40096
40097 // We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).
40098 SDValue Op0 = N->getOperand(0);
40099 SDValue Op1 = N->getOperand(1);
40100 SDValue FMAdd = Op0, FMSub = Op1;
40101 if (FMSub.getOpcode() != X86ISD::FMSUB)
40102 std::swap(FMAdd, FMSub);
40103
40104 if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB ||
40105 FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() ||
40106 FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() ||
40107 FMAdd.getOperand(2) != FMSub.getOperand(2))
40108 return SDValue();
40109
40110 // Check for correct shuffle mask.
40111 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
40112 bool Op0Even;
40113 if (!isAddSubOrSubAddMask(Mask, Op0Even))
40114 return SDValue();
40115
40116 // FMAddSub takes zeroth operand from FMSub node.
40117 SDLoc DL(N);
40118 bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;
40119 unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
40120 return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),
40121 FMAdd.getOperand(2));
40122}
40123
40124/// Try to combine a shuffle into a target-specific add-sub or
40125/// mul-add-sub node.
40126static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
40127 const X86Subtarget &Subtarget,
40128 SelectionDAG &DAG) {
40129 if (SDValue V = combineShuffleToFMAddSub(N, Subtarget, DAG))
40130 return V;
40131
40132 SDValue Opnd0, Opnd1;
40133 bool IsSubAdd;
40134 if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd))
40135 return SDValue();
40136
40137 MVT VT = N->getSimpleValueType(0);
40138 SDLoc DL(N);
40139
40140 // Try to generate X86ISD::FMADDSUB node here.
40141 SDValue Opnd2;
40142 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) {
40143 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
40144 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
40145 }
40146
40147 if (IsSubAdd)
40148 return SDValue();
40149
40150 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
40151 // the ADDSUB idiom has been successfully recognized. There are no known
40152 // X86 targets with 512-bit ADDSUB instructions!
40153 if (VT.is512BitVector())
40154 return SDValue();
40155
40156 // Do not generate X86ISD::ADDSUB node for FP16's vector types even though
40157 // the ADDSUB idiom has been successfully recognized. There are no known
40158 // X86 targets with FP16 ADDSUB instructions!
40159 if (VT.getVectorElementType() == MVT::f16)
40160 return SDValue();
40161
40162 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
40163}
40164
40165// We are looking for a shuffle where both sources are concatenated with undef
40166// and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
40167// if we can express this as a single-source shuffle, that's preferable.
40168static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
40169 const X86Subtarget &Subtarget) {
40170 if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
40171 return SDValue();
40172
40173 EVT VT = N->getValueType(0);
40174
40175 // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
40176 if (!VT.is128BitVector() && !VT.is256BitVector())
40177 return SDValue();
40178
40179 if (VT.getVectorElementType() != MVT::i32 &&
40180 VT.getVectorElementType() != MVT::i64 &&
40181 VT.getVectorElementType() != MVT::f32 &&
40182 VT.getVectorElementType() != MVT::f64)
40183 return SDValue();
40184
40185 SDValue N0 = N->getOperand(0);
40186 SDValue N1 = N->getOperand(1);
40187
40188 // Check that both sources are concats with undef.
40189 if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
40190 N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
40191 N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
40192 !N1.getOperand(1).isUndef())
40193 return SDValue();
40194
40195 // Construct the new shuffle mask. Elements from the first source retain their
40196 // index, but elements from the second source no longer need to skip an undef.
40197 SmallVector<int, 8> Mask;
40198 int NumElts = VT.getVectorNumElements();
40199
40200 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
40201 for (int Elt : SVOp->getMask())
40202 Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
40203
40204 SDLoc DL(N);
40205 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
40206 N1.getOperand(0));
40207 return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
40208}
40209
40210/// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the
40211/// low half of each source vector and does not set any high half elements in
40212/// the destination vector, narrow the shuffle to half its original size.
40213static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG) {
40214 if (!Shuf->getValueType(0).isSimple())
40215 return SDValue();
40216 MVT VT = Shuf->getSimpleValueType(0);
40217 if (!VT.is256BitVector() && !VT.is512BitVector())
40218 return SDValue();
40219
40220 // See if we can ignore all of the high elements of the shuffle.
40221 ArrayRef<int> Mask = Shuf->getMask();
40222 if (!isUndefUpperHalf(Mask))
40223 return SDValue();
40224
40225 // Check if the shuffle mask accesses only the low half of each input vector
40226 // (half-index output is 0 or 2).
40227 int HalfIdx1, HalfIdx2;
40228 SmallVector<int, 8> HalfMask(Mask.size() / 2);
40229 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) ||
40230 (HalfIdx1 % 2 == 1) || (HalfIdx2 % 2 == 1))
40231 return SDValue();
40232
40233 // Create a half-width shuffle to replace the unnecessarily wide shuffle.
40234 // The trick is knowing that all of the insert/extract are actually free
40235 // subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle
40236 // of narrow inputs into a narrow output, and that is always cheaper than
40237 // the wide shuffle that we started with.
40238 return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),
40239 Shuf->getOperand(1), HalfMask, HalfIdx1,
40240 HalfIdx2, false, DAG, /*UseConcat*/true);
40241}
40242
40243static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
40244 TargetLowering::DAGCombinerInfo &DCI,
40245 const X86Subtarget &Subtarget) {
40246 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N))
40247 if (SDValue V = narrowShuffle(Shuf, DAG))
40248 return V;
40249
40250 // If we have legalized the vector types, look for blends of FADD and FSUB
40251 // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
40252 SDLoc dl(N);
40253 EVT VT = N->getValueType(0);
40254 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40255 if (TLI.isTypeLegal(VT))
40256 if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
40257 return AddSub;
40258
40259 // Attempt to combine into a vector load/broadcast.
40260 if (SDValue LD = combineToConsecutiveLoads(
40261 VT, SDValue(N, 0), dl, DAG, Subtarget, /*IsAfterLegalize*/ true))
40262 return LD;
40263
40264 // For AVX2, we sometimes want to combine
40265 // (vector_shuffle <mask> (concat_vectors t1, undef)
40266 // (concat_vectors t2, undef))
40267 // Into:
40268 // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
40269 // Since the latter can be efficiently lowered with VPERMD/VPERMQ
40270 if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
40271 return ShufConcat;
40272
40273 if (isTargetShuffle(N->getOpcode())) {
40274 SDValue Op(N, 0);
40275 if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
40276 return Shuffle;
40277
40278 // Try recursively combining arbitrary sequences of x86 shuffle
40279 // instructions into higher-order shuffles. We do this after combining
40280 // specific PSHUF instruction sequences into their minimal form so that we
40281 // can evaluate how many specialized shuffle instructions are involved in
40282 // a particular chain.
40283 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
40284 return Res;
40285
40286 // Simplify source operands based on shuffle mask.
40287 // TODO - merge this into combineX86ShufflesRecursively.
40288 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
40289 if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, DCI))
40290 return SDValue(N, 0);
40291
40292 // Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
40293 // Perform this after other shuffle combines to allow inner shuffles to be
40294 // combined away first.
40295 if (SDValue BinOp = canonicalizeShuffleWithBinOps(Op, DAG, SDLoc(N)))
40296 return BinOp;
40297 }
40298
40299 return SDValue();
40300}
40301
40302// Simplify variable target shuffle masks based on the demanded elements.
40303// TODO: Handle DemandedBits in mask indices as well?
40304bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetShuffle(
40305 SDValue Op, const APInt &DemandedElts, unsigned MaskIndex,
40306 TargetLowering::TargetLoweringOpt &TLO, unsigned Depth) const {
40307 // If we're demanding all elements don't bother trying to simplify the mask.
40308 unsigned NumElts = DemandedElts.getBitWidth();
40309 if (DemandedElts.isAllOnes())
40310 return false;
40311
40312 SDValue Mask = Op.getOperand(MaskIndex);
40313 if (!Mask.hasOneUse())
40314 return false;
40315
40316 // Attempt to generically simplify the variable shuffle mask.
40317 APInt MaskUndef, MaskZero;
40318 if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
40319 Depth + 1))
40320 return true;
40321
40322 // Attempt to extract+simplify a (constant pool load) shuffle mask.
40323 // TODO: Support other types from getTargetShuffleMaskIndices?
40324 SDValue BC = peekThroughOneUseBitcasts(Mask);
40325 EVT BCVT = BC.getValueType();
40326 auto *Load = dyn_cast<LoadSDNode>(BC);
40327 if (!Load)
40328 return false;
40329
40330 const Constant *C = getTargetConstantFromNode(Load);
40331 if (!C)
40332 return false;
40333
40334 Type *CTy = C->getType();
40335 if (!CTy->isVectorTy() ||
40336 CTy->getPrimitiveSizeInBits() != Mask.getValueSizeInBits())
40337 return false;
40338
40339 // Handle scaling for i64 elements on 32-bit targets.
40340 unsigned NumCstElts = cast<FixedVectorType>(CTy)->getNumElements();
40341 if (NumCstElts != NumElts && NumCstElts != (NumElts * 2))
40342 return false;
40343 unsigned Scale = NumCstElts / NumElts;
40344
40345 // Simplify mask if we have an undemanded element that is not undef.
40346 bool Simplified = false;
40347 SmallVector<Constant *, 32> ConstVecOps;
40348 for (unsigned i = 0; i != NumCstElts; ++i) {
40349 Constant *Elt = C->getAggregateElement(i);
40350 if (!DemandedElts[i / Scale] && !isa<UndefValue>(Elt)) {
40351 ConstVecOps.push_back(UndefValue::get(Elt->getType()));
40352 Simplified = true;
40353 continue;
40354 }
40355 ConstVecOps.push_back(Elt);
40356 }
40357 if (!Simplified)
40358 return false;
40359
40360 // Generate new constant pool entry + legalize immediately for the load.
40361 SDLoc DL(Op);
40362 SDValue CV = TLO.DAG.getConstantPool(ConstantVector::get(ConstVecOps), BCVT);
40363 SDValue LegalCV = LowerConstantPool(CV, TLO.DAG);
40364 SDValue NewMask = TLO.DAG.getLoad(
40365 BCVT, DL, TLO.DAG.getEntryNode(), LegalCV,
40366 MachinePointerInfo::getConstantPool(TLO.DAG.getMachineFunction()),
40367 Load->getAlign());
40368 return TLO.CombineTo(Mask, TLO.DAG.getBitcast(Mask.getValueType(), NewMask));
40369}
40370
40371bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
40372 SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
40373 TargetLoweringOpt &TLO, unsigned Depth) const {
40374 int NumElts = DemandedElts.getBitWidth();
40375 unsigned Opc = Op.getOpcode();
40376 EVT VT = Op.getValueType();
40377
40378 // Handle special case opcodes.
40379 switch (Opc) {
40380 case X86ISD::PMULDQ:
40381 case X86ISD::PMULUDQ: {
40382 APInt LHSUndef, LHSZero;
40383 APInt RHSUndef, RHSZero;
40384 SDValue LHS = Op.getOperand(0);
40385 SDValue RHS = Op.getOperand(1);
40386 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
40387 Depth + 1))
40388 return true;
40389 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
40390 Depth + 1))
40391 return true;
40392 // Multiply by zero.
40393 KnownZero = LHSZero | RHSZero;
40394 break;
40395 }
40396 case X86ISD::VPMADDWD: {
40397 APInt LHSUndef, LHSZero;
40398 APInt RHSUndef, RHSZero;
40399 SDValue LHS = Op.getOperand(0);
40400 SDValue RHS = Op.getOperand(1);
40401 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, 2 * NumElts);
40402
40403 if (SimplifyDemandedVectorElts(LHS, DemandedSrcElts, LHSUndef, LHSZero, TLO,
40404 Depth + 1))
40405 return true;
40406 if (SimplifyDemandedVectorElts(RHS, DemandedSrcElts, RHSUndef, RHSZero, TLO,
40407 Depth + 1))
40408 return true;
40409
40410 // TODO: Multiply by zero.
40411
40412 // If RHS/LHS elements are known zero then we don't need the LHS/RHS equivalent.
40413 APInt DemandedLHSElts = DemandedSrcElts & ~RHSZero;
40414 if (SimplifyDemandedVectorElts(LHS, DemandedLHSElts, LHSUndef, LHSZero, TLO,
40415 Depth + 1))
40416 return true;
40417 APInt DemandedRHSElts = DemandedSrcElts & ~LHSZero;
40418 if (SimplifyDemandedVectorElts(RHS, DemandedRHSElts, RHSUndef, RHSZero, TLO,
40419 Depth + 1))
40420 return true;
40421 break;
40422 }
40423 case X86ISD::PSADBW: {
40424 SDValue LHS = Op.getOperand(0);
40425 SDValue RHS = Op.getOperand(1);
40426 assert(VT.getScalarType() == MVT::i64 &&(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
LHS.getValueType() == RHS.getValueType() && LHS.getValueType
().getScalarType() == MVT::i8 && "Unexpected PSADBW types"
) ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && LHS.getValueType() == RHS.getValueType() && LHS.getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40429, __extension__
__PRETTY_FUNCTION__))
40427 LHS.getValueType() == RHS.getValueType() &&(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
LHS.getValueType() == RHS.getValueType() && LHS.getValueType
().getScalarType() == MVT::i8 && "Unexpected PSADBW types"
) ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && LHS.getValueType() == RHS.getValueType() && LHS.getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40429, __extension__
__PRETTY_FUNCTION__))
40428 LHS.getValueType().getScalarType() == MVT::i8 &&(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
LHS.getValueType() == RHS.getValueType() && LHS.getValueType
().getScalarType() == MVT::i8 && "Unexpected PSADBW types"
) ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && LHS.getValueType() == RHS.getValueType() && LHS.getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40429, __extension__
__PRETTY_FUNCTION__))
40429 "Unexpected PSADBW types")(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
LHS.getValueType() == RHS.getValueType() && LHS.getValueType
().getScalarType() == MVT::i8 && "Unexpected PSADBW types"
) ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && LHS.getValueType() == RHS.getValueType() && LHS.getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40429, __extension__
__PRETTY_FUNCTION__))
;
40430
40431 // Aggressively peek through ops to get at the demanded elts.
40432 if (!DemandedElts.isAllOnes()) {
40433 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
40434 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
40435 SDValue NewLHS = SimplifyMultipleUseDemandedVectorElts(
40436 LHS, DemandedSrcElts, TLO.DAG, Depth + 1);
40437 SDValue NewRHS = SimplifyMultipleUseDemandedVectorElts(
40438 RHS, DemandedSrcElts, TLO.DAG, Depth + 1);
40439 if (NewLHS || NewRHS) {
40440 NewLHS = NewLHS ? NewLHS : LHS;
40441 NewRHS = NewRHS ? NewRHS : RHS;
40442 return TLO.CombineTo(
40443 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
40444 }
40445 }
40446 break;
40447 }
40448 case X86ISD::VSHL:
40449 case X86ISD::VSRL:
40450 case X86ISD::VSRA: {
40451 // We only need the bottom 64-bits of the (128-bit) shift amount.
40452 SDValue Amt = Op.getOperand(1);
40453 MVT AmtVT = Amt.getSimpleValueType();
40454 assert(AmtVT.is128BitVector() && "Unexpected value type")(static_cast <bool> (AmtVT.is128BitVector() && "Unexpected value type"
) ? void (0) : __assert_fail ("AmtVT.is128BitVector() && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40454, __extension__
__PRETTY_FUNCTION__))
;
40455
40456 // If we reuse the shift amount just for sse shift amounts then we know that
40457 // only the bottom 64-bits are only ever used.
40458 bool AssumeSingleUse = llvm::all_of(Amt->uses(), [&Amt](SDNode *Use) {
40459 unsigned UseOpc = Use->getOpcode();
40460 return (UseOpc == X86ISD::VSHL || UseOpc == X86ISD::VSRL ||
40461 UseOpc == X86ISD::VSRA) &&
40462 Use->getOperand(0) != Amt;
40463 });
40464
40465 APInt AmtUndef, AmtZero;
40466 unsigned NumAmtElts = AmtVT.getVectorNumElements();
40467 APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);
40468 if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,
40469 Depth + 1, AssumeSingleUse))
40470 return true;
40471 LLVM_FALLTHROUGH[[gnu::fallthrough]];
40472 }
40473 case X86ISD::VSHLI:
40474 case X86ISD::VSRLI:
40475 case X86ISD::VSRAI: {
40476 SDValue Src = Op.getOperand(0);
40477 APInt SrcUndef;
40478 if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,
40479 Depth + 1))
40480 return true;
40481
40482 // Aggressively peek through ops to get at the demanded elts.
40483 if (!DemandedElts.isAllOnes())
40484 if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
40485 Src, DemandedElts, TLO.DAG, Depth + 1))
40486 return TLO.CombineTo(
40487 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1)));
40488 break;
40489 }
40490 case X86ISD::VPSHA:
40491 case X86ISD::VPSHL:
40492 case X86ISD::VSHLV:
40493 case X86ISD::VSRLV:
40494 case X86ISD::VSRAV: {
40495 APInt LHSUndef, LHSZero;
40496 APInt RHSUndef, RHSZero;
40497 SDValue LHS = Op.getOperand(0);
40498 SDValue RHS = Op.getOperand(1);
40499 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
40500 Depth + 1))
40501 return true;
40502 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
40503 Depth + 1))
40504 return true;
40505 KnownZero = LHSZero;
40506 break;
40507 }
40508 case X86ISD::KSHIFTL: {
40509 SDValue Src = Op.getOperand(0);
40510 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
40511 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount")(static_cast <bool> (Amt->getAPIntValue().ult(NumElts
) && "Out of range shift amount") ? void (0) : __assert_fail
("Amt->getAPIntValue().ult(NumElts) && \"Out of range shift amount\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40511, __extension__
__PRETTY_FUNCTION__))
;
40512 unsigned ShiftAmt = Amt->getZExtValue();
40513
40514 if (ShiftAmt == 0)
40515 return TLO.CombineTo(Op, Src);
40516
40517 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
40518 // single shift. We can do this if the bottom bits (which are shifted
40519 // out) are never demanded.
40520 if (Src.getOpcode() == X86ISD::KSHIFTR) {
40521 if (!DemandedElts.intersects(APInt::getLowBitsSet(NumElts, ShiftAmt))) {
40522 unsigned C1 = Src.getConstantOperandVal(1);
40523 unsigned NewOpc = X86ISD::KSHIFTL;
40524 int Diff = ShiftAmt - C1;
40525 if (Diff < 0) {
40526 Diff = -Diff;
40527 NewOpc = X86ISD::KSHIFTR;
40528 }
40529
40530 SDLoc dl(Op);
40531 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
40532 return TLO.CombineTo(
40533 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
40534 }
40535 }
40536
40537 APInt DemandedSrc = DemandedElts.lshr(ShiftAmt);
40538 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
40539 Depth + 1))
40540 return true;
40541
40542 KnownUndef <<= ShiftAmt;
40543 KnownZero <<= ShiftAmt;
40544 KnownZero.setLowBits(ShiftAmt);
40545 break;
40546 }
40547 case X86ISD::KSHIFTR: {
40548 SDValue Src = Op.getOperand(0);
40549 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
40550 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount")(static_cast <bool> (Amt->getAPIntValue().ult(NumElts
) && "Out of range shift amount") ? void (0) : __assert_fail
("Amt->getAPIntValue().ult(NumElts) && \"Out of range shift amount\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40550, __extension__
__PRETTY_FUNCTION__))
;
40551 unsigned ShiftAmt = Amt->getZExtValue();
40552
40553 if (ShiftAmt == 0)
40554 return TLO.CombineTo(Op, Src);
40555
40556 // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a
40557 // single shift. We can do this if the top bits (which are shifted
40558 // out) are never demanded.
40559 if (Src.getOpcode() == X86ISD::KSHIFTL) {
40560 if (!DemandedElts.intersects(APInt::getHighBitsSet(NumElts, ShiftAmt))) {
40561 unsigned C1 = Src.getConstantOperandVal(1);
40562 unsigned NewOpc = X86ISD::KSHIFTR;
40563 int Diff = ShiftAmt - C1;
40564 if (Diff < 0) {
40565 Diff = -Diff;
40566 NewOpc = X86ISD::KSHIFTL;
40567 }
40568
40569 SDLoc dl(Op);
40570 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
40571 return TLO.CombineTo(
40572 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
40573 }
40574 }
40575
40576 APInt DemandedSrc = DemandedElts.shl(ShiftAmt);
40577 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
40578 Depth + 1))
40579 return true;
40580
40581 KnownUndef.lshrInPlace(ShiftAmt);
40582 KnownZero.lshrInPlace(ShiftAmt);
40583 KnownZero.setHighBits(ShiftAmt);
40584 break;
40585 }
40586 case X86ISD::ANDNP: {
40587 // ANDNP = (~LHS & RHS);
40588 SDValue LHS = Op.getOperand(0);
40589 SDValue RHS = Op.getOperand(1);
40590
40591 auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
40592 APInt UndefElts;
40593 SmallVector<APInt> EltBits;
40594 int NumElts = VT.getVectorNumElements();
40595 int EltSizeInBits = VT.getScalarSizeInBits();
40596 APInt OpBits = APInt::getAllOnes(EltSizeInBits);
40597 APInt OpElts = DemandedElts;
40598 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
40599 EltBits)) {
40600 OpBits.clearAllBits();
40601 OpElts.clearAllBits();
40602 for (int I = 0; I != NumElts; ++I)
40603 if (DemandedElts[I] && ((Invert && !EltBits[I].isAllOnes()) ||
40604 (!Invert && !EltBits[I].isZero()))) {
40605 OpBits |= Invert ? ~EltBits[I] : EltBits[I];
40606 OpElts.setBit(I);
40607 }
40608 }
40609 return std::make_pair(OpBits, OpElts);
40610 };
40611 std::pair<APInt, APInt> DemandLHS = GetDemandedMasks(RHS);
40612 std::pair<APInt, APInt> DemandRHS = GetDemandedMasks(LHS, true);
40613
40614 APInt LHSUndef, LHSZero;
40615 APInt RHSUndef, RHSZero;
40616 if (SimplifyDemandedVectorElts(LHS, DemandLHS.second, LHSUndef, LHSZero,
40617 TLO, Depth + 1))
40618 return true;
40619 if (SimplifyDemandedVectorElts(RHS, DemandRHS.second, RHSUndef, RHSZero,
40620 TLO, Depth + 1))
40621 return true;
40622
40623 if (!DemandedElts.isAllOnes()) {
40624 SDValue NewLHS = SimplifyMultipleUseDemandedBits(
40625 LHS, DemandLHS.first, DemandLHS.second, TLO.DAG, Depth + 1);
40626 SDValue NewRHS = SimplifyMultipleUseDemandedBits(
40627 RHS, DemandRHS.first, DemandRHS.second, TLO.DAG, Depth + 1);
40628 if (NewLHS || NewRHS) {
40629 NewLHS = NewLHS ? NewLHS : LHS;
40630 NewRHS = NewRHS ? NewRHS : RHS;
40631 return TLO.CombineTo(
40632 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
40633 }
40634 }
40635 break;
40636 }
40637 case X86ISD::CVTSI2P:
40638 case X86ISD::CVTUI2P: {
40639 SDValue Src = Op.getOperand(0);
40640 MVT SrcVT = Src.getSimpleValueType();
40641 APInt SrcUndef, SrcZero;
40642 APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
40643 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
40644 Depth + 1))
40645 return true;
40646 break;
40647 }
40648 case X86ISD::PACKSS:
40649 case X86ISD::PACKUS: {
40650 SDValue N0 = Op.getOperand(0);
40651 SDValue N1 = Op.getOperand(1);
40652
40653 APInt DemandedLHS, DemandedRHS;
40654 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
40655
40656 APInt LHSUndef, LHSZero;
40657 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
40658 Depth + 1))
40659 return true;
40660 APInt RHSUndef, RHSZero;
40661 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
40662 Depth + 1))
40663 return true;
40664
40665 // TODO - pass on known zero/undef.
40666
40667 // Aggressively peek through ops to get at the demanded elts.
40668 // TODO - we should do this for all target/faux shuffles ops.
40669 if (!DemandedElts.isAllOnes()) {
40670 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
40671 TLO.DAG, Depth + 1);
40672 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
40673 TLO.DAG, Depth + 1);
40674 if (NewN0 || NewN1) {
40675 NewN0 = NewN0 ? NewN0 : N0;
40676 NewN1 = NewN1 ? NewN1 : N1;
40677 return TLO.CombineTo(Op,
40678 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
40679 }
40680 }
40681 break;
40682 }
40683 case X86ISD::HADD:
40684 case X86ISD::HSUB:
40685 case X86ISD::FHADD:
40686 case X86ISD::FHSUB: {
40687 SDValue N0 = Op.getOperand(0);
40688 SDValue N1 = Op.getOperand(1);
40689
40690 APInt DemandedLHS, DemandedRHS;
40691 getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
40692
40693 APInt LHSUndef, LHSZero;
40694 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
40695 Depth + 1))
40696 return true;
40697 APInt RHSUndef, RHSZero;
40698 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
40699 Depth + 1))
40700 return true;
40701
40702 // TODO - pass on known zero/undef.
40703
40704 // Aggressively peek through ops to get at the demanded elts.
40705 // TODO: Handle repeated operands.
40706 if (N0 != N1 && !DemandedElts.isAllOnes()) {
40707 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
40708 TLO.DAG, Depth + 1);
40709 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
40710 TLO.DAG, Depth + 1);
40711 if (NewN0 || NewN1) {
40712 NewN0 = NewN0 ? NewN0 : N0;
40713 NewN1 = NewN1 ? NewN1 : N1;
40714 return TLO.CombineTo(Op,
40715 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
40716 }
40717 }
40718 break;
40719 }
40720 case X86ISD::VTRUNC:
40721 case X86ISD::VTRUNCS:
40722 case X86ISD::VTRUNCUS: {
40723 SDValue Src = Op.getOperand(0);
40724 MVT SrcVT = Src.getSimpleValueType();
40725 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
40726 APInt SrcUndef, SrcZero;
40727 if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO,
40728 Depth + 1))
40729 return true;
40730 KnownZero = SrcZero.zextOrTrunc(NumElts);
40731 KnownUndef = SrcUndef.zextOrTrunc(NumElts);
40732 break;
40733 }
40734 case X86ISD::BLENDV: {
40735 APInt SelUndef, SelZero;
40736 if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,
40737 SelZero, TLO, Depth + 1))
40738 return true;
40739
40740 // TODO: Use SelZero to adjust LHS/RHS DemandedElts.
40741 APInt LHSUndef, LHSZero;
40742 if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef,
40743 LHSZero, TLO, Depth + 1))
40744 return true;
40745
40746 APInt RHSUndef, RHSZero;
40747 if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef,
40748 RHSZero, TLO, Depth + 1))
40749 return true;
40750
40751 KnownZero = LHSZero & RHSZero;
40752 KnownUndef = LHSUndef & RHSUndef;
40753 break;
40754 }
40755 case X86ISD::VZEXT_MOVL: {
40756 // If upper demanded elements are already zero then we have nothing to do.
40757 SDValue Src = Op.getOperand(0);
40758 APInt DemandedUpperElts = DemandedElts;
40759 DemandedUpperElts.clearLowBits(1);
40760 if (TLO.DAG.computeKnownBits(Src, DemandedUpperElts, Depth + 1).isZero())
40761 return TLO.CombineTo(Op, Src);
40762 break;
40763 }
40764 case X86ISD::VBROADCAST: {
40765 SDValue Src = Op.getOperand(0);
40766 MVT SrcVT = Src.getSimpleValueType();
40767 if (!SrcVT.isVector())
40768 break;
40769 // Don't bother broadcasting if we just need the 0'th element.
40770 if (DemandedElts == 1) {
40771 if (Src.getValueType() != VT)
40772 Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,
40773 SDLoc(Op));
40774 return TLO.CombineTo(Op, Src);
40775 }
40776 APInt SrcUndef, SrcZero;
40777 APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);
40778 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
40779 Depth + 1))
40780 return true;
40781 // Aggressively peek through src to get at the demanded elt.
40782 // TODO - we should do this for all target/faux shuffles ops.
40783 if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
40784 Src, SrcElts, TLO.DAG, Depth + 1))
40785 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
40786 break;
40787 }
40788 case X86ISD::VPERMV:
40789 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 0, TLO,
40790 Depth))
40791 return true;
40792 break;
40793 case X86ISD::PSHUFB:
40794 case X86ISD::VPERMV3:
40795 case X86ISD::VPERMILPV:
40796 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 1, TLO,
40797 Depth))
40798 return true;
40799 break;
40800 case X86ISD::VPPERM:
40801 case X86ISD::VPERMIL2:
40802 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 2, TLO,
40803 Depth))
40804 return true;
40805 break;
40806 }
40807
40808 // For 256/512-bit ops that are 128/256-bit ops glued together, if we do not
40809 // demand any of the high elements, then narrow the op to 128/256-bits: e.g.
40810 // (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0
40811 if ((VT.is256BitVector() || VT.is512BitVector()) &&
40812 DemandedElts.lshr(NumElts / 2) == 0) {
40813 unsigned SizeInBits = VT.getSizeInBits();
40814 unsigned ExtSizeInBits = SizeInBits / 2;
40815
40816 // See if 512-bit ops only use the bottom 128-bits.
40817 if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0)
40818 ExtSizeInBits = SizeInBits / 4;
40819
40820 switch (Opc) {
40821 // Scalar broadcast.
40822 case X86ISD::VBROADCAST: {
40823 SDLoc DL(Op);
40824 SDValue Src = Op.getOperand(0);
40825 if (Src.getValueSizeInBits() > ExtSizeInBits)
40826 Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);
40827 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
40828 ExtSizeInBits / VT.getScalarSizeInBits());
40829 SDValue Bcst = TLO.DAG.getNode(X86ISD::VBROADCAST, DL, BcstVT, Src);
40830 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
40831 TLO.DAG, DL, ExtSizeInBits));
40832 }
40833 case X86ISD::VBROADCAST_LOAD: {
40834 SDLoc DL(Op);
40835 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
40836 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
40837 ExtSizeInBits / VT.getScalarSizeInBits());
40838 SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);
40839 SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};
40840 SDValue Bcst = TLO.DAG.getMemIntrinsicNode(
40841 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),
40842 MemIntr->getMemOperand());
40843 TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),
40844 Bcst.getValue(1));
40845 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
40846 TLO.DAG, DL, ExtSizeInBits));
40847 }
40848 // Subvector broadcast.
40849 case X86ISD::SUBV_BROADCAST_LOAD: {
40850 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
40851 EVT MemVT = MemIntr->getMemoryVT();
40852 if (ExtSizeInBits == MemVT.getStoreSizeInBits()) {
40853 SDLoc DL(Op);
40854 SDValue Ld =
40855 TLO.DAG.getLoad(MemVT, DL, MemIntr->getChain(),
40856 MemIntr->getBasePtr(), MemIntr->getMemOperand());
40857 TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),
40858 Ld.getValue(1));
40859 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Ld, 0,
40860 TLO.DAG, DL, ExtSizeInBits));
40861 } else if ((ExtSizeInBits % MemVT.getStoreSizeInBits()) == 0) {
40862 SDLoc DL(Op);
40863 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
40864 ExtSizeInBits / VT.getScalarSizeInBits());
40865 if (SDValue BcstLd =
40866 getBROADCAST_LOAD(Opc, DL, BcstVT, MemVT, MemIntr, 0, TLO.DAG))
40867 return TLO.CombineTo(Op,
40868 insertSubVector(TLO.DAG.getUNDEF(VT), BcstLd, 0,
40869 TLO.DAG, DL, ExtSizeInBits));
40870 }
40871 break;
40872 }
40873 // Byte shifts by immediate.
40874 case X86ISD::VSHLDQ:
40875 case X86ISD::VSRLDQ:
40876 // Shift by uniform.
40877 case X86ISD::VSHL:
40878 case X86ISD::VSRL:
40879 case X86ISD::VSRA:
40880 // Shift by immediate.
40881 case X86ISD::VSHLI:
40882 case X86ISD::VSRLI:
40883 case X86ISD::VSRAI: {
40884 SDLoc DL(Op);
40885 SDValue Ext0 =
40886 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
40887 SDValue ExtOp =
40888 TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1));
40889 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
40890 SDValue Insert =
40891 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
40892 return TLO.CombineTo(Op, Insert);
40893 }
40894 case X86ISD::VPERMI: {
40895 // Simplify PERMPD/PERMQ to extract_subvector.
40896 // TODO: This should be done in shuffle combining.
40897 if (VT == MVT::v4f64 || VT == MVT::v4i64) {
40898 SmallVector<int, 4> Mask;
40899 DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask);
40900 if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) {
40901 SDLoc DL(Op);
40902 SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128);
40903 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
40904 SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128);
40905 return TLO.CombineTo(Op, Insert);
40906 }
40907 }
40908 break;
40909 }
40910 case X86ISD::VPERM2X128: {
40911 // Simplify VPERM2F128/VPERM2I128 to extract_subvector.
40912 SDLoc DL(Op);
40913 unsigned LoMask = Op.getConstantOperandVal(2) & 0xF;
40914 if (LoMask & 0x8)
40915 return TLO.CombineTo(
40916 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, DL));
40917 unsigned EltIdx = (LoMask & 0x1) * (NumElts / 2);
40918 unsigned SrcIdx = (LoMask & 0x2) >> 1;
40919 SDValue ExtOp =
40920 extractSubVector(Op.getOperand(SrcIdx), EltIdx, TLO.DAG, DL, 128);
40921 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
40922 SDValue Insert =
40923 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
40924 return TLO.CombineTo(Op, Insert);
40925 }
40926 // Zero upper elements.
40927 case X86ISD::VZEXT_MOVL:
40928 // Target unary shuffles by immediate:
40929 case X86ISD::PSHUFD:
40930 case X86ISD::PSHUFLW:
40931 case X86ISD::PSHUFHW:
40932 case X86ISD::VPERMILPI:
40933 // (Non-Lane Crossing) Target Shuffles.
40934 case X86ISD::VPERMILPV:
40935 case X86ISD::VPERMIL2:
40936 case X86ISD::PSHUFB:
40937 case X86ISD::UNPCKL:
40938 case X86ISD::UNPCKH:
40939 case X86ISD::BLENDI:
40940 // Integer ops.
40941 case X86ISD::PACKSS:
40942 case X86ISD::PACKUS:
40943 // Horizontal Ops.
40944 case X86ISD::HADD:
40945 case X86ISD::HSUB:
40946 case X86ISD::FHADD:
40947 case X86ISD::FHSUB: {
40948 SDLoc DL(Op);
40949 SmallVector<SDValue, 4> Ops;
40950 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
40951 SDValue SrcOp = Op.getOperand(i);
40952 EVT SrcVT = SrcOp.getValueType();
40953 assert((!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) &&(static_cast <bool> ((!SrcVT.isVector() || SrcVT.getSizeInBits
() == SizeInBits) && "Unsupported vector size") ? void
(0) : __assert_fail ("(!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) && \"Unsupported vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40954, __extension__
__PRETTY_FUNCTION__))
40954 "Unsupported vector size")(static_cast <bool> ((!SrcVT.isVector() || SrcVT.getSizeInBits
() == SizeInBits) && "Unsupported vector size") ? void
(0) : __assert_fail ("(!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) && \"Unsupported vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40954, __extension__
__PRETTY_FUNCTION__))
;
40955 Ops.push_back(SrcVT.isVector() ? extractSubVector(SrcOp, 0, TLO.DAG, DL,
40956 ExtSizeInBits)
40957 : SrcOp);
40958 }
40959 MVT ExtVT = VT.getSimpleVT();
40960 ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),
40961 ExtSizeInBits / ExtVT.getScalarSizeInBits());
40962 SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ops);
40963 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
40964 SDValue Insert =
40965 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
40966 return TLO.CombineTo(Op, Insert);
40967 }
40968 }
40969 }
40970
40971 // For broadcasts, unless we *only* demand the 0'th element,
40972 // stop attempts at simplification here, we aren't going to improve things,
40973 // this is better than any potential shuffle.
40974 if (isTargetShuffleSplat(Op) && !DemandedElts.isOne())
40975 return false;
40976
40977 // Get target/faux shuffle mask.
40978 APInt OpUndef, OpZero;
40979 SmallVector<int, 64> OpMask;
40980 SmallVector<SDValue, 2> OpInputs;
40981 if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, OpUndef,
40982 OpZero, TLO.DAG, Depth, false))
40983 return false;
40984
40985 // Shuffle inputs must be the same size as the result.
40986 if (OpMask.size() != (unsigned)NumElts ||
40987 llvm::any_of(OpInputs, [VT](SDValue V) {
40988 return VT.getSizeInBits() != V.getValueSizeInBits() ||
40989 !V.getValueType().isVector();
40990 }))
40991 return false;
40992
40993 KnownZero = OpZero;
40994 KnownUndef = OpUndef;
40995
40996 // Check if shuffle mask can be simplified to undef/zero/identity.
40997 int NumSrcs = OpInputs.size();
40998 for (int i = 0; i != NumElts; ++i)
40999 if (!DemandedElts[i])
41000 OpMask[i] = SM_SentinelUndef;
41001
41002 if (isUndefInRange(OpMask, 0, NumElts)) {
41003 KnownUndef.setAllBits();
41004 return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
41005 }
41006 if (isUndefOrZeroInRange(OpMask, 0, NumElts)) {
41007 KnownZero.setAllBits();
41008 return TLO.CombineTo(
41009 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
41010 }
41011 for (int Src = 0; Src != NumSrcs; ++Src)
41012 if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))
41013 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, OpInputs[Src]));
41014
41015 // Attempt to simplify inputs.
41016 for (int Src = 0; Src != NumSrcs; ++Src) {
41017 // TODO: Support inputs of different types.
41018 if (OpInputs[Src].getValueType() != VT)
41019 continue;
41020
41021 int Lo = Src * NumElts;
41022 APInt SrcElts = APInt::getZero(NumElts);
41023 for (int i = 0; i != NumElts; ++i)
41024 if (DemandedElts[i]) {
41025 int M = OpMask[i] - Lo;
41026 if (0 <= M && M < NumElts)
41027 SrcElts.setBit(M);
41028 }
41029
41030 // TODO - Propagate input undef/zero elts.
41031 APInt SrcUndef, SrcZero;
41032 if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,
41033 TLO, Depth + 1))
41034 return true;
41035 }
41036
41037 // If we don't demand all elements, then attempt to combine to a simpler
41038 // shuffle.
41039 // We need to convert the depth to something combineX86ShufflesRecursively
41040 // can handle - so pretend its Depth == 0 again, and reduce the max depth
41041 // to match. This prevents combineX86ShuffleChain from returning a
41042 // combined shuffle that's the same as the original root, causing an
41043 // infinite loop.
41044 if (!DemandedElts.isAllOnes()) {
41045 assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range")(static_cast <bool> (Depth < X86::MaxShuffleCombineDepth
&& "Depth out of range") ? void (0) : __assert_fail (
"Depth < X86::MaxShuffleCombineDepth && \"Depth out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41045, __extension__
__PRETTY_FUNCTION__))
;
41046
41047 SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);
41048 for (int i = 0; i != NumElts; ++i)
41049 if (DemandedElts[i])
41050 DemandedMask[i] = i;
41051
41052 SDValue NewShuffle = combineX86ShufflesRecursively(
41053 {Op}, 0, Op, DemandedMask, {}, 0, X86::MaxShuffleCombineDepth - Depth,
41054 /*HasVarMask*/ false,
41055 /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, TLO.DAG,
41056 Subtarget);
41057 if (NewShuffle)
41058 return TLO.CombineTo(Op, NewShuffle);
41059 }
41060
41061 return false;
41062}
41063
41064bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
41065 SDValue Op, const APInt &OriginalDemandedBits,
41066 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
41067 unsigned Depth) const {
41068 EVT VT = Op.getValueType();
41069 unsigned BitWidth = OriginalDemandedBits.getBitWidth();
41070 unsigned Opc = Op.getOpcode();
41071 switch(Opc) {
41072 case X86ISD::VTRUNC: {
41073 KnownBits KnownOp;
41074 SDValue Src = Op.getOperand(0);
41075 MVT SrcVT = Src.getSimpleValueType();
41076
41077 // Simplify the input, using demanded bit information.
41078 APInt TruncMask = OriginalDemandedBits.zext(SrcVT.getScalarSizeInBits());
41079 APInt DemandedElts = OriginalDemandedElts.trunc(SrcVT.getVectorNumElements());
41080 if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1))
41081 return true;
41082 break;
41083 }
41084 case X86ISD::PMULDQ:
41085 case X86ISD::PMULUDQ: {
41086 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
41087 KnownBits KnownOp;
41088 SDValue LHS = Op.getOperand(0);
41089 SDValue RHS = Op.getOperand(1);
41090 // FIXME: Can we bound this better?
41091 APInt DemandedMask = APInt::getLowBitsSet(64, 32);
41092 if (SimplifyDemandedBits(LHS, DemandedMask, OriginalDemandedElts, KnownOp,
41093 TLO, Depth + 1))
41094 return true;
41095 if (SimplifyDemandedBits(RHS, DemandedMask, OriginalDemandedElts, KnownOp,
41096 TLO, Depth + 1))
41097 return true;
41098
41099 // Aggressively peek through ops to get at the demanded low bits.
41100 SDValue DemandedLHS = SimplifyMultipleUseDemandedBits(
41101 LHS, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
41102 SDValue DemandedRHS = SimplifyMultipleUseDemandedBits(
41103 RHS, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
41104 if (DemandedLHS || DemandedRHS) {
41105 DemandedLHS = DemandedLHS ? DemandedLHS : LHS;
41106 DemandedRHS = DemandedRHS ? DemandedRHS : RHS;
41107 return TLO.CombineTo(
41108 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, DemandedLHS, DemandedRHS));
41109 }
41110 break;
41111 }
41112 case X86ISD::VSHLI: {
41113 SDValue Op0 = Op.getOperand(0);
41114
41115 unsigned ShAmt = Op.getConstantOperandVal(1);
41116 if (ShAmt >= BitWidth)
41117 break;
41118
41119 APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);
41120
41121 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
41122 // single shift. We can do this if the bottom bits (which are shifted
41123 // out) are never demanded.
41124 if (Op0.getOpcode() == X86ISD::VSRLI &&
41125 OriginalDemandedBits.countTrailingZeros() >= ShAmt) {
41126 unsigned Shift2Amt = Op0.getConstantOperandVal(1);
41127 if (Shift2Amt < BitWidth) {
41128 int Diff = ShAmt - Shift2Amt;
41129 if (Diff == 0)
41130 return TLO.CombineTo(Op, Op0.getOperand(0));
41131
41132 unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;
41133 SDValue NewShift = TLO.DAG.getNode(
41134 NewOpc, SDLoc(Op), VT, Op0.getOperand(0),
41135 TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));
41136 return TLO.CombineTo(Op, NewShift);
41137 }
41138 }
41139
41140 // If we are only demanding sign bits then we can use the shift source directly.
41141 unsigned NumSignBits =
41142 TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1);
41143 unsigned UpperDemandedBits =
41144 BitWidth - OriginalDemandedBits.countTrailingZeros();
41145 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
41146 return TLO.CombineTo(Op, Op0);
41147
41148 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
41149 TLO, Depth + 1))
41150 return true;
41151
41152 assert(!Known.hasConflict() && "Bits known to be one AND zero?")(static_cast <bool> (!Known.hasConflict() && "Bits known to be one AND zero?"
) ? void (0) : __assert_fail ("!Known.hasConflict() && \"Bits known to be one AND zero?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41152, __extension__
__PRETTY_FUNCTION__))
;
41153 Known.Zero <<= ShAmt;
41154 Known.One <<= ShAmt;
41155
41156 // Low bits known zero.
41157 Known.Zero.setLowBits(ShAmt);
41158 return false;
41159 }
41160 case X86ISD::VSRLI: {
41161 unsigned ShAmt = Op.getConstantOperandVal(1);
41162 if (ShAmt >= BitWidth)
41163 break;
41164
41165 APInt DemandedMask = OriginalDemandedBits << ShAmt;
41166
41167 if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask,
41168 OriginalDemandedElts, Known, TLO, Depth + 1))
41169 return true;
41170
41171 assert(!Known.hasConflict() && "Bits known to be one AND zero?")(static_cast <bool> (!Known.hasConflict() && "Bits known to be one AND zero?"
) ? void (0) : __assert_fail ("!Known.hasConflict() && \"Bits known to be one AND zero?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41171, __extension__
__PRETTY_FUNCTION__))
;
41172 Known.Zero.lshrInPlace(ShAmt);
41173 Known.One.lshrInPlace(ShAmt);
41174
41175 // High bits known zero.
41176 Known.Zero.setHighBits(ShAmt);
41177 return false;
41178 }
41179 case X86ISD::VSRAI: {
41180 SDValue Op0 = Op.getOperand(0);
41181 SDValue Op1 = Op.getOperand(1);
41182
41183 unsigned ShAmt = cast<ConstantSDNode>(Op1)->getZExtValue();
41184 if (ShAmt >= BitWidth)
41185 break;
41186
41187 APInt DemandedMask = OriginalDemandedBits << ShAmt;
41188
41189 // If we just want the sign bit then we don't need to shift it.
41190 if (OriginalDemandedBits.isSignMask())
41191 return TLO.CombineTo(Op, Op0);
41192
41193 // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
41194 if (Op0.getOpcode() == X86ISD::VSHLI &&
41195 Op.getOperand(1) == Op0.getOperand(1)) {
41196 SDValue Op00 = Op0.getOperand(0);
41197 unsigned NumSignBits =
41198 TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);
41199 if (ShAmt < NumSignBits)
41200 return TLO.CombineTo(Op, Op00);
41201 }
41202
41203 // If any of the demanded bits are produced by the sign extension, we also
41204 // demand the input sign bit.
41205 if (OriginalDemandedBits.countLeadingZeros() < ShAmt)
41206 DemandedMask.setSignBit();
41207
41208 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
41209 TLO, Depth + 1))
41210 return true;
41211
41212 assert(!Known.hasConflict() && "Bits known to be one AND zero?")(static_cast <bool> (!Known.hasConflict() && "Bits known to be one AND zero?"
) ? void (0) : __assert_fail ("!Known.hasConflict() && \"Bits known to be one AND zero?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41212, __extension__
__PRETTY_FUNCTION__))
;
41213 Known.Zero.lshrInPlace(ShAmt);
41214 Known.One.lshrInPlace(ShAmt);
41215
41216 // If the input sign bit is known to be zero, or if none of the top bits
41217 // are demanded, turn this into an unsigned shift right.
41218 if (Known.Zero[BitWidth - ShAmt - 1] ||
41219 OriginalDemandedBits.countLeadingZeros() >= ShAmt)
41220 return TLO.CombineTo(
41221 Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));
41222
41223 // High bits are known one.
41224 if (Known.One[BitWidth - ShAmt - 1])
41225 Known.One.setHighBits(ShAmt);
41226 return false;
41227 }
41228 case X86ISD::BLENDV: {
41229 SDValue Sel = Op.getOperand(0);
41230 SDValue LHS = Op.getOperand(1);
41231 SDValue RHS = Op.getOperand(2);
41232
41233 APInt SignMask = APInt::getSignMask(BitWidth);
41234 SDValue NewSel = SimplifyMultipleUseDemandedBits(
41235 Sel, SignMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
41236 SDValue NewLHS = SimplifyMultipleUseDemandedBits(
41237 LHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
41238 SDValue NewRHS = SimplifyMultipleUseDemandedBits(
41239 RHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
41240
41241 if (NewSel || NewLHS || NewRHS) {
41242 NewSel = NewSel ? NewSel : Sel;
41243 NewLHS = NewLHS ? NewLHS : LHS;
41244 NewRHS = NewRHS ? NewRHS : RHS;
41245 return TLO.CombineTo(Op, TLO.DAG.getNode(X86ISD::BLENDV, SDLoc(Op), VT,
41246 NewSel, NewLHS, NewRHS));
41247 }
41248 break;
41249 }
41250 case X86ISD::PEXTRB:
41251 case X86ISD::PEXTRW: {
41252 SDValue Vec = Op.getOperand(0);
41253 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
41254 MVT VecVT = Vec.getSimpleValueType();
41255 unsigned NumVecElts = VecVT.getVectorNumElements();
41256
41257 if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {
41258 unsigned Idx = CIdx->getZExtValue();
41259 unsigned VecBitWidth = VecVT.getScalarSizeInBits();
41260
41261 // If we demand no bits from the vector then we must have demanded
41262 // bits from the implict zext - simplify to zero.
41263 APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth);
41264 if (DemandedVecBits == 0)
41265 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
41266
41267 APInt KnownUndef, KnownZero;
41268 APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx);
41269 if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,
41270 KnownZero, TLO, Depth + 1))
41271 return true;
41272
41273 KnownBits KnownVec;
41274 if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,
41275 KnownVec, TLO, Depth + 1))
41276 return true;
41277
41278 if (SDValue V = SimplifyMultipleUseDemandedBits(
41279 Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1))
41280 return TLO.CombineTo(
41281 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1)));
41282
41283 Known = KnownVec.zext(BitWidth);
41284 return false;
41285 }
41286 break;
41287 }
41288 case X86ISD::PINSRB:
41289 case X86ISD::PINSRW: {
41290 SDValue Vec = Op.getOperand(0);
41291 SDValue Scl = Op.getOperand(1);
41292 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
41293 MVT VecVT = Vec.getSimpleValueType();
41294
41295 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {
41296 unsigned Idx = CIdx->getZExtValue();
41297 if (!OriginalDemandedElts[Idx])
41298 return TLO.CombineTo(Op, Vec);
41299
41300 KnownBits KnownVec;
41301 APInt DemandedVecElts(OriginalDemandedElts);
41302 DemandedVecElts.clearBit(Idx);
41303 if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts,
41304 KnownVec, TLO, Depth + 1))
41305 return true;
41306
41307 KnownBits KnownScl;
41308 unsigned NumSclBits = Scl.getScalarValueSizeInBits();
41309 APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits);
41310 if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))
41311 return true;
41312
41313 KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());
41314 Known = KnownBits::commonBits(KnownVec, KnownScl);
41315 return false;
41316 }
41317 break;
41318 }
41319 case X86ISD::PACKSS:
41320 // PACKSS saturates to MIN/MAX integer values. So if we just want the
41321 // sign bit then we can just ask for the source operands sign bit.
41322 // TODO - add known bits handling.
41323 if (OriginalDemandedBits.isSignMask()) {
41324 APInt DemandedLHS, DemandedRHS;
41325 getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS);
41326
41327 KnownBits KnownLHS, KnownRHS;
41328 APInt SignMask = APInt::getSignMask(BitWidth * 2);
41329 if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS,
41330 KnownLHS, TLO, Depth + 1))
41331 return true;
41332 if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,
41333 KnownRHS, TLO, Depth + 1))
41334 return true;
41335
41336 // Attempt to avoid multi-use ops if we don't need anything from them.
41337 SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
41338 Op.getOperand(0), SignMask, DemandedLHS, TLO.DAG, Depth + 1);
41339 SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits(
41340 Op.getOperand(1), SignMask, DemandedRHS, TLO.DAG, Depth + 1);
41341 if (DemandedOp0 || DemandedOp1) {
41342 SDValue Op0 = DemandedOp0 ? DemandedOp0 : Op.getOperand(0);
41343 SDValue Op1 = DemandedOp1 ? DemandedOp1 : Op.getOperand(1);
41344 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, Op0, Op1));
41345 }
41346 }
41347 // TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.
41348 break;
41349 case X86ISD::VBROADCAST: {
41350 SDValue Src = Op.getOperand(0);
41351 MVT SrcVT = Src.getSimpleValueType();
41352 APInt DemandedElts = APInt::getOneBitSet(
41353 SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1, 0);
41354 if (SimplifyDemandedBits(Src, OriginalDemandedBits, DemandedElts, Known,
41355 TLO, Depth + 1))
41356 return true;
41357 // If we don't need the upper bits, attempt to narrow the broadcast source.
41358 // Don't attempt this on AVX512 as it might affect broadcast folding.
41359 // TODO: Should we attempt this for i32/i16 splats? They tend to be slower.
41360 if ((BitWidth == 64) && SrcVT.isScalarInteger() && !Subtarget.hasAVX512() &&
41361 OriginalDemandedBits.countLeadingZeros() >= (BitWidth / 2) &&
41362 Src->hasOneUse()) {
41363 MVT NewSrcVT = MVT::getIntegerVT(BitWidth / 2);
41364 SDValue NewSrc =
41365 TLO.DAG.getNode(ISD::TRUNCATE, SDLoc(Src), NewSrcVT, Src);
41366 MVT NewVT = MVT::getVectorVT(NewSrcVT, VT.getVectorNumElements() * 2);
41367 SDValue NewBcst =
41368 TLO.DAG.getNode(X86ISD::VBROADCAST, SDLoc(Op), NewVT, NewSrc);
41369 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, NewBcst));
41370 }
41371 break;
41372 }
41373 case X86ISD::PCMPGT:
41374 // icmp sgt(0, R) == ashr(R, BitWidth-1).
41375 // iff we only need the sign bit then we can use R directly.
41376 if (OriginalDemandedBits.isSignMask() &&
41377 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
41378 return TLO.CombineTo(Op, Op.getOperand(1));
41379 break;
41380 case X86ISD::MOVMSK: {
41381 SDValue Src = Op.getOperand(0);
41382 MVT SrcVT = Src.getSimpleValueType();
41383 unsigned SrcBits = SrcVT.getScalarSizeInBits();
41384 unsigned NumElts = SrcVT.getVectorNumElements();
41385
41386 // If we don't need the sign bits at all just return zero.
41387 if (OriginalDemandedBits.countTrailingZeros() >= NumElts)
41388 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
41389
41390 // See if we only demand bits from the lower 128-bit vector.
41391 if (SrcVT.is256BitVector() &&
41392 OriginalDemandedBits.getActiveBits() <= (NumElts / 2)) {
41393 SDValue NewSrc = extract128BitVector(Src, 0, TLO.DAG, SDLoc(Src));
41394 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
41395 }
41396
41397 // Only demand the vector elements of the sign bits we need.
41398 APInt KnownUndef, KnownZero;
41399 APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);
41400 if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
41401 TLO, Depth + 1))
41402 return true;
41403
41404 Known.Zero = KnownZero.zextOrSelf(BitWidth);
41405 Known.Zero.setHighBits(BitWidth - NumElts);
41406
41407 // MOVMSK only uses the MSB from each vector element.
41408 KnownBits KnownSrc;
41409 APInt DemandedSrcBits = APInt::getSignMask(SrcBits);
41410 if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, KnownSrc, TLO,
41411 Depth + 1))
41412 return true;
41413
41414 if (KnownSrc.One[SrcBits - 1])
41415 Known.One.setLowBits(NumElts);
41416 else if (KnownSrc.Zero[SrcBits - 1])
41417 Known.Zero.setLowBits(NumElts);
41418
41419 // Attempt to avoid multi-use os if we don't need anything from it.
41420 if (SDValue NewSrc = SimplifyMultipleUseDemandedBits(
41421 Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1))
41422 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
41423 return false;
41424 }
41425 case X86ISD::BEXTR:
41426 case X86ISD::BEXTRI: {
41427 SDValue Op0 = Op.getOperand(0);
41428 SDValue Op1 = Op.getOperand(1);
41429
41430 // Only bottom 16-bits of the control bits are required.
41431 if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
41432 // NOTE: SimplifyDemandedBits won't do this for constants.
41433 uint64_t Val1 = Cst1->getZExtValue();
41434 uint64_t MaskedVal1 = Val1 & 0xFFFF;
41435 if (Opc == X86ISD::BEXTR && MaskedVal1 != Val1) {
41436 SDLoc DL(Op);
41437 return TLO.CombineTo(
41438 Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0,
41439 TLO.DAG.getConstant(MaskedVal1, DL, VT)));
41440 }
41441
41442 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
41443 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
41444
41445 // If the length is 0, the result is 0.
41446 if (Length == 0) {
41447 Known.setAllZero();
41448 return false;
41449 }
41450
41451 if ((Shift + Length) <= BitWidth) {
41452 APInt DemandedMask = APInt::getBitsSet(BitWidth, Shift, Shift + Length);
41453 if (SimplifyDemandedBits(Op0, DemandedMask, Known, TLO, Depth + 1))
41454 return true;
41455
41456 Known = Known.extractBits(Length, Shift);
41457 Known = Known.zextOrTrunc(BitWidth);
41458 return false;
41459 }
41460 } else {
41461 assert(Opc == X86ISD::BEXTR && "Unexpected opcode!")(static_cast <bool> (Opc == X86ISD::BEXTR && "Unexpected opcode!"
) ? void (0) : __assert_fail ("Opc == X86ISD::BEXTR && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41461, __extension__
__PRETTY_FUNCTION__))
;
41462 KnownBits Known1;
41463 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16));
41464 if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))
41465 return true;
41466
41467 // If the length is 0, replace with 0.
41468 KnownBits LengthBits = Known1.extractBits(8, 8);
41469 if (LengthBits.isZero())
41470 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
41471 }
41472
41473 break;
41474 }
41475 case X86ISD::PDEP: {
41476 SDValue Op0 = Op.getOperand(0);
41477 SDValue Op1 = Op.getOperand(1);
41478
41479 unsigned DemandedBitsLZ = OriginalDemandedBits.countLeadingZeros();
41480 APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
41481
41482 // If the demanded bits has leading zeroes, we don't demand those from the
41483 // mask.
41484 if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1))
41485 return true;
41486
41487 // The number of possible 1s in the mask determines the number of LSBs of
41488 // operand 0 used. Undemanded bits from the mask don't matter so filter
41489 // them before counting.
41490 KnownBits Known2;
41491 uint64_t Count = (~Known.Zero & LoMask).countPopulation();
41492 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count));
41493 if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1))
41494 return true;
41495
41496 // Zeroes are retained from the mask, but not ones.
41497 Known.One.clearAllBits();
41498 // The result will have at least as many trailing zeros as the non-mask
41499 // operand since bits can only map to the same or higher bit position.
41500 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
41501 return false;
41502 }
41503 }
41504
41505 return TargetLowering::SimplifyDemandedBitsForTargetNode(
41506 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
41507}
41508
41509SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
41510 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
41511 SelectionDAG &DAG, unsigned Depth) const {
41512 int NumElts = DemandedElts.getBitWidth();
41513 unsigned Opc = Op.getOpcode();
41514 EVT VT = Op.getValueType();
41515
41516 switch (Opc) {
41517 case X86ISD::PINSRB:
41518 case X86ISD::PINSRW: {
41519 // If we don't demand the inserted element, return the base vector.
41520 SDValue Vec = Op.getOperand(0);
41521 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
41522 MVT VecVT = Vec.getSimpleValueType();
41523 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&
41524 !DemandedElts[CIdx->getZExtValue()])
41525 return Vec;
41526 break;
41527 }
41528 case X86ISD::VSHLI: {
41529 // If we are only demanding sign bits then we can use the shift source
41530 // directly.
41531 SDValue Op0 = Op.getOperand(0);
41532 unsigned ShAmt = Op.getConstantOperandVal(1);
41533 unsigned BitWidth = DemandedBits.getBitWidth();
41534 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);
41535 unsigned UpperDemandedBits = BitWidth - DemandedBits.countTrailingZeros();
41536 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
41537 return Op0;
41538 break;
41539 }
41540 case X86ISD::VSRAI:
41541 // iff we only need the sign bit then we can use the source directly.
41542 // TODO: generalize where we only demand extended signbits.
41543 if (DemandedBits.isSignMask())
41544 return Op.getOperand(0);
41545 break;
41546 case X86ISD::PCMPGT:
41547 // icmp sgt(0, R) == ashr(R, BitWidth-1).
41548 // iff we only need the sign bit then we can use R directly.
41549 if (DemandedBits.isSignMask() &&
41550 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
41551 return Op.getOperand(1);
41552 break;
41553 }
41554
41555 APInt ShuffleUndef, ShuffleZero;
41556 SmallVector<int, 16> ShuffleMask;
41557 SmallVector<SDValue, 2> ShuffleOps;
41558 if (getTargetShuffleInputs(Op, DemandedElts, ShuffleOps, ShuffleMask,
41559 ShuffleUndef, ShuffleZero, DAG, Depth, false)) {
41560 // If all the demanded elts are from one operand and are inline,
41561 // then we can use the operand directly.
41562 int NumOps = ShuffleOps.size();
41563 if (ShuffleMask.size() == (unsigned)NumElts &&
41564 llvm::all_of(ShuffleOps, [VT](SDValue V) {
41565 return VT.getSizeInBits() == V.getValueSizeInBits();
41566 })) {
41567
41568 if (DemandedElts.isSubsetOf(ShuffleUndef))
41569 return DAG.getUNDEF(VT);
41570 if (DemandedElts.isSubsetOf(ShuffleUndef | ShuffleZero))
41571 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(Op));
41572
41573 // Bitmask that indicates which ops have only been accessed 'inline'.
41574 APInt IdentityOp = APInt::getAllOnes(NumOps);
41575 for (int i = 0; i != NumElts; ++i) {
41576 int M = ShuffleMask[i];
41577 if (!DemandedElts[i] || ShuffleUndef[i])
41578 continue;
41579 int OpIdx = M / NumElts;
41580 int EltIdx = M % NumElts;
41581 if (M < 0 || EltIdx != i) {
41582 IdentityOp.clearAllBits();
41583 break;
41584 }
41585 IdentityOp &= APInt::getOneBitSet(NumOps, OpIdx);
41586 if (IdentityOp == 0)
41587 break;
41588 }
41589 assert((IdentityOp == 0 || IdentityOp.countPopulation() == 1) &&(static_cast <bool> ((IdentityOp == 0 || IdentityOp.countPopulation
() == 1) && "Multiple identity shuffles detected") ? void
(0) : __assert_fail ("(IdentityOp == 0 || IdentityOp.countPopulation() == 1) && \"Multiple identity shuffles detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41590, __extension__
__PRETTY_FUNCTION__))
41590 "Multiple identity shuffles detected")(static_cast <bool> ((IdentityOp == 0 || IdentityOp.countPopulation
() == 1) && "Multiple identity shuffles detected") ? void
(0) : __assert_fail ("(IdentityOp == 0 || IdentityOp.countPopulation() == 1) && \"Multiple identity shuffles detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41590, __extension__
__PRETTY_FUNCTION__))
;
41591
41592 if (IdentityOp != 0)
41593 return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countTrailingZeros()]);
41594 }
41595 }
41596
41597 return TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
41598 Op, DemandedBits, DemandedElts, DAG, Depth);
41599}
41600
41601bool X86TargetLowering::isSplatValueForTargetNode(SDValue Op,
41602 const APInt &DemandedElts,
41603 APInt &UndefElts,
41604 unsigned Depth) const {
41605 unsigned NumElts = DemandedElts.getBitWidth();
41606 unsigned Opc = Op.getOpcode();
41607
41608 switch (Opc) {
41609 case X86ISD::VBROADCAST:
41610 case X86ISD::VBROADCAST_LOAD:
41611 UndefElts = APInt::getNullValue(NumElts);
41612 return true;
41613 }
41614
41615 return TargetLowering::isSplatValueForTargetNode(Op, DemandedElts, UndefElts,
41616 Depth);
41617}
41618
41619// Helper to peek through bitops/trunc/setcc to determine size of source vector.
41620// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.
41621static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size,
41622 bool AllowTruncate) {
41623 switch (Src.getOpcode()) {
41624 case ISD::TRUNCATE:
41625 if (!AllowTruncate)
41626 return false;
41627 LLVM_FALLTHROUGH[[gnu::fallthrough]];
41628 case ISD::SETCC:
41629 return Src.getOperand(0).getValueSizeInBits() == Size;
41630 case ISD::AND:
41631 case ISD::XOR:
41632 case ISD::OR:
41633 return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate) &&
41634 checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate);
41635 }
41636 return false;
41637}
41638
41639// Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents.
41640static unsigned getAltBitOpcode(unsigned Opcode) {
41641 switch(Opcode) {
41642 case ISD::AND: return X86ISD::FAND;
41643 case ISD::OR: return X86ISD::FOR;
41644 case ISD::XOR: return X86ISD::FXOR;
41645 case X86ISD::ANDNP: return X86ISD::FANDN;
41646 }
41647 llvm_unreachable("Unknown bitwise opcode")::llvm::llvm_unreachable_internal("Unknown bitwise opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41647)
;
41648}
41649
41650// Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets.
41651static SDValue adjustBitcastSrcVectorSSE1(SelectionDAG &DAG, SDValue Src,
41652 const SDLoc &DL) {
41653 EVT SrcVT = Src.getValueType();
41654 if (SrcVT != MVT::v4i1)
41655 return SDValue();
41656
41657 switch (Src.getOpcode()) {
41658 case ISD::SETCC:
41659 if (Src.getOperand(0).getValueType() == MVT::v4i32 &&
41660 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) &&
41661 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT) {
41662 SDValue Op0 = Src.getOperand(0);
41663 if (ISD::isNormalLoad(Op0.getNode()))
41664 return DAG.getBitcast(MVT::v4f32, Op0);
41665 if (Op0.getOpcode() == ISD::BITCAST &&
41666 Op0.getOperand(0).getValueType() == MVT::v4f32)
41667 return Op0.getOperand(0);
41668 }
41669 break;
41670 case ISD::AND:
41671 case ISD::XOR:
41672 case ISD::OR: {
41673 SDValue Op0 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(0), DL);
41674 SDValue Op1 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(1), DL);
41675 if (Op0 && Op1)
41676 return DAG.getNode(getAltBitOpcode(Src.getOpcode()), DL, MVT::v4f32, Op0,
41677 Op1);
41678 break;
41679 }
41680 }
41681 return SDValue();
41682}
41683
41684// Helper to push sign extension of vXi1 SETCC result through bitops.
41685static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT,
41686 SDValue Src, const SDLoc &DL) {
41687 switch (Src.getOpcode()) {
41688 case ISD::SETCC:
41689 case ISD::TRUNCATE:
41690 return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
41691 case ISD::AND:
41692 case ISD::XOR:
41693 case ISD::OR:
41694 return DAG.getNode(
41695 Src.getOpcode(), DL, SExtVT,
41696 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(0), DL),
41697 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL));
41698 }
41699 llvm_unreachable("Unexpected node type for vXi1 sign extension")::llvm::llvm_unreachable_internal("Unexpected node type for vXi1 sign extension"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41699)
;
41700}
41701
41702// Try to match patterns such as
41703// (i16 bitcast (v16i1 x))
41704// ->
41705// (i16 movmsk (16i8 sext (v16i1 x)))
41706// before the illegal vector is scalarized on subtargets that don't have legal
41707// vxi1 types.
41708static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
41709 const SDLoc &DL,
41710 const X86Subtarget &Subtarget) {
41711 EVT SrcVT = Src.getValueType();
41712 if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)
41713 return SDValue();
41714
41715 // Recognize the IR pattern for the movmsk intrinsic under SSE1 before type
41716 // legalization destroys the v4i32 type.
41717 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2()) {
41718 if (SDValue V = adjustBitcastSrcVectorSSE1(DAG, Src, DL)) {
41719 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32,
41720 DAG.getBitcast(MVT::v4f32, V));
41721 return DAG.getZExtOrTrunc(V, DL, VT);
41722 }
41723 }
41724
41725 // If the input is a truncate from v16i8 or v32i8 go ahead and use a
41726 // movmskb even with avx512. This will be better than truncating to vXi1 and
41727 // using a kmov. This can especially help KNL if the input is a v16i8/v32i8
41728 // vpcmpeqb/vpcmpgtb.
41729 bool PreferMovMsk = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&
41730 (Src.getOperand(0).getValueType() == MVT::v16i8 ||
41731 Src.getOperand(0).getValueType() == MVT::v32i8 ||
41732 Src.getOperand(0).getValueType() == MVT::v64i8);
41733
41734 // Prefer movmsk for AVX512 for (bitcast (setlt X, 0)) which can be handled
41735 // directly with vpmovmskb/vmovmskps/vmovmskpd.
41736 if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() &&
41737 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT &&
41738 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {
41739 EVT CmpVT = Src.getOperand(0).getValueType();
41740 EVT EltVT = CmpVT.getVectorElementType();
41741 if (CmpVT.getSizeInBits() <= 256 &&
41742 (EltVT == MVT::i8 || EltVT == MVT::i32 || EltVT == MVT::i64))
41743 PreferMovMsk = true;
41744 }
41745
41746 // With AVX512 vxi1 types are legal and we prefer using k-regs.
41747 // MOVMSK is supported in SSE2 or later.
41748 if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !PreferMovMsk))
41749 return SDValue();
41750
41751 // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
41752 // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
41753 // v8i16 and v16i16.
41754 // For these two cases, we can shuffle the upper element bytes to a
41755 // consecutive sequence at the start of the vector and treat the results as
41756 // v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
41757 // for v16i16 this is not the case, because the shuffle is expensive, so we
41758 // avoid sign-extending to this type entirely.
41759 // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
41760 // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
41761 MVT SExtVT;
41762 bool PropagateSExt = false;
41763 switch (SrcVT.getSimpleVT().SimpleTy) {
41764 default:
41765 return SDValue();
41766 case MVT::v2i1:
41767 SExtVT = MVT::v2i64;
41768 break;
41769 case MVT::v4i1:
41770 SExtVT = MVT::v4i32;
41771 // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
41772 // sign-extend to a 256-bit operation to avoid truncation.
41773 if (Subtarget.hasAVX() &&
41774 checkBitcastSrcVectorSize(Src, 256, Subtarget.hasAVX2())) {
41775 SExtVT = MVT::v4i64;
41776 PropagateSExt = true;
41777 }
41778 break;
41779 case MVT::v8i1:
41780 SExtVT = MVT::v8i16;
41781 // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
41782 // sign-extend to a 256-bit operation to match the compare.
41783 // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
41784 // 256-bit because the shuffle is cheaper than sign extending the result of
41785 // the compare.
41786 if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256, true) ||
41787 checkBitcastSrcVectorSize(Src, 512, true))) {
41788 SExtVT = MVT::v8i32;
41789 PropagateSExt = true;
41790 }
41791 break;
41792 case MVT::v16i1:
41793 SExtVT = MVT::v16i8;
41794 // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
41795 // it is not profitable to sign-extend to 256-bit because this will
41796 // require an extra cross-lane shuffle which is more expensive than
41797 // truncating the result of the compare to 128-bits.
41798 break;
41799 case MVT::v32i1:
41800 SExtVT = MVT::v32i8;
41801 break;
41802 case MVT::v64i1:
41803 // If we have AVX512F, but not AVX512BW and the input is truncated from
41804 // v64i8 checked earlier. Then split the input and make two pmovmskbs.
41805 if (Subtarget.hasAVX512()) {
41806 if (Subtarget.hasBWI())
41807 return SDValue();
41808 SExtVT = MVT::v64i8;
41809 break;
41810 }
41811 // Split if this is a <64 x i8> comparison result.
41812 if (checkBitcastSrcVectorSize(Src, 512, false)) {
41813 SExtVT = MVT::v64i8;
41814 break;
41815 }
41816 return SDValue();
41817 };
41818
41819 SDValue V = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)
41820 : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
41821
41822 if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8 || SExtVT == MVT::v64i8) {
41823 V = getPMOVMSKB(DL, V, DAG, Subtarget);
41824 } else {
41825 if (SExtVT == MVT::v8i16)
41826 V = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, V,
41827 DAG.getUNDEF(MVT::v8i16));
41828 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
41829 }
41830
41831 EVT IntVT =
41832 EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements());
41833 V = DAG.getZExtOrTrunc(V, DL, IntVT);
41834 return DAG.getBitcast(VT, V);
41835}
41836
41837// Convert a vXi1 constant build vector to the same width scalar integer.
41838static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG) {
41839 EVT SrcVT = Op.getValueType();
41840 assert(SrcVT.getVectorElementType() == MVT::i1 &&(static_cast <bool> (SrcVT.getVectorElementType() == MVT
::i1 && "Expected a vXi1 vector") ? void (0) : __assert_fail
("SrcVT.getVectorElementType() == MVT::i1 && \"Expected a vXi1 vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41841, __extension__
__PRETTY_FUNCTION__))
41841 "Expected a vXi1 vector")(static_cast <bool> (SrcVT.getVectorElementType() == MVT
::i1 && "Expected a vXi1 vector") ? void (0) : __assert_fail
("SrcVT.getVectorElementType() == MVT::i1 && \"Expected a vXi1 vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41841, __extension__
__PRETTY_FUNCTION__))
;
41842 assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&(static_cast <bool> (ISD::isBuildVectorOfConstantSDNodes
(Op.getNode()) && "Expected a constant build vector")
? void (0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && \"Expected a constant build vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41843, __extension__
__PRETTY_FUNCTION__))
41843 "Expected a constant build vector")(static_cast <bool> (ISD::isBuildVectorOfConstantSDNodes
(Op.getNode()) && "Expected a constant build vector")
? void (0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && \"Expected a constant build vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41843, __extension__
__PRETTY_FUNCTION__))
;
41844
41845 APInt Imm(SrcVT.getVectorNumElements(), 0);
41846 for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
41847 SDValue In = Op.getOperand(Idx);
41848 if (!In.isUndef() && (cast<ConstantSDNode>(In)->getZExtValue() & 0x1))
41849 Imm.setBit(Idx);
41850 }
41851 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
41852 return DAG.getConstant(Imm, SDLoc(Op), IntVT);
41853}
41854
41855static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,
41856 TargetLowering::DAGCombinerInfo &DCI,
41857 const X86Subtarget &Subtarget) {
41858 assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast")(static_cast <bool> (N->getOpcode() == ISD::BITCAST &&
"Expected a bitcast") ? void (0) : __assert_fail ("N->getOpcode() == ISD::BITCAST && \"Expected a bitcast\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41858, __extension__
__PRETTY_FUNCTION__))
;
41859
41860 if (!DCI.isBeforeLegalizeOps())
41861 return SDValue();
41862
41863 // Only do this if we have k-registers.
41864 if (!Subtarget.hasAVX512())
41865 return SDValue();
41866
41867 EVT DstVT = N->getValueType(0);
41868 SDValue Op = N->getOperand(0);
41869 EVT SrcVT = Op.getValueType();
41870
41871 if (!Op.hasOneUse())
41872 return SDValue();
41873
41874 // Look for logic ops.
41875 if (Op.getOpcode() != ISD::AND &&
41876 Op.getOpcode() != ISD::OR &&
41877 Op.getOpcode() != ISD::XOR)
41878 return SDValue();
41879
41880 // Make sure we have a bitcast between mask registers and a scalar type.
41881 if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
41882 DstVT.isScalarInteger()) &&
41883 !(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&
41884 SrcVT.isScalarInteger()))
41885 return SDValue();
41886
41887 SDValue LHS = Op.getOperand(0);
41888 SDValue RHS = Op.getOperand(1);
41889
41890 if (LHS.hasOneUse() && LHS.getOpcode() == ISD::BITCAST &&
41891 LHS.getOperand(0).getValueType() == DstVT)
41892 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, LHS.getOperand(0),
41893 DAG.getBitcast(DstVT, RHS));
41894
41895 if (RHS.hasOneUse() && RHS.getOpcode() == ISD::BITCAST &&
41896 RHS.getOperand(0).getValueType() == DstVT)
41897 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
41898 DAG.getBitcast(DstVT, LHS), RHS.getOperand(0));
41899
41900 // If the RHS is a vXi1 build vector, this is a good reason to flip too.
41901 // Most of these have to move a constant from the scalar domain anyway.
41902 if (ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) {
41903 RHS = combinevXi1ConstantToInteger(RHS, DAG);
41904 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
41905 DAG.getBitcast(DstVT, LHS), RHS);
41906 }
41907
41908 return SDValue();
41909}
41910
41911static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG,
41912 const X86Subtarget &Subtarget) {
41913 SDLoc DL(BV);
41914 unsigned NumElts = BV->getNumOperands();
41915 SDValue Splat = BV->getSplatValue();
41916
41917 // Build MMX element from integer GPR or SSE float values.
41918 auto CreateMMXElement = [&](SDValue V) {
41919 if (V.isUndef())
41920 return DAG.getUNDEF(MVT::x86mmx);
41921 if (V.getValueType().isFloatingPoint()) {
41922 if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {
41923 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);
41924 V = DAG.getBitcast(MVT::v2i64, V);
41925 return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);
41926 }
41927 V = DAG.getBitcast(MVT::i32, V);
41928 } else {
41929 V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);
41930 }
41931 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);
41932 };
41933
41934 // Convert build vector ops to MMX data in the bottom elements.
41935 SmallVector<SDValue, 8> Ops;
41936
41937 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41938
41939 // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
41940 if (Splat) {
41941 if (Splat.isUndef())
41942 return DAG.getUNDEF(MVT::x86mmx);
41943
41944 Splat = CreateMMXElement(Splat);
41945
41946 if (Subtarget.hasSSE1()) {
41947 // Unpack v8i8 to splat i8 elements to lowest 16-bits.
41948 if (NumElts == 8)
41949 Splat = DAG.getNode(
41950 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
41951 DAG.getTargetConstant(Intrinsic::x86_mmx_punpcklbw, DL,
41952 TLI.getPointerTy(DAG.getDataLayout())),
41953 Splat, Splat);
41954
41955 // Use PSHUFW to repeat 16-bit elements.
41956 unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
41957 return DAG.getNode(
41958 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
41959 DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL,
41960 TLI.getPointerTy(DAG.getDataLayout())),
41961 Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8));
41962 }
41963 Ops.append(NumElts, Splat);
41964 } else {
41965 for (unsigned i = 0; i != NumElts; ++i)
41966 Ops.push_back(CreateMMXElement(BV->getOperand(i)));
41967 }
41968
41969 // Use tree of PUNPCKLs to build up general MMX vector.
41970 while (Ops.size() > 1) {
41971 unsigned NumOps = Ops.size();
41972 unsigned IntrinOp =
41973 (NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
41974 : (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
41975 : Intrinsic::x86_mmx_punpcklbw));
41976 SDValue Intrin = DAG.getTargetConstant(
41977 IntrinOp, DL, TLI.getPointerTy(DAG.getDataLayout()));
41978 for (unsigned i = 0; i != NumOps; i += 2)
41979 Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
41980 Ops[i], Ops[i + 1]);
41981 Ops.resize(NumOps / 2);
41982 }
41983
41984 return Ops[0];
41985}
41986
41987// Recursive function that attempts to find if a bool vector node was originally
41988// a vector/float/double that got truncated/extended/bitcast to/from a scalar
41989// integer. If so, replace the scalar ops with bool vector equivalents back down
41990// the chain.
41991static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL,
41992 SelectionDAG &DAG,
41993 const X86Subtarget &Subtarget) {
41994 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41995 unsigned Opc = V.getOpcode();
41996 switch (Opc) {
41997 case ISD::BITCAST: {
41998 // Bitcast from a vector/float/double, we can cheaply bitcast to VT.
41999 SDValue Src = V.getOperand(0);
42000 EVT SrcVT = Src.getValueType();
42001 if (SrcVT.isVector() || SrcVT.isFloatingPoint())
42002 return DAG.getBitcast(VT, Src);
42003 break;
42004 }
42005 case ISD::TRUNCATE: {
42006 // If we find a suitable source, a truncated scalar becomes a subvector.
42007 SDValue Src = V.getOperand(0);
42008 EVT NewSrcVT =
42009 EVT::getVectorVT(*DAG.getContext(), MVT::i1, Src.getValueSizeInBits());
42010 if (TLI.isTypeLegal(NewSrcVT))
42011 if (SDValue N0 =
42012 combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))
42013 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N0,
42014 DAG.getIntPtrConstant(0, DL));
42015 break;
42016 }
42017 case ISD::ANY_EXTEND:
42018 case ISD::ZERO_EXTEND: {
42019 // If we find a suitable source, an extended scalar becomes a subvector.
42020 SDValue Src = V.getOperand(0);
42021 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
42022 Src.getScalarValueSizeInBits());
42023 if (TLI.isTypeLegal(NewSrcVT))
42024 if (SDValue N0 =
42025 combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))
42026 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
42027 Opc == ISD::ANY_EXTEND ? DAG.getUNDEF(VT)
42028 : DAG.getConstant(0, DL, VT),
42029 N0, DAG.getIntPtrConstant(0, DL));
42030 break;
42031 }
42032 case ISD::OR: {
42033 // If we find suitable sources, we can just move an OR to the vector domain.
42034 SDValue Src0 = V.getOperand(0);
42035 SDValue Src1 = V.getOperand(1);
42036 if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
42037 if (SDValue N1 = combineBitcastToBoolVector(VT, Src1, DL, DAG, Subtarget))
42038 return DAG.getNode(Opc, DL, VT, N0, N1);
42039 break;
42040 }
42041 case ISD::SHL: {
42042 // If we find a suitable source, a SHL becomes a KSHIFTL.
42043 SDValue Src0 = V.getOperand(0);
42044 if ((VT == MVT::v8i1 && !Subtarget.hasDQI()) ||
42045 ((VT == MVT::v32i1 || VT == MVT::v64i1) && !Subtarget.hasBWI()))
42046 break;
42047
42048 if (auto *Amt = dyn_cast<ConstantSDNode>(V.getOperand(1)))
42049 if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
42050 return DAG.getNode(
42051 X86ISD::KSHIFTL, DL, VT, N0,
42052 DAG.getTargetConstant(Amt->getZExtValue(), DL, MVT::i8));
42053 break;
42054 }
42055 }
42056 return SDValue();
42057}
42058
42059static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
42060 TargetLowering::DAGCombinerInfo &DCI,
42061 const X86Subtarget &Subtarget) {
42062 SDValue N0 = N->getOperand(0);
42063 EVT VT = N->getValueType(0);
42064 EVT SrcVT = N0.getValueType();
42065 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42066
42067 // Try to match patterns such as
42068 // (i16 bitcast (v16i1 x))
42069 // ->
42070 // (i16 movmsk (16i8 sext (v16i1 x)))
42071 // before the setcc result is scalarized on subtargets that don't have legal
42072 // vxi1 types.
42073 if (DCI.isBeforeLegalize()) {
42074 SDLoc dl(N);
42075 if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))
42076 return V;
42077
42078 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
42079 // type, widen both sides to avoid a trip through memory.
42080 if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
42081 Subtarget.hasAVX512()) {
42082 N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
42083 N0 = DAG.getBitcast(MVT::v8i1, N0);
42084 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
42085 DAG.getIntPtrConstant(0, dl));
42086 }
42087
42088 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
42089 // type, widen both sides to avoid a trip through memory.
42090 if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
42091 Subtarget.hasAVX512()) {
42092 // Use zeros for the widening if we already have some zeroes. This can
42093 // allow SimplifyDemandedBits to remove scalar ANDs that may be down
42094 // stream of this.
42095 // FIXME: It might make sense to detect a concat_vectors with a mix of
42096 // zeroes and undef and turn it into insert_subvector for i1 vectors as
42097 // a separate combine. What we can't do is canonicalize the operands of
42098 // such a concat or we'll get into a loop with SimplifyDemandedBits.
42099 if (N0.getOpcode() == ISD::CONCAT_VECTORS) {
42100 SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1);
42101 if (ISD::isBuildVectorAllZeros(LastOp.getNode())) {
42102 SrcVT = LastOp.getValueType();
42103 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
42104 SmallVector<SDValue, 4> Ops(N0->op_begin(), N0->op_end());
42105 Ops.resize(NumConcats, DAG.getConstant(0, dl, SrcVT));
42106 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
42107 N0 = DAG.getBitcast(MVT::i8, N0);
42108 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
42109 }
42110 }
42111
42112 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
42113 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
42114 Ops[0] = N0;
42115 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
42116 N0 = DAG.getBitcast(MVT::i8, N0);
42117 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
42118 }
42119 } else {
42120 // If we're bitcasting from iX to vXi1, see if the integer originally
42121 // began as a vXi1 and whether we can remove the bitcast entirely.
42122 if (VT.isVector() && VT.getScalarType() == MVT::i1 &&
42123 SrcVT.isScalarInteger() && TLI.isTypeLegal(VT)) {
42124 if (SDValue V =
42125 combineBitcastToBoolVector(VT, N0, SDLoc(N), DAG, Subtarget))
42126 return V;
42127 }
42128 }
42129
42130 // Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and
42131 // replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur
42132 // due to insert_subvector legalization on KNL. By promoting the copy to i16
42133 // we can help with known bits propagation from the vXi1 domain to the
42134 // scalar domain.
42135 if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() &&
42136 !Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
42137 N0.getOperand(0).getValueType() == MVT::v16i1 &&
42138 isNullConstant(N0.getOperand(1)))
42139 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,
42140 DAG.getBitcast(MVT::i16, N0.getOperand(0)));
42141
42142 // Canonicalize (bitcast (vbroadcast_load)) so that the output of the bitcast
42143 // and the vbroadcast_load are both integer or both fp. In some cases this
42144 // will remove the bitcast entirely.
42145 if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&
42146 VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) {
42147 auto *BCast = cast<MemIntrinsicSDNode>(N0);
42148 unsigned SrcVTSize = SrcVT.getScalarSizeInBits();
42149 unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits();
42150 // Don't swap i8/i16 since don't have fp types that size.
42151 if (MemSize >= 32) {
42152 MVT MemVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(MemSize)
42153 : MVT::getIntegerVT(MemSize);
42154 MVT LoadVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(SrcVTSize)
42155 : MVT::getIntegerVT(SrcVTSize);
42156 LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements());
42157
42158 SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
42159 SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };
42160 SDValue ResNode =
42161 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops,
42162 MemVT, BCast->getMemOperand());
42163 DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));
42164 return DAG.getBitcast(VT, ResNode);
42165 }
42166 }
42167
42168 // Since MMX types are special and don't usually play with other vector types,
42169 // it's better to handle them early to be sure we emit efficient code by
42170 // avoiding store-load conversions.
42171 if (VT == MVT::x86mmx) {
42172 // Detect MMX constant vectors.
42173 APInt UndefElts;
42174 SmallVector<APInt, 1> EltBits;
42175 if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits)) {
42176 SDLoc DL(N0);
42177 // Handle zero-extension of i32 with MOVD.
42178 if (EltBits[0].countLeadingZeros() >= 32)
42179 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,
42180 DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));
42181 // Else, bitcast to a double.
42182 // TODO - investigate supporting sext 32-bit immediates on x86_64.
42183 APFloat F64(APFloat::IEEEdouble(), EltBits[0]);
42184 return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));
42185 }
42186
42187 // Detect bitcasts to x86mmx low word.
42188 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
42189 (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&
42190 N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {
42191 bool LowUndef = true, AllUndefOrZero = true;
42192 for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
42193 SDValue Op = N0.getOperand(i);
42194 LowUndef &= Op.isUndef() || (i >= e/2);
42195 AllUndefOrZero &= (Op.isUndef() || isNullConstant(Op));
42196 }
42197 if (AllUndefOrZero) {
42198 SDValue N00 = N0.getOperand(0);
42199 SDLoc dl(N00);
42200 N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)
42201 : DAG.getZExtOrTrunc(N00, dl, MVT::i32);
42202 return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);
42203 }
42204 }
42205
42206 // Detect bitcasts of 64-bit build vectors and convert to a
42207 // MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the
42208 // lowest element.
42209 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
42210 (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||
42211 SrcVT == MVT::v8i8))
42212 return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget);
42213
42214 // Detect bitcasts between element or subvector extraction to x86mmx.
42215 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
42216 N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
42217 isNullConstant(N0.getOperand(1))) {
42218 SDValue N00 = N0.getOperand(0);
42219 if (N00.getValueType().is128BitVector())
42220 return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
42221 DAG.getBitcast(MVT::v2i64, N00));
42222 }
42223
42224 // Detect bitcasts from FP_TO_SINT to x86mmx.
42225 if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {
42226 SDLoc DL(N0);
42227 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
42228 DAG.getUNDEF(MVT::v2i32));
42229 return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
42230 DAG.getBitcast(MVT::v2i64, Res));
42231 }
42232 }
42233
42234 // Try to remove a bitcast of constant vXi1 vector. We have to legalize
42235 // most of these to scalar anyway.
42236 if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
42237 SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
42238 ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {
42239 return combinevXi1ConstantToInteger(N0, DAG);
42240 }
42241
42242 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
42243 VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
42244 isa<ConstantSDNode>(N0)) {
42245 auto *C = cast<ConstantSDNode>(N0);
42246 if (C->isAllOnes())
42247 return DAG.getConstant(1, SDLoc(N0), VT);
42248 if (C->isZero())
42249 return DAG.getConstant(0, SDLoc(N0), VT);
42250 }
42251
42252 // Look for MOVMSK that is maybe truncated and then bitcasted to vXi1.
42253 // Turn it into a sign bit compare that produces a k-register. This avoids
42254 // a trip through a GPR.
42255 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
42256 VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
42257 isPowerOf2_32(VT.getVectorNumElements())) {
42258 unsigned NumElts = VT.getVectorNumElements();
42259 SDValue Src = N0;
42260
42261 // Peek through truncate.
42262 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
42263 Src = N0.getOperand(0);
42264
42265 if (Src.getOpcode() == X86ISD::MOVMSK && Src.hasOneUse()) {
42266 SDValue MovmskIn = Src.getOperand(0);
42267 MVT MovmskVT = MovmskIn.getSimpleValueType();
42268 unsigned MovMskElts = MovmskVT.getVectorNumElements();
42269
42270 // We allow extra bits of the movmsk to be used since they are known zero.
42271 // We can't convert a VPMOVMSKB without avx512bw.
42272 if (MovMskElts <= NumElts &&
42273 (Subtarget.hasBWI() || MovmskVT.getVectorElementType() != MVT::i8)) {
42274 EVT IntVT = EVT(MovmskVT).changeVectorElementTypeToInteger();
42275 MovmskIn = DAG.getBitcast(IntVT, MovmskIn);
42276 SDLoc dl(N);
42277 MVT CmpVT = MVT::getVectorVT(MVT::i1, MovMskElts);
42278 SDValue Cmp = DAG.getSetCC(dl, CmpVT, MovmskIn,
42279 DAG.getConstant(0, dl, IntVT), ISD::SETLT);
42280 if (EVT(CmpVT) == VT)
42281 return Cmp;
42282
42283 // Pad with zeroes up to original VT to replace the zeroes that were
42284 // being used from the MOVMSK.
42285 unsigned NumConcats = NumElts / MovMskElts;
42286 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, CmpVT));
42287 Ops[0] = Cmp;
42288 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Ops);
42289 }
42290 }
42291 }
42292
42293 // Try to remove bitcasts from input and output of mask arithmetic to
42294 // remove GPR<->K-register crossings.
42295 if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
42296 return V;
42297
42298 // Convert a bitcasted integer logic operation that has one bitcasted
42299 // floating-point operand into a floating-point logic operation. This may
42300 // create a load of a constant, but that is cheaper than materializing the
42301 // constant in an integer register and transferring it to an SSE register or
42302 // transferring the SSE operand to integer register and back.
42303 unsigned FPOpcode;
42304 switch (N0.getOpcode()) {
42305 case ISD::AND: FPOpcode = X86ISD::FAND; break;
42306 case ISD::OR: FPOpcode = X86ISD::FOR; break;
42307 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
42308 default: return SDValue();
42309 }
42310
42311 // Check if we have a bitcast from another integer type as well.
42312 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
42313 (Subtarget.hasSSE2() && VT == MVT::f64) ||
42314 (Subtarget.hasFP16() && VT == MVT::f16) ||
42315 (Subtarget.hasSSE2() && VT.isInteger() && VT.isVector() &&
42316 TLI.isTypeLegal(VT))))
42317 return SDValue();
42318
42319 SDValue LogicOp0 = N0.getOperand(0);
42320 SDValue LogicOp1 = N0.getOperand(1);
42321 SDLoc DL0(N0);
42322
42323 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
42324 if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
42325 LogicOp0.hasOneUse() && LogicOp0.getOperand(0).hasOneUse() &&
42326 LogicOp0.getOperand(0).getValueType() == VT &&
42327 !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
42328 SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
42329 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
42330 return DAG.getNode(Opcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
42331 }
42332 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
42333 if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
42334 LogicOp1.hasOneUse() && LogicOp1.getOperand(0).hasOneUse() &&
42335 LogicOp1.getOperand(0).getValueType() == VT &&
42336 !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
42337 SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
42338 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
42339 return DAG.getNode(Opcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
42340 }
42341
42342 return SDValue();
42343}
42344
42345// (mul (zext a), (sext, b))
42346static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0,
42347 SDValue &Op1) {
42348 Op0 = Mul.getOperand(0);
42349 Op1 = Mul.getOperand(1);
42350
42351 // The operand1 should be signed extend
42352 if (Op0.getOpcode() == ISD::SIGN_EXTEND)
42353 std::swap(Op0, Op1);
42354
42355 auto IsFreeTruncation = [](SDValue &Op) -> bool {
42356 if ((Op.getOpcode() == ISD::ZERO_EXTEND ||
42357 Op.getOpcode() == ISD::SIGN_EXTEND) &&
42358 Op.getOperand(0).getScalarValueSizeInBits() <= 8)
42359 return true;
42360
42361 auto *BV = dyn_cast<BuildVectorSDNode>(Op);
42362 return (BV && BV->isConstant());
42363 };
42364
42365 // (dpbusd (zext a), (sext, b)). Since the first operand should be unsigned
42366 // value, we need to check Op0 is zero extended value. Op1 should be signed
42367 // value, so we just check the signed bits.
42368 if ((IsFreeTruncation(Op0) &&
42369 DAG.computeKnownBits(Op0).countMaxActiveBits() <= 8) &&
42370 (IsFreeTruncation(Op1) && DAG.ComputeMaxSignificantBits(Op1) <= 8))
42371 return true;
42372
42373 return false;
42374}
42375
42376// Given a ABS node, detect the following pattern:
42377// (ABS (SUB (ZERO_EXTEND a), (ZERO_EXTEND b))).
42378// This is useful as it is the input into a SAD pattern.
42379static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1) {
42380 SDValue AbsOp1 = Abs->getOperand(0);
42381 if (AbsOp1.getOpcode() != ISD::SUB)
42382 return false;
42383
42384 Op0 = AbsOp1.getOperand(0);
42385 Op1 = AbsOp1.getOperand(1);
42386
42387 // Check if the operands of the sub are zero-extended from vectors of i8.
42388 if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
42389 Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
42390 Op1.getOpcode() != ISD::ZERO_EXTEND ||
42391 Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
42392 return false;
42393
42394 return true;
42395}
42396
42397static SDValue createVPDPBUSD(SelectionDAG &DAG, SDValue LHS, SDValue RHS,
42398 unsigned &LogBias, const SDLoc &DL,
42399 const X86Subtarget &Subtarget) {
42400 // Extend or truncate to MVT::i8 first.
42401 MVT Vi8VT =
42402 MVT::getVectorVT(MVT::i8, LHS.getValueType().getVectorElementCount());
42403 LHS = DAG.getZExtOrTrunc(LHS, DL, Vi8VT);
42404 RHS = DAG.getSExtOrTrunc(RHS, DL, Vi8VT);
42405
42406 // VPDPBUSD(<16 x i32>C, <16 x i8>A, <16 x i8>B). For each dst element
42407 // C[0] = C[0] + A[0]B[0] + A[1]B[1] + A[2]B[2] + A[3]B[3].
42408 // The src A, B element type is i8, but the dst C element type is i32.
42409 // When we calculate the reduce stage, we use src vector type vXi8 for it
42410 // so we need logbias 2 to avoid extra 2 stages.
42411 LogBias = 2;
42412
42413 unsigned RegSize = std::max(128u, (unsigned)Vi8VT.getSizeInBits());
42414 if (Subtarget.hasVNNI() && !Subtarget.hasVLX())
42415 RegSize = std::max(512u, RegSize);
42416
42417 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
42418 // fill in the missing vector elements with 0.
42419 unsigned NumConcat = RegSize / Vi8VT.getSizeInBits();
42420 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, Vi8VT));
42421 Ops[0] = LHS;
42422 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
42423 SDValue DpOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
42424 Ops[0] = RHS;
42425 SDValue DpOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
42426
42427 // Actually build the DotProduct, split as 256/512 bits for
42428 // AVXVNNI/AVX512VNNI.
42429 auto DpBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
42430 ArrayRef<SDValue> Ops) {
42431 MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
42432 return DAG.getNode(X86ISD::VPDPBUSD, DL, VT, Ops);
42433 };
42434 MVT DpVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
42435 SDValue Zero = DAG.getConstant(0, DL, DpVT);
42436
42437 return SplitOpsAndApply(DAG, Subtarget, DL, DpVT, {Zero, DpOp0, DpOp1},
42438 DpBuilder, false);
42439}
42440
42441// Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
42442// to these zexts.
42443static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
42444 const SDValue &Zext1, const SDLoc &DL,
42445 const X86Subtarget &Subtarget) {
42446 // Find the appropriate width for the PSADBW.
42447 EVT InVT = Zext0.getOperand(0).getValueType();
42448 unsigned RegSize = std::max(128u, (unsigned)InVT.getSizeInBits());
42449
42450 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
42451 // fill in the missing vector elements with 0.
42452 unsigned NumConcat = RegSize / InVT.getSizeInBits();
42453 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
42454 Ops[0] = Zext0.getOperand(0);
42455 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
42456 SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
42457 Ops[0] = Zext1.getOperand(0);
42458 SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
42459
42460 // Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
42461 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
42462 ArrayRef<SDValue> Ops) {
42463 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
42464 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);
42465 };
42466 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
42467 return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, { SadOp0, SadOp1 },
42468 PSADBWBuilder);
42469}
42470
42471// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
42472// PHMINPOSUW.
42473static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG,
42474 const X86Subtarget &Subtarget) {
42475 // Bail without SSE41.
42476 if (!Subtarget.hasSSE41())
42477 return SDValue();
42478
42479 EVT ExtractVT = Extract->getValueType(0);
42480 if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
42481 return SDValue();
42482
42483 // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
42484 ISD::NodeType BinOp;
42485 SDValue Src = DAG.matchBinOpReduction(
42486 Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}, true);
42487 if (!Src)
42488 return SDValue();
42489
42490 EVT SrcVT = Src.getValueType();
42491 EVT SrcSVT = SrcVT.getScalarType();
42492 if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
42493 return SDValue();
42494
42495 SDLoc DL(Extract);
42496 SDValue MinPos = Src;
42497
42498 // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
42499 while (SrcVT.getSizeInBits() > 128) {
42500 SDValue Lo, Hi;
42501 std::tie(Lo, Hi) = splitVector(MinPos, DAG, DL);
42502 SrcVT = Lo.getValueType();
42503 MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
42504 }
42505 assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||(static_cast <bool> (((SrcVT == MVT::v8i16 && ExtractVT
== MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT ==
MVT::i8)) && "Unexpected value type") ? void (0) : __assert_fail
("((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42507, __extension__
__PRETTY_FUNCTION__))
42506 (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&(static_cast <bool> (((SrcVT == MVT::v8i16 && ExtractVT
== MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT ==
MVT::i8)) && "Unexpected value type") ? void (0) : __assert_fail
("((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42507, __extension__
__PRETTY_FUNCTION__))
42507 "Unexpected value type")(static_cast <bool> (((SrcVT == MVT::v8i16 && ExtractVT
== MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT ==
MVT::i8)) && "Unexpected value type") ? void (0) : __assert_fail
("((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42507, __extension__
__PRETTY_FUNCTION__))
;
42508
42509 // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
42510 // to flip the value accordingly.
42511 SDValue Mask;
42512 unsigned MaskEltsBits = ExtractVT.getSizeInBits();
42513 if (BinOp == ISD::SMAX)
42514 Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
42515 else if (BinOp == ISD::SMIN)
42516 Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
42517 else if (BinOp == ISD::UMAX)
42518 Mask = DAG.getAllOnesConstant(DL, SrcVT);
42519
42520 if (Mask)
42521 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
42522
42523 // For v16i8 cases we need to perform UMIN on pairs of byte elements,
42524 // shuffling each upper element down and insert zeros. This means that the
42525 // v16i8 UMIN will leave the upper element as zero, performing zero-extension
42526 // ready for the PHMINPOS.
42527 if (ExtractVT == MVT::i8) {
42528 SDValue Upper = DAG.getVectorShuffle(
42529 SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8),
42530 {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
42531 MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
42532 }
42533
42534 // Perform the PHMINPOS on a v8i16 vector,
42535 MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
42536 MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
42537 MinPos = DAG.getBitcast(SrcVT, MinPos);
42538
42539 if (Mask)
42540 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
42541
42542 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
42543 DAG.getIntPtrConstant(0, DL));
42544}
42545
42546// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
42547static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG,
42548 const X86Subtarget &Subtarget) {
42549 // Bail without SSE2.
42550 if (!Subtarget.hasSSE2())
42551 return SDValue();
42552
42553 EVT ExtractVT = Extract->getValueType(0);
42554 unsigned BitWidth = ExtractVT.getSizeInBits();
42555 if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
42556 ExtractVT != MVT::i8 && ExtractVT != MVT::i1)
42557 return SDValue();
42558
42559 // Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns.
42560 ISD::NodeType BinOp;
42561 SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
42562 if (!Match && ExtractVT == MVT::i1)
42563 Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR});
42564 if (!Match)
42565 return SDValue();
42566
42567 // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
42568 // which we can't support here for now.
42569 if (Match.getScalarValueSizeInBits() != BitWidth)
42570 return SDValue();
42571
42572 SDValue Movmsk;
42573 SDLoc DL(Extract);
42574 EVT MatchVT = Match.getValueType();
42575 unsigned NumElts = MatchVT.getVectorNumElements();
42576 unsigned MaxElts = Subtarget.hasInt256() ? 32 : 16;
42577 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42578
42579 if (ExtractVT == MVT::i1) {
42580 // Special case for (pre-legalization) vXi1 reductions.
42581 if (NumElts > 64 || !isPowerOf2_32(NumElts))
42582 return SDValue();
42583 if (TLI.isTypeLegal(MatchVT)) {
42584 // If this is a legal AVX512 predicate type then we can just bitcast.
42585 EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
42586 Movmsk = DAG.getBitcast(MovmskVT, Match);
42587 } else {
42588 // For all_of(setcc(x,y,eq)) - use PMOVMSKB(PCMPEQB()).
42589 if (BinOp == ISD::AND && Match.getOpcode() == ISD::SETCC &&
42590 cast<CondCodeSDNode>(Match.getOperand(2))->get() ==
42591 ISD::CondCode::SETEQ) {
42592 EVT VecSVT = Match.getOperand(0).getValueType().getScalarType();
42593 if (VecSVT != MVT::i8) {
42594 NumElts *= VecSVT.getSizeInBits() / 8;
42595 EVT CmpVT = EVT::getVectorVT(*DAG.getContext(), MVT::i8, NumElts);
42596 MatchVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
42597 Match = DAG.getSetCC(
42598 DL, MatchVT, DAG.getBitcast(CmpVT, Match.getOperand(0)),
42599 DAG.getBitcast(CmpVT, Match.getOperand(1)), ISD::CondCode::SETEQ);
42600 }
42601 }
42602
42603 // Use combineBitcastvxi1 to create the MOVMSK.
42604 while (NumElts > MaxElts) {
42605 SDValue Lo, Hi;
42606 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
42607 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
42608 NumElts /= 2;
42609 }
42610 EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
42611 Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);
42612 }
42613 if (!Movmsk)
42614 return SDValue();
42615 Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32);
42616 } else {
42617 // FIXME: Better handling of k-registers or 512-bit vectors?
42618 unsigned MatchSizeInBits = Match.getValueSizeInBits();
42619 if (!(MatchSizeInBits == 128 ||
42620 (MatchSizeInBits == 256 && Subtarget.hasAVX())))
42621 return SDValue();
42622
42623 // Make sure this isn't a vector of 1 element. The perf win from using
42624 // MOVMSK diminishes with less elements in the reduction, but it is
42625 // generally better to get the comparison over to the GPRs as soon as
42626 // possible to reduce the number of vector ops.
42627 if (Match.getValueType().getVectorNumElements() < 2)
42628 return SDValue();
42629
42630 // Check that we are extracting a reduction of all sign bits.
42631 if (DAG.ComputeNumSignBits(Match) != BitWidth)
42632 return SDValue();
42633
42634 if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) {
42635 SDValue Lo, Hi;
42636 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
42637 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
42638 MatchSizeInBits = Match.getValueSizeInBits();
42639 }
42640
42641 // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
42642 MVT MaskSrcVT;
42643 if (64 == BitWidth || 32 == BitWidth)
42644 MaskSrcVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
42645 MatchSizeInBits / BitWidth);
42646 else
42647 MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
42648
42649 SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match);
42650 Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);
42651 NumElts = MaskSrcVT.getVectorNumElements();
42652 }
42653 assert((NumElts <= 32 || NumElts == 64) &&(static_cast <bool> ((NumElts <= 32 || NumElts == 64
) && "Not expecting more than 64 elements") ? void (0
) : __assert_fail ("(NumElts <= 32 || NumElts == 64) && \"Not expecting more than 64 elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42654, __extension__
__PRETTY_FUNCTION__))
42654 "Not expecting more than 64 elements")(static_cast <bool> ((NumElts <= 32 || NumElts == 64
) && "Not expecting more than 64 elements") ? void (0
) : __assert_fail ("(NumElts <= 32 || NumElts == 64) && \"Not expecting more than 64 elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42654, __extension__
__PRETTY_FUNCTION__))
;
42655
42656 MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32;
42657 if (BinOp == ISD::XOR) {
42658 // parity -> (PARITY(MOVMSK X))
42659 SDValue Result = DAG.getNode(ISD::PARITY, DL, CmpVT, Movmsk);
42660 return DAG.getZExtOrTrunc(Result, DL, ExtractVT);
42661 }
42662
42663 SDValue CmpC;
42664 ISD::CondCode CondCode;
42665 if (BinOp == ISD::OR) {
42666 // any_of -> MOVMSK != 0
42667 CmpC = DAG.getConstant(0, DL, CmpVT);
42668 CondCode = ISD::CondCode::SETNE;
42669 } else {
42670 // all_of -> MOVMSK == ((1 << NumElts) - 1)
42671 CmpC = DAG.getConstant(APInt::getLowBitsSet(CmpVT.getSizeInBits(), NumElts),
42672 DL, CmpVT);
42673 CondCode = ISD::CondCode::SETEQ;
42674 }
42675
42676 // The setcc produces an i8 of 0/1, so extend that to the result width and
42677 // negate to get the final 0/-1 mask value.
42678 EVT SetccVT =
42679 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), CmpVT);
42680 SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);
42681 SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);
42682 SDValue Zero = DAG.getConstant(0, DL, ExtractVT);
42683 return DAG.getNode(ISD::SUB, DL, ExtractVT, Zero, Zext);
42684}
42685
42686static SDValue combineVPDPBUSDPattern(SDNode *Extract, SelectionDAG &DAG,
42687 const X86Subtarget &Subtarget) {
42688 if (!Subtarget.hasVNNI() && !Subtarget.hasAVXVNNI())
42689 return SDValue();
42690
42691 EVT ExtractVT = Extract->getValueType(0);
42692 // Verify the type we're extracting is i32, as the output element type of
42693 // vpdpbusd is i32.
42694 if (ExtractVT != MVT::i32)
42695 return SDValue();
42696
42697 EVT VT = Extract->getOperand(0).getValueType();
42698 if (!isPowerOf2_32(VT.getVectorNumElements()))
42699 return SDValue();
42700
42701 // Match shuffle + add pyramid.
42702 ISD::NodeType BinOp;
42703 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
42704
42705 // We can't combine to vpdpbusd for zext, because each of the 4 multiplies
42706 // done by vpdpbusd compute a signed 16-bit product that will be sign extended
42707 // before adding into the accumulator.
42708 // TODO:
42709 // We also need to verify that the multiply has at least 2x the number of bits
42710 // of the input. We shouldn't match
42711 // (sign_extend (mul (vXi9 (zext (vXi8 X))), (vXi9 (zext (vXi8 Y)))).
42712 // if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND))
42713 // Root = Root.getOperand(0);
42714
42715 // If there was a match, we want Root to be a mul.
42716 if (!Root || Root.getOpcode() != ISD::MUL)
42717 return SDValue();
42718
42719 // Check whether we have an extend and mul pattern
42720 SDValue LHS, RHS;
42721 if (!detectExtMul(DAG, Root, LHS, RHS))
42722 return SDValue();
42723
42724 // Create the dot product instruction.
42725 SDLoc DL(Extract);
42726 unsigned StageBias;
42727 SDValue DP = createVPDPBUSD(DAG, LHS, RHS, StageBias, DL, Subtarget);
42728
42729 // If the original vector was wider than 4 elements, sum over the results
42730 // in the DP vector.
42731 unsigned Stages = Log2_32(VT.getVectorNumElements());
42732 EVT DpVT = DP.getValueType();
42733
42734 if (Stages > StageBias) {
42735 unsigned DpElems = DpVT.getVectorNumElements();
42736
42737 for (unsigned i = Stages - StageBias; i > 0; --i) {
42738 SmallVector<int, 16> Mask(DpElems, -1);
42739 for (unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
42740 Mask[j] = MaskEnd + j;
42741
42742 SDValue Shuffle =
42743 DAG.getVectorShuffle(DpVT, DL, DP, DAG.getUNDEF(DpVT), Mask);
42744 DP = DAG.getNode(ISD::ADD, DL, DpVT, DP, Shuffle);
42745 }
42746 }
42747
42748 // Return the lowest ExtractSizeInBits bits.
42749 EVT ResVT =
42750 EVT::getVectorVT(*DAG.getContext(), ExtractVT,
42751 DpVT.getSizeInBits() / ExtractVT.getSizeInBits());
42752 DP = DAG.getBitcast(ResVT, DP);
42753 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, DP,
42754 Extract->getOperand(1));
42755}
42756
42757static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
42758 const X86Subtarget &Subtarget) {
42759 // PSADBW is only supported on SSE2 and up.
42760 if (!Subtarget.hasSSE2())
42761 return SDValue();
42762
42763 EVT ExtractVT = Extract->getValueType(0);
42764 // Verify the type we're extracting is either i32 or i64.
42765 // FIXME: Could support other types, but this is what we have coverage for.
42766 if (ExtractVT != MVT::i32 && ExtractVT != MVT::i64)
42767 return SDValue();
42768
42769 EVT VT = Extract->getOperand(0).getValueType();
42770 if (!isPowerOf2_32(VT.getVectorNumElements()))
42771 return SDValue();
42772
42773 // Match shuffle + add pyramid.
42774 ISD::NodeType BinOp;
42775 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
42776
42777 // The operand is expected to be zero extended from i8
42778 // (verified in detectZextAbsDiff).
42779 // In order to convert to i64 and above, additional any/zero/sign
42780 // extend is expected.
42781 // The zero extend from 32 bit has no mathematical effect on the result.
42782 // Also the sign extend is basically zero extend
42783 // (extends the sign bit which is zero).
42784 // So it is correct to skip the sign/zero extend instruction.
42785 if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
42786 Root.getOpcode() == ISD::ZERO_EXTEND ||
42787 Root.getOpcode() == ISD::ANY_EXTEND))
42788 Root = Root.getOperand(0);
42789
42790 // If there was a match, we want Root to be a select that is the root of an
42791 // abs-diff pattern.
42792 if (!Root || Root.getOpcode() != ISD::ABS)
42793 return SDValue();
42794
42795 // Check whether we have an abs-diff pattern feeding into the select.
42796 SDValue Zext0, Zext1;
42797 if (!detectZextAbsDiff(Root, Zext0, Zext1))
42798 return SDValue();
42799
42800 // Create the SAD instruction.
42801 SDLoc DL(Extract);
42802 SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL, Subtarget);
42803
42804 // If the original vector was wider than 8 elements, sum over the results
42805 // in the SAD vector.
42806 unsigned Stages = Log2_32(VT.getVectorNumElements());
42807 EVT SadVT = SAD.getValueType();
42808 if (Stages > 3) {
42809 unsigned SadElems = SadVT.getVectorNumElements();
42810
42811 for(unsigned i = Stages - 3; i > 0; --i) {
42812 SmallVector<int, 16> Mask(SadElems, -1);
42813 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
42814 Mask[j] = MaskEnd + j;
42815
42816 SDValue Shuffle =
42817 DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
42818 SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
42819 }
42820 }
42821
42822 unsigned ExtractSizeInBits = ExtractVT.getSizeInBits();
42823 // Return the lowest ExtractSizeInBits bits.
42824 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), ExtractVT,
42825 SadVT.getSizeInBits() / ExtractSizeInBits);
42826 SAD = DAG.getBitcast(ResVT, SAD);
42827 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, SAD,
42828 Extract->getOperand(1));
42829}
42830
42831// Attempt to peek through a target shuffle and extract the scalar from the
42832// source.
42833static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
42834 TargetLowering::DAGCombinerInfo &DCI,
42835 const X86Subtarget &Subtarget) {
42836 if (DCI.isBeforeLegalizeOps())
42837 return SDValue();
42838
42839 SDLoc dl(N);
42840 SDValue Src = N->getOperand(0);
42841 SDValue Idx = N->getOperand(1);
42842
42843 EVT VT = N->getValueType(0);
42844 EVT SrcVT = Src.getValueType();
42845 EVT SrcSVT = SrcVT.getVectorElementType();
42846 unsigned SrcEltBits = SrcSVT.getSizeInBits();
42847 unsigned NumSrcElts = SrcVT.getVectorNumElements();
42848
42849 // Don't attempt this for boolean mask vectors or unknown extraction indices.
42850 if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
42851 return SDValue();
42852
42853 const APInt &IdxC = N->getConstantOperandAPInt(1);
42854 if (IdxC.uge(NumSrcElts))
42855 return SDValue();
42856
42857 SDValue SrcBC = peekThroughBitcasts(Src);
42858
42859 // Handle extract(bitcast(broadcast(scalar_value))).
42860 if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {
42861 SDValue SrcOp = SrcBC.getOperand(0);
42862 EVT SrcOpVT = SrcOp.getValueType();
42863 if (SrcOpVT.isScalarInteger() && VT.isInteger() &&
42864 (SrcOpVT.getSizeInBits() % SrcEltBits) == 0) {
42865 unsigned Scale = SrcOpVT.getSizeInBits() / SrcEltBits;
42866 unsigned Offset = IdxC.urem(Scale) * SrcEltBits;
42867 // TODO support non-zero offsets.
42868 if (Offset == 0) {
42869 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, SrcVT.getScalarType());
42870 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, VT);
42871 return SrcOp;
42872 }
42873 }
42874 }
42875
42876 // If we're extracting a single element from a broadcast load and there are
42877 // no other users, just create a single load.
42878 if (SrcBC.getOpcode() == X86ISD::VBROADCAST_LOAD && SrcBC.hasOneUse()) {
42879 auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);
42880 unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();
42881 if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&
42882 VT.getSizeInBits() == SrcBCWidth && SrcEltBits == SrcBCWidth) {
42883 SDValue Load = DAG.getLoad(VT, dl, MemIntr->getChain(),
42884 MemIntr->getBasePtr(),
42885 MemIntr->getPointerInfo(),
42886 MemIntr->getOriginalAlign(),
42887 MemIntr->getMemOperand()->getFlags());
42888 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
42889 return Load;
42890 }
42891 }
42892
42893 // Handle extract(bitcast(scalar_to_vector(scalar_value))) for integers.
42894 // TODO: Move to DAGCombine?
42895 if (SrcBC.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isInteger() &&
42896 SrcBC.getValueType().isInteger() &&
42897 (SrcBC.getScalarValueSizeInBits() % SrcEltBits) == 0 &&
42898 SrcBC.getScalarValueSizeInBits() ==
42899 SrcBC.getOperand(0).getValueSizeInBits()) {
42900 unsigned Scale = SrcBC.getScalarValueSizeInBits() / SrcEltBits;
42901 if (IdxC.ult(Scale)) {
42902 unsigned Offset = IdxC.getZExtValue() * SrcVT.getScalarSizeInBits();
42903 SDValue Scl = SrcBC.getOperand(0);
42904 EVT SclVT = Scl.getValueType();
42905 if (Offset) {
42906 Scl = DAG.getNode(ISD::SRL, dl, SclVT, Scl,
42907 DAG.getShiftAmountConstant(Offset, SclVT, dl));
42908 }
42909 Scl = DAG.getZExtOrTrunc(Scl, dl, SrcVT.getScalarType());
42910 Scl = DAG.getZExtOrTrunc(Scl, dl, VT);
42911 return Scl;
42912 }
42913 }
42914
42915 // Handle extract(truncate(x)) for 0'th index.
42916 // TODO: Treat this as a faux shuffle?
42917 // TODO: When can we use this for general indices?
42918 if (ISD::TRUNCATE == Src.getOpcode() && IdxC == 0 &&
42919 (SrcVT.getSizeInBits() % 128) == 0) {
42920 Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);
42921 MVT ExtractVT = MVT::getVectorVT(SrcSVT.getSimpleVT(), 128 / SrcEltBits);
42922 return DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(ExtractVT, Src),
42923 Idx);
42924 }
42925
42926 // We can only legally extract other elements from 128-bit vectors and in
42927 // certain circumstances, depending on SSE-level.
42928 // TODO: Investigate float/double extraction if it will be just stored.
42929 auto GetLegalExtract = [&Subtarget, &DAG, &dl](SDValue Vec, EVT VecVT,
42930 unsigned Idx) {
42931 EVT VecSVT = VecVT.getScalarType();
42932 if ((VecVT.is256BitVector() || VecVT.is512BitVector()) &&
42933 (VecSVT == MVT::i8 || VecSVT == MVT::i16 || VecSVT == MVT::i32 ||
42934 VecSVT == MVT::i64)) {
42935 unsigned EltSizeInBits = VecSVT.getSizeInBits();
42936 unsigned NumEltsPerLane = 128 / EltSizeInBits;
42937 unsigned LaneOffset = (Idx & ~(NumEltsPerLane - 1)) * EltSizeInBits;
42938 unsigned LaneIdx = LaneOffset / Vec.getScalarValueSizeInBits();
42939 VecVT = EVT::getVectorVT(*DAG.getContext(), VecSVT, NumEltsPerLane);
42940 Vec = extract128BitVector(Vec, LaneIdx, DAG, dl);
42941 Idx &= (NumEltsPerLane - 1);
42942 }
42943 if ((VecVT == MVT::v4i32 || VecVT == MVT::v2i64) &&
42944 ((Idx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
42945 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VecVT.getScalarType(),
42946 DAG.getBitcast(VecVT, Vec),
42947 DAG.getIntPtrConstant(Idx, dl));
42948 }
42949 if ((VecVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
42950 (VecVT == MVT::v16i8 && Subtarget.hasSSE41())) {
42951 unsigned OpCode = (VecVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
42952 return DAG.getNode(OpCode, dl, MVT::i32, DAG.getBitcast(VecVT, Vec),
42953 DAG.getTargetConstant(Idx, dl, MVT::i8));
42954 }
42955 return SDValue();
42956 };
42957
42958 // Resolve the target shuffle inputs and mask.
42959 SmallVector<int, 16> Mask;
42960 SmallVector<SDValue, 2> Ops;
42961 if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG))
42962 return SDValue();
42963
42964 // Shuffle inputs must be the same size as the result.
42965 if (llvm::any_of(Ops, [SrcVT](SDValue Op) {
42966 return SrcVT.getSizeInBits() != Op.getValueSizeInBits();
42967 }))
42968 return SDValue();
42969
42970 // Attempt to narrow/widen the shuffle mask to the correct size.
42971 if (Mask.size() != NumSrcElts) {
42972 if ((NumSrcElts % Mask.size()) == 0) {
42973 SmallVector<int, 16> ScaledMask;
42974 int Scale = NumSrcElts / Mask.size();
42975 narrowShuffleMaskElts(Scale, Mask, ScaledMask);
42976 Mask = std::move(ScaledMask);
42977 } else if ((Mask.size() % NumSrcElts) == 0) {
42978 // Simplify Mask based on demanded element.
42979 int ExtractIdx = (int)IdxC.getZExtValue();
42980 int Scale = Mask.size() / NumSrcElts;
42981 int Lo = Scale * ExtractIdx;
42982 int Hi = Scale * (ExtractIdx + 1);
42983 for (int i = 0, e = (int)Mask.size(); i != e; ++i)
42984 if (i < Lo || Hi <= i)
42985 Mask[i] = SM_SentinelUndef;
42986
42987 SmallVector<int, 16> WidenedMask;
42988 while (Mask.size() > NumSrcElts &&
42989 canWidenShuffleElements(Mask, WidenedMask))
42990 Mask = std::move(WidenedMask);
42991 }
42992 }
42993
42994 // If narrowing/widening failed, see if we can extract+zero-extend.
42995 int ExtractIdx;
42996 EVT ExtractVT;
42997 if (Mask.size() == NumSrcElts) {
42998 ExtractIdx = Mask[IdxC.getZExtValue()];
42999 ExtractVT = SrcVT;
43000 } else {
43001 unsigned Scale = Mask.size() / NumSrcElts;
43002 if ((Mask.size() % NumSrcElts) != 0 || SrcVT.isFloatingPoint())
43003 return SDValue();
43004 unsigned ScaledIdx = Scale * IdxC.getZExtValue();
43005 if (!isUndefOrZeroInRange(Mask, ScaledIdx + 1, Scale - 1))
43006 return SDValue();
43007 ExtractIdx = Mask[ScaledIdx];
43008 EVT ExtractSVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltBits / Scale);
43009 ExtractVT = EVT::getVectorVT(*DAG.getContext(), ExtractSVT, Mask.size());
43010 assert(SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() &&(static_cast <bool> (SrcVT.getSizeInBits() == ExtractVT
.getSizeInBits() && "Failed to widen vector type") ? void
(0) : __assert_fail ("SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() && \"Failed to widen vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43011, __extension__
__PRETTY_FUNCTION__))
43011 "Failed to widen vector type")(static_cast <bool> (SrcVT.getSizeInBits() == ExtractVT
.getSizeInBits() && "Failed to widen vector type") ? void
(0) : __assert_fail ("SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() && \"Failed to widen vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43011, __extension__
__PRETTY_FUNCTION__))
;
43012 }
43013
43014 // If the shuffle source element is undef/zero then we can just accept it.
43015 if (ExtractIdx == SM_SentinelUndef)
43016 return DAG.getUNDEF(VT);
43017
43018 if (ExtractIdx == SM_SentinelZero)
43019 return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
43020 : DAG.getConstant(0, dl, VT);
43021
43022 SDValue SrcOp = Ops[ExtractIdx / Mask.size()];
43023 ExtractIdx = ExtractIdx % Mask.size();
43024 if (SDValue V = GetLegalExtract(SrcOp, ExtractVT, ExtractIdx))
43025 return DAG.getZExtOrTrunc(V, dl, VT);
43026
43027 return SDValue();
43028}
43029
43030/// Extracting a scalar FP value from vector element 0 is free, so extract each
43031/// operand first, then perform the math as a scalar op.
43032static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG,
43033 const X86Subtarget &Subtarget) {
43034 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract")(static_cast <bool> (ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT
&& "Expected extract") ? void (0) : __assert_fail ("ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && \"Expected extract\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43034, __extension__
__PRETTY_FUNCTION__))
;
43035 SDValue Vec = ExtElt->getOperand(0);
43036 SDValue Index = ExtElt->getOperand(1);
43037 EVT VT = ExtElt->getValueType(0);
43038 EVT VecVT = Vec.getValueType();
43039
43040 // TODO: If this is a unary/expensive/expand op, allow extraction from a
43041 // non-zero element because the shuffle+scalar op will be cheaper?
43042 if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT)
43043 return SDValue();
43044
43045 // Vector FP compares don't fit the pattern of FP math ops (propagate, not
43046 // extract, the condition code), so deal with those as a special-case.
43047 if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) {
43048 EVT OpVT = Vec.getOperand(0).getValueType().getScalarType();
43049 if (OpVT != MVT::f32 && OpVT != MVT::f64)
43050 return SDValue();
43051
43052 // extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC
43053 SDLoc DL(ExtElt);
43054 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
43055 Vec.getOperand(0), Index);
43056 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
43057 Vec.getOperand(1), Index);
43058 return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2));
43059 }
43060
43061 if (!(VT == MVT::f16 && Subtarget.hasFP16()) && VT != MVT::f32 &&
43062 VT != MVT::f64)
43063 return SDValue();
43064
43065 // Vector FP selects don't fit the pattern of FP math ops (because the
43066 // condition has a different type and we have to change the opcode), so deal
43067 // with those here.
43068 // FIXME: This is restricted to pre type legalization by ensuring the setcc
43069 // has i1 elements. If we loosen this we need to convert vector bool to a
43070 // scalar bool.
43071 if (Vec.getOpcode() == ISD::VSELECT &&
43072 Vec.getOperand(0).getOpcode() == ISD::SETCC &&
43073 Vec.getOperand(0).getValueType().getScalarType() == MVT::i1 &&
43074 Vec.getOperand(0).getOperand(0).getValueType() == VecVT) {
43075 // ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)
43076 SDLoc DL(ExtElt);
43077 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
43078 Vec.getOperand(0).getValueType().getScalarType(),
43079 Vec.getOperand(0), Index);
43080 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
43081 Vec.getOperand(1), Index);
43082 SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
43083 Vec.getOperand(2), Index);
43084 return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);
43085 }
43086
43087 // TODO: This switch could include FNEG and the x86-specific FP logic ops
43088 // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid
43089 // missed load folding and fma+fneg combining.
43090 switch (Vec.getOpcode()) {
43091 case ISD::FMA: // Begin 3 operands
43092 case ISD::FMAD:
43093 case ISD::FADD: // Begin 2 operands
43094 case ISD::FSUB:
43095 case ISD::FMUL:
43096 case ISD::FDIV:
43097 case ISD::FREM:
43098 case ISD::FCOPYSIGN:
43099 case ISD::FMINNUM:
43100 case ISD::FMAXNUM:
43101 case ISD::FMINNUM_IEEE:
43102 case ISD::FMAXNUM_IEEE:
43103 case ISD::FMAXIMUM:
43104 case ISD::FMINIMUM:
43105 case X86ISD::FMAX:
43106 case X86ISD::FMIN:
43107 case ISD::FABS: // Begin 1 operand
43108 case ISD::FSQRT:
43109 case ISD::FRINT:
43110 case ISD::FCEIL:
43111 case ISD::FTRUNC:
43112 case ISD::FNEARBYINT:
43113 case ISD::FROUND:
43114 case ISD::FFLOOR:
43115 case X86ISD::FRCP:
43116 case X86ISD::FRSQRT: {
43117 // extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...
43118 SDLoc DL(ExtElt);
43119 SmallVector<SDValue, 4> ExtOps;
43120 for (SDValue Op : Vec->ops())
43121 ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index));
43122 return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps);
43123 }
43124 default:
43125 return SDValue();
43126 }
43127 llvm_unreachable("All opcodes should return within switch")::llvm::llvm_unreachable_internal("All opcodes should return within switch"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43127)
;
43128}
43129
43130/// Try to convert a vector reduction sequence composed of binops and shuffles
43131/// into horizontal ops.
43132static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG,
43133 const X86Subtarget &Subtarget) {
43134 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller")(static_cast <bool> (ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT
&& "Unexpected caller") ? void (0) : __assert_fail (
"ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && \"Unexpected caller\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43134, __extension__
__PRETTY_FUNCTION__))
;
43135
43136 // We need at least SSE2 to anything here.
43137 if (!Subtarget.hasSSE2())
43138 return SDValue();
43139
43140 ISD::NodeType Opc;
43141 SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc,
43142 {ISD::ADD, ISD::MUL, ISD::FADD}, true);
43143 if (!Rdx)
43144 return SDValue();
43145
43146 SDValue Index = ExtElt->getOperand(1);
43147 assert(isNullConstant(Index) &&(static_cast <bool> (isNullConstant(Index) && "Reduction doesn't end in an extract from index 0"
) ? void (0) : __assert_fail ("isNullConstant(Index) && \"Reduction doesn't end in an extract from index 0\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43148, __extension__
__PRETTY_FUNCTION__))
43148 "Reduction doesn't end in an extract from index 0")(static_cast <bool> (isNullConstant(Index) && "Reduction doesn't end in an extract from index 0"
) ? void (0) : __assert_fail ("isNullConstant(Index) && \"Reduction doesn't end in an extract from index 0\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43148, __extension__
__PRETTY_FUNCTION__))
;
43149
43150 EVT VT = ExtElt->getValueType(0);
43151 EVT VecVT = Rdx.getValueType();
43152 if (VecVT.getScalarType() != VT)
43153 return SDValue();
43154
43155 SDLoc DL(ExtElt);
43156 unsigned NumElts = VecVT.getVectorNumElements();
43157 unsigned EltSizeInBits = VecVT.getScalarSizeInBits();
43158
43159 // Extend v4i8/v8i8 vector to v16i8, with undef upper 64-bits.
43160 auto WidenToV16I8 = [&](SDValue V, bool ZeroExtend) {
43161 if (V.getValueType() == MVT::v4i8) {
43162 if (ZeroExtend && Subtarget.hasSSE41()) {
43163 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
43164 DAG.getConstant(0, DL, MVT::v4i32),
43165 DAG.getBitcast(MVT::i32, V),
43166 DAG.getIntPtrConstant(0, DL));
43167 return DAG.getBitcast(MVT::v16i8, V);
43168 }
43169 V = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, V,
43170 ZeroExtend ? DAG.getConstant(0, DL, MVT::v4i8)
43171 : DAG.getUNDEF(MVT::v4i8));
43172 }
43173 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V,
43174 DAG.getUNDEF(MVT::v8i8));
43175 };
43176
43177 // vXi8 mul reduction - promote to vXi16 mul reduction.
43178 if (Opc == ISD::MUL) {
43179 if (VT != MVT::i8 || NumElts < 4 || !isPowerOf2_32(NumElts))
43180 return SDValue();
43181 if (VecVT.getSizeInBits() >= 128) {
43182 EVT WideVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts / 2);
43183 SDValue Lo = getUnpackl(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
43184 SDValue Hi = getUnpackh(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
43185 Lo = DAG.getBitcast(WideVT, Lo);
43186 Hi = DAG.getBitcast(WideVT, Hi);
43187 Rdx = DAG.getNode(Opc, DL, WideVT, Lo, Hi);
43188 while (Rdx.getValueSizeInBits() > 128) {
43189 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
43190 Rdx = DAG.getNode(Opc, DL, Lo.getValueType(), Lo, Hi);
43191 }
43192 } else {
43193 Rdx = WidenToV16I8(Rdx, false);
43194 Rdx = getUnpackl(DAG, DL, MVT::v16i8, Rdx, DAG.getUNDEF(MVT::v16i8));
43195 Rdx = DAG.getBitcast(MVT::v8i16, Rdx);
43196 }
43197 if (NumElts >= 8)
43198 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
43199 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
43200 {4, 5, 6, 7, -1, -1, -1, -1}));
43201 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
43202 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
43203 {2, 3, -1, -1, -1, -1, -1, -1}));
43204 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
43205 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
43206 {1, -1, -1, -1, -1, -1, -1, -1}));
43207 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
43208 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
43209 }
43210
43211 // vXi8 add reduction - sub 128-bit vector.
43212 if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) {
43213 Rdx = WidenToV16I8(Rdx, true);
43214 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
43215 DAG.getConstant(0, DL, MVT::v16i8));
43216 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
43217 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
43218 }
43219
43220 // Must be a >=128-bit vector with pow2 elements.
43221 if ((VecVT.getSizeInBits() % 128) != 0 || !isPowerOf2_32(NumElts))
43222 return SDValue();
43223
43224 // vXi8 add reduction - sum lo/hi halves then use PSADBW.
43225 if (VT == MVT::i8) {
43226 while (Rdx.getValueSizeInBits() > 128) {
43227 SDValue Lo, Hi;
43228 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
43229 VecVT = Lo.getValueType();
43230 Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
43231 }
43232 assert(VecVT == MVT::v16i8 && "v16i8 reduction expected")(static_cast <bool> (VecVT == MVT::v16i8 && "v16i8 reduction expected"
) ? void (0) : __assert_fail ("VecVT == MVT::v16i8 && \"v16i8 reduction expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43232, __extension__
__PRETTY_FUNCTION__))
;
43233
43234 SDValue Hi = DAG.getVectorShuffle(
43235 MVT::v16i8, DL, Rdx, Rdx,
43236 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
43237 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi);
43238 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
43239 getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
43240 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
43241 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
43242 }
43243
43244 // See if we can use vXi8 PSADBW add reduction for larger zext types.
43245 // If the source vector values are 0-255, then we can use PSADBW to
43246 // sum+zext v8i8 subvectors to vXi64, then perform the reduction.
43247 // TODO: See if its worth avoiding vXi16/i32 truncations?
43248 if (Opc == ISD::ADD && NumElts >= 4 && EltSizeInBits >= 16 &&
43249 DAG.computeKnownBits(Rdx).getMaxValue().ule(255) &&
43250 (EltSizeInBits == 16 || Rdx.getOpcode() == ISD::ZERO_EXTEND ||
43251 Subtarget.hasAVX512())) {
43252 EVT ByteVT = VecVT.changeVectorElementType(MVT::i8);
43253 Rdx = DAG.getNode(ISD::TRUNCATE, DL, ByteVT, Rdx);
43254 if (ByteVT.getSizeInBits() < 128)
43255 Rdx = WidenToV16I8(Rdx, true);
43256
43257 // Build the PSADBW, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
43258 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
43259 ArrayRef<SDValue> Ops) {
43260 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
43261 SDValue Zero = DAG.getConstant(0, DL, Ops[0].getValueType());
43262 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops[0], Zero);
43263 };
43264 MVT SadVT = MVT::getVectorVT(MVT::i64, Rdx.getValueSizeInBits() / 64);
43265 Rdx = SplitOpsAndApply(DAG, Subtarget, DL, SadVT, {Rdx}, PSADBWBuilder);
43266
43267 // TODO: We could truncate to vXi16/vXi32 before performing the reduction.
43268 while (Rdx.getValueSizeInBits() > 128) {
43269 SDValue Lo, Hi;
43270 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
43271 VecVT = Lo.getValueType();
43272 Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
43273 }
43274 assert(Rdx.getValueType() == MVT::v2i64 && "v2i64 reduction expected")(static_cast <bool> (Rdx.getValueType() == MVT::v2i64 &&
"v2i64 reduction expected") ? void (0) : __assert_fail ("Rdx.getValueType() == MVT::v2i64 && \"v2i64 reduction expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43274, __extension__
__PRETTY_FUNCTION__))
;
43275
43276 if (NumElts > 8) {
43277 SDValue RdxHi = DAG.getVectorShuffle(MVT::v2i64, DL, Rdx, Rdx, {1, -1});
43278 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v2i64, Rdx, RdxHi);
43279 }
43280
43281 VecVT = MVT::getVectorVT(VT.getSimpleVT(), 128 / VT.getSizeInBits());
43282 Rdx = DAG.getBitcast(VecVT, Rdx);
43283 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
43284 }
43285
43286 // Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.
43287 if (!shouldUseHorizontalOp(true, DAG, Subtarget))
43288 return SDValue();
43289
43290 unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
43291
43292 // 256-bit horizontal instructions operate on 128-bit chunks rather than
43293 // across the whole vector, so we need an extract + hop preliminary stage.
43294 // This is the only step where the operands of the hop are not the same value.
43295 // TODO: We could extend this to handle 512-bit or even longer vectors.
43296 if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) ||
43297 ((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {
43298 unsigned NumElts = VecVT.getVectorNumElements();
43299 SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);
43300 SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);
43301 Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo);
43302 VecVT = Rdx.getValueType();
43303 }
43304 if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
43305 !((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
43306 return SDValue();
43307
43308 // extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
43309 unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
43310 for (unsigned i = 0; i != ReductionSteps; ++i)
43311 Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);
43312
43313 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
43314}
43315
43316/// Detect vector gather/scatter index generation and convert it from being a
43317/// bunch of shuffles and extracts into a somewhat faster sequence.
43318/// For i686, the best sequence is apparently storing the value and loading
43319/// scalars back, while for x64 we should use 64-bit extracts and shifts.
43320static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
43321 TargetLowering::DAGCombinerInfo &DCI,
43322 const X86Subtarget &Subtarget) {
43323 if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
43324 return NewOp;
43325
43326 SDValue InputVector = N->getOperand(0);
43327 SDValue EltIdx = N->getOperand(1);
43328 auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx);
43329
43330 EVT SrcVT = InputVector.getValueType();
43331 EVT VT = N->getValueType(0);
43332 SDLoc dl(InputVector);
43333 bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;
43334 unsigned NumSrcElts = SrcVT.getVectorNumElements();
43335
43336 if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts))
43337 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
43338
43339 // Integer Constant Folding.
43340 if (CIdx && VT.isInteger()) {
43341 APInt UndefVecElts;
43342 SmallVector<APInt, 16> EltBits;
43343 unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits();
43344 if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts,
43345 EltBits, true, false)) {
43346 uint64_t Idx = CIdx->getZExtValue();
43347 if (UndefVecElts[Idx])
43348 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
43349 return DAG.getConstant(EltBits[Idx].zextOrSelf(VT.getScalarSizeInBits()),
43350 dl, VT);
43351 }
43352 }
43353
43354 if (IsPextr) {
43355 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43356 if (TLI.SimplifyDemandedBits(SDValue(N, 0),
43357 APInt::getAllOnes(VT.getSizeInBits()), DCI))
43358 return SDValue(N, 0);
43359
43360 // PEXTR*(PINSR*(v, s, c), c) -> s (with implicit zext handling).
43361 if ((InputVector.getOpcode() == X86ISD::PINSRB ||
43362 InputVector.getOpcode() == X86ISD::PINSRW) &&
43363 InputVector.getOperand(2) == EltIdx) {
43364 assert(SrcVT == InputVector.getOperand(0).getValueType() &&(static_cast <bool> (SrcVT == InputVector.getOperand(0)
.getValueType() && "Vector type mismatch") ? void (0)
: __assert_fail ("SrcVT == InputVector.getOperand(0).getValueType() && \"Vector type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43365, __extension__
__PRETTY_FUNCTION__))
43365 "Vector type mismatch")(static_cast <bool> (SrcVT == InputVector.getOperand(0)
.getValueType() && "Vector type mismatch") ? void (0)
: __assert_fail ("SrcVT == InputVector.getOperand(0).getValueType() && \"Vector type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43365, __extension__
__PRETTY_FUNCTION__))
;
43366 SDValue Scl = InputVector.getOperand(1);
43367 Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl);
43368 return DAG.getZExtOrTrunc(Scl, dl, VT);
43369 }
43370
43371 // TODO - Remove this once we can handle the implicit zero-extension of
43372 // X86ISD::PEXTRW/X86ISD::PEXTRB in combinePredicateReduction and
43373 // combineBasicSADPattern.
43374 return SDValue();
43375 }
43376
43377 // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
43378 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
43379 VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
43380 SDValue MMXSrc = InputVector.getOperand(0);
43381
43382 // The bitcast source is a direct mmx result.
43383 if (MMXSrc.getValueType() == MVT::x86mmx)
43384 return DAG.getBitcast(VT, InputVector);
43385 }
43386
43387 // Detect mmx to i32 conversion through a v2i32 elt extract.
43388 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
43389 VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {
43390 SDValue MMXSrc = InputVector.getOperand(0);
43391
43392 // The bitcast source is a direct mmx result.
43393 if (MMXSrc.getValueType() == MVT::x86mmx)
43394 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
43395 }
43396
43397 // Check whether this extract is the root of a sum of absolute differences
43398 // pattern. This has to be done here because we really want it to happen
43399 // pre-legalization,
43400 if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
43401 return SAD;
43402
43403 if (SDValue VPDPBUSD = combineVPDPBUSDPattern(N, DAG, Subtarget))
43404 return VPDPBUSD;
43405
43406 // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
43407 if (SDValue Cmp = combinePredicateReduction(N, DAG, Subtarget))
43408 return Cmp;
43409
43410 // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
43411 if (SDValue MinMax = combineMinMaxReduction(N, DAG, Subtarget))
43412 return MinMax;
43413
43414 // Attempt to optimize ADD/FADD/MUL reductions with HADD, promotion etc..
43415 if (SDValue V = combineArithReduction(N, DAG, Subtarget))
43416 return V;
43417
43418 if (SDValue V = scalarizeExtEltFP(N, DAG, Subtarget))
43419 return V;
43420
43421 // Attempt to extract a i1 element by using MOVMSK to extract the signbits
43422 // and then testing the relevant element.
43423 //
43424 // Note that we only combine extracts on the *same* result number, i.e.
43425 // t0 = merge_values a0, a1, a2, a3
43426 // i1 = extract_vector_elt t0, Constant:i64<2>
43427 // i1 = extract_vector_elt t0, Constant:i64<3>
43428 // but not
43429 // i1 = extract_vector_elt t0:1, Constant:i64<2>
43430 // since the latter would need its own MOVMSK.
43431 if (SrcVT.getScalarType() == MVT::i1) {
43432 bool IsVar = !CIdx;
43433 SmallVector<SDNode *, 16> BoolExtracts;
43434 unsigned ResNo = InputVector.getResNo();
43435 auto IsBoolExtract = [&BoolExtracts, &ResNo, &IsVar](SDNode *Use) {
43436 if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
43437 Use->getOperand(0).getResNo() == ResNo &&
43438 Use->getValueType(0) == MVT::i1) {
43439 BoolExtracts.push_back(Use);
43440 IsVar |= !isa<ConstantSDNode>(Use->getOperand(1));
43441 return true;
43442 }
43443 return false;
43444 };
43445 // TODO: Can we drop the oneuse check for constant extracts?
43446 if (all_of(InputVector->uses(), IsBoolExtract) &&
43447 (IsVar || BoolExtracts.size() > 1)) {
43448 EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);
43449 if (SDValue BC =
43450 combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {
43451 for (SDNode *Use : BoolExtracts) {
43452 // extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask
43453 // Mask = 1 << MaskIdx
43454 SDValue MaskIdx = DAG.getZExtOrTrunc(Use->getOperand(1), dl, MVT::i8);
43455 SDValue MaskBit = DAG.getConstant(1, dl, BCVT);
43456 SDValue Mask = DAG.getNode(ISD::SHL, dl, BCVT, MaskBit, MaskIdx);
43457 SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);
43458 Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);
43459 DCI.CombineTo(Use, Res);
43460 }
43461 return SDValue(N, 0);
43462 }
43463 }
43464 }
43465
43466 // If this extract is from a loaded vector value and will be used as an
43467 // integer, that requires a potentially expensive XMM -> GPR transfer.
43468 // Additionally, if we can convert to a scalar integer load, that will likely
43469 // be folded into a subsequent integer op.
43470 // Note: Unlike the related fold for this in DAGCombiner, this is not limited
43471 // to a single-use of the loaded vector. For the reasons above, we
43472 // expect this to be profitable even if it creates an extra load.
43473 bool LikelyUsedAsVector = any_of(N->uses(), [](SDNode *Use) {
43474 return Use->getOpcode() == ISD::STORE ||
43475 Use->getOpcode() == ISD::INSERT_VECTOR_ELT ||
43476 Use->getOpcode() == ISD::SCALAR_TO_VECTOR;
43477 });
43478 auto *LoadVec = dyn_cast<LoadSDNode>(InputVector);
43479 if (LoadVec && CIdx && ISD::isNormalLoad(LoadVec) && VT.isInteger() &&
43480 SrcVT.getVectorElementType() == VT && DCI.isAfterLegalizeDAG() &&
43481 !LikelyUsedAsVector) {
43482 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43483 SDValue NewPtr =
43484 TLI.getVectorElementPointer(DAG, LoadVec->getBasePtr(), SrcVT, EltIdx);
43485 unsigned PtrOff = VT.getSizeInBits() * CIdx->getZExtValue() / 8;
43486 MachinePointerInfo MPI = LoadVec->getPointerInfo().getWithOffset(PtrOff);
43487 Align Alignment = commonAlignment(LoadVec->getAlign(), PtrOff);
43488 SDValue Load =
43489 DAG.getLoad(VT, dl, LoadVec->getChain(), NewPtr, MPI, Alignment,
43490 LoadVec->getMemOperand()->getFlags(), LoadVec->getAAInfo());
43491 DAG.makeEquivalentMemoryOrdering(LoadVec, Load);
43492 return Load;
43493 }
43494
43495 return SDValue();
43496}
43497
43498// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
43499// This is more or less the reverse of combineBitcastvxi1.
43500static SDValue combineToExtendBoolVectorInReg(
43501 unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG,
43502 TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) {
43503 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
43504 Opcode != ISD::ANY_EXTEND)
43505 return SDValue();
43506 if (!DCI.isBeforeLegalizeOps())
43507 return SDValue();
43508 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
43509 return SDValue();
43510
43511 EVT SVT = VT.getScalarType();
43512 EVT InSVT = N0.getValueType().getScalarType();
43513 unsigned EltSizeInBits = SVT.getSizeInBits();
43514
43515 // Input type must be extending a bool vector (bit-casted from a scalar
43516 // integer) to legal integer types.
43517 if (!VT.isVector())
43518 return SDValue();
43519 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
43520 return SDValue();
43521 if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
43522 return SDValue();
43523
43524 SDValue N00 = N0.getOperand(0);
43525 EVT SclVT = N00.getValueType();
43526 if (!SclVT.isScalarInteger())
43527 return SDValue();
43528
43529 SDValue Vec;
43530 SmallVector<int> ShuffleMask;
43531 unsigned NumElts = VT.getVectorNumElements();
43532 assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size")(static_cast <bool> (NumElts == SclVT.getSizeInBits() &&
"Unexpected bool vector size") ? void (0) : __assert_fail ("NumElts == SclVT.getSizeInBits() && \"Unexpected bool vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43532, __extension__
__PRETTY_FUNCTION__))
;
43533
43534 // Broadcast the scalar integer to the vector elements.
43535 if (NumElts > EltSizeInBits) {
43536 // If the scalar integer is greater than the vector element size, then we
43537 // must split it down into sub-sections for broadcasting. For example:
43538 // i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
43539 // i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
43540 assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale")(static_cast <bool> ((NumElts % EltSizeInBits) == 0 &&
"Unexpected integer scale") ? void (0) : __assert_fail ("(NumElts % EltSizeInBits) == 0 && \"Unexpected integer scale\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43540, __extension__
__PRETTY_FUNCTION__))
;
43541 unsigned Scale = NumElts / EltSizeInBits;
43542 EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
43543 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
43544 Vec = DAG.getBitcast(VT, Vec);
43545
43546 for (unsigned i = 0; i != Scale; ++i)
43547 ShuffleMask.append(EltSizeInBits, i);
43548 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
43549 } else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits &&
43550 (SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) {
43551 // If we have register broadcast instructions, use the scalar size as the
43552 // element type for the shuffle. Then cast to the wider element type. The
43553 // widened bits won't be used, and this might allow the use of a broadcast
43554 // load.
43555 assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale")(static_cast <bool> ((EltSizeInBits % NumElts) == 0 &&
"Unexpected integer scale") ? void (0) : __assert_fail ("(EltSizeInBits % NumElts) == 0 && \"Unexpected integer scale\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43555, __extension__
__PRETTY_FUNCTION__))
;
43556 unsigned Scale = EltSizeInBits / NumElts;
43557 EVT BroadcastVT =
43558 EVT::getVectorVT(*DAG.getContext(), SclVT, NumElts * Scale);
43559 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
43560 ShuffleMask.append(NumElts * Scale, 0);
43561 Vec = DAG.getVectorShuffle(BroadcastVT, DL, Vec, Vec, ShuffleMask);
43562 Vec = DAG.getBitcast(VT, Vec);
43563 } else {
43564 // For smaller scalar integers, we can simply any-extend it to the vector
43565 // element size (we don't care about the upper bits) and broadcast it to all
43566 // elements.
43567 SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
43568 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
43569 ShuffleMask.append(NumElts, 0);
43570 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
43571 }
43572
43573 // Now, mask the relevant bit in each element.
43574 SmallVector<SDValue, 32> Bits;
43575 for (unsigned i = 0; i != NumElts; ++i) {
43576 int BitIdx = (i % EltSizeInBits);
43577 APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
43578 Bits.push_back(DAG.getConstant(Bit, DL, SVT));
43579 }
43580 SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
43581 Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
43582
43583 // Compare against the bitmask and extend the result.
43584 EVT CCVT = VT.changeVectorElementType(MVT::i1);
43585 Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
43586 Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
43587
43588 // For SEXT, this is now done, otherwise shift the result down for
43589 // zero-extension.
43590 if (Opcode == ISD::SIGN_EXTEND)
43591 return Vec;
43592 return DAG.getNode(ISD::SRL, DL, VT, Vec,
43593 DAG.getConstant(EltSizeInBits - 1, DL, VT));
43594}
43595
43596/// If a vector select has an operand that is -1 or 0, try to simplify the
43597/// select to a bitwise logic operation.
43598/// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
43599static SDValue
43600combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
43601 TargetLowering::DAGCombinerInfo &DCI,
43602 const X86Subtarget &Subtarget) {
43603 SDValue Cond = N->getOperand(0);
43604 SDValue LHS = N->getOperand(1);
43605 SDValue RHS = N->getOperand(2);
43606 EVT VT = LHS.getValueType();
43607 EVT CondVT = Cond.getValueType();
43608 SDLoc DL(N);
43609 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43610
43611 if (N->getOpcode() != ISD::VSELECT)
43612 return SDValue();
43613
43614 assert(CondVT.isVector() && "Vector select expects a vector selector!")(static_cast <bool> (CondVT.isVector() && "Vector select expects a vector selector!"
) ? void (0) : __assert_fail ("CondVT.isVector() && \"Vector select expects a vector selector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43614, __extension__
__PRETTY_FUNCTION__))
;
43615
43616 // TODO: Use isNullOrNullSplat() to distinguish constants with undefs?
43617 // TODO: Can we assert that both operands are not zeros (because that should
43618 // get simplified at node creation time)?
43619 bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
43620 bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
43621
43622 // If both inputs are 0/undef, create a complete zero vector.
43623 // FIXME: As noted above this should be handled by DAGCombiner/getNode.
43624 if (TValIsAllZeros && FValIsAllZeros) {
43625 if (VT.isFloatingPoint())
43626 return DAG.getConstantFP(0.0, DL, VT);
43627 return DAG.getConstant(0, DL, VT);
43628 }
43629
43630 // To use the condition operand as a bitwise mask, it must have elements that
43631 // are the same size as the select elements. Ie, the condition operand must
43632 // have already been promoted from the IR select condition type <N x i1>.
43633 // Don't check if the types themselves are equal because that excludes
43634 // vector floating-point selects.
43635 if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
43636 return SDValue();
43637
43638 // Try to invert the condition if true value is not all 1s and false value is
43639 // not all 0s. Only do this if the condition has one use.
43640 bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
43641 if (!TValIsAllOnes && !FValIsAllZeros && Cond.hasOneUse() &&
43642 // Check if the selector will be produced by CMPP*/PCMP*.
43643 Cond.getOpcode() == ISD::SETCC &&
43644 // Check if SETCC has already been promoted.
43645 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
43646 CondVT) {
43647 bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
43648
43649 if (TValIsAllZeros || FValIsAllOnes) {
43650 SDValue CC = Cond.getOperand(2);
43651 ISD::CondCode NewCC = ISD::getSetCCInverse(
43652 cast<CondCodeSDNode>(CC)->get(), Cond.getOperand(0).getValueType());
43653 Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
43654 NewCC);
43655 std::swap(LHS, RHS);
43656 TValIsAllOnes = FValIsAllOnes;
43657 FValIsAllZeros = TValIsAllZeros;
43658 }
43659 }
43660
43661 // Cond value must be 'sign splat' to be converted to a logical op.
43662 if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())
43663 return SDValue();
43664
43665 // vselect Cond, 111..., 000... -> Cond
43666 if (TValIsAllOnes && FValIsAllZeros)
43667 return DAG.getBitcast(VT, Cond);
43668
43669 if (!TLI.isTypeLegal(CondVT))
43670 return SDValue();
43671
43672 // vselect Cond, 111..., X -> or Cond, X
43673 if (TValIsAllOnes) {
43674 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
43675 SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
43676 return DAG.getBitcast(VT, Or);
43677 }
43678
43679 // vselect Cond, X, 000... -> and Cond, X
43680 if (FValIsAllZeros) {
43681 SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
43682 SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
43683 return DAG.getBitcast(VT, And);
43684 }
43685
43686 // vselect Cond, 000..., X -> andn Cond, X
43687 if (TValIsAllZeros) {
43688 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
43689 SDValue AndN;
43690 // The canonical form differs for i1 vectors - x86andnp is not used
43691 if (CondVT.getScalarType() == MVT::i1)
43692 AndN = DAG.getNode(ISD::AND, DL, CondVT, DAG.getNOT(DL, Cond, CondVT),
43693 CastRHS);
43694 else
43695 AndN = DAG.getNode(X86ISD::ANDNP, DL, CondVT, Cond, CastRHS);
43696 return DAG.getBitcast(VT, AndN);
43697 }
43698
43699 return SDValue();
43700}
43701
43702/// If both arms of a vector select are concatenated vectors, split the select,
43703/// and concatenate the result to eliminate a wide (256-bit) vector instruction:
43704/// vselect Cond, (concat T0, T1), (concat F0, F1) -->
43705/// concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1)
43706static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG,
43707 const X86Subtarget &Subtarget) {
43708 unsigned Opcode = N->getOpcode();
43709 if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT)
43710 return SDValue();
43711
43712 // TODO: Split 512-bit vectors too?
43713 EVT VT = N->getValueType(0);
43714 if (!VT.is256BitVector())
43715 return SDValue();
43716
43717 // TODO: Split as long as any 2 of the 3 operands are concatenated?
43718 SDValue Cond = N->getOperand(0);
43719 SDValue TVal = N->getOperand(1);
43720 SDValue FVal = N->getOperand(2);
43721 SmallVector<SDValue, 4> CatOpsT, CatOpsF;
43722 if (!TVal.hasOneUse() || !FVal.hasOneUse() ||
43723 !collectConcatOps(TVal.getNode(), CatOpsT) ||
43724 !collectConcatOps(FVal.getNode(), CatOpsF))
43725 return SDValue();
43726
43727 auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,
43728 ArrayRef<SDValue> Ops) {
43729 return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops);
43730 };
43731 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { Cond, TVal, FVal },
43732 makeBlend, /*CheckBWI*/ false);
43733}
43734
43735static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
43736 SDValue Cond = N->getOperand(0);
43737 SDValue LHS = N->getOperand(1);
43738 SDValue RHS = N->getOperand(2);
43739 SDLoc DL(N);
43740
43741 auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
43742 auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
43743 if (!TrueC || !FalseC)
43744 return SDValue();
43745
43746 // Don't do this for crazy integer types.
43747 EVT VT = N->getValueType(0);
43748 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
43749 return SDValue();
43750
43751 // We're going to use the condition bit in math or logic ops. We could allow
43752 // this with a wider condition value (post-legalization it becomes an i8),
43753 // but if nothing is creating selects that late, it doesn't matter.
43754 if (Cond.getValueType() != MVT::i1)
43755 return SDValue();
43756
43757 // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
43758 // 3, 5, or 9 with i32/i64, so those get transformed too.
43759 // TODO: For constants that overflow or do not differ by power-of-2 or small
43760 // multiplier, convert to 'and' + 'add'.
43761 const APInt &TrueVal = TrueC->getAPIntValue();
43762 const APInt &FalseVal = FalseC->getAPIntValue();
43763
43764 // We have a more efficient lowering for "(X == 0) ? Y : -1" using SBB.
43765 if ((TrueVal.isAllOnes() || FalseVal.isAllOnes()) &&
43766 Cond.getOpcode() == ISD::SETCC && isNullConstant(Cond.getOperand(1))) {
43767 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
43768 if (CC == ISD::SETEQ || CC == ISD::SETNE)
43769 return SDValue();
43770 }
43771
43772 bool OV;
43773 APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
43774 if (OV)
43775 return SDValue();
43776
43777 APInt AbsDiff = Diff.abs();
43778 if (AbsDiff.isPowerOf2() ||
43779 ((VT == MVT::i32 || VT == MVT::i64) &&
43780 (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {
43781
43782 // We need a positive multiplier constant for shift/LEA codegen. The 'not'
43783 // of the condition can usually be folded into a compare predicate, but even
43784 // without that, the sequence should be cheaper than a CMOV alternative.
43785 if (TrueVal.slt(FalseVal)) {
43786 Cond = DAG.getNOT(DL, Cond, MVT::i1);
43787 std::swap(TrueC, FalseC);
43788 }
43789
43790 // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
43791 SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
43792
43793 // Multiply condition by the difference if non-one.
43794 if (!AbsDiff.isOne())
43795 R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
43796
43797 // Add the base if non-zero.
43798 if (!FalseC->isZero())
43799 R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
43800
43801 return R;
43802 }
43803
43804 return SDValue();
43805}
43806
43807/// If this is a *dynamic* select (non-constant condition) and we can match
43808/// this node with one of the variable blend instructions, restructure the
43809/// condition so that blends can use the high (sign) bit of each element.
43810/// This function will also call SimplifyDemandedBits on already created
43811/// BLENDV to perform additional simplifications.
43812static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,
43813 TargetLowering::DAGCombinerInfo &DCI,
43814 const X86Subtarget &Subtarget) {
43815 SDValue Cond = N->getOperand(0);
43816 if ((N->getOpcode() != ISD::VSELECT &&
43817 N->getOpcode() != X86ISD::BLENDV) ||
43818 ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
43819 return SDValue();
43820
43821 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43822 unsigned BitWidth = Cond.getScalarValueSizeInBits();
43823 EVT VT = N->getValueType(0);
43824
43825 // We can only handle the cases where VSELECT is directly legal on the
43826 // subtarget. We custom lower VSELECT nodes with constant conditions and
43827 // this makes it hard to see whether a dynamic VSELECT will correctly
43828 // lower, so we both check the operation's status and explicitly handle the
43829 // cases where a *dynamic* blend will fail even though a constant-condition
43830 // blend could be custom lowered.
43831 // FIXME: We should find a better way to handle this class of problems.
43832 // Potentially, we should combine constant-condition vselect nodes
43833 // pre-legalization into shuffles and not mark as many types as custom
43834 // lowered.
43835 if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
43836 return SDValue();
43837 // FIXME: We don't support i16-element blends currently. We could and
43838 // should support them by making *all* the bits in the condition be set
43839 // rather than just the high bit and using an i8-element blend.
43840 if (VT.getVectorElementType() == MVT::i16)
43841 return SDValue();
43842 // Dynamic blending was only available from SSE4.1 onward.
43843 if (VT.is128BitVector() && !Subtarget.hasSSE41())
43844 return SDValue();
43845 // Byte blends are only available in AVX2
43846 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
43847 return SDValue();
43848 // There are no 512-bit blend instructions that use sign bits.
43849 if (VT.is512BitVector())
43850 return SDValue();
43851
43852 // Don't optimize before the condition has been transformed to a legal type
43853 // and don't ever optimize vector selects that map to AVX512 mask-registers.
43854 if (BitWidth < 8 || BitWidth > 64)
43855 return SDValue();
43856
43857 auto OnlyUsedAsSelectCond = [](SDValue Cond) {
43858 for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
43859 UI != UE; ++UI)
43860 if ((UI->getOpcode() != ISD::VSELECT &&
43861 UI->getOpcode() != X86ISD::BLENDV) ||
43862 UI.getOperandNo() != 0)
43863 return false;
43864
43865 return true;
43866 };
43867
43868 APInt DemandedBits(APInt::getSignMask(BitWidth));
43869
43870 if (OnlyUsedAsSelectCond(Cond)) {
43871 KnownBits Known;
43872 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
43873 !DCI.isBeforeLegalizeOps());
43874 if (!TLI.SimplifyDemandedBits(Cond, DemandedBits, Known, TLO, 0, true))
43875 return SDValue();
43876
43877 // If we changed the computation somewhere in the DAG, this change will
43878 // affect all users of Cond. Update all the nodes so that we do not use
43879 // the generic VSELECT anymore. Otherwise, we may perform wrong
43880 // optimizations as we messed with the actual expectation for the vector
43881 // boolean values.
43882 for (SDNode *U : Cond->uses()) {
43883 if (U->getOpcode() == X86ISD::BLENDV)
43884 continue;
43885
43886 SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),
43887 Cond, U->getOperand(1), U->getOperand(2));
43888 DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
43889 DCI.AddToWorklist(U);
43890 }
43891 DCI.CommitTargetLoweringOpt(TLO);
43892 return SDValue(N, 0);
43893 }
43894
43895 // Otherwise we can still at least try to simplify multiple use bits.
43896 if (SDValue V = TLI.SimplifyMultipleUseDemandedBits(Cond, DemandedBits, DAG))
43897 return DAG.getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0), V,
43898 N->getOperand(1), N->getOperand(2));
43899
43900 return SDValue();
43901}
43902
43903// Try to match:
43904// (or (and (M, (sub 0, X)), (pandn M, X)))
43905// which is a special case of:
43906// (select M, (sub 0, X), X)
43907// Per:
43908// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
43909// We know that, if fNegate is 0 or 1:
43910// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
43911//
43912// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
43913// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
43914// ( M ? -X : X) == ((X ^ M ) + (M & 1))
43915// This lets us transform our vselect to:
43916// (add (xor X, M), (and M, 1))
43917// And further to:
43918// (sub (xor X, M), M)
43919static SDValue combineLogicBlendIntoConditionalNegate(
43920 EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,
43921 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
43922 EVT MaskVT = Mask.getValueType();
43923 assert(MaskVT.isInteger() &&(static_cast <bool> (MaskVT.isInteger() && DAG.
ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
"Mask must be zero/all-bits") ? void (0) : __assert_fail ("MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && \"Mask must be zero/all-bits\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43925, __extension__
__PRETTY_FUNCTION__))
43924 DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&(static_cast <bool> (MaskVT.isInteger() && DAG.
ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
"Mask must be zero/all-bits") ? void (0) : __assert_fail ("MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && \"Mask must be zero/all-bits\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43925, __extension__
__PRETTY_FUNCTION__))
43925 "Mask must be zero/all-bits")(static_cast <bool> (MaskVT.isInteger() && DAG.
ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
"Mask must be zero/all-bits") ? void (0) : __assert_fail ("MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && \"Mask must be zero/all-bits\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43925, __extension__
__PRETTY_FUNCTION__))
;
43926
43927 if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT)
43928 return SDValue();
43929 if (!DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT))
43930 return SDValue();
43931
43932 auto IsNegV = [](SDNode *N, SDValue V) {
43933 return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
43934 ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
43935 };
43936
43937 SDValue V;
43938 if (IsNegV(Y.getNode(), X))
43939 V = X;
43940 else if (IsNegV(X.getNode(), Y))
43941 V = Y;
43942 else
43943 return SDValue();
43944
43945 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
43946 SDValue SubOp2 = Mask;
43947
43948 // If the negate was on the false side of the select, then
43949 // the operands of the SUB need to be swapped. PR 27251.
43950 // This is because the pattern being matched above is
43951 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
43952 // but if the pattern matched was
43953 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
43954 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
43955 // pattern also needs to be a negation of the replacement pattern above.
43956 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
43957 // sub accomplishes the negation of the replacement pattern.
43958 if (V == Y)
43959 std::swap(SubOp1, SubOp2);
43960
43961 SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
43962 return DAG.getBitcast(VT, Res);
43963}
43964
43965/// Do target-specific dag combines on SELECT and VSELECT nodes.
43966static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
43967 TargetLowering::DAGCombinerInfo &DCI,
43968 const X86Subtarget &Subtarget) {
43969 SDLoc DL(N);
43970 SDValue Cond = N->getOperand(0);
43971 SDValue LHS = N->getOperand(1);
43972 SDValue RHS = N->getOperand(2);
43973
43974 // Try simplification again because we use this function to optimize
43975 // BLENDV nodes that are not handled by the generic combiner.
43976 if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS))
43977 return V;
43978
43979 EVT VT = LHS.getValueType();
43980 EVT CondVT = Cond.getValueType();
43981 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43982 bool CondConstantVector = ISD::isBuildVectorOfConstantSDNodes(Cond.getNode());
43983
43984 // Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M).
43985 // Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT
43986 // can't catch, plus vXi8 cases where we'd likely end up with BLENDV.
43987 if (CondVT.isVector() && CondVT.isInteger() &&
43988 CondVT.getScalarSizeInBits() == VT.getScalarSizeInBits() &&
43989 (!CondConstantVector || CondVT.getScalarType() == MVT::i8) &&
43990 DAG.ComputeNumSignBits(Cond) == CondVT.getScalarSizeInBits())
43991 if (SDValue V = combineLogicBlendIntoConditionalNegate(VT, Cond, RHS, LHS,
43992 DL, DAG, Subtarget))
43993 return V;
43994
43995 // Convert vselects with constant condition into shuffles.
43996 if (CondConstantVector && DCI.isBeforeLegalizeOps() &&
43997 (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV)) {
43998 SmallVector<int, 64> Mask;
43999 if (createShuffleMaskFromVSELECT(Mask, Cond,
44000 N->getOpcode() == X86ISD::BLENDV))
44001 return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);
44002 }
44003
44004 // fold vselect(cond, pshufb(x), pshufb(y)) -> or (pshufb(x), pshufb(y))
44005 // by forcing the unselected elements to zero.
44006 // TODO: Can we handle more shuffles with this?
44007 if (N->getOpcode() == ISD::VSELECT && CondVT.isVector() &&
44008 LHS.getOpcode() == X86ISD::PSHUFB && RHS.getOpcode() == X86ISD::PSHUFB &&
44009 LHS.hasOneUse() && RHS.hasOneUse()) {
44010 MVT SimpleVT = VT.getSimpleVT();
44011 SmallVector<SDValue, 1> LHSOps, RHSOps;
44012 SmallVector<int, 64> LHSMask, RHSMask, CondMask;
44013 if (createShuffleMaskFromVSELECT(CondMask, Cond) &&
44014 getTargetShuffleMask(LHS.getNode(), SimpleVT, true, LHSOps, LHSMask) &&
44015 getTargetShuffleMask(RHS.getNode(), SimpleVT, true, RHSOps, RHSMask)) {
44016 int NumElts = VT.getVectorNumElements();
44017 for (int i = 0; i != NumElts; ++i) {
44018 // getConstVector sets negative shuffle mask values as undef, so ensure
44019 // we hardcode SM_SentinelZero values to zero (0x80).
44020 if (CondMask[i] < NumElts) {
44021 LHSMask[i] = isUndefOrZero(LHSMask[i]) ? 0x80 : LHSMask[i];
44022 RHSMask[i] = 0x80;
44023 } else {
44024 LHSMask[i] = 0x80;
44025 RHSMask[i] = isUndefOrZero(RHSMask[i]) ? 0x80 : RHSMask[i];
44026 }
44027 }
44028 LHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, LHS.getOperand(0),
44029 getConstVector(LHSMask, SimpleVT, DAG, DL, true));
44030 RHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, RHS.getOperand(0),
44031 getConstVector(RHSMask, SimpleVT, DAG, DL, true));
44032 return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
44033 }
44034 }
44035
44036 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
44037 // instructions match the semantics of the common C idiom x<y?x:y but not
44038 // x<=y?x:y, because of how they handle negative zero (which can be
44039 // ignored in unsafe-math mode).
44040 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
44041 if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
44042 VT != MVT::f80 && VT != MVT::f128 &&
44043 (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
44044 (Subtarget.hasSSE2() ||
44045 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
44046 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
44047
44048 unsigned Opcode = 0;
44049 // Check for x CC y ? x : y.
44050 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
44051 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
44052 switch (CC) {
44053 default: break;
44054 case ISD::SETULT:
44055 // Converting this to a min would handle NaNs incorrectly, and swapping
44056 // the operands would cause it to handle comparisons between positive
44057 // and negative zero incorrectly.
44058 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
44059 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
44060 !(DAG.isKnownNeverZeroFloat(LHS) ||
44061 DAG.isKnownNeverZeroFloat(RHS)))
44062 break;
44063 std::swap(LHS, RHS);
44064 }
44065 Opcode = X86ISD::FMIN;
44066 break;
44067 case ISD::SETOLE:
44068 // Converting this to a min would handle comparisons between positive
44069 // and negative zero incorrectly.
44070 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
44071 !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
44072 break;
44073 Opcode = X86ISD::FMIN;
44074 break;
44075 case ISD::SETULE:
44076 // Converting this to a min would handle both negative zeros and NaNs
44077 // incorrectly, but we can swap the operands to fix both.
44078 std::swap(LHS, RHS);
44079 LLVM_FALLTHROUGH[[gnu::fallthrough]];
44080 case ISD::SETOLT:
44081 case ISD::SETLT:
44082 case ISD::SETLE:
44083 Opcode = X86ISD::FMIN;
44084 break;
44085
44086 case ISD::SETOGE:
44087 // Converting this to a max would handle comparisons between positive
44088 // and negative zero incorrectly.
44089 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
44090 !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
44091 break;
44092 Opcode = X86ISD::FMAX;
44093 break;
44094 case ISD::SETUGT:
44095 // Converting this to a max would handle NaNs incorrectly, and swapping
44096 // the operands would cause it to handle comparisons between positive
44097 // and negative zero incorrectly.
44098 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
44099 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
44100 !(DAG.isKnownNeverZeroFloat(LHS) ||
44101 DAG.isKnownNeverZeroFloat(RHS)))
44102 break;
44103 std::swap(LHS, RHS);
44104 }
44105 Opcode = X86ISD::FMAX;
44106 break;
44107 case ISD::SETUGE:
44108 // Converting this to a max would handle both negative zeros and NaNs
44109 // incorrectly, but we can swap the operands to fix both.
44110 std::swap(LHS, RHS);
44111 LLVM_FALLTHROUGH[[gnu::fallthrough]];
44112 case ISD::SETOGT:
44113 case ISD::SETGT:
44114 case ISD::SETGE:
44115 Opcode = X86ISD::FMAX;
44116 break;
44117 }
44118 // Check for x CC y ? y : x -- a min/max with reversed arms.
44119 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
44120 DAG.isEqualTo(RHS, Cond.getOperand(0))) {
44121 switch (CC) {
44122 default: break;
44123 case ISD::SETOGE:
44124 // Converting this to a min would handle comparisons between positive
44125 // and negative zero incorrectly, and swapping the operands would
44126 // cause it to handle NaNs incorrectly.
44127 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
44128 !(DAG.isKnownNeverZeroFloat(LHS) ||
44129 DAG.isKnownNeverZeroFloat(RHS))) {
44130 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
44131 break;
44132 std::swap(LHS, RHS);
44133 }
44134 Opcode = X86ISD::FMIN;
44135 break;
44136 case ISD::SETUGT:
44137 // Converting this to a min would handle NaNs incorrectly.
44138 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
44139 break;
44140 Opcode = X86ISD::FMIN;
44141 break;
44142 case ISD::SETUGE:
44143 // Converting this to a min would handle both negative zeros and NaNs
44144 // incorrectly, but we can swap the operands to fix both.
44145 std::swap(LHS, RHS);
44146 LLVM_FALLTHROUGH[[gnu::fallthrough]];
44147 case ISD::SETOGT:
44148 case ISD::SETGT:
44149 case ISD::SETGE:
44150 Opcode = X86ISD::FMIN;
44151 break;
44152
44153 case ISD::SETULT:
44154 // Converting this to a max would handle NaNs incorrectly.
44155 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
44156 break;
44157 Opcode = X86ISD::FMAX;
44158 break;
44159 case ISD::SETOLE:
44160 // Converting this to a max would handle comparisons between positive
44161 // and negative zero incorrectly, and swapping the operands would
44162 // cause it to handle NaNs incorrectly.
44163 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
44164 !DAG.isKnownNeverZeroFloat(LHS) &&
44165 !DAG.isKnownNeverZeroFloat(RHS)) {
44166 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
44167 break;
44168 std::swap(LHS, RHS);
44169 }
44170 Opcode = X86ISD::FMAX;
44171 break;
44172 case ISD::SETULE:
44173 // Converting this to a max would handle both negative zeros and NaNs
44174 // incorrectly, but we can swap the operands to fix both.
44175 std::swap(LHS, RHS);
44176 LLVM_FALLTHROUGH[[gnu::fallthrough]];
44177 case ISD::SETOLT:
44178 case ISD::SETLT:
44179 case ISD::SETLE:
44180 Opcode = X86ISD::FMAX;
44181 break;
44182 }
44183 }
44184
44185 if (Opcode)
44186 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
44187 }
44188
44189 // Some mask scalar intrinsics rely on checking if only one bit is set
44190 // and implement it in C code like this:
44191 // A[0] = (U & 1) ? A[0] : W[0];
44192 // This creates some redundant instructions that break pattern matching.
44193 // fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
44194 if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
44195 Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {
44196 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
44197 SDValue AndNode = Cond.getOperand(0);
44198 if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&
44199 isNullConstant(Cond.getOperand(1)) &&
44200 isOneConstant(AndNode.getOperand(1))) {
44201 // LHS and RHS swapped due to
44202 // setcc outputting 1 when AND resulted in 0 and vice versa.
44203 AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);
44204 return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);
44205 }
44206 }
44207
44208 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
44209 // lowering on KNL. In this case we convert it to
44210 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
44211 // The same situation all vectors of i8 and i16 without BWI.
44212 // Make sure we extend these even before type legalization gets a chance to
44213 // split wide vectors.
44214 // Since SKX these selects have a proper lowering.
44215 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
44216 CondVT.getVectorElementType() == MVT::i1 &&
44217 (VT.getVectorElementType() == MVT::i8 ||
44218 VT.getVectorElementType() == MVT::i16)) {
44219 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
44220 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
44221 }
44222
44223 // AVX512 - Extend select with zero to merge with target shuffle.
44224 // select(mask, extract_subvector(shuffle(x)), zero) -->
44225 // extract_subvector(select(insert_subvector(mask), shuffle(x), zero))
44226 // TODO - support non target shuffles as well.
44227 if (Subtarget.hasAVX512() && CondVT.isVector() &&
44228 CondVT.getVectorElementType() == MVT::i1) {
44229 auto SelectableOp = [&TLI](SDValue Op) {
44230 return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
44231 isTargetShuffle(Op.getOperand(0).getOpcode()) &&
44232 isNullConstant(Op.getOperand(1)) &&
44233 TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&
44234 Op.hasOneUse() && Op.getOperand(0).hasOneUse();
44235 };
44236
44237 bool SelectableLHS = SelectableOp(LHS);
44238 bool SelectableRHS = SelectableOp(RHS);
44239 bool ZeroLHS = ISD::isBuildVectorAllZeros(LHS.getNode());
44240 bool ZeroRHS = ISD::isBuildVectorAllZeros(RHS.getNode());
44241
44242 if ((SelectableLHS && ZeroRHS) || (SelectableRHS && ZeroLHS)) {
44243 EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType()
44244 : RHS.getOperand(0).getValueType();
44245 EVT SrcCondVT = SrcVT.changeVectorElementType(MVT::i1);
44246 LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL,
44247 VT.getSizeInBits());
44248 RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL,
44249 VT.getSizeInBits());
44250 Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT,
44251 DAG.getUNDEF(SrcCondVT), Cond,
44252 DAG.getIntPtrConstant(0, DL));
44253 SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS);
44254 return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
44255 }
44256 }
44257
44258 if (SDValue V = combineSelectOfTwoConstants(N, DAG))
44259 return V;
44260
44261 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
44262 Cond.hasOneUse()) {
44263 EVT CondVT = Cond.getValueType();
44264 SDValue Cond0 = Cond.getOperand(0);
44265 SDValue Cond1 = Cond.getOperand(1);
44266 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
44267
44268 // Canonicalize min/max:
44269 // (x > 0) ? x : 0 -> (x >= 0) ? x : 0
44270 // (x < -1) ? x : -1 -> (x <= -1) ? x : -1
44271 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
44272 // the need for an extra compare against zero. e.g.
44273 // (a - b) > 0 : (a - b) ? 0 -> (a - b) >= 0 : (a - b) ? 0
44274 // subl %esi, %edi
44275 // testl %edi, %edi
44276 // movl $0, %eax
44277 // cmovgl %edi, %eax
44278 // =>
44279 // xorl %eax, %eax
44280 // subl %esi, $edi
44281 // cmovsl %eax, %edi
44282 //
44283 // We can also canonicalize
44284 // (x s> 1) ? x : 1 -> (x s>= 1) ? x : 1 -> (x s> 0) ? x : 1
44285 // (x u> 1) ? x : 1 -> (x u>= 1) ? x : 1 -> (x != 0) ? x : 1
44286 // This allows the use of a test instruction for the compare.
44287 if (LHS == Cond0 && RHS == Cond1) {
44288 if ((CC == ISD::SETGT && (isNullConstant(RHS) || isOneConstant(RHS))) ||
44289 (CC == ISD::SETLT && isAllOnesConstant(RHS))) {
44290 ISD::CondCode NewCC = CC == ISD::SETGT ? ISD::SETGE : ISD::SETLE;
44291 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
44292 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
44293 }
44294 if (CC == ISD::SETUGT && isOneConstant(RHS)) {
44295 ISD::CondCode NewCC = ISD::SETUGE;
44296 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
44297 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
44298 }
44299 }
44300
44301 // Similar to DAGCombine's select(or(CC0,CC1),X,Y) fold but for legal types.
44302 // fold eq + gt/lt nested selects into ge/le selects
44303 // select (cmpeq Cond0, Cond1), LHS, (select (cmpugt Cond0, Cond1), LHS, Y)
44304 // --> (select (cmpuge Cond0, Cond1), LHS, Y)
44305 // select (cmpslt Cond0, Cond1), LHS, (select (cmpeq Cond0, Cond1), LHS, Y)
44306 // --> (select (cmpsle Cond0, Cond1), LHS, Y)
44307 // .. etc ..
44308 if (RHS.getOpcode() == ISD::SELECT && RHS.getOperand(1) == LHS &&
44309 RHS.getOperand(0).getOpcode() == ISD::SETCC) {
44310 SDValue InnerSetCC = RHS.getOperand(0);
44311 ISD::CondCode InnerCC =
44312 cast<CondCodeSDNode>(InnerSetCC.getOperand(2))->get();
44313 if ((CC == ISD::SETEQ || InnerCC == ISD::SETEQ) &&
44314 Cond0 == InnerSetCC.getOperand(0) &&
44315 Cond1 == InnerSetCC.getOperand(1)) {
44316 ISD::CondCode NewCC;
44317 switch (CC == ISD::SETEQ ? InnerCC : CC) {
44318 case ISD::SETGT: NewCC = ISD::SETGE; break;
44319 case ISD::SETLT: NewCC = ISD::SETLE; break;
44320 case ISD::SETUGT: NewCC = ISD::SETUGE; break;
44321 case ISD::SETULT: NewCC = ISD::SETULE; break;
44322 default: NewCC = ISD::SETCC_INVALID; break;
44323 }
44324 if (NewCC != ISD::SETCC_INVALID) {
44325 Cond = DAG.getSetCC(DL, CondVT, Cond0, Cond1, NewCC);
44326 return DAG.getSelect(DL, VT, Cond, LHS, RHS.getOperand(2));
44327 }
44328 }
44329 }
44330 }
44331
44332 // Check if the first operand is all zeros and Cond type is vXi1.
44333 // If this an avx512 target we can improve the use of zero masking by
44334 // swapping the operands and inverting the condition.
44335 if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&
44336 Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&
44337 ISD::isBuildVectorAllZeros(LHS.getNode()) &&
44338 !ISD::isBuildVectorAllZeros(RHS.getNode())) {
44339 // Invert the cond to not(cond) : xor(op,allones)=not(op)
44340 SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
44341 // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
44342 return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
44343 }
44344
44345 // Attempt to convert a (vXi1 bitcast(iX Cond)) selection mask before it might
44346 // get split by legalization.
44347 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::BITCAST &&
44348 CondVT.getVectorElementType() == MVT::i1 && Cond.hasOneUse() &&
44349 TLI.isTypeLegal(VT.getScalarType())) {
44350 EVT ExtCondVT = VT.changeVectorElementTypeToInteger();
44351 if (SDValue ExtCond = combineToExtendBoolVectorInReg(
44352 ISD::SIGN_EXTEND, DL, ExtCondVT, Cond, DAG, DCI, Subtarget)) {
44353 ExtCond = DAG.getNode(ISD::TRUNCATE, DL, CondVT, ExtCond);
44354 return DAG.getSelect(DL, VT, ExtCond, LHS, RHS);
44355 }
44356 }
44357
44358 // Early exit check
44359 if (!TLI.isTypeLegal(VT))
44360 return SDValue();
44361
44362 if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
44363 return V;
44364
44365 if (SDValue V = combineVSelectToBLENDV(N, DAG, DCI, Subtarget))
44366 return V;
44367
44368 if (SDValue V = narrowVectorSelect(N, DAG, Subtarget))
44369 return V;
44370
44371 // select(~Cond, X, Y) -> select(Cond, Y, X)
44372 if (CondVT.getScalarType() != MVT::i1) {
44373 if (SDValue CondNot = IsNOT(Cond, DAG))
44374 return DAG.getNode(N->getOpcode(), DL, VT,
44375 DAG.getBitcast(CondVT, CondNot), RHS, LHS);
44376 // pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the signbit.
44377 if (Cond.getOpcode() == X86ISD::PCMPGT && Cond.hasOneUse() &&
44378 ISD::isBuildVectorAllOnes(Cond.getOperand(1).getNode())) {
44379 Cond = DAG.getNode(X86ISD::PCMPGT, DL, CondVT,
44380 DAG.getConstant(0, DL, CondVT), Cond.getOperand(0));
44381 return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
44382 }
44383 }
44384
44385 // Try to optimize vXi1 selects if both operands are either all constants or
44386 // bitcasts from scalar integer type. In that case we can convert the operands
44387 // to integer and use an integer select which will be converted to a CMOV.
44388 // We need to take a little bit of care to avoid creating an i64 type after
44389 // type legalization.
44390 if (N->getOpcode() == ISD::SELECT && VT.isVector() &&
44391 VT.getVectorElementType() == MVT::i1 &&
44392 (DCI.isBeforeLegalize() || (VT != MVT::v64i1 || Subtarget.is64Bit()))) {
44393 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());
44394 bool LHSIsConst = ISD::isBuildVectorOfConstantSDNodes(LHS.getNode());
44395 bool RHSIsConst = ISD::isBuildVectorOfConstantSDNodes(RHS.getNode());
44396
44397 if ((LHSIsConst ||
44398 (LHS.getOpcode() == ISD::BITCAST &&
44399 LHS.getOperand(0).getValueType() == IntVT)) &&
44400 (RHSIsConst ||
44401 (RHS.getOpcode() == ISD::BITCAST &&
44402 RHS.getOperand(0).getValueType() == IntVT))) {
44403 if (LHSIsConst)
44404 LHS = combinevXi1ConstantToInteger(LHS, DAG);
44405 else
44406 LHS = LHS.getOperand(0);
44407
44408 if (RHSIsConst)
44409 RHS = combinevXi1ConstantToInteger(RHS, DAG);
44410 else
44411 RHS = RHS.getOperand(0);
44412
44413 SDValue Select = DAG.getSelect(DL, IntVT, Cond, LHS, RHS);
44414 return DAG.getBitcast(VT, Select);
44415 }
44416 }
44417
44418 // If this is "((X & C) == 0) ? Y : Z" and C is a constant mask vector of
44419 // single bits, then invert the predicate and swap the select operands.
44420 // This can lower using a vector shift bit-hack rather than mask and compare.
44421 if (DCI.isBeforeLegalize() && !Subtarget.hasAVX512() &&
44422 N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
44423 Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1 &&
44424 Cond.getOperand(0).getOpcode() == ISD::AND &&
44425 isNullOrNullSplat(Cond.getOperand(1)) &&
44426 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
44427 Cond.getOperand(0).getValueType() == VT) {
44428 // The 'and' mask must be composed of power-of-2 constants.
44429 SDValue And = Cond.getOperand(0);
44430 auto *C = isConstOrConstSplat(And.getOperand(1));
44431 if (C && C->getAPIntValue().isPowerOf2()) {
44432 // vselect (X & C == 0), LHS, RHS --> vselect (X & C != 0), RHS, LHS
44433 SDValue NotCond =
44434 DAG.getSetCC(DL, CondVT, And, Cond.getOperand(1), ISD::SETNE);
44435 return DAG.getSelect(DL, VT, NotCond, RHS, LHS);
44436 }
44437
44438 // If we have a non-splat but still powers-of-2 mask, AVX1 can use pmulld
44439 // and AVX2 can use vpsllv{dq}. 8-bit lacks a proper shift or multiply.
44440 // 16-bit lacks a proper blendv.
44441 unsigned EltBitWidth = VT.getScalarSizeInBits();
44442 bool CanShiftBlend =
44443 TLI.isTypeLegal(VT) && ((Subtarget.hasAVX() && EltBitWidth == 32) ||
44444 (Subtarget.hasAVX2() && EltBitWidth == 64) ||
44445 (Subtarget.hasXOP()));
44446 if (CanShiftBlend &&
44447 ISD::matchUnaryPredicate(And.getOperand(1), [](ConstantSDNode *C) {
44448 return C->getAPIntValue().isPowerOf2();
44449 })) {
44450 // Create a left-shift constant to get the mask bits over to the sign-bit.
44451 SDValue Mask = And.getOperand(1);
44452 SmallVector<int, 32> ShlVals;
44453 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
44454 auto *MaskVal = cast<ConstantSDNode>(Mask.getOperand(i));
44455 ShlVals.push_back(EltBitWidth - 1 -
44456 MaskVal->getAPIntValue().exactLogBase2());
44457 }
44458 // vsel ((X & C) == 0), LHS, RHS --> vsel ((shl X, C') < 0), RHS, LHS
44459 SDValue ShlAmt = getConstVector(ShlVals, VT.getSimpleVT(), DAG, DL);
44460 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And.getOperand(0), ShlAmt);
44461 SDValue NewCond =
44462 DAG.getSetCC(DL, CondVT, Shl, Cond.getOperand(1), ISD::SETLT);
44463 return DAG.getSelect(DL, VT, NewCond, RHS, LHS);
44464 }
44465 }
44466
44467 return SDValue();
44468}
44469
44470/// Combine:
44471/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
44472/// to:
44473/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
44474/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
44475/// Note that this is only legal for some op/cc combinations.
44476static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
44477 SelectionDAG &DAG,
44478 const X86Subtarget &Subtarget) {
44479 // This combine only operates on CMP-like nodes.
44480 if (!(Cmp.getOpcode() == X86ISD::CMP ||
44481 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
44482 return SDValue();
44483
44484 // Can't replace the cmp if it has more uses than the one we're looking at.
44485 // FIXME: We would like to be able to handle this, but would need to make sure
44486 // all uses were updated.
44487 if (!Cmp.hasOneUse())
44488 return SDValue();
44489
44490 // This only applies to variations of the common case:
44491 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
44492 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
44493 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
44494 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
44495 // Using the proper condcodes (see below), overflow is checked for.
44496
44497 // FIXME: We can generalize both constraints:
44498 // - XOR/OR/AND (if they were made to survive AtomicExpand)
44499 // - LHS != 1
44500 // if the result is compared.
44501
44502 SDValue CmpLHS = Cmp.getOperand(0);
44503 SDValue CmpRHS = Cmp.getOperand(1);
44504 EVT CmpVT = CmpLHS.getValueType();
44505
44506 if (!CmpLHS.hasOneUse())
44507 return SDValue();
44508
44509 unsigned Opc = CmpLHS.getOpcode();
44510 if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
44511 return SDValue();
44512
44513 SDValue OpRHS = CmpLHS.getOperand(2);
44514 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
44515 if (!OpRHSC)
44516 return SDValue();
44517
44518 APInt Addend = OpRHSC->getAPIntValue();
44519 if (Opc == ISD::ATOMIC_LOAD_SUB)
44520 Addend = -Addend;
44521
44522 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
44523 if (!CmpRHSC)
44524 return SDValue();
44525
44526 APInt Comparison = CmpRHSC->getAPIntValue();
44527 APInt NegAddend = -Addend;
44528
44529 // See if we can adjust the CC to make the comparison match the negated
44530 // addend.
44531 if (Comparison != NegAddend) {
44532 APInt IncComparison = Comparison + 1;
44533 if (IncComparison == NegAddend) {
44534 if (CC == X86::COND_A && !Comparison.isMaxValue()) {
44535 Comparison = IncComparison;
44536 CC = X86::COND_AE;
44537 } else if (CC == X86::COND_LE && !Comparison.isMaxSignedValue()) {
44538 Comparison = IncComparison;
44539 CC = X86::COND_L;
44540 }
44541 }
44542 APInt DecComparison = Comparison - 1;
44543 if (DecComparison == NegAddend) {
44544 if (CC == X86::COND_AE && !Comparison.isMinValue()) {
44545 Comparison = DecComparison;
44546 CC = X86::COND_A;
44547 } else if (CC == X86::COND_L && !Comparison.isMinSignedValue()) {
44548 Comparison = DecComparison;
44549 CC = X86::COND_LE;
44550 }
44551 }
44552 }
44553
44554 // If the addend is the negation of the comparison value, then we can do
44555 // a full comparison by emitting the atomic arithmetic as a locked sub.
44556 if (Comparison == NegAddend) {
44557 // The CC is fine, but we need to rewrite the LHS of the comparison as an
44558 // atomic sub.
44559 auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
44560 auto AtomicSub = DAG.getAtomic(
44561 ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpVT,
44562 /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
44563 /*RHS*/ DAG.getConstant(NegAddend, SDLoc(CmpRHS), CmpVT),
44564 AN->getMemOperand());
44565 auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget);
44566 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
44567 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
44568 return LockOp;
44569 }
44570
44571 // We can handle comparisons with zero in a number of cases by manipulating
44572 // the CC used.
44573 if (!Comparison.isZero())
44574 return SDValue();
44575
44576 if (CC == X86::COND_S && Addend == 1)
44577 CC = X86::COND_LE;
44578 else if (CC == X86::COND_NS && Addend == 1)
44579 CC = X86::COND_G;
44580 else if (CC == X86::COND_G && Addend == -1)
44581 CC = X86::COND_GE;
44582 else if (CC == X86::COND_LE && Addend == -1)
44583 CC = X86::COND_L;
44584 else
44585 return SDValue();
44586
44587 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
44588 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
44589 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
44590 return LockOp;
44591}
44592
44593// Check whether a boolean test is testing a boolean value generated by
44594// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
44595// code.
44596//
44597// Simplify the following patterns:
44598// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
44599// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
44600// to (Op EFLAGS Cond)
44601//
44602// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
44603// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
44604// to (Op EFLAGS !Cond)
44605//
44606// where Op could be BRCOND or CMOV.
44607//
44608static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
44609 // This combine only operates on CMP-like nodes.
44610 if (!(Cmp.getOpcode() == X86ISD::CMP ||
44611 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
44612 return SDValue();
44613
44614 // Quit if not used as a boolean value.
44615 if (CC != X86::COND_E && CC != X86::COND_NE)
44616 return SDValue();
44617
44618 // Check CMP operands. One of them should be 0 or 1 and the other should be
44619 // an SetCC or extended from it.
44620 SDValue Op1 = Cmp.getOperand(0);
44621 SDValue Op2 = Cmp.getOperand(1);
44622
44623 SDValue SetCC;
44624 const ConstantSDNode* C = nullptr;
44625 bool needOppositeCond = (CC == X86::COND_E);
44626 bool checkAgainstTrue = false; // Is it a comparison against 1?
44627
44628 if ((C = dyn_cast<ConstantSDNode>(Op1)))
44629 SetCC = Op2;
44630 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
44631 SetCC = Op1;
44632 else // Quit if all operands are not constants.
44633 return SDValue();
44634
44635 if (C->getZExtValue() == 1) {
44636 needOppositeCond = !needOppositeCond;
44637 checkAgainstTrue = true;
44638 } else if (C->getZExtValue() != 0)
44639 // Quit if the constant is neither 0 or 1.
44640 return SDValue();
44641
44642 bool truncatedToBoolWithAnd = false;
44643 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
44644 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
44645 SetCC.getOpcode() == ISD::TRUNCATE ||
44646 SetCC.getOpcode() == ISD::AND) {
44647 if (SetCC.getOpcode() == ISD::AND) {
44648 int OpIdx = -1;
44649 if (isOneConstant(SetCC.getOperand(0)))
44650 OpIdx = 1;
44651 if (isOneConstant(SetCC.getOperand(1)))
44652 OpIdx = 0;
44653 if (OpIdx < 0)
44654 break;
44655 SetCC = SetCC.getOperand(OpIdx);
44656 truncatedToBoolWithAnd = true;
44657 } else
44658 SetCC = SetCC.getOperand(0);
44659 }
44660
44661 switch (SetCC.getOpcode()) {
44662 case X86ISD::SETCC_CARRY:
44663 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
44664 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
44665 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
44666 // truncated to i1 using 'and'.
44667 if (checkAgainstTrue && !truncatedToBoolWithAnd)
44668 break;
44669 assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&(static_cast <bool> (X86::CondCode(SetCC.getConstantOperandVal
(0)) == X86::COND_B && "Invalid use of SETCC_CARRY!")
? void (0) : __assert_fail ("X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && \"Invalid use of SETCC_CARRY!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44670, __extension__
__PRETTY_FUNCTION__))
44670 "Invalid use of SETCC_CARRY!")(static_cast <bool> (X86::CondCode(SetCC.getConstantOperandVal
(0)) == X86::COND_B && "Invalid use of SETCC_CARRY!")
? void (0) : __assert_fail ("X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && \"Invalid use of SETCC_CARRY!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44670, __extension__
__PRETTY_FUNCTION__))
;
44671 LLVM_FALLTHROUGH[[gnu::fallthrough]];
44672 case X86ISD::SETCC:
44673 // Set the condition code or opposite one if necessary.
44674 CC = X86::CondCode(SetCC.getConstantOperandVal(0));
44675 if (needOppositeCond)
44676 CC = X86::GetOppositeBranchCondition(CC);
44677 return SetCC.getOperand(1);
44678 case X86ISD::CMOV: {
44679 // Check whether false/true value has canonical one, i.e. 0 or 1.
44680 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
44681 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
44682 // Quit if true value is not a constant.
44683 if (!TVal)
44684 return SDValue();
44685 // Quit if false value is not a constant.
44686 if (!FVal) {
44687 SDValue Op = SetCC.getOperand(0);
44688 // Skip 'zext' or 'trunc' node.
44689 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
44690 Op.getOpcode() == ISD::TRUNCATE)
44691 Op = Op.getOperand(0);
44692 // A special case for rdrand/rdseed, where 0 is set if false cond is
44693 // found.
44694 if ((Op.getOpcode() != X86ISD::RDRAND &&
44695 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
44696 return SDValue();
44697 }
44698 // Quit if false value is not the constant 0 or 1.
44699 bool FValIsFalse = true;
44700 if (FVal && FVal->getZExtValue() != 0) {
44701 if (FVal->getZExtValue() != 1)
44702 return SDValue();
44703 // If FVal is 1, opposite cond is needed.
44704 needOppositeCond = !needOppositeCond;
44705 FValIsFalse = false;
44706 }
44707 // Quit if TVal is not the constant opposite of FVal.
44708 if (FValIsFalse && TVal->getZExtValue() != 1)
44709 return SDValue();
44710 if (!FValIsFalse && TVal->getZExtValue() != 0)
44711 return SDValue();
44712 CC = X86::CondCode(SetCC.getConstantOperandVal(2));
44713 if (needOppositeCond)
44714 CC = X86::GetOppositeBranchCondition(CC);
44715 return SetCC.getOperand(3);
44716 }
44717 }
44718
44719 return SDValue();
44720}
44721
44722/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
44723/// Match:
44724/// (X86or (X86setcc) (X86setcc))
44725/// (X86cmp (and (X86setcc) (X86setcc)), 0)
44726static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
44727 X86::CondCode &CC1, SDValue &Flags,
44728 bool &isAnd) {
44729 if (Cond->getOpcode() == X86ISD::CMP) {
44730 if (!isNullConstant(Cond->getOperand(1)))
44731 return false;
44732
44733 Cond = Cond->getOperand(0);
44734 }
44735
44736 isAnd = false;
44737
44738 SDValue SetCC0, SetCC1;
44739 switch (Cond->getOpcode()) {
44740 default: return false;
44741 case ISD::AND:
44742 case X86ISD::AND:
44743 isAnd = true;
44744 LLVM_FALLTHROUGH[[gnu::fallthrough]];
44745 case ISD::OR:
44746 case X86ISD::OR:
44747 SetCC0 = Cond->getOperand(0);
44748 SetCC1 = Cond->getOperand(1);
44749 break;
44750 };
44751
44752 // Make sure we have SETCC nodes, using the same flags value.
44753 if (SetCC0.getOpcode() != X86ISD::SETCC ||
44754 SetCC1.getOpcode() != X86ISD::SETCC ||
44755 SetCC0->getOperand(1) != SetCC1->getOperand(1))
44756 return false;
44757
44758 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
44759 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
44760 Flags = SetCC0->getOperand(1);
44761 return true;
44762}
44763
44764// When legalizing carry, we create carries via add X, -1
44765// If that comes from an actual carry, via setcc, we use the
44766// carry directly.
44767static SDValue combineCarryThroughADD(SDValue EFLAGS, SelectionDAG &DAG) {
44768 if (EFLAGS.getOpcode() == X86ISD::ADD) {
44769 if (isAllOnesConstant(EFLAGS.getOperand(1))) {
44770 bool FoundAndLSB = false;
44771 SDValue Carry = EFLAGS.getOperand(0);
44772 while (Carry.getOpcode() == ISD::TRUNCATE ||
44773 Carry.getOpcode() == ISD::ZERO_EXTEND ||
44774 (Carry.getOpcode() == ISD::AND &&
44775 isOneConstant(Carry.getOperand(1)))) {
44776 FoundAndLSB |= Carry.getOpcode() == ISD::AND;
44777 Carry = Carry.getOperand(0);
44778 }
44779 if (Carry.getOpcode() == X86ISD::SETCC ||
44780 Carry.getOpcode() == X86ISD::SETCC_CARRY) {
44781 // TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB?
44782 uint64_t CarryCC = Carry.getConstantOperandVal(0);
44783 SDValue CarryOp1 = Carry.getOperand(1);
44784 if (CarryCC == X86::COND_B)
44785 return CarryOp1;
44786 if (CarryCC == X86::COND_A) {
44787 // Try to convert COND_A into COND_B in an attempt to facilitate
44788 // materializing "setb reg".
44789 //
44790 // Do not flip "e > c", where "c" is a constant, because Cmp
44791 // instruction cannot take an immediate as its first operand.
44792 //
44793 if (CarryOp1.getOpcode() == X86ISD::SUB &&
44794 CarryOp1.getNode()->hasOneUse() &&
44795 CarryOp1.getValueType().isInteger() &&
44796 !isa<ConstantSDNode>(CarryOp1.getOperand(1))) {
44797 SDValue SubCommute =
44798 DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),
44799 CarryOp1.getOperand(1), CarryOp1.getOperand(0));
44800 return SDValue(SubCommute.getNode(), CarryOp1.getResNo());
44801 }
44802 }
44803 // If this is a check of the z flag of an add with 1, switch to the
44804 // C flag.
44805 if (CarryCC == X86::COND_E &&
44806 CarryOp1.getOpcode() == X86ISD::ADD &&
44807 isOneConstant(CarryOp1.getOperand(1)))
44808 return CarryOp1;
44809 } else if (FoundAndLSB) {
44810 SDLoc DL(Carry);
44811 SDValue BitNo = DAG.getConstant(0, DL, Carry.getValueType());
44812 if (Carry.getOpcode() == ISD::SRL) {
44813 BitNo = Carry.getOperand(1);
44814 Carry = Carry.getOperand(0);
44815 }
44816 return getBT(Carry, BitNo, DL, DAG);
44817 }
44818 }
44819 }
44820
44821 return SDValue();
44822}
44823
44824/// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC
44825/// to avoid the inversion.
44826static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC,
44827 SelectionDAG &DAG,
44828 const X86Subtarget &Subtarget) {
44829 // TODO: Handle X86ISD::KTEST/X86ISD::KORTEST.
44830 if (EFLAGS.getOpcode() != X86ISD::PTEST &&
44831 EFLAGS.getOpcode() != X86ISD::TESTP)
44832 return SDValue();
44833
44834 // PTEST/TESTP sets EFLAGS as:
44835 // TESTZ: ZF = (Op0 & Op1) == 0
44836 // TESTC: CF = (~Op0 & Op1) == 0
44837 // TESTNZC: ZF == 0 && CF == 0
44838 EVT VT = EFLAGS.getValueType();
44839 SDValue Op0 = EFLAGS.getOperand(0);
44840 SDValue Op1 = EFLAGS.getOperand(1);
44841 EVT OpVT = Op0.getValueType();
44842
44843 // TEST*(~X,Y) == TEST*(X,Y)
44844 if (SDValue NotOp0 = IsNOT(Op0, DAG)) {
44845 X86::CondCode InvCC;
44846 switch (CC) {
44847 case X86::COND_B:
44848 // testc -> testz.
44849 InvCC = X86::COND_E;
44850 break;
44851 case X86::COND_AE:
44852 // !testc -> !testz.
44853 InvCC = X86::COND_NE;
44854 break;
44855 case X86::COND_E:
44856 // testz -> testc.
44857 InvCC = X86::COND_B;
44858 break;
44859 case X86::COND_NE:
44860 // !testz -> !testc.
44861 InvCC = X86::COND_AE;
44862 break;
44863 case X86::COND_A:
44864 case X86::COND_BE:
44865 // testnzc -> testnzc (no change).
44866 InvCC = CC;
44867 break;
44868 default:
44869 InvCC = X86::COND_INVALID;
44870 break;
44871 }
44872
44873 if (InvCC != X86::COND_INVALID) {
44874 CC = InvCC;
44875 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
44876 DAG.getBitcast(OpVT, NotOp0), Op1);
44877 }
44878 }
44879
44880 if (CC == X86::COND_E || CC == X86::COND_NE) {
44881 // TESTZ(X,~Y) == TESTC(Y,X)
44882 if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
44883 CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
44884 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
44885 DAG.getBitcast(OpVT, NotOp1), Op0);
44886 }
44887
44888 if (Op0 == Op1) {
44889 SDValue BC = peekThroughBitcasts(Op0);
44890 EVT BCVT = BC.getValueType();
44891 assert(BCVT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(BCVT) &&(static_cast <bool> (BCVT.isVector() && DAG.getTargetLoweringInfo
().isTypeLegal(BCVT) && "Unexpected vector type") ? void
(0) : __assert_fail ("BCVT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(BCVT) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44892, __extension__
__PRETTY_FUNCTION__))
44892 "Unexpected vector type")(static_cast <bool> (BCVT.isVector() && DAG.getTargetLoweringInfo
().isTypeLegal(BCVT) && "Unexpected vector type") ? void
(0) : __assert_fail ("BCVT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(BCVT) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44892, __extension__
__PRETTY_FUNCTION__))
;
44893
44894 // TESTZ(AND(X,Y),AND(X,Y)) == TESTZ(X,Y)
44895 if (BC.getOpcode() == ISD::AND || BC.getOpcode() == X86ISD::FAND) {
44896 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
44897 DAG.getBitcast(OpVT, BC.getOperand(0)),
44898 DAG.getBitcast(OpVT, BC.getOperand(1)));
44899 }
44900
44901 // TESTZ(AND(~X,Y),AND(~X,Y)) == TESTC(X,Y)
44902 if (BC.getOpcode() == X86ISD::ANDNP || BC.getOpcode() == X86ISD::FANDN) {
44903 CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
44904 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
44905 DAG.getBitcast(OpVT, BC.getOperand(0)),
44906 DAG.getBitcast(OpVT, BC.getOperand(1)));
44907 }
44908
44909 // If every element is an all-sign value, see if we can use MOVMSK to
44910 // more efficiently extract the sign bits and compare that.
44911 // TODO: Handle TESTC with comparison inversion.
44912 // TODO: Can we remove SimplifyMultipleUseDemandedBits and rely on
44913 // MOVMSK combines to make sure its never worse than PTEST?
44914 unsigned EltBits = BCVT.getScalarSizeInBits();
44915 if (DAG.ComputeNumSignBits(BC) == EltBits) {
44916 assert(VT == MVT::i32 && "Expected i32 EFLAGS comparison result")(static_cast <bool> (VT == MVT::i32 && "Expected i32 EFLAGS comparison result"
) ? void (0) : __assert_fail ("VT == MVT::i32 && \"Expected i32 EFLAGS comparison result\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44916, __extension__
__PRETTY_FUNCTION__))
;
44917 APInt SignMask = APInt::getSignMask(EltBits);
44918 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44919 if (SDValue Res =
44920 TLI.SimplifyMultipleUseDemandedBits(BC, SignMask, DAG)) {
44921 // For vXi16 cases we need to use pmovmksb and extract every other
44922 // sign bit.
44923 SDLoc DL(EFLAGS);
44924 if (EltBits == 16) {
44925 MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8;
44926 Res = DAG.getBitcast(MovmskVT, Res);
44927 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
44928 Res = DAG.getNode(ISD::AND, DL, MVT::i32, Res,
44929 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
44930 } else {
44931 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
44932 }
44933 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Res,
44934 DAG.getConstant(0, DL, MVT::i32));
44935 }
44936 }
44937 }
44938
44939 // TESTZ(-1,X) == TESTZ(X,X)
44940 if (ISD::isBuildVectorAllOnes(Op0.getNode()))
44941 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1);
44942
44943 // TESTZ(X,-1) == TESTZ(X,X)
44944 if (ISD::isBuildVectorAllOnes(Op1.getNode()))
44945 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0);
44946
44947 // TESTZ(OR(LO(X),HI(X)),OR(LO(Y),HI(Y))) -> TESTZ(X,Y)
44948 // TODO: Add COND_NE handling?
44949 if (CC == X86::COND_E && OpVT.is128BitVector() && Subtarget.hasAVX()) {
44950 SDValue Src0 = peekThroughBitcasts(Op0);
44951 SDValue Src1 = peekThroughBitcasts(Op1);
44952 if (Src0.getOpcode() == ISD::OR && Src1.getOpcode() == ISD::OR) {
44953 Src0 = getSplitVectorSrc(peekThroughBitcasts(Src0.getOperand(0)),
44954 peekThroughBitcasts(Src0.getOperand(1)), true);
44955 Src1 = getSplitVectorSrc(peekThroughBitcasts(Src1.getOperand(0)),
44956 peekThroughBitcasts(Src1.getOperand(1)), true);
44957 if (Src0 && Src1)
44958 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
44959 DAG.getBitcast(MVT::v4i64, Src0),
44960 DAG.getBitcast(MVT::v4i64, Src1));
44961 }
44962 }
44963 }
44964
44965 return SDValue();
44966}
44967
44968// Attempt to simplify the MOVMSK input based on the comparison type.
44969static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC,
44970 SelectionDAG &DAG,
44971 const X86Subtarget &Subtarget) {
44972 // Handle eq/ne against zero (any_of).
44973 // Handle eq/ne against -1 (all_of).
44974 if (!(CC == X86::COND_E || CC == X86::COND_NE))
44975 return SDValue();
44976 if (EFLAGS.getValueType() != MVT::i32)
44977 return SDValue();
44978 unsigned CmpOpcode = EFLAGS.getOpcode();
44979 if (CmpOpcode != X86ISD::CMP && CmpOpcode != X86ISD::SUB)
44980 return SDValue();
44981 auto *CmpConstant = dyn_cast<ConstantSDNode>(EFLAGS.getOperand(1));
44982 if (!CmpConstant)
44983 return SDValue();
44984 const APInt &CmpVal = CmpConstant->getAPIntValue();
44985
44986 SDValue CmpOp = EFLAGS.getOperand(0);
44987 unsigned CmpBits = CmpOp.getValueSizeInBits();
44988 assert(CmpBits == CmpVal.getBitWidth() && "Value size mismatch")(static_cast <bool> (CmpBits == CmpVal.getBitWidth() &&
"Value size mismatch") ? void (0) : __assert_fail ("CmpBits == CmpVal.getBitWidth() && \"Value size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44988, __extension__
__PRETTY_FUNCTION__))
;
44989
44990 // Peek through any truncate.
44991 if (CmpOp.getOpcode() == ISD::TRUNCATE)
44992 CmpOp = CmpOp.getOperand(0);
44993
44994 // Bail if we don't find a MOVMSK.
44995 if (CmpOp.getOpcode() != X86ISD::MOVMSK)
44996 return SDValue();
44997
44998 SDValue Vec = CmpOp.getOperand(0);
44999 MVT VecVT = Vec.getSimpleValueType();
45000 assert((VecVT.is128BitVector() || VecVT.is256BitVector()) &&(static_cast <bool> ((VecVT.is128BitVector() || VecVT.is256BitVector
()) && "Unexpected MOVMSK operand") ? void (0) : __assert_fail
("(VecVT.is128BitVector() || VecVT.is256BitVector()) && \"Unexpected MOVMSK operand\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45001, __extension__
__PRETTY_FUNCTION__))
45001 "Unexpected MOVMSK operand")(static_cast <bool> ((VecVT.is128BitVector() || VecVT.is256BitVector
()) && "Unexpected MOVMSK operand") ? void (0) : __assert_fail
("(VecVT.is128BitVector() || VecVT.is256BitVector()) && \"Unexpected MOVMSK operand\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45001, __extension__
__PRETTY_FUNCTION__))
;
45002 unsigned NumElts = VecVT.getVectorNumElements();
45003 unsigned NumEltBits = VecVT.getScalarSizeInBits();
45004
45005 bool IsAnyOf = CmpOpcode == X86ISD::CMP && CmpVal.isZero();
45006 bool IsAllOf = (CmpOpcode == X86ISD::SUB || CmpOpcode == X86ISD::CMP) &&
45007 NumElts <= CmpBits && CmpVal.isMask(NumElts);
45008 if (!IsAnyOf && !IsAllOf)
45009 return SDValue();
45010
45011 // See if we can peek through to a vector with a wider element type, if the
45012 // signbits extend down to all the sub-elements as well.
45013 // Calling MOVMSK with the wider type, avoiding the bitcast, helps expose
45014 // potential SimplifyDemandedBits/Elts cases.
45015 // If we looked through a truncate that discard bits, we can't do this
45016 // transform.
45017 // FIXME: We could do this transform for truncates that discarded bits by
45018 // inserting an AND mask between the new MOVMSK and the CMP.
45019 if (Vec.getOpcode() == ISD::BITCAST && NumElts <= CmpBits) {
45020 SDValue BC = peekThroughBitcasts(Vec);
45021 MVT BCVT = BC.getSimpleValueType();
45022 unsigned BCNumElts = BCVT.getVectorNumElements();
45023 unsigned BCNumEltBits = BCVT.getScalarSizeInBits();
45024 if ((BCNumEltBits == 32 || BCNumEltBits == 64) &&
45025 BCNumEltBits > NumEltBits &&
45026 DAG.ComputeNumSignBits(BC) > (BCNumEltBits - NumEltBits)) {
45027 SDLoc DL(EFLAGS);
45028 APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : BCNumElts);
45029 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
45030 DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, BC),
45031 DAG.getConstant(CmpMask, DL, MVT::i32));
45032 }
45033 }
45034
45035 // MOVMSK(CONCAT(X,Y)) == 0 -> MOVMSK(OR(X,Y)).
45036 // MOVMSK(CONCAT(X,Y)) != 0 -> MOVMSK(OR(X,Y)).
45037 // MOVMSK(CONCAT(X,Y)) == -1 -> MOVMSK(AND(X,Y)).
45038 // MOVMSK(CONCAT(X,Y)) != -1 -> MOVMSK(AND(X,Y)).
45039 if (VecVT.is256BitVector() && NumElts <= CmpBits) {
45040 SmallVector<SDValue> Ops;
45041 if (collectConcatOps(peekThroughBitcasts(Vec).getNode(), Ops) &&
45042 Ops.size() == 2) {
45043 SDLoc DL(EFLAGS);
45044 EVT SubVT = Ops[0].getValueType().changeTypeToInteger();
45045 APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : NumElts / 2);
45046 SDValue V = DAG.getNode(IsAnyOf ? ISD::OR : ISD::AND, DL, SubVT,
45047 DAG.getBitcast(SubVT, Ops[0]),
45048 DAG.getBitcast(SubVT, Ops[1]));
45049 V = DAG.getBitcast(VecVT.getHalfNumVectorElementsVT(), V);
45050 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
45051 DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V),
45052 DAG.getConstant(CmpMask, DL, MVT::i32));
45053 }
45054 }
45055
45056 // MOVMSK(PCMPEQ(X,0)) == -1 -> PTESTZ(X,X).
45057 // MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X).
45058 // MOVMSK(PCMPEQ(X,Y)) == -1 -> PTESTZ(SUB(X,Y),SUB(X,Y)).
45059 // MOVMSK(PCMPEQ(X,Y)) != -1 -> !PTESTZ(SUB(X,Y),SUB(X,Y)).
45060 if (IsAllOf && Subtarget.hasSSE41()) {
45061 MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
45062 SDValue BC = peekThroughBitcasts(Vec);
45063 // Ensure MOVMSK was testing every signbit of BC.
45064 if (BC.getValueType().getVectorNumElements() <= NumElts) {
45065 if (BC.getOpcode() == X86ISD::PCMPEQ) {
45066 SDValue V = DAG.getNode(ISD::SUB, SDLoc(BC), BC.getValueType(),
45067 BC.getOperand(0), BC.getOperand(1));
45068 V = DAG.getBitcast(TestVT, V);
45069 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
45070 }
45071 // Check for 256-bit split vector cases.
45072 if (BC.getOpcode() == ISD::AND &&
45073 BC.getOperand(0).getOpcode() == X86ISD::PCMPEQ &&
45074 BC.getOperand(1).getOpcode() == X86ISD::PCMPEQ) {
45075 SDValue LHS = BC.getOperand(0);
45076 SDValue RHS = BC.getOperand(1);
45077 LHS = DAG.getNode(ISD::SUB, SDLoc(LHS), LHS.getValueType(),
45078 LHS.getOperand(0), LHS.getOperand(1));
45079 RHS = DAG.getNode(ISD::SUB, SDLoc(RHS), RHS.getValueType(),
45080 RHS.getOperand(0), RHS.getOperand(1));
45081 LHS = DAG.getBitcast(TestVT, LHS);
45082 RHS = DAG.getBitcast(TestVT, RHS);
45083 SDValue V = DAG.getNode(ISD::OR, SDLoc(EFLAGS), TestVT, LHS, RHS);
45084 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
45085 }
45086 }
45087 }
45088
45089 // See if we can avoid a PACKSS by calling MOVMSK on the sources.
45090 // For vXi16 cases we can use a v2Xi8 PMOVMSKB. We must mask out
45091 // sign bits prior to the comparison with zero unless we know that
45092 // the vXi16 splats the sign bit down to the lower i8 half.
45093 // TODO: Handle all_of patterns.
45094 if (Vec.getOpcode() == X86ISD::PACKSS && VecVT == MVT::v16i8) {
45095 SDValue VecOp0 = Vec.getOperand(0);
45096 SDValue VecOp1 = Vec.getOperand(1);
45097 bool SignExt0 = DAG.ComputeNumSignBits(VecOp0) > 8;
45098 bool SignExt1 = DAG.ComputeNumSignBits(VecOp1) > 8;
45099 // PMOVMSKB(PACKSSBW(X, undef)) -> PMOVMSKB(BITCAST_v16i8(X)) & 0xAAAA.
45100 if (IsAnyOf && CmpBits == 8 && VecOp1.isUndef()) {
45101 SDLoc DL(EFLAGS);
45102 SDValue Result = DAG.getBitcast(MVT::v16i8, VecOp0);
45103 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
45104 Result = DAG.getZExtOrTrunc(Result, DL, MVT::i16);
45105 if (!SignExt0) {
45106 Result = DAG.getNode(ISD::AND, DL, MVT::i16, Result,
45107 DAG.getConstant(0xAAAA, DL, MVT::i16));
45108 }
45109 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
45110 DAG.getConstant(0, DL, MVT::i16));
45111 }
45112 // PMOVMSKB(PACKSSBW(LO(X), HI(X)))
45113 // -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA.
45114 if (CmpBits >= 16 && Subtarget.hasInt256() &&
45115 (IsAnyOf || (SignExt0 && SignExt1))) {
45116 if (SDValue Src = getSplitVectorSrc(VecOp0, VecOp1, true)) {
45117 SDLoc DL(EFLAGS);
45118 SDValue Result = peekThroughBitcasts(Src);
45119 if (IsAllOf && Result.getOpcode() == X86ISD::PCMPEQ &&
45120 Result.getValueType().getVectorNumElements() <= NumElts) {
45121 SDValue V = DAG.getNode(ISD::SUB, DL, Result.getValueType(),
45122 Result.getOperand(0), Result.getOperand(1));
45123 V = DAG.getBitcast(MVT::v4i64, V);
45124 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
45125 }
45126 Result = DAG.getBitcast(MVT::v32i8, Result);
45127 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
45128 unsigned CmpMask = IsAnyOf ? 0 : 0xFFFFFFFF;
45129 if (!SignExt0 || !SignExt1) {
45130 assert(IsAnyOf &&(static_cast <bool> (IsAnyOf && "Only perform v16i16 signmasks for any_of patterns"
) ? void (0) : __assert_fail ("IsAnyOf && \"Only perform v16i16 signmasks for any_of patterns\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45131, __extension__
__PRETTY_FUNCTION__))
45131 "Only perform v16i16 signmasks for any_of patterns")(static_cast <bool> (IsAnyOf && "Only perform v16i16 signmasks for any_of patterns"
) ? void (0) : __assert_fail ("IsAnyOf && \"Only perform v16i16 signmasks for any_of patterns\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45131, __extension__
__PRETTY_FUNCTION__))
;
45132 Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,
45133 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
45134 }
45135 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
45136 DAG.getConstant(CmpMask, DL, MVT::i32));
45137 }
45138 }
45139 }
45140
45141 // MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced.
45142 SmallVector<int, 32> ShuffleMask;
45143 SmallVector<SDValue, 2> ShuffleInputs;
45144 if (NumElts <= CmpBits &&
45145 getTargetShuffleInputs(peekThroughBitcasts(Vec), ShuffleInputs,
45146 ShuffleMask, DAG) &&
45147 ShuffleInputs.size() == 1 && !isAnyZeroOrUndef(ShuffleMask) &&
45148 ShuffleInputs[0].getValueSizeInBits() == VecVT.getSizeInBits()) {
45149 unsigned NumShuffleElts = ShuffleMask.size();
45150 APInt DemandedElts = APInt::getZero(NumShuffleElts);
45151 for (int M : ShuffleMask) {
45152 assert(0 <= M && M < (int)NumShuffleElts && "Bad unary shuffle index")(static_cast <bool> (0 <= M && M < (int)NumShuffleElts
&& "Bad unary shuffle index") ? void (0) : __assert_fail
("0 <= M && M < (int)NumShuffleElts && \"Bad unary shuffle index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45152, __extension__
__PRETTY_FUNCTION__))
;
45153 DemandedElts.setBit(M);
45154 }
45155 if (DemandedElts.isAllOnes()) {
45156 SDLoc DL(EFLAGS);
45157 SDValue Result = DAG.getBitcast(VecVT, ShuffleInputs[0]);
45158 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
45159 Result =
45160 DAG.getZExtOrTrunc(Result, DL, EFLAGS.getOperand(0).getValueType());
45161 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
45162 EFLAGS.getOperand(1));
45163 }
45164 }
45165
45166 return SDValue();
45167}
45168
45169/// Optimize an EFLAGS definition used according to the condition code \p CC
45170/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
45171/// uses of chain values.
45172static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
45173 SelectionDAG &DAG,
45174 const X86Subtarget &Subtarget) {
45175 if (CC == X86::COND_B)
45176 if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG))
45177 return Flags;
45178
45179 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
45180 return R;
45181
45182 if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG, Subtarget))
45183 return R;
45184
45185 if (SDValue R = combineSetCCMOVMSK(EFLAGS, CC, DAG, Subtarget))
45186 return R;
45187
45188 return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
45189}
45190
45191/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
45192static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
45193 TargetLowering::DAGCombinerInfo &DCI,
45194 const X86Subtarget &Subtarget) {
45195 SDLoc DL(N);
45196
45197 SDValue FalseOp = N->getOperand(0);
45198 SDValue TrueOp = N->getOperand(1);
45199 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
45200 SDValue Cond = N->getOperand(3);
45201
45202 // cmov X, X, ?, ? --> X
45203 if (TrueOp == FalseOp)
45204 return TrueOp;
45205
45206 // Try to simplify the EFLAGS and condition code operands.
45207 // We can't always do this as FCMOV only supports a subset of X86 cond.
45208 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
45209 if (!(FalseOp.getValueType() == MVT::f80 ||
45210 (FalseOp.getValueType() == MVT::f64 && !Subtarget.hasSSE2()) ||
45211 (FalseOp.getValueType() == MVT::f32 && !Subtarget.hasSSE1())) ||
45212 !Subtarget.canUseCMOV() || hasFPCMov(CC)) {
45213 SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),
45214 Flags};
45215 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
45216 }
45217 }
45218
45219 // If this is a select between two integer constants, try to do some
45220 // optimizations. Note that the operands are ordered the opposite of SELECT
45221 // operands.
45222 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
45223 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
45224 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
45225 // larger than FalseC (the false value).
45226 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
45227 CC = X86::GetOppositeBranchCondition(CC);
45228 std::swap(TrueC, FalseC);
45229 std::swap(TrueOp, FalseOp);
45230 }
45231
45232 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
45233 // This is efficient for any integer data type (including i8/i16) and
45234 // shift amount.
45235 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
45236 Cond = getSETCC(CC, Cond, DL, DAG);
45237
45238 // Zero extend the condition if needed.
45239 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
45240
45241 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
45242 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
45243 DAG.getConstant(ShAmt, DL, MVT::i8));
45244 return Cond;
45245 }
45246
45247 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
45248 // for any integer data type, including i8/i16.
45249 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
45250 Cond = getSETCC(CC, Cond, DL, DAG);
45251
45252 // Zero extend the condition if needed.
45253 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
45254 FalseC->getValueType(0), Cond);
45255 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
45256 SDValue(FalseC, 0));
45257 return Cond;
45258 }
45259
45260 // Optimize cases that will turn into an LEA instruction. This requires
45261 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
45262 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
45263 APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
45264 assert(Diff.getBitWidth() == N->getValueType(0).getSizeInBits() &&(static_cast <bool> (Diff.getBitWidth() == N->getValueType
(0).getSizeInBits() && "Implicit constant truncation"
) ? void (0) : __assert_fail ("Diff.getBitWidth() == N->getValueType(0).getSizeInBits() && \"Implicit constant truncation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45265, __extension__
__PRETTY_FUNCTION__))
45265 "Implicit constant truncation")(static_cast <bool> (Diff.getBitWidth() == N->getValueType
(0).getSizeInBits() && "Implicit constant truncation"
) ? void (0) : __assert_fail ("Diff.getBitWidth() == N->getValueType(0).getSizeInBits() && \"Implicit constant truncation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45265, __extension__
__PRETTY_FUNCTION__))
;
45266
45267 bool isFastMultiplier = false;
45268 if (Diff.ult(10)) {
45269 switch (Diff.getZExtValue()) {
45270 default: break;
45271 case 1: // result = add base, cond
45272 case 2: // result = lea base( , cond*2)
45273 case 3: // result = lea base(cond, cond*2)
45274 case 4: // result = lea base( , cond*4)
45275 case 5: // result = lea base(cond, cond*4)
45276 case 8: // result = lea base( , cond*8)
45277 case 9: // result = lea base(cond, cond*8)
45278 isFastMultiplier = true;
45279 break;
45280 }
45281 }
45282
45283 if (isFastMultiplier) {
45284 Cond = getSETCC(CC, Cond, DL ,DAG);
45285 // Zero extend the condition if needed.
45286 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
45287 Cond);
45288 // Scale the condition by the difference.
45289 if (Diff != 1)
45290 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
45291 DAG.getConstant(Diff, DL, Cond.getValueType()));
45292
45293 // Add the base if non-zero.
45294 if (FalseC->getAPIntValue() != 0)
45295 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
45296 SDValue(FalseC, 0));
45297 return Cond;
45298 }
45299 }
45300 }
45301 }
45302
45303 // Handle these cases:
45304 // (select (x != c), e, c) -> select (x != c), e, x),
45305 // (select (x == c), c, e) -> select (x == c), x, e)
45306 // where the c is an integer constant, and the "select" is the combination
45307 // of CMOV and CMP.
45308 //
45309 // The rationale for this change is that the conditional-move from a constant
45310 // needs two instructions, however, conditional-move from a register needs
45311 // only one instruction.
45312 //
45313 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
45314 // some instruction-combining opportunities. This opt needs to be
45315 // postponed as late as possible.
45316 //
45317 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
45318 // the DCI.xxxx conditions are provided to postpone the optimization as
45319 // late as possible.
45320
45321 ConstantSDNode *CmpAgainst = nullptr;
45322 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
45323 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
45324 !isa<ConstantSDNode>(Cond.getOperand(0))) {
45325
45326 if (CC == X86::COND_NE &&
45327 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
45328 CC = X86::GetOppositeBranchCondition(CC);
45329 std::swap(TrueOp, FalseOp);
45330 }
45331
45332 if (CC == X86::COND_E &&
45333 CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
45334 SDValue Ops[] = {FalseOp, Cond.getOperand(0),
45335 DAG.getTargetConstant(CC, DL, MVT::i8), Cond};
45336 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
45337 }
45338 }
45339 }
45340
45341 // Fold and/or of setcc's to double CMOV:
45342 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
45343 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
45344 //
45345 // This combine lets us generate:
45346 // cmovcc1 (jcc1 if we don't have CMOV)
45347 // cmovcc2 (same)
45348 // instead of:
45349 // setcc1
45350 // setcc2
45351 // and/or
45352 // cmovne (jne if we don't have CMOV)
45353 // When we can't use the CMOV instruction, it might increase branch
45354 // mispredicts.
45355 // When we can use CMOV, or when there is no mispredict, this improves
45356 // throughput and reduces register pressure.
45357 //
45358 if (CC == X86::COND_NE) {
45359 SDValue Flags;
45360 X86::CondCode CC0, CC1;
45361 bool isAndSetCC;
45362 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
45363 if (isAndSetCC) {
45364 std::swap(FalseOp, TrueOp);
45365 CC0 = X86::GetOppositeBranchCondition(CC0);
45366 CC1 = X86::GetOppositeBranchCondition(CC1);
45367 }
45368
45369 SDValue LOps[] = {FalseOp, TrueOp,
45370 DAG.getTargetConstant(CC0, DL, MVT::i8), Flags};
45371 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);
45372 SDValue Ops[] = {LCMOV, TrueOp, DAG.getTargetConstant(CC1, DL, MVT::i8),
45373 Flags};
45374 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
45375 return CMOV;
45376 }
45377 }
45378
45379 // Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->
45380 // (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)
45381 // Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->
45382 // (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)
45383 if ((CC == X86::COND_NE || CC == X86::COND_E) &&
45384 Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {
45385 SDValue Add = TrueOp;
45386 SDValue Const = FalseOp;
45387 // Canonicalize the condition code for easier matching and output.
45388 if (CC == X86::COND_E)
45389 std::swap(Add, Const);
45390
45391 // We might have replaced the constant in the cmov with the LHS of the
45392 // compare. If so change it to the RHS of the compare.
45393 if (Const == Cond.getOperand(0))
45394 Const = Cond.getOperand(1);
45395
45396 // Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant.
45397 if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD &&
45398 Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) &&
45399 (Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
45400 Add.getOperand(0).getOpcode() == ISD::CTTZ) &&
45401 Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {
45402 EVT VT = N->getValueType(0);
45403 // This should constant fold.
45404 SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));
45405 SDValue CMov =
45406 DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),
45407 DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), Cond);
45408 return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));
45409 }
45410 }
45411
45412 return SDValue();
45413}
45414
45415/// Different mul shrinking modes.
45416enum class ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
45417
45418static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
45419 EVT VT = N->getOperand(0).getValueType();
45420 if (VT.getScalarSizeInBits() != 32)
45421 return false;
45422
45423 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2")(static_cast <bool> (N->getNumOperands() == 2 &&
"NumOperands of Mul are 2") ? void (0) : __assert_fail ("N->getNumOperands() == 2 && \"NumOperands of Mul are 2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45423, __extension__
__PRETTY_FUNCTION__))
;
45424 unsigned SignBits[2] = {1, 1};
45425 bool IsPositive[2] = {false, false};
45426 for (unsigned i = 0; i < 2; i++) {
45427 SDValue Opd = N->getOperand(i);
45428
45429 SignBits[i] = DAG.ComputeNumSignBits(Opd);
45430 IsPositive[i] = DAG.SignBitIsZero(Opd);
45431 }
45432
45433 bool AllPositive = IsPositive[0] && IsPositive[1];
45434 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
45435 // When ranges are from -128 ~ 127, use MULS8 mode.
45436 if (MinSignBits >= 25)
45437 Mode = ShrinkMode::MULS8;
45438 // When ranges are from 0 ~ 255, use MULU8 mode.
45439 else if (AllPositive && MinSignBits >= 24)
45440 Mode = ShrinkMode::MULU8;
45441 // When ranges are from -32768 ~ 32767, use MULS16 mode.
45442 else if (MinSignBits >= 17)
45443 Mode = ShrinkMode::MULS16;
45444 // When ranges are from 0 ~ 65535, use MULU16 mode.
45445 else if (AllPositive && MinSignBits >= 16)
45446 Mode = ShrinkMode::MULU16;
45447 else
45448 return false;
45449 return true;
45450}
45451
45452/// When the operands of vector mul are extended from smaller size values,
45453/// like i8 and i16, the type of mul may be shrinked to generate more
45454/// efficient code. Two typical patterns are handled:
45455/// Pattern1:
45456/// %2 = sext/zext <N x i8> %1 to <N x i32>
45457/// %4 = sext/zext <N x i8> %3 to <N x i32>
45458// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
45459/// %5 = mul <N x i32> %2, %4
45460///
45461/// Pattern2:
45462/// %2 = zext/sext <N x i16> %1 to <N x i32>
45463/// %4 = zext/sext <N x i16> %3 to <N x i32>
45464/// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
45465/// %5 = mul <N x i32> %2, %4
45466///
45467/// There are four mul shrinking modes:
45468/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
45469/// -128 to 128, and the scalar value range of %4 is also -128 to 128,
45470/// generate pmullw+sext32 for it (MULS8 mode).
45471/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
45472/// 0 to 255, and the scalar value range of %4 is also 0 to 255,
45473/// generate pmullw+zext32 for it (MULU8 mode).
45474/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
45475/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
45476/// generate pmullw+pmulhw for it (MULS16 mode).
45477/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
45478/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
45479/// generate pmullw+pmulhuw for it (MULU16 mode).
45480static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
45481 const X86Subtarget &Subtarget) {
45482 // Check for legality
45483 // pmullw/pmulhw are not supported by SSE.
45484 if (!Subtarget.hasSSE2())
45485 return SDValue();
45486
45487 // Check for profitability
45488 // pmulld is supported since SSE41. It is better to use pmulld
45489 // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
45490 // the expansion.
45491 bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize();
45492 if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
45493 return SDValue();
45494
45495 ShrinkMode Mode;
45496 if (!canReduceVMulWidth(N, DAG, Mode))
45497 return SDValue();
45498
45499 SDLoc DL(N);
45500 SDValue N0 = N->getOperand(0);
45501 SDValue N1 = N->getOperand(1);
45502 EVT VT = N->getOperand(0).getValueType();
45503 unsigned NumElts = VT.getVectorNumElements();
45504 if ((NumElts % 2) != 0)
45505 return SDValue();
45506
45507 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
45508
45509 // Shrink the operands of mul.
45510 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
45511 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
45512
45513 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
45514 // lower part is needed.
45515 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
45516 if (Mode == ShrinkMode::MULU8 || Mode == ShrinkMode::MULS8)
45517 return DAG.getNode((Mode == ShrinkMode::MULU8) ? ISD::ZERO_EXTEND
45518 : ISD::SIGN_EXTEND,
45519 DL, VT, MulLo);
45520
45521 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts / 2);
45522 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
45523 // the higher part is also needed.
45524 SDValue MulHi =
45525 DAG.getNode(Mode == ShrinkMode::MULS16 ? ISD::MULHS : ISD::MULHU, DL,
45526 ReducedVT, NewN0, NewN1);
45527
45528 // Repack the lower part and higher part result of mul into a wider
45529 // result.
45530 // Generate shuffle functioning as punpcklwd.
45531 SmallVector<int, 16> ShuffleMask(NumElts);
45532 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
45533 ShuffleMask[2 * i] = i;
45534 ShuffleMask[2 * i + 1] = i + NumElts;
45535 }
45536 SDValue ResLo =
45537 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
45538 ResLo = DAG.getBitcast(ResVT, ResLo);
45539 // Generate shuffle functioning as punpckhwd.
45540 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
45541 ShuffleMask[2 * i] = i + NumElts / 2;
45542 ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
45543 }
45544 SDValue ResHi =
45545 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
45546 ResHi = DAG.getBitcast(ResVT, ResHi);
45547 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
45548}
45549
45550static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
45551 EVT VT, const SDLoc &DL) {
45552
45553 auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
45554 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
45555 DAG.getConstant(Mult, DL, VT));
45556 Result = DAG.getNode(ISD::SHL, DL, VT, Result,
45557 DAG.getConstant(Shift, DL, MVT::i8));
45558 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
45559 N->getOperand(0));
45560 return Result;
45561 };
45562
45563 auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {
45564 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
45565 DAG.getConstant(Mul1, DL, VT));
45566 Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,
45567 DAG.getConstant(Mul2, DL, VT));
45568 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
45569 N->getOperand(0));
45570 return Result;
45571 };
45572
45573 switch (MulAmt) {
45574 default:
45575 break;
45576 case 11:
45577 // mul x, 11 => add ((shl (mul x, 5), 1), x)
45578 return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
45579 case 21:
45580 // mul x, 21 => add ((shl (mul x, 5), 2), x)
45581 return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
45582 case 41:
45583 // mul x, 41 => add ((shl (mul x, 5), 3), x)
45584 return combineMulShlAddOrSub(5, 3, /*isAdd*/ true);
45585 case 22:
45586 // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
45587 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
45588 combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
45589 case 19:
45590 // mul x, 19 => add ((shl (mul x, 9), 1), x)
45591 return combineMulShlAddOrSub(9, 1, /*isAdd*/ true);
45592 case 37:
45593 // mul x, 37 => add ((shl (mul x, 9), 2), x)
45594 return combineMulShlAddOrSub(9, 2, /*isAdd*/ true);
45595 case 73:
45596 // mul x, 73 => add ((shl (mul x, 9), 3), x)
45597 return combineMulShlAddOrSub(9, 3, /*isAdd*/ true);
45598 case 13:
45599 // mul x, 13 => add ((shl (mul x, 3), 2), x)
45600 return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
45601 case 23:
45602 // mul x, 23 => sub ((shl (mul x, 3), 3), x)
45603 return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
45604 case 26:
45605 // mul x, 26 => add ((mul (mul x, 5), 5), x)
45606 return combineMulMulAddOrSub(5, 5, /*isAdd*/ true);
45607 case 28:
45608 // mul x, 28 => add ((mul (mul x, 9), 3), x)
45609 return combineMulMulAddOrSub(9, 3, /*isAdd*/ true);
45610 case 29:
45611 // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
45612 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
45613 combineMulMulAddOrSub(9, 3, /*isAdd*/ true));
45614 }
45615
45616 // Another trick. If this is a power 2 + 2/4/8, we can use a shift followed
45617 // by a single LEA.
45618 // First check if this a sum of two power of 2s because that's easy. Then
45619 // count how many zeros are up to the first bit.
45620 // TODO: We can do this even without LEA at a cost of two shifts and an add.
45621 if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
45622 unsigned ScaleShift = countTrailingZeros(MulAmt);
45623 if (ScaleShift >= 1 && ScaleShift < 4) {
45624 unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
45625 SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
45626 DAG.getConstant(ShiftAmt, DL, MVT::i8));
45627 SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
45628 DAG.getConstant(ScaleShift, DL, MVT::i8));
45629 return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);
45630 }
45631 }
45632
45633 return SDValue();
45634}
45635
45636// If the upper 17 bits of either element are zero and the other element are
45637// zero/sign bits then we can use PMADDWD, which is always at least as quick as
45638// PMULLD, except on KNL.
45639static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
45640 const X86Subtarget &Subtarget) {
45641 if (!Subtarget.hasSSE2())
45642 return SDValue();
45643
45644 if (Subtarget.isPMADDWDSlow())
45645 return SDValue();
45646
45647 EVT VT = N->getValueType(0);
45648
45649 // Only support vXi32 vectors.
45650 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
45651 return SDValue();
45652
45653 // Make sure the type is legal or can split/widen to a legal type.
45654 // With AVX512 but without BWI, we would need to split v32i16.
45655 unsigned NumElts = VT.getVectorNumElements();
45656 if (NumElts == 1 || !isPowerOf2_32(NumElts))
45657 return SDValue();
45658
45659 EVT WVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, 2 * NumElts);
45660
45661 // With AVX512 but without BWI, we would need to split v32i16.
45662 if (32 <= (2 * NumElts) && Subtarget.hasAVX512() && !Subtarget.hasBWI())
45663 return SDValue();
45664
45665 SDValue N0 = N->getOperand(0);
45666 SDValue N1 = N->getOperand(1);
45667
45668 // If we are zero/sign extending two steps without SSE4.1, its better to
45669 // reduce the vmul width instead.
45670 if (!Subtarget.hasSSE41() &&
45671 (((N0.getOpcode() == ISD::ZERO_EXTEND &&
45672 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
45673 (N1.getOpcode() == ISD::ZERO_EXTEND &&
45674 N1.getOperand(0).getScalarValueSizeInBits() <= 8)) ||
45675 ((N0.getOpcode() == ISD::SIGN_EXTEND &&
45676 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
45677 (N1.getOpcode() == ISD::SIGN_EXTEND &&
45678 N1.getOperand(0).getScalarValueSizeInBits() <= 8))))
45679 return SDValue();
45680
45681 // If we are sign extending a wide vector without SSE4.1, its better to reduce
45682 // the vmul width instead.
45683 if (!Subtarget.hasSSE41() &&
45684 (N0.getOpcode() == ISD::SIGN_EXTEND &&
45685 N0.getOperand(0).getValueSizeInBits() > 128) &&
45686 (N1.getOpcode() == ISD::SIGN_EXTEND &&
45687 N1.getOperand(0).getValueSizeInBits() > 128))
45688 return SDValue();
45689
45690 // Sign bits must extend down to the lowest i16.
45691 if (DAG.ComputeMaxSignificantBits(N1) > 16 ||
45692 DAG.ComputeMaxSignificantBits(N0) > 16)
45693 return SDValue();
45694
45695 // At least one of the elements must be zero in the upper 17 bits, or can be
45696 // safely made zero without altering the final result.
45697 auto GetZeroableOp = [&](SDValue Op) {
45698 APInt Mask17 = APInt::getHighBitsSet(32, 17);
45699 if (DAG.MaskedValueIsZero(Op, Mask17))
45700 return Op;
45701 // Mask off upper 16-bits of sign-extended constants.
45702 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode()))
45703 return DAG.getNode(ISD::AND, SDLoc(N), VT, Op,
45704 DAG.getConstant(0xFFFF, SDLoc(N), VT));
45705 if (Op.getOpcode() == ISD::SIGN_EXTEND && N->isOnlyUserOf(Op.getNode())) {
45706 SDValue Src = Op.getOperand(0);
45707 // Convert sext(vXi16) to zext(vXi16).
45708 if (Src.getScalarValueSizeInBits() == 16 && VT.getSizeInBits() <= 128)
45709 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, Src);
45710 // Convert sext(vXi8) to zext(vXi16 sext(vXi8)) on pre-SSE41 targets
45711 // which will expand the extension.
45712 if (Src.getScalarValueSizeInBits() < 16 && !Subtarget.hasSSE41()) {
45713 EVT ExtVT = VT.changeVectorElementType(MVT::i16);
45714 Src = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), ExtVT, Src);
45715 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, Src);
45716 }
45717 }
45718 // Convert SIGN_EXTEND_VECTOR_INREG to ZEXT_EXTEND_VECTOR_INREG.
45719 if (Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
45720 N->isOnlyUserOf(Op.getNode())) {
45721 SDValue Src = Op.getOperand(0);
45722 if (Src.getScalarValueSizeInBits() == 16)
45723 return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(N), VT, Src);
45724 }
45725 // Convert VSRAI(Op, 16) to VSRLI(Op, 16).
45726 if (Op.getOpcode() == X86ISD::VSRAI && Op.getConstantOperandVal(1) == 16 &&
45727 N->isOnlyUserOf(Op.getNode())) {
45728 return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, Op.getOperand(0),
45729 Op.getOperand(1));
45730 }
45731 return SDValue();
45732 };
45733 SDValue ZeroN0 = GetZeroableOp(N0);
45734 SDValue ZeroN1 = GetZeroableOp(N1);
45735 if (!ZeroN0 && !ZeroN1)
45736 return SDValue();
45737 N0 = ZeroN0 ? ZeroN0 : N0;
45738 N1 = ZeroN1 ? ZeroN1 : N1;
45739
45740 // Use SplitOpsAndApply to handle AVX splitting.
45741 auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
45742 ArrayRef<SDValue> Ops) {
45743 MVT OpVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
45744 return DAG.getNode(X86ISD::VPMADDWD, DL, OpVT, Ops);
45745 };
45746 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
45747 { DAG.getBitcast(WVT, N0), DAG.getBitcast(WVT, N1) },
45748 PMADDWDBuilder);
45749}
45750
45751static SDValue combineMulToPMULDQ(SDNode *N, SelectionDAG &DAG,
45752 const X86Subtarget &Subtarget) {
45753 if (!Subtarget.hasSSE2())
45754 return SDValue();
45755
45756 EVT VT = N->getValueType(0);
45757
45758 // Only support vXi64 vectors.
45759 if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||
45760 VT.getVectorNumElements() < 2 ||
45761 !isPowerOf2_32(VT.getVectorNumElements()))
45762 return SDValue();
45763
45764 SDValue N0 = N->getOperand(0);
45765 SDValue N1 = N->getOperand(1);
45766
45767 // MULDQ returns the 64-bit result of the signed multiplication of the lower
45768 // 32-bits. We can lower with this if the sign bits stretch that far.
45769 if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&
45770 DAG.ComputeNumSignBits(N1) > 32) {
45771 auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
45772 ArrayRef<SDValue> Ops) {
45773 return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);
45774 };
45775 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
45776 PMULDQBuilder, /*CheckBWI*/false);
45777 }
45778
45779 // If the upper bits are zero we can use a single pmuludq.
45780 APInt Mask = APInt::getHighBitsSet(64, 32);
45781 if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {
45782 auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
45783 ArrayRef<SDValue> Ops) {
45784 return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);
45785 };
45786 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
45787 PMULUDQBuilder, /*CheckBWI*/false);
45788 }
45789
45790 return SDValue();
45791}
45792
45793static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
45794 TargetLowering::DAGCombinerInfo &DCI,
45795 const X86Subtarget &Subtarget) {
45796 EVT VT = N->getValueType(0);
45797
45798 if (SDValue V = combineMulToPMADDWD(N, DAG, Subtarget))
45799 return V;
45800
45801 if (SDValue V = combineMulToPMULDQ(N, DAG, Subtarget))
45802 return V;
45803
45804 if (DCI.isBeforeLegalize() && VT.isVector())
45805 return reduceVMULWidth(N, DAG, Subtarget);
45806
45807 // Optimize a single multiply with constant into two operations in order to
45808 // implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
45809 if (!MulConstantOptimization)
45810 return SDValue();
45811
45812 // An imul is usually smaller than the alternative sequence.
45813 if (DAG.getMachineFunction().getFunction().hasMinSize())
45814 return SDValue();
45815
45816 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
45817 return SDValue();
45818
45819 if (VT != MVT::i64 && VT != MVT::i32)
45820 return SDValue();
45821
45822 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
45823 if (!C)
45824 return SDValue();
45825 if (isPowerOf2_64(C->getZExtValue()))
45826 return SDValue();
45827
45828 int64_t SignMulAmt = C->getSExtValue();
45829 assert(SignMulAmt != INT64_MIN && "Int min should have been handled!")(static_cast <bool> (SignMulAmt != (-9223372036854775807L
-1) && "Int min should have been handled!") ? void (
0) : __assert_fail ("SignMulAmt != INT64_MIN && \"Int min should have been handled!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45829, __extension__
__PRETTY_FUNCTION__))
;
45830 uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;
45831
45832 SDLoc DL(N);
45833 if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) {
45834 SDValue NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
45835 DAG.getConstant(AbsMulAmt, DL, VT));
45836 if (SignMulAmt < 0)
45837 NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
45838 NewMul);
45839
45840 return NewMul;
45841 }
45842
45843 uint64_t MulAmt1 = 0;
45844 uint64_t MulAmt2 = 0;
45845 if ((AbsMulAmt % 9) == 0) {
45846 MulAmt1 = 9;
45847 MulAmt2 = AbsMulAmt / 9;
45848 } else if ((AbsMulAmt % 5) == 0) {
45849 MulAmt1 = 5;
45850 MulAmt2 = AbsMulAmt / 5;
45851 } else if ((AbsMulAmt % 3) == 0) {
45852 MulAmt1 = 3;
45853 MulAmt2 = AbsMulAmt / 3;
45854 }
45855
45856 SDValue NewMul;
45857 // For negative multiply amounts, only allow MulAmt2 to be a power of 2.
45858 if (MulAmt2 &&
45859 (isPowerOf2_64(MulAmt2) ||
45860 (SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) {
45861
45862 if (isPowerOf2_64(MulAmt2) &&
45863 !(SignMulAmt >= 0 && N->hasOneUse() &&
45864 N->use_begin()->getOpcode() == ISD::ADD))
45865 // If second multiplifer is pow2, issue it first. We want the multiply by
45866 // 3, 5, or 9 to be folded into the addressing mode unless the lone use
45867 // is an add. Only do this for positive multiply amounts since the
45868 // negate would prevent it from being used as an address mode anyway.
45869 std::swap(MulAmt1, MulAmt2);
45870
45871 if (isPowerOf2_64(MulAmt1))
45872 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
45873 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
45874 else
45875 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
45876 DAG.getConstant(MulAmt1, DL, VT));
45877
45878 if (isPowerOf2_64(MulAmt2))
45879 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
45880 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
45881 else
45882 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
45883 DAG.getConstant(MulAmt2, DL, VT));
45884
45885 // Negate the result.
45886 if (SignMulAmt < 0)
45887 NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
45888 NewMul);
45889 } else if (!Subtarget.slowLEA())
45890 NewMul = combineMulSpecial(C->getZExtValue(), N, DAG, VT, DL);
45891
45892 if (!NewMul) {
45893 assert(C->getZExtValue() != 0 &&(static_cast <bool> (C->getZExtValue() != 0 &&
C->getZExtValue() != (VT == MVT::i64 ? (18446744073709551615UL
) : (4294967295U)) && "Both cases that could cause potential overflows should have "
"already been handled.") ? void (0) : __assert_fail ("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45896, __extension__
__PRETTY_FUNCTION__))
45894 C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&(static_cast <bool> (C->getZExtValue() != 0 &&
C->getZExtValue() != (VT == MVT::i64 ? (18446744073709551615UL
) : (4294967295U)) && "Both cases that could cause potential overflows should have "
"already been handled.") ? void (0) : __assert_fail ("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45896, __extension__
__PRETTY_FUNCTION__))
45895 "Both cases that could cause potential overflows should have "(static_cast <bool> (C->getZExtValue() != 0 &&
C->getZExtValue() != (VT == MVT::i64 ? (18446744073709551615UL
) : (4294967295U)) && "Both cases that could cause potential overflows should have "
"already been handled.") ? void (0) : __assert_fail ("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45896, __extension__
__PRETTY_FUNCTION__))
45896 "already been handled.")(static_cast <bool> (C->getZExtValue() != 0 &&
C->getZExtValue() != (VT == MVT::i64 ? (18446744073709551615UL
) : (4294967295U)) && "Both cases that could cause potential overflows should have "
"already been handled.") ? void (0) : __assert_fail ("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45896, __extension__
__PRETTY_FUNCTION__))
;
45897 if (isPowerOf2_64(AbsMulAmt - 1)) {
45898 // (mul x, 2^N + 1) => (add (shl x, N), x)
45899 NewMul = DAG.getNode(
45900 ISD::ADD, DL, VT, N->getOperand(0),
45901 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
45902 DAG.getConstant(Log2_64(AbsMulAmt - 1), DL,
45903 MVT::i8)));
45904 // To negate, subtract the number from zero
45905 if (SignMulAmt < 0)
45906 NewMul = DAG.getNode(ISD::SUB, DL, VT,
45907 DAG.getConstant(0, DL, VT), NewMul);
45908 } else if (isPowerOf2_64(AbsMulAmt + 1)) {
45909 // (mul x, 2^N - 1) => (sub (shl x, N), x)
45910 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
45911 DAG.getConstant(Log2_64(AbsMulAmt + 1),
45912 DL, MVT::i8));
45913 // To negate, reverse the operands of the subtract.
45914 if (SignMulAmt < 0)
45915 NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);
45916 else
45917 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
45918 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2)) {
45919 // (mul x, 2^N + 2) => (add (add (shl x, N), x), x)
45920 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
45921 DAG.getConstant(Log2_64(AbsMulAmt - 2),
45922 DL, MVT::i8));
45923 NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0));
45924 NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0));
45925 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2)) {
45926 // (mul x, 2^N - 2) => (sub (sub (shl x, N), x), x)
45927 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
45928 DAG.getConstant(Log2_64(AbsMulAmt + 2),
45929 DL, MVT::i8));
45930 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
45931 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
45932 }
45933 }
45934
45935 return NewMul;
45936}
45937
45938// Try to form a MULHU or MULHS node by looking for
45939// (srl (mul ext, ext), 16)
45940// TODO: This is X86 specific because we want to be able to handle wide types
45941// before type legalization. But we can only do it if the vector will be
45942// legalized via widening/splitting. Type legalization can't handle promotion
45943// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
45944// combiner.
45945static SDValue combineShiftToPMULH(SDNode *N, SelectionDAG &DAG,
45946 const X86Subtarget &Subtarget) {
45947 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&(static_cast <bool> ((N->getOpcode() == ISD::SRL || N
->getOpcode() == ISD::SRA) && "SRL or SRA node is required here!"
) ? void (0) : __assert_fail ("(N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) && \"SRL or SRA node is required here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45948, __extension__
__PRETTY_FUNCTION__))
45948 "SRL or SRA node is required here!")(static_cast <bool> ((N->getOpcode() == ISD::SRL || N
->getOpcode() == ISD::SRA) && "SRL or SRA node is required here!"
) ? void (0) : __assert_fail ("(N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) && \"SRL or SRA node is required here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45948, __extension__
__PRETTY_FUNCTION__))
;
45949 SDLoc DL(N);
45950
45951 if (!Subtarget.hasSSE2())
45952 return SDValue();
45953
45954 // The operation feeding into the shift must be a multiply.
45955 SDValue ShiftOperand = N->getOperand(0);
45956 if (ShiftOperand.getOpcode() != ISD::MUL || !ShiftOperand.hasOneUse())
45957 return SDValue();
45958
45959 // Input type should be at least vXi32.
45960 EVT VT = N->getValueType(0);
45961 if (!VT.isVector() || VT.getVectorElementType().getSizeInBits() < 32)
45962 return SDValue();
45963
45964 // Need a shift by 16.
45965 APInt ShiftAmt;
45966 if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), ShiftAmt) ||
45967 ShiftAmt != 16)
45968 return SDValue();
45969
45970 SDValue LHS = ShiftOperand.getOperand(0);
45971 SDValue RHS = ShiftOperand.getOperand(1);
45972
45973 unsigned ExtOpc = LHS.getOpcode();
45974 if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
45975 RHS.getOpcode() != ExtOpc)
45976 return SDValue();
45977
45978 // Peek through the extends.
45979 LHS = LHS.getOperand(0);
45980 RHS = RHS.getOperand(0);
45981
45982 // Ensure the input types match.
45983 EVT MulVT = LHS.getValueType();
45984 if (MulVT.getVectorElementType() != MVT::i16 || RHS.getValueType() != MulVT)
45985 return SDValue();
45986
45987 unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
45988 SDValue Mulh = DAG.getNode(Opc, DL, MulVT, LHS, RHS);
45989
45990 ExtOpc = N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
45991 return DAG.getNode(ExtOpc, DL, VT, Mulh);
45992}
45993
45994static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
45995 SDValue N0 = N->getOperand(0);
45996 SDValue N1 = N->getOperand(1);
45997 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
45998 EVT VT = N0.getValueType();
45999
46000 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
46001 // since the result of setcc_c is all zero's or all ones.
46002 if (VT.isInteger() && !VT.isVector() &&
46003 N1C && N0.getOpcode() == ISD::AND &&
46004 N0.getOperand(1).getOpcode() == ISD::Constant) {
46005 SDValue N00 = N0.getOperand(0);
46006 APInt Mask = N0.getConstantOperandAPInt(1);
46007 Mask <<= N1C->getAPIntValue();
46008 bool MaskOK = false;
46009 // We can handle cases concerning bit-widening nodes containing setcc_c if
46010 // we carefully interrogate the mask to make sure we are semantics
46011 // preserving.
46012 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
46013 // of the underlying setcc_c operation if the setcc_c was zero extended.
46014 // Consider the following example:
46015 // zext(setcc_c) -> i32 0x0000FFFF
46016 // c1 -> i32 0x0000FFFF
46017 // c2 -> i32 0x00000001
46018 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
46019 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
46020 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
46021 MaskOK = true;
46022 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
46023 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
46024 MaskOK = true;
46025 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
46026 N00.getOpcode() == ISD::ANY_EXTEND) &&
46027 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
46028 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
46029 }
46030 if (MaskOK && Mask != 0) {
46031 SDLoc DL(N);
46032 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
46033 }
46034 }
46035
46036 // Hardware support for vector shifts is sparse which makes us scalarize the
46037 // vector operations in many cases. Also, on sandybridge ADD is faster than
46038 // shl.
46039 // (shl V, 1) -> add V,V
46040 if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
46041 if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
46042 assert(N0.getValueType().isVector() && "Invalid vector shift type")(static_cast <bool> (N0.getValueType().isVector() &&
"Invalid vector shift type") ? void (0) : __assert_fail ("N0.getValueType().isVector() && \"Invalid vector shift type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46042, __extension__
__PRETTY_FUNCTION__))
;
46043 // We shift all of the values by one. In many cases we do not have
46044 // hardware support for this operation. This is better expressed as an ADD
46045 // of two values.
46046 if (N1SplatC->isOne())
46047 return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
46048 }
46049
46050 return SDValue();
46051}
46052
46053static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG,
46054 const X86Subtarget &Subtarget) {
46055 SDValue N0 = N->getOperand(0);
46056 SDValue N1 = N->getOperand(1);
46057 EVT VT = N0.getValueType();
46058 unsigned Size = VT.getSizeInBits();
46059
46060 if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
46061 return V;
46062
46063 // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
46064 // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
46065 // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
46066 // depending on sign of (SarConst - [56,48,32,24,16])
46067
46068 // sexts in X86 are MOVs. The MOVs have the same code size
46069 // as above SHIFTs (only SHIFT on 1 has lower code size).
46070 // However the MOVs have 2 advantages to a SHIFT:
46071 // 1. MOVs can write to a register that differs from source
46072 // 2. MOVs accept memory operands
46073
46074 if (VT.isVector() || N1.getOpcode() != ISD::Constant ||
46075 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
46076 N0.getOperand(1).getOpcode() != ISD::Constant)
46077 return SDValue();
46078
46079 SDValue N00 = N0.getOperand(0);
46080 SDValue N01 = N0.getOperand(1);
46081 APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
46082 APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
46083 EVT CVT = N1.getValueType();
46084
46085 if (SarConst.isNegative())
46086 return SDValue();
46087
46088 for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
46089 unsigned ShiftSize = SVT.getSizeInBits();
46090 // skipping types without corresponding sext/zext and
46091 // ShlConst that is not one of [56,48,32,24,16]
46092 if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
46093 continue;
46094 SDLoc DL(N);
46095 SDValue NN =
46096 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
46097 SarConst = SarConst - (Size - ShiftSize);
46098 if (SarConst == 0)
46099 return NN;
46100 if (SarConst.isNegative())
46101 return DAG.getNode(ISD::SHL, DL, VT, NN,
46102 DAG.getConstant(-SarConst, DL, CVT));
46103 return DAG.getNode(ISD::SRA, DL, VT, NN,
46104 DAG.getConstant(SarConst, DL, CVT));
46105 }
46106 return SDValue();
46107}
46108
46109static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,
46110 TargetLowering::DAGCombinerInfo &DCI,
46111 const X86Subtarget &Subtarget) {
46112 SDValue N0 = N->getOperand(0);
46113 SDValue N1 = N->getOperand(1);
46114 EVT VT = N0.getValueType();
46115
46116 if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
46117 return V;
46118
46119 // Only do this on the last DAG combine as it can interfere with other
46120 // combines.
46121 if (!DCI.isAfterLegalizeDAG())
46122 return SDValue();
46123
46124 // Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
46125 // TODO: This is a generic DAG combine that became an x86-only combine to
46126 // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
46127 // and-not ('andn').
46128 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
46129 return SDValue();
46130
46131 auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
46132 auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
46133 if (!ShiftC || !AndC)
46134 return SDValue();
46135
46136 // If we can shrink the constant mask below 8-bits or 32-bits, then this
46137 // transform should reduce code size. It may also enable secondary transforms
46138 // from improved known-bits analysis or instruction selection.
46139 APInt MaskVal = AndC->getAPIntValue();
46140
46141 // If this can be matched by a zero extend, don't optimize.
46142 if (MaskVal.isMask()) {
46143 unsigned TO = MaskVal.countTrailingOnes();
46144 if (TO >= 8 && isPowerOf2_32(TO))
46145 return SDValue();
46146 }
46147
46148 APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
46149 unsigned OldMaskSize = MaskVal.getMinSignedBits();
46150 unsigned NewMaskSize = NewMaskVal.getMinSignedBits();
46151 if ((OldMaskSize > 8 && NewMaskSize <= 8) ||
46152 (OldMaskSize > 32 && NewMaskSize <= 32)) {
46153 // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
46154 SDLoc DL(N);
46155 SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
46156 SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
46157 return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
46158 }
46159 return SDValue();
46160}
46161
46162static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG,
46163 const X86Subtarget &Subtarget) {
46164 unsigned Opcode = N->getOpcode();
46165 assert(isHorizOp(Opcode) && "Unexpected hadd/hsub/pack opcode")(static_cast <bool> (isHorizOp(Opcode) && "Unexpected hadd/hsub/pack opcode"
) ? void (0) : __assert_fail ("isHorizOp(Opcode) && \"Unexpected hadd/hsub/pack opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46165, __extension__
__PRETTY_FUNCTION__))
;
46166
46167 SDLoc DL(N);
46168 EVT VT = N->getValueType(0);
46169 SDValue N0 = N->getOperand(0);
46170 SDValue N1 = N->getOperand(1);
46171 EVT SrcVT = N0.getValueType();
46172
46173 SDValue BC0 =
46174 N->isOnlyUserOf(N0.getNode()) ? peekThroughOneUseBitcasts(N0) : N0;
46175 SDValue BC1 =
46176 N->isOnlyUserOf(N1.getNode()) ? peekThroughOneUseBitcasts(N1) : N1;
46177
46178 // Attempt to fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))
46179 // to SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for
46180 // truncation trees that help us avoid lane crossing shuffles.
46181 // TODO: There's a lot more we can do for PACK/HADD style shuffle combines.
46182 // TODO: We don't handle vXf64 shuffles yet.
46183 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
46184 if (SDValue BCSrc = getSplitVectorSrc(BC0, BC1, false)) {
46185 SmallVector<SDValue> ShuffleOps;
46186 SmallVector<int> ShuffleMask, ScaledMask;
46187 SDValue Vec = peekThroughBitcasts(BCSrc);
46188 if (getTargetShuffleInputs(Vec, ShuffleOps, ShuffleMask, DAG)) {
46189 resolveTargetShuffleInputsAndMask(ShuffleOps, ShuffleMask);
46190 // To keep the HOP LHS/RHS coherency, we must be able to scale the unary
46191 // shuffle to a v4X64 width - we can probably relax this in the future.
46192 if (!isAnyZero(ShuffleMask) && ShuffleOps.size() == 1 &&
46193 ShuffleOps[0].getValueType().is256BitVector() &&
46194 scaleShuffleElements(ShuffleMask, 4, ScaledMask)) {
46195 SDValue Lo, Hi;
46196 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
46197 std::tie(Lo, Hi) = DAG.SplitVector(ShuffleOps[0], DL);
46198 Lo = DAG.getBitcast(SrcVT, Lo);
46199 Hi = DAG.getBitcast(SrcVT, Hi);
46200 SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);
46201 Res = DAG.getBitcast(ShufVT, Res);
46202 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ScaledMask);
46203 return DAG.getBitcast(VT, Res);
46204 }
46205 }
46206 }
46207 }
46208
46209 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(Z,W)) -> SHUFFLE(HOP()).
46210 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
46211 // If either/both ops are a shuffle that can scale to v2x64,
46212 // then see if we can perform this as a v4x32 post shuffle.
46213 SmallVector<SDValue> Ops0, Ops1;
46214 SmallVector<int> Mask0, Mask1, ScaledMask0, ScaledMask1;
46215 bool IsShuf0 =
46216 getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
46217 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
46218 all_of(Ops0, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
46219 bool IsShuf1 =
46220 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
46221 scaleShuffleElements(Mask1, 2, ScaledMask1) &&
46222 all_of(Ops1, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
46223 if (IsShuf0 || IsShuf1) {
46224 if (!IsShuf0) {
46225 Ops0.assign({BC0});
46226 ScaledMask0.assign({0, 1});
46227 }
46228 if (!IsShuf1) {
46229 Ops1.assign({BC1});
46230 ScaledMask1.assign({0, 1});
46231 }
46232
46233 SDValue LHS, RHS;
46234 int PostShuffle[4] = {-1, -1, -1, -1};
46235 auto FindShuffleOpAndIdx = [&](int M, int &Idx, ArrayRef<SDValue> Ops) {
46236 if (M < 0)
46237 return true;
46238 Idx = M % 2;
46239 SDValue Src = Ops[M / 2];
46240 if (!LHS || LHS == Src) {
46241 LHS = Src;
46242 return true;
46243 }
46244 if (!RHS || RHS == Src) {
46245 Idx += 2;
46246 RHS = Src;
46247 return true;
46248 }
46249 return false;
46250 };
46251 if (FindShuffleOpAndIdx(ScaledMask0[0], PostShuffle[0], Ops0) &&
46252 FindShuffleOpAndIdx(ScaledMask0[1], PostShuffle[1], Ops0) &&
46253 FindShuffleOpAndIdx(ScaledMask1[0], PostShuffle[2], Ops1) &&
46254 FindShuffleOpAndIdx(ScaledMask1[1], PostShuffle[3], Ops1)) {
46255 LHS = DAG.getBitcast(SrcVT, LHS);
46256 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
46257 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
46258 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
46259 Res = DAG.getBitcast(ShufVT, Res);
46260 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, PostShuffle);
46261 return DAG.getBitcast(VT, Res);
46262 }
46263 }
46264 }
46265
46266 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)).
46267 if (VT.is256BitVector() && Subtarget.hasInt256()) {
46268 SmallVector<int> Mask0, Mask1;
46269 SmallVector<SDValue> Ops0, Ops1;
46270 SmallVector<int, 2> ScaledMask0, ScaledMask1;
46271 if (getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
46272 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
46273 !Ops0.empty() && !Ops1.empty() &&
46274 all_of(Ops0,
46275 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
46276 all_of(Ops1,
46277 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
46278 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
46279 scaleShuffleElements(Mask1, 2, ScaledMask1)) {
46280 SDValue Op00 = peekThroughBitcasts(Ops0.front());
46281 SDValue Op10 = peekThroughBitcasts(Ops1.front());
46282 SDValue Op01 = peekThroughBitcasts(Ops0.back());
46283 SDValue Op11 = peekThroughBitcasts(Ops1.back());
46284 if ((Op00 == Op11) && (Op01 == Op10)) {
46285 std::swap(Op10, Op11);
46286 ShuffleVectorSDNode::commuteMask(ScaledMask1);
46287 }
46288 if ((Op00 == Op10) && (Op01 == Op11)) {
46289 const int Map[4] = {0, 2, 1, 3};
46290 SmallVector<int, 4> ShuffleMask(
46291 {Map[ScaledMask0[0]], Map[ScaledMask1[0]], Map[ScaledMask0[1]],
46292 Map[ScaledMask1[1]]});
46293 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
46294 SDValue Res = DAG.getNode(Opcode, DL, VT, DAG.getBitcast(SrcVT, Op00),
46295 DAG.getBitcast(SrcVT, Op01));
46296 Res = DAG.getBitcast(ShufVT, Res);
46297 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);
46298 return DAG.getBitcast(VT, Res);
46299 }
46300 }
46301 }
46302
46303 return SDValue();
46304}
46305
46306static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
46307 TargetLowering::DAGCombinerInfo &DCI,
46308 const X86Subtarget &Subtarget) {
46309 unsigned Opcode = N->getOpcode();
46310 assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&(static_cast <bool> ((X86ISD::PACKSS == Opcode || X86ISD
::PACKUS == Opcode) && "Unexpected pack opcode") ? void
(0) : __assert_fail ("(X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) && \"Unexpected pack opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46311, __extension__
__PRETTY_FUNCTION__))
46311 "Unexpected pack opcode")(static_cast <bool> ((X86ISD::PACKSS == Opcode || X86ISD
::PACKUS == Opcode) && "Unexpected pack opcode") ? void
(0) : __assert_fail ("(X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) && \"Unexpected pack opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46311, __extension__
__PRETTY_FUNCTION__))
;
46312
46313 EVT VT = N->getValueType(0);
46314 SDValue N0 = N->getOperand(0);
46315 SDValue N1 = N->getOperand(1);
46316 unsigned NumDstElts = VT.getVectorNumElements();
46317 unsigned DstBitsPerElt = VT.getScalarSizeInBits();
46318 unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
46319 assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&(static_cast <bool> (N0.getScalarValueSizeInBits() == SrcBitsPerElt
&& N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
"Unexpected PACKSS/PACKUS input type") ? void (0) : __assert_fail
("N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && \"Unexpected PACKSS/PACKUS input type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46321, __extension__
__PRETTY_FUNCTION__))
46320 N1.getScalarValueSizeInBits() == SrcBitsPerElt &&(static_cast <bool> (N0.getScalarValueSizeInBits() == SrcBitsPerElt
&& N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
"Unexpected PACKSS/PACKUS input type") ? void (0) : __assert_fail
("N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && \"Unexpected PACKSS/PACKUS input type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46321, __extension__
__PRETTY_FUNCTION__))
46321 "Unexpected PACKSS/PACKUS input type")(static_cast <bool> (N0.getScalarValueSizeInBits() == SrcBitsPerElt
&& N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
"Unexpected PACKSS/PACKUS input type") ? void (0) : __assert_fail
("N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && \"Unexpected PACKSS/PACKUS input type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46321, __extension__
__PRETTY_FUNCTION__))
;
46322
46323 bool IsSigned = (X86ISD::PACKSS == Opcode);
46324
46325 // Constant Folding.
46326 APInt UndefElts0, UndefElts1;
46327 SmallVector<APInt, 32> EltBits0, EltBits1;
46328 if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) &&
46329 (N1.isUndef() || N->isOnlyUserOf(N1.getNode())) &&
46330 getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) &&
46331 getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) {
46332 unsigned NumLanes = VT.getSizeInBits() / 128;
46333 unsigned NumSrcElts = NumDstElts / 2;
46334 unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
46335 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
46336
46337 APInt Undefs(NumDstElts, 0);
46338 SmallVector<APInt, 32> Bits(NumDstElts, APInt::getZero(DstBitsPerElt));
46339 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
46340 for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
46341 unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
46342 auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
46343 auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);
46344
46345 if (UndefElts[SrcIdx]) {
46346 Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
46347 continue;
46348 }
46349
46350 APInt &Val = EltBits[SrcIdx];
46351 if (IsSigned) {
46352 // PACKSS: Truncate signed value with signed saturation.
46353 // Source values less than dst minint are saturated to minint.
46354 // Source values greater than dst maxint are saturated to maxint.
46355 if (Val.isSignedIntN(DstBitsPerElt))
46356 Val = Val.trunc(DstBitsPerElt);
46357 else if (Val.isNegative())
46358 Val = APInt::getSignedMinValue(DstBitsPerElt);
46359 else
46360 Val = APInt::getSignedMaxValue(DstBitsPerElt);
46361 } else {
46362 // PACKUS: Truncate signed value with unsigned saturation.
46363 // Source values less than zero are saturated to zero.
46364 // Source values greater than dst maxuint are saturated to maxuint.
46365 if (Val.isIntN(DstBitsPerElt))
46366 Val = Val.trunc(DstBitsPerElt);
46367 else if (Val.isNegative())
46368 Val = APInt::getZero(DstBitsPerElt);
46369 else
46370 Val = APInt::getAllOnes(DstBitsPerElt);
46371 }
46372 Bits[Lane * NumDstEltsPerLane + Elt] = Val;
46373 }
46374 }
46375
46376 return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
46377 }
46378
46379 // Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()).
46380 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
46381 return V;
46382
46383 // Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular
46384 // truncate to create a larger truncate.
46385 if (Subtarget.hasAVX512() &&
46386 N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&
46387 N0.getOperand(0).getValueType() == MVT::v8i32) {
46388 if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) ||
46389 (!IsSigned &&
46390 DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) {
46391 if (Subtarget.hasVLX())
46392 return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0));
46393
46394 // Widen input to v16i32 so we can truncate that.
46395 SDLoc dl(N);
46396 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32,
46397 N0.getOperand(0), DAG.getUNDEF(MVT::v8i32));
46398 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat);
46399 }
46400 }
46401
46402 // Try to fold PACK(EXTEND(X),EXTEND(Y)) -> CONCAT(X,Y) subvectors.
46403 if (VT.is128BitVector()) {
46404 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
46405 SDValue Src0, Src1;
46406 if (N0.getOpcode() == ExtOpc &&
46407 N0.getOperand(0).getValueType().is64BitVector() &&
46408 N0.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
46409 Src0 = N0.getOperand(0);
46410 }
46411 if (N1.getOpcode() == ExtOpc &&
46412 N1.getOperand(0).getValueType().is64BitVector() &&
46413 N1.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
46414 Src1 = N1.getOperand(0);
46415 }
46416 if ((Src0 || N0.isUndef()) && (Src1 || N1.isUndef())) {
46417 assert((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)")(static_cast <bool> ((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)"
) ? void (0) : __assert_fail ("(Src0 || Src1) && \"Found PACK(UNDEF,UNDEF)\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46417, __extension__
__PRETTY_FUNCTION__))
;
46418 Src0 = Src0 ? Src0 : DAG.getUNDEF(Src1.getValueType());
46419 Src1 = Src1 ? Src1 : DAG.getUNDEF(Src0.getValueType());
46420 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Src0, Src1);
46421 }
46422
46423 // Try again with pack(*_extend_vector_inreg, undef).
46424 unsigned VecInRegOpc = IsSigned ? ISD::SIGN_EXTEND_VECTOR_INREG
46425 : ISD::ZERO_EXTEND_VECTOR_INREG;
46426 if (N0.getOpcode() == VecInRegOpc && N1.isUndef() &&
46427 N0.getOperand(0).getScalarValueSizeInBits() < DstBitsPerElt)
46428 return getEXTEND_VECTOR_INREG(ExtOpc, SDLoc(N), VT, N0.getOperand(0),
46429 DAG);
46430 }
46431
46432 // Attempt to combine as shuffle.
46433 SDValue Op(N, 0);
46434 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
46435 return Res;
46436
46437 return SDValue();
46438}
46439
46440static SDValue combineVectorHADDSUB(SDNode *N, SelectionDAG &DAG,
46441 TargetLowering::DAGCombinerInfo &DCI,
46442 const X86Subtarget &Subtarget) {
46443 assert((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() ||(static_cast <bool> ((X86ISD::HADD == N->getOpcode()
|| X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->
getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
"Unexpected horizontal add/sub opcode") ? void (0) : __assert_fail
("(X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) && \"Unexpected horizontal add/sub opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46445, __extension__
__PRETTY_FUNCTION__))
46444 X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&(static_cast <bool> ((X86ISD::HADD == N->getOpcode()
|| X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->
getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
"Unexpected horizontal add/sub opcode") ? void (0) : __assert_fail
("(X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) && \"Unexpected horizontal add/sub opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46445, __extension__
__PRETTY_FUNCTION__))
46445 "Unexpected horizontal add/sub opcode")(static_cast <bool> ((X86ISD::HADD == N->getOpcode()
|| X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->
getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
"Unexpected horizontal add/sub opcode") ? void (0) : __assert_fail
("(X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) && \"Unexpected horizontal add/sub opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46445, __extension__
__PRETTY_FUNCTION__))
;
46446
46447 if (!shouldUseHorizontalOp(true, DAG, Subtarget)) {
46448 MVT VT = N->getSimpleValueType(0);
46449 SDValue LHS = N->getOperand(0);
46450 SDValue RHS = N->getOperand(1);
46451
46452 // HOP(HOP'(X,X),HOP'(Y,Y)) -> HOP(PERMUTE(HOP'(X,Y)),PERMUTE(HOP'(X,Y)).
46453 if (LHS != RHS && LHS.getOpcode() == N->getOpcode() &&
46454 LHS.getOpcode() == RHS.getOpcode() &&
46455 LHS.getValueType() == RHS.getValueType() &&
46456 N->isOnlyUserOf(LHS.getNode()) && N->isOnlyUserOf(RHS.getNode())) {
46457 SDValue LHS0 = LHS.getOperand(0);
46458 SDValue LHS1 = LHS.getOperand(1);
46459 SDValue RHS0 = RHS.getOperand(0);
46460 SDValue RHS1 = RHS.getOperand(1);
46461 if ((LHS0 == LHS1 || LHS0.isUndef() || LHS1.isUndef()) &&
46462 (RHS0 == RHS1 || RHS0.isUndef() || RHS1.isUndef())) {
46463 SDLoc DL(N);
46464 SDValue Res = DAG.getNode(LHS.getOpcode(), DL, LHS.getValueType(),
46465 LHS0.isUndef() ? LHS1 : LHS0,
46466 RHS0.isUndef() ? RHS1 : RHS0);
46467 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
46468 Res = DAG.getBitcast(ShufVT, Res);
46469 SDValue NewLHS =
46470 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
46471 getV4X86ShuffleImm8ForMask({0, 1, 0, 1}, DL, DAG));
46472 SDValue NewRHS =
46473 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
46474 getV4X86ShuffleImm8ForMask({2, 3, 2, 3}, DL, DAG));
46475 return DAG.getNode(N->getOpcode(), DL, VT, DAG.getBitcast(VT, NewLHS),
46476 DAG.getBitcast(VT, NewRHS));
46477 }
46478 }
46479 }
46480
46481 // Try to fold HOP(SHUFFLE(),SHUFFLE()) -> SHUFFLE(HOP()).
46482 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
46483 return V;
46484
46485 return SDValue();
46486}
46487
46488static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG,
46489 TargetLowering::DAGCombinerInfo &DCI,
46490 const X86Subtarget &Subtarget) {
46491 assert((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() ||(static_cast <bool> ((X86ISD::VSHL == N->getOpcode()
|| X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->
getOpcode()) && "Unexpected shift opcode") ? void (0)
: __assert_fail ("(X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->getOpcode()) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46493, __extension__
__PRETTY_FUNCTION__))
46492 X86ISD::VSRL == N->getOpcode()) &&(static_cast <bool> ((X86ISD::VSHL == N->getOpcode()
|| X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->
getOpcode()) && "Unexpected shift opcode") ? void (0)
: __assert_fail ("(X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->getOpcode()) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46493, __extension__
__PRETTY_FUNCTION__))
46493 "Unexpected shift opcode")(static_cast <bool> ((X86ISD::VSHL == N->getOpcode()
|| X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->
getOpcode()) && "Unexpected shift opcode") ? void (0)
: __assert_fail ("(X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->getOpcode()) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46493, __extension__
__PRETTY_FUNCTION__))
;
46494 EVT VT = N->getValueType(0);
46495 SDValue N0 = N->getOperand(0);
46496 SDValue N1 = N->getOperand(1);
46497
46498 // Shift zero -> zero.
46499 if (ISD::isBuildVectorAllZeros(N0.getNode()))
46500 return DAG.getConstant(0, SDLoc(N), VT);
46501
46502 // Detect constant shift amounts.
46503 APInt UndefElts;
46504 SmallVector<APInt, 32> EltBits;
46505 if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits, true, false)) {
46506 unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);
46507 return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0,
46508 EltBits[0].getZExtValue(), DAG);
46509 }
46510
46511 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46512 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
46513 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
46514 return SDValue(N, 0);
46515
46516 return SDValue();
46517}
46518
46519static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
46520 TargetLowering::DAGCombinerInfo &DCI,
46521 const X86Subtarget &Subtarget) {
46522 unsigned Opcode = N->getOpcode();
46523 assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||(static_cast <bool> ((X86ISD::VSHLI == Opcode || X86ISD
::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && "Unexpected shift opcode"
) ? void (0) : __assert_fail ("(X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46525, __extension__
__PRETTY_FUNCTION__))
46524 X86ISD::VSRLI == Opcode) &&(static_cast <bool> ((X86ISD::VSHLI == Opcode || X86ISD
::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && "Unexpected shift opcode"
) ? void (0) : __assert_fail ("(X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46525, __extension__
__PRETTY_FUNCTION__))
46525 "Unexpected shift opcode")(static_cast <bool> ((X86ISD::VSHLI == Opcode || X86ISD
::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && "Unexpected shift opcode"
) ? void (0) : __assert_fail ("(X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46525, __extension__
__PRETTY_FUNCTION__))
;
46526 bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
46527 EVT VT = N->getValueType(0);
46528 SDValue N0 = N->getOperand(0);
46529 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
46530 assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&(static_cast <bool> (VT == N0.getValueType() &&
(NumBitsPerElt % 8) == 0 && "Unexpected value type")
? void (0) : __assert_fail ("VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46531, __extension__
__PRETTY_FUNCTION__))
46531 "Unexpected value type")(static_cast <bool> (VT == N0.getValueType() &&
(NumBitsPerElt % 8) == 0 && "Unexpected value type")
? void (0) : __assert_fail ("VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46531, __extension__
__PRETTY_FUNCTION__))
;
46532 assert(N->getOperand(1).getValueType() == MVT::i8 &&(static_cast <bool> (N->getOperand(1).getValueType()
== MVT::i8 && "Unexpected shift amount type") ? void
(0) : __assert_fail ("N->getOperand(1).getValueType() == MVT::i8 && \"Unexpected shift amount type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46533, __extension__
__PRETTY_FUNCTION__))
46533 "Unexpected shift amount type")(static_cast <bool> (N->getOperand(1).getValueType()
== MVT::i8 && "Unexpected shift amount type") ? void
(0) : __assert_fail ("N->getOperand(1).getValueType() == MVT::i8 && \"Unexpected shift amount type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46533, __extension__
__PRETTY_FUNCTION__))
;
46534
46535 // (shift undef, X) -> 0
46536 if (N0.isUndef())
46537 return DAG.getConstant(0, SDLoc(N), VT);
46538
46539 // Out of range logical bit shifts are guaranteed to be zero.
46540 // Out of range arithmetic bit shifts splat the sign bit.
46541 unsigned ShiftVal = N->getConstantOperandVal(1);
46542 if (ShiftVal >= NumBitsPerElt) {
46543 if (LogicalShift)
46544 return DAG.getConstant(0, SDLoc(N), VT);
46545 ShiftVal = NumBitsPerElt - 1;
46546 }
46547
46548 // (shift X, 0) -> X
46549 if (!ShiftVal)
46550 return N0;
46551
46552 // (shift 0, C) -> 0
46553 if (ISD::isBuildVectorAllZeros(N0.getNode()))
46554 // N0 is all zeros or undef. We guarantee that the bits shifted into the
46555 // result are all zeros, not undef.
46556 return DAG.getConstant(0, SDLoc(N), VT);
46557
46558 // (VSRAI -1, C) -> -1
46559 if (!LogicalShift && ISD::isBuildVectorAllOnes(N0.getNode()))
46560 // N0 is all ones or undef. We guarantee that the bits shifted into the
46561 // result are all ones, not undef.
46562 return DAG.getConstant(-1, SDLoc(N), VT);
46563
46564 // (shift (shift X, C2), C1) -> (shift X, (C1 + C2))
46565 if (Opcode == N0.getOpcode()) {
46566 unsigned ShiftVal2 = cast<ConstantSDNode>(N0.getOperand(1))->getZExtValue();
46567 unsigned NewShiftVal = ShiftVal + ShiftVal2;
46568 if (NewShiftVal >= NumBitsPerElt) {
46569 // Out of range logical bit shifts are guaranteed to be zero.
46570 // Out of range arithmetic bit shifts splat the sign bit.
46571 if (LogicalShift)
46572 return DAG.getConstant(0, SDLoc(N), VT);
46573 NewShiftVal = NumBitsPerElt - 1;
46574 }
46575 return DAG.getNode(Opcode, SDLoc(N), VT, N0.getOperand(0),
46576 DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8));
46577 }
46578
46579 // We can decode 'whole byte' logical bit shifts as shuffles.
46580 if (LogicalShift && (ShiftVal % 8) == 0) {
46581 SDValue Op(N, 0);
46582 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
46583 return Res;
46584 }
46585
46586 // Constant Folding.
46587 APInt UndefElts;
46588 SmallVector<APInt, 32> EltBits;
46589 if (N->isOnlyUserOf(N0.getNode()) &&
46590 getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {
46591 assert(EltBits.size() == VT.getVectorNumElements() &&(static_cast <bool> (EltBits.size() == VT.getVectorNumElements
() && "Unexpected shift value type") ? void (0) : __assert_fail
("EltBits.size() == VT.getVectorNumElements() && \"Unexpected shift value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46592, __extension__
__PRETTY_FUNCTION__))
46592 "Unexpected shift value type")(static_cast <bool> (EltBits.size() == VT.getVectorNumElements
() && "Unexpected shift value type") ? void (0) : __assert_fail
("EltBits.size() == VT.getVectorNumElements() && \"Unexpected shift value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46592, __extension__
__PRETTY_FUNCTION__))
;
46593 // Undef elements need to fold to 0. It's possible SimplifyDemandedBits
46594 // created an undef input due to no input bits being demanded, but user
46595 // still expects 0 in other bits.
46596 for (unsigned i = 0, e = EltBits.size(); i != e; ++i) {
46597 APInt &Elt = EltBits[i];
46598 if (UndefElts[i])
46599 Elt = 0;
46600 else if (X86ISD::VSHLI == Opcode)
46601 Elt <<= ShiftVal;
46602 else if (X86ISD::VSRAI == Opcode)
46603 Elt.ashrInPlace(ShiftVal);
46604 else
46605 Elt.lshrInPlace(ShiftVal);
46606 }
46607 // Reset undef elements since they were zeroed above.
46608 UndefElts = 0;
46609 return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
46610 }
46611
46612 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46613 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBitsPerElt),
46614 DCI))
46615 return SDValue(N, 0);
46616
46617 return SDValue();
46618}
46619
46620static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
46621 TargetLowering::DAGCombinerInfo &DCI,
46622 const X86Subtarget &Subtarget) {
46623 EVT VT = N->getValueType(0);
46624 assert(((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) ||(static_cast <bool> (((N->getOpcode() == X86ISD::PINSRB
&& VT == MVT::v16i8) || (N->getOpcode() == X86ISD
::PINSRW && VT == MVT::v8i16) || N->getOpcode() ==
ISD::INSERT_VECTOR_ELT) && "Unexpected vector insertion"
) ? void (0) : __assert_fail ("((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) || (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16) || N->getOpcode() == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46627, __extension__
__PRETTY_FUNCTION__))
46625 (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16) ||(static_cast <bool> (((N->getOpcode() == X86ISD::PINSRB
&& VT == MVT::v16i8) || (N->getOpcode() == X86ISD
::PINSRW && VT == MVT::v8i16) || N->getOpcode() ==
ISD::INSERT_VECTOR_ELT) && "Unexpected vector insertion"
) ? void (0) : __assert_fail ("((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) || (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16) || N->getOpcode() == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46627, __extension__
__PRETTY_FUNCTION__))
46626 N->getOpcode() == ISD::INSERT_VECTOR_ELT) &&(static_cast <bool> (((N->getOpcode() == X86ISD::PINSRB
&& VT == MVT::v16i8) || (N->getOpcode() == X86ISD
::PINSRW && VT == MVT::v8i16) || N->getOpcode() ==
ISD::INSERT_VECTOR_ELT) && "Unexpected vector insertion"
) ? void (0) : __assert_fail ("((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) || (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16) || N->getOpcode() == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46627, __extension__
__PRETTY_FUNCTION__))
46627 "Unexpected vector insertion")(static_cast <bool> (((N->getOpcode() == X86ISD::PINSRB
&& VT == MVT::v16i8) || (N->getOpcode() == X86ISD
::PINSRW && VT == MVT::v8i16) || N->getOpcode() ==
ISD::INSERT_VECTOR_ELT) && "Unexpected vector insertion"
) ? void (0) : __assert_fail ("((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) || (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16) || N->getOpcode() == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46627, __extension__
__PRETTY_FUNCTION__))
;
46628
46629 if (N->getOpcode() == X86ISD::PINSRB || N->getOpcode() == X86ISD::PINSRW) {
46630 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
46631 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46632 if (TLI.SimplifyDemandedBits(SDValue(N, 0),
46633 APInt::getAllOnes(NumBitsPerElt), DCI))
46634 return SDValue(N, 0);
46635 }
46636
46637 // Attempt to combine insertion patterns to a shuffle.
46638 if (VT.isSimple() && DCI.isAfterLegalizeDAG()) {
46639 SDValue Op(N, 0);
46640 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
46641 return Res;
46642 }
46643
46644 return SDValue();
46645}
46646
46647/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
46648/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
46649/// OR -> CMPNEQSS.
46650static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
46651 TargetLowering::DAGCombinerInfo &DCI,
46652 const X86Subtarget &Subtarget) {
46653 unsigned opcode;
46654
46655 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
46656 // we're requiring SSE2 for both.
46657 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
46658 SDValue N0 = N->getOperand(0);
46659 SDValue N1 = N->getOperand(1);
46660 SDValue CMP0 = N0.getOperand(1);
46661 SDValue CMP1 = N1.getOperand(1);
46662 SDLoc DL(N);
46663
46664 // The SETCCs should both refer to the same CMP.
46665 if (CMP0.getOpcode() != X86ISD::FCMP || CMP0 != CMP1)
46666 return SDValue();
46667
46668 SDValue CMP00 = CMP0->getOperand(0);
46669 SDValue CMP01 = CMP0->getOperand(1);
46670 EVT VT = CMP00.getValueType();
46671
46672 if (VT == MVT::f32 || VT == MVT::f64 ||
46673 (VT == MVT::f16 && Subtarget.hasFP16())) {
46674 bool ExpectingFlags = false;
46675 // Check for any users that want flags:
46676 for (const SDNode *U : N->uses()) {
46677 if (ExpectingFlags)
46678 break;
46679
46680 switch (U->getOpcode()) {
46681 default:
46682 case ISD::BR_CC:
46683 case ISD::BRCOND:
46684 case ISD::SELECT:
46685 ExpectingFlags = true;
46686 break;
46687 case ISD::CopyToReg:
46688 case ISD::SIGN_EXTEND:
46689 case ISD::ZERO_EXTEND:
46690 case ISD::ANY_EXTEND:
46691 break;
46692 }
46693 }
46694
46695 if (!ExpectingFlags) {
46696 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
46697 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
46698
46699 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
46700 X86::CondCode tmp = cc0;
46701 cc0 = cc1;
46702 cc1 = tmp;
46703 }
46704
46705 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
46706 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
46707 // FIXME: need symbolic constants for these magic numbers.
46708 // See X86ATTInstPrinter.cpp:printSSECC().
46709 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
46710 if (Subtarget.hasAVX512()) {
46711 SDValue FSetCC =
46712 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
46713 DAG.getTargetConstant(x86cc, DL, MVT::i8));
46714 // Need to fill with zeros to ensure the bitcast will produce zeroes
46715 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
46716 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
46717 DAG.getConstant(0, DL, MVT::v16i1),
46718 FSetCC, DAG.getIntPtrConstant(0, DL));
46719 return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
46720 N->getSimpleValueType(0));
46721 }
46722 SDValue OnesOrZeroesF =
46723 DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00,
46724 CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8));
46725
46726 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
46727 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
46728
46729 if (is64BitFP && !Subtarget.is64Bit()) {
46730 // On a 32-bit target, we cannot bitcast the 64-bit float to a
46731 // 64-bit integer, since that's not a legal type. Since
46732 // OnesOrZeroesF is all ones or all zeroes, we don't need all the
46733 // bits, but can do this little dance to extract the lowest 32 bits
46734 // and work with those going forward.
46735 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
46736 OnesOrZeroesF);
46737 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
46738 OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
46739 Vector32, DAG.getIntPtrConstant(0, DL));
46740 IntVT = MVT::i32;
46741 }
46742
46743 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
46744 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
46745 DAG.getConstant(1, DL, IntVT));
46746 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
46747 ANDed);
46748 return OneBitOfTruth;
46749 }
46750 }
46751 }
46752 }
46753 return SDValue();
46754}
46755
46756/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
46757static SDValue combineAndNotIntoANDNP(SDNode *N, SelectionDAG &DAG) {
46758 assert(N->getOpcode() == ISD::AND)(static_cast <bool> (N->getOpcode() == ISD::AND) ? void
(0) : __assert_fail ("N->getOpcode() == ISD::AND", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 46758, __extension__ __PRETTY_FUNCTION__))
;
46759
46760 MVT VT = N->getSimpleValueType(0);
46761 if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
46762 return SDValue();
46763
46764 SDValue X, Y;
46765 SDValue N0 = N->getOperand(0);
46766 SDValue N1 = N->getOperand(1);
46767
46768 auto GetNot = [&VT, &DAG](SDValue V) {
46769 // Basic X = NOT(Y) detection.
46770 if (SDValue Not = IsNOT(V, DAG))
46771 return Not;
46772 // Fold BROADCAST(NOT(Y)) -> BROADCAST(Y).
46773 if (V.getOpcode() == X86ISD::VBROADCAST) {
46774 SDValue Src = V.getOperand(0);
46775 EVT SrcVT = Src.getValueType();
46776 if (!SrcVT.isVector())
46777 return SDValue();
46778 if (SDValue Not = IsNOT(Src, DAG))
46779 return DAG.getNode(X86ISD::VBROADCAST, SDLoc(V), VT,
46780 DAG.getBitcast(SrcVT, Not));
46781 }
46782 return SDValue();
46783 };
46784
46785 if (SDValue Not = GetNot(N0)) {
46786 X = Not;
46787 Y = N1;
46788 } else if (SDValue Not = GetNot(N1)) {
46789 X = Not;
46790 Y = N0;
46791 } else
46792 return SDValue();
46793
46794 X = DAG.getBitcast(VT, X);
46795 Y = DAG.getBitcast(VT, Y);
46796 return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);
46797}
46798
46799// Try to widen AND, OR and XOR nodes to VT in order to remove casts around
46800// logical operations, like in the example below.
46801// or (and (truncate x, truncate y)),
46802// (xor (truncate z, build_vector (constants)))
46803// Given a target type \p VT, we generate
46804// or (and x, y), (xor z, zext(build_vector (constants)))
46805// given x, y and z are of type \p VT. We can do so, if operands are either
46806// truncates from VT types, the second operand is a vector of constants or can
46807// be recursively promoted.
46808static SDValue PromoteMaskArithmetic(SDNode *N, EVT VT, SelectionDAG &DAG,
46809 unsigned Depth) {
46810 // Limit recursion to avoid excessive compile times.
46811 if (Depth >= SelectionDAG::MaxRecursionDepth)
46812 return SDValue();
46813
46814 if (N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND &&
46815 N->getOpcode() != ISD::OR)
46816 return SDValue();
46817
46818 SDValue N0 = N->getOperand(0);
46819 SDValue N1 = N->getOperand(1);
46820 SDLoc DL(N);
46821
46822 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46823 if (!TLI.isOperationLegalOrPromote(N->getOpcode(), VT))
46824 return SDValue();
46825
46826 if (SDValue NN0 = PromoteMaskArithmetic(N0.getNode(), VT, DAG, Depth + 1))
46827 N0 = NN0;
46828 else {
46829 // The Left side has to be a trunc.
46830 if (N0.getOpcode() != ISD::TRUNCATE)
46831 return SDValue();
46832
46833 // The type of the truncated inputs.
46834 if (N0.getOperand(0).getValueType() != VT)
46835 return SDValue();
46836
46837 N0 = N0.getOperand(0);
46838 }
46839
46840 if (SDValue NN1 = PromoteMaskArithmetic(N1.getNode(), VT, DAG, Depth + 1))
46841 N1 = NN1;
46842 else {
46843 // The right side has to be a 'trunc' or a constant vector.
46844 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
46845 N1.getOperand(0).getValueType() == VT;
46846 if (!RHSTrunc && !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()))
46847 return SDValue();
46848
46849 if (RHSTrunc)
46850 N1 = N1.getOperand(0);
46851 else
46852 N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);
46853 }
46854
46855 return DAG.getNode(N->getOpcode(), DL, VT, N0, N1);
46856}
46857
46858// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
46859// register. In most cases we actually compare or select YMM-sized registers
46860// and mixing the two types creates horrible code. This method optimizes
46861// some of the transition sequences.
46862// Even with AVX-512 this is still useful for removing casts around logical
46863// operations on vXi1 mask types.
46864static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG,
46865 const X86Subtarget &Subtarget) {
46866 EVT VT = N->getValueType(0);
46867 assert(VT.isVector() && "Expected vector type")(static_cast <bool> (VT.isVector() && "Expected vector type"
) ? void (0) : __assert_fail ("VT.isVector() && \"Expected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46867, __extension__
__PRETTY_FUNCTION__))
;
46868
46869 SDLoc DL(N);
46870 assert((N->getOpcode() == ISD::ANY_EXTEND ||(static_cast <bool> ((N->getOpcode() == ISD::ANY_EXTEND
|| N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode(
) == ISD::SIGN_EXTEND) && "Invalid Node") ? void (0) :
__assert_fail ("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46872, __extension__
__PRETTY_FUNCTION__))
46871 N->getOpcode() == ISD::ZERO_EXTEND ||(static_cast <bool> ((N->getOpcode() == ISD::ANY_EXTEND
|| N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode(
) == ISD::SIGN_EXTEND) && "Invalid Node") ? void (0) :
__assert_fail ("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46872, __extension__
__PRETTY_FUNCTION__))
46872 N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node")(static_cast <bool> ((N->getOpcode() == ISD::ANY_EXTEND
|| N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode(
) == ISD::SIGN_EXTEND) && "Invalid Node") ? void (0) :
__assert_fail ("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46872, __extension__
__PRETTY_FUNCTION__))
;
46873
46874 SDValue Narrow = N->getOperand(0);
46875 EVT NarrowVT = Narrow.getValueType();
46876
46877 // Generate the wide operation.
46878 SDValue Op = PromoteMaskArithmetic(Narrow.getNode(), VT, DAG, 0);
46879 if (!Op)
46880 return SDValue();
46881 switch (N->getOpcode()) {
46882 default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 46882)
;
46883 case ISD::ANY_EXTEND:
46884 return Op;
46885 case ISD::ZERO_EXTEND:
46886 return DAG.getZeroExtendInReg(Op, DL, NarrowVT);
46887 case ISD::SIGN_EXTEND:
46888 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
46889 Op, DAG.getValueType(NarrowVT));
46890 }
46891}
46892
46893static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) {
46894 unsigned FPOpcode;
46895 switch (Opcode) {
46896 default: llvm_unreachable("Unexpected input node for FP logic conversion")::llvm::llvm_unreachable_internal("Unexpected input node for FP logic conversion"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46896)
;
46897 case ISD::AND: FPOpcode = X86ISD::FAND; break;
46898 case ISD::OR: FPOpcode = X86ISD::FOR; break;
46899 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
46900 }
46901 return FPOpcode;
46902}
46903
46904/// If both input operands of a logic op are being cast from floating-point
46905/// types or FP compares, try to convert this into a floating-point logic node
46906/// to avoid unnecessary moves from SSE to integer registers.
46907static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
46908 TargetLowering::DAGCombinerInfo &DCI,
46909 const X86Subtarget &Subtarget) {
46910 EVT VT = N->getValueType(0);
46911 SDValue N0 = N->getOperand(0);
46912 SDValue N1 = N->getOperand(1);
46913 SDLoc DL(N);
46914
46915 if (!((N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) ||
46916 (N0.getOpcode() == ISD::SETCC && N1.getOpcode() == ISD::SETCC)))
46917 return SDValue();
46918
46919 SDValue N00 = N0.getOperand(0);
46920 SDValue N10 = N1.getOperand(0);
46921 EVT N00Type = N00.getValueType();
46922 EVT N10Type = N10.getValueType();
46923
46924 // Ensure that both types are the same and are legal scalar fp types.
46925 if (N00Type != N10Type || !((Subtarget.hasSSE1() && N00Type == MVT::f32) ||
46926 (Subtarget.hasSSE2() && N00Type == MVT::f64) ||
46927 (Subtarget.hasFP16() && N00Type == MVT::f16)))
46928 return SDValue();
46929
46930 if (N0.getOpcode() == ISD::BITCAST && !DCI.isBeforeLegalizeOps()) {
46931 unsigned FPOpcode = convertIntLogicToFPLogicOpcode(N->getOpcode());
46932 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
46933 return DAG.getBitcast(VT, FPLogic);
46934 }
46935
46936 if (VT != MVT::i1 || N0.getOpcode() != ISD::SETCC || !N0.hasOneUse() ||
46937 !N1.hasOneUse())
46938 return SDValue();
46939
46940 ISD::CondCode CC0 = cast<CondCodeSDNode>(N0.getOperand(2))->get();
46941 ISD::CondCode CC1 = cast<CondCodeSDNode>(N1.getOperand(2))->get();
46942
46943 // The vector ISA for FP predicates is incomplete before AVX, so converting
46944 // COMIS* to CMPS* may not be a win before AVX.
46945 if (!Subtarget.hasAVX() &&
46946 !(cheapX86FSETCC_SSE(CC0) && cheapX86FSETCC_SSE(CC1)))
46947 return SDValue();
46948
46949 // Convert scalar FP compares and logic to vector compares (COMIS* to CMPS*)
46950 // and vector logic:
46951 // logic (setcc N00, N01), (setcc N10, N11) -->
46952 // extelt (logic (setcc (s2v N00), (s2v N01)), setcc (s2v N10), (s2v N11))), 0
46953 unsigned NumElts = 128 / N00Type.getSizeInBits();
46954 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), N00Type, NumElts);
46955 EVT BoolVecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
46956 SDValue ZeroIndex = DAG.getVectorIdxConstant(0, DL);
46957 SDValue N01 = N0.getOperand(1);
46958 SDValue N11 = N1.getOperand(1);
46959 SDValue Vec00 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N00);
46960 SDValue Vec01 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N01);
46961 SDValue Vec10 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N10);
46962 SDValue Vec11 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N11);
46963 SDValue Setcc0 = DAG.getSetCC(DL, BoolVecVT, Vec00, Vec01, CC0);
46964 SDValue Setcc1 = DAG.getSetCC(DL, BoolVecVT, Vec10, Vec11, CC1);
46965 SDValue Logic = DAG.getNode(N->getOpcode(), DL, BoolVecVT, Setcc0, Setcc1);
46966 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Logic, ZeroIndex);
46967}
46968
46969// Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y))
46970// to reduce XMM->GPR traffic.
46971static SDValue combineBitOpWithMOVMSK(SDNode *N, SelectionDAG &DAG) {
46972 unsigned Opc = N->getOpcode();
46973 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&(static_cast <bool> ((Opc == ISD::OR || Opc == ISD::AND
|| Opc == ISD::XOR) && "Unexpected bit opcode") ? void
(0) : __assert_fail ("(Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && \"Unexpected bit opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46974, __extension__
__PRETTY_FUNCTION__))
46974 "Unexpected bit opcode")(static_cast <bool> ((Opc == ISD::OR || Opc == ISD::AND
|| Opc == ISD::XOR) && "Unexpected bit opcode") ? void
(0) : __assert_fail ("(Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && \"Unexpected bit opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46974, __extension__
__PRETTY_FUNCTION__))
;
46975
46976 SDValue N0 = N->getOperand(0);
46977 SDValue N1 = N->getOperand(1);
46978
46979 // Both operands must be single use MOVMSK.
46980 if (N0.getOpcode() != X86ISD::MOVMSK || !N0.hasOneUse() ||
46981 N1.getOpcode() != X86ISD::MOVMSK || !N1.hasOneUse())
46982 return SDValue();
46983
46984 SDValue Vec0 = N0.getOperand(0);
46985 SDValue Vec1 = N1.getOperand(0);
46986 EVT VecVT0 = Vec0.getValueType();
46987 EVT VecVT1 = Vec1.getValueType();
46988
46989 // Both MOVMSK operands must be from vectors of the same size and same element
46990 // size, but its OK for a fp/int diff.
46991 if (VecVT0.getSizeInBits() != VecVT1.getSizeInBits() ||
46992 VecVT0.getScalarSizeInBits() != VecVT1.getScalarSizeInBits())
46993 return SDValue();
46994
46995 SDLoc DL(N);
46996 unsigned VecOpc =
46997 VecVT0.isFloatingPoint() ? convertIntLogicToFPLogicOpcode(Opc) : Opc;
46998 SDValue Result =
46999 DAG.getNode(VecOpc, DL, VecVT0, Vec0, DAG.getBitcast(VecVT0, Vec1));
47000 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
47001}
47002
47003// Attempt to fold BITOP(SHIFT(X,Z),SHIFT(Y,Z)) -> SHIFT(BITOP(X,Y),Z).
47004// NOTE: This is a very limited case of what SimplifyUsingDistributiveLaws
47005// handles in InstCombine.
47006static SDValue combineBitOpWithShift(SDNode *N, SelectionDAG &DAG) {
47007 unsigned Opc = N->getOpcode();
47008 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&(static_cast <bool> ((Opc == ISD::OR || Opc == ISD::AND
|| Opc == ISD::XOR) && "Unexpected bit opcode") ? void
(0) : __assert_fail ("(Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && \"Unexpected bit opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47009, __extension__
__PRETTY_FUNCTION__))
47009 "Unexpected bit opcode")(static_cast <bool> ((Opc == ISD::OR || Opc == ISD::AND
|| Opc == ISD::XOR) && "Unexpected bit opcode") ? void
(0) : __assert_fail ("(Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && \"Unexpected bit opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47009, __extension__
__PRETTY_FUNCTION__))
;
47010
47011 SDValue N0 = N->getOperand(0);
47012 SDValue N1 = N->getOperand(1);
47013 EVT VT = N->getValueType(0);
47014
47015 // Both operands must be single use.
47016 if (!N0.hasOneUse() || !N1.hasOneUse())
47017 return SDValue();
47018
47019 // Search for matching shifts.
47020 SDValue BC0 = peekThroughOneUseBitcasts(N0);
47021 SDValue BC1 = peekThroughOneUseBitcasts(N1);
47022
47023 unsigned BCOpc = BC0.getOpcode();
47024 EVT BCVT = BC0.getValueType();
47025 if (BCOpc != BC1->getOpcode() || BCVT != BC1.getValueType())
47026 return SDValue();
47027
47028 switch (BCOpc) {
47029 case X86ISD::VSHLI:
47030 case X86ISD::VSRLI:
47031 case X86ISD::VSRAI: {
47032 if (BC0.getOperand(1) != BC1.getOperand(1))
47033 return SDValue();
47034
47035 SDLoc DL(N);
47036 SDValue BitOp =
47037 DAG.getNode(Opc, DL, BCVT, BC0.getOperand(0), BC1.getOperand(0));
47038 SDValue Shift = DAG.getNode(BCOpc, DL, BCVT, BitOp, BC0.getOperand(1));
47039 return DAG.getBitcast(VT, Shift);
47040 }
47041 }
47042
47043 return SDValue();
47044}
47045
47046/// If this is a zero/all-bits result that is bitwise-anded with a low bits
47047/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
47048/// with a shift-right to eliminate loading the vector constant mask value.
47049static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
47050 const X86Subtarget &Subtarget) {
47051 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
47052 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
47053 EVT VT = Op0.getValueType();
47054 if (VT != Op1.getValueType() || !VT.isSimple() || !VT.isInteger())
47055 return SDValue();
47056
47057 // Try to convert an "is positive" signbit masking operation into arithmetic
47058 // shift and "andn". This saves a materialization of a -1 vector constant.
47059 // The "is negative" variant should be handled more generally because it only
47060 // requires "and" rather than "andn":
47061 // and (pcmpgt X, -1), Y --> pandn (vsrai X, BitWidth - 1), Y
47062 //
47063 // This is limited to the original type to avoid producing even more bitcasts.
47064 // If the bitcasts can't be eliminated, then it is unlikely that this fold
47065 // will be profitable.
47066 if (N->getValueType(0) == VT &&
47067 supportedVectorShiftWithImm(VT.getSimpleVT(), Subtarget, ISD::SRA)) {
47068 SDValue X, Y;
47069 if (Op1.hasOneUse() && Op1.getOpcode() == X86ISD::PCMPGT &&
47070 isAllOnesOrAllOnesSplat(Op1.getOperand(1))) {
47071 X = Op1.getOperand(0);
47072 Y = Op0;
47073 } else if (Op0.hasOneUse() && Op0.getOpcode() == X86ISD::PCMPGT &&
47074 isAllOnesOrAllOnesSplat(Op0.getOperand(1))) {
47075 X = Op0.getOperand(0);
47076 Y = Op1;
47077 }
47078 if (X && Y) {
47079 SDLoc DL(N);
47080 SDValue Sra =
47081 getTargetVShiftByConstNode(X86ISD::VSRAI, DL, VT.getSimpleVT(), X,
47082 VT.getScalarSizeInBits() - 1, DAG);
47083 return DAG.getNode(X86ISD::ANDNP, DL, VT, Sra, Y);
47084 }
47085 }
47086
47087 APInt SplatVal;
47088 if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) ||
47089 !SplatVal.isMask())
47090 return SDValue();
47091
47092 // Don't prevent creation of ANDN.
47093 if (isBitwiseNot(Op0))
47094 return SDValue();
47095
47096 if (!supportedVectorShiftWithImm(VT.getSimpleVT(), Subtarget, ISD::SRL))
47097 return SDValue();
47098
47099 unsigned EltBitWidth = VT.getScalarSizeInBits();
47100 if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
47101 return SDValue();
47102
47103 SDLoc DL(N);
47104 unsigned ShiftVal = SplatVal.countTrailingOnes();
47105 SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
47106 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT, Op0, ShAmt);
47107 return DAG.getBitcast(N->getValueType(0), Shift);
47108}
47109
47110// Get the index node from the lowered DAG of a GEP IR instruction with one
47111// indexing dimension.
47112static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) {
47113 if (Ld->isIndexed())
47114 return SDValue();
47115
47116 SDValue Base = Ld->getBasePtr();
47117
47118 if (Base.getOpcode() != ISD::ADD)
47119 return SDValue();
47120
47121 SDValue ShiftedIndex = Base.getOperand(0);
47122
47123 if (ShiftedIndex.getOpcode() != ISD::SHL)
47124 return SDValue();
47125
47126 return ShiftedIndex.getOperand(0);
47127
47128}
47129
47130static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
47131 if (Subtarget.hasBMI2() && VT.isScalarInteger()) {
47132 switch (VT.getSizeInBits()) {
47133 default: return false;
47134 case 64: return Subtarget.is64Bit() ? true : false;
47135 case 32: return true;
47136 }
47137 }
47138 return false;
47139}
47140
47141// This function recognizes cases where X86 bzhi instruction can replace and
47142// 'and-load' sequence.
47143// In case of loading integer value from an array of constants which is defined
47144// as follows:
47145//
47146// int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
47147//
47148// then applying a bitwise and on the result with another input.
47149// It's equivalent to performing bzhi (zero high bits) on the input, with the
47150// same index of the load.
47151static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
47152 const X86Subtarget &Subtarget) {
47153 MVT VT = Node->getSimpleValueType(0);
47154 SDLoc dl(Node);
47155
47156 // Check if subtarget has BZHI instruction for the node's type
47157 if (!hasBZHI(Subtarget, VT))
47158 return SDValue();
47159
47160 // Try matching the pattern for both operands.
47161 for (unsigned i = 0; i < 2; i++) {
47162 SDValue N = Node->getOperand(i);
47163 LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());
47164
47165 // continue if the operand is not a load instruction
47166 if (!Ld)
47167 return SDValue();
47168
47169 const Value *MemOp = Ld->getMemOperand()->getValue();
47170
47171 if (!MemOp)
47172 return SDValue();
47173
47174 if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
47175 if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
47176 if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
47177
47178 Constant *Init = GV->getInitializer();
47179 Type *Ty = Init->getType();
47180 if (!isa<ConstantDataArray>(Init) ||
47181 !Ty->getArrayElementType()->isIntegerTy() ||
47182 Ty->getArrayElementType()->getScalarSizeInBits() !=
47183 VT.getSizeInBits() ||
47184 Ty->getArrayNumElements() >
47185 Ty->getArrayElementType()->getScalarSizeInBits())
47186 continue;
47187
47188 // Check if the array's constant elements are suitable to our case.
47189 uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
47190 bool ConstantsMatch = true;
47191 for (uint64_t j = 0; j < ArrayElementCount; j++) {
47192 auto *Elem = cast<ConstantInt>(Init->getAggregateElement(j));
47193 if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
47194 ConstantsMatch = false;
47195 break;
47196 }
47197 }
47198 if (!ConstantsMatch)
47199 continue;
47200
47201 // Do the transformation (For 32-bit type):
47202 // -> (and (load arr[idx]), inp)
47203 // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
47204 // that will be replaced with one bzhi instruction.
47205 SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);
47206 SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);
47207
47208 // Get the Node which indexes into the array.
47209 SDValue Index = getIndexFromUnindexedLoad(Ld);
47210 if (!Index)
47211 return SDValue();
47212 Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);
47213
47214 SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);
47215 Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);
47216
47217 SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
47218 SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
47219
47220 return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
47221 }
47222 }
47223 }
47224 }
47225 return SDValue();
47226}
47227
47228// Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C)
47229// Where C is a mask containing the same number of bits as the setcc and
47230// where the setcc will freely 0 upper bits of k-register. We can replace the
47231// undef in the concat with 0s and remove the AND. This mainly helps with
47232// v2i1/v4i1 setcc being casted to scalar.
47233static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG,
47234 const X86Subtarget &Subtarget) {
47235 assert(N->getOpcode() == ISD::AND && "Unexpected opcode!")(static_cast <bool> (N->getOpcode() == ISD::AND &&
"Unexpected opcode!") ? void (0) : __assert_fail ("N->getOpcode() == ISD::AND && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47235, __extension__
__PRETTY_FUNCTION__))
;
47236
47237 EVT VT = N->getValueType(0);
47238
47239 // Make sure this is an AND with constant. We will check the value of the
47240 // constant later.
47241 if (!isa<ConstantSDNode>(N->getOperand(1)))
47242 return SDValue();
47243
47244 // This is implied by the ConstantSDNode.
47245 assert(!VT.isVector() && "Expected scalar VT!")(static_cast <bool> (!VT.isVector() && "Expected scalar VT!"
) ? void (0) : __assert_fail ("!VT.isVector() && \"Expected scalar VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47245, __extension__
__PRETTY_FUNCTION__))
;
47246
47247 if (N->getOperand(0).getOpcode() != ISD::BITCAST ||
47248 !N->getOperand(0).hasOneUse() ||
47249 !N->getOperand(0).getOperand(0).hasOneUse())
47250 return SDValue();
47251
47252 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47253 SDValue Src = N->getOperand(0).getOperand(0);
47254 EVT SrcVT = Src.getValueType();
47255 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::i1 ||
47256 !TLI.isTypeLegal(SrcVT))
47257 return SDValue();
47258
47259 if (Src.getOpcode() != ISD::CONCAT_VECTORS)
47260 return SDValue();
47261
47262 // We only care about the first subvector of the concat, we expect the
47263 // other subvectors to be ignored due to the AND if we make the change.
47264 SDValue SubVec = Src.getOperand(0);
47265 EVT SubVecVT = SubVec.getValueType();
47266
47267 // First subvector should be a setcc with a legal result type. The RHS of the
47268 // AND should be a mask with this many bits.
47269 if (SubVec.getOpcode() != ISD::SETCC || !TLI.isTypeLegal(SubVecVT) ||
47270 !N->getConstantOperandAPInt(1).isMask(SubVecVT.getVectorNumElements()))
47271 return SDValue();
47272
47273 EVT SetccVT = SubVec.getOperand(0).getValueType();
47274 if (!TLI.isTypeLegal(SetccVT) ||
47275 !(Subtarget.hasVLX() || SetccVT.is512BitVector()))
47276 return SDValue();
47277
47278 if (!(Subtarget.hasBWI() || SetccVT.getScalarSizeInBits() >= 32))
47279 return SDValue();
47280
47281 // We passed all the checks. Rebuild the concat_vectors with zeroes
47282 // and cast it back to VT.
47283 SDLoc dl(N);
47284 SmallVector<SDValue, 4> Ops(Src.getNumOperands(),
47285 DAG.getConstant(0, dl, SubVecVT));
47286 Ops[0] = SubVec;
47287 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT,
47288 Ops);
47289 return DAG.getBitcast(VT, Concat);
47290}
47291
47292static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
47293 TargetLowering::DAGCombinerInfo &DCI,
47294 const X86Subtarget &Subtarget) {
47295 SDValue N0 = N->getOperand(0);
47296 SDValue N1 = N->getOperand(1);
47297 EVT VT = N->getValueType(0);
47298 SDLoc dl(N);
47299 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47300
47301 // If this is SSE1 only convert to FAND to avoid scalarization.
47302 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
47303 return DAG.getBitcast(MVT::v4i32,
47304 DAG.getNode(X86ISD::FAND, dl, MVT::v4f32,
47305 DAG.getBitcast(MVT::v4f32, N0),
47306 DAG.getBitcast(MVT::v4f32, N1)));
47307 }
47308
47309 // Use a 32-bit and+zext if upper bits known zero.
47310 if (VT == MVT::i64 && Subtarget.is64Bit() && !isa<ConstantSDNode>(N1)) {
47311 APInt HiMask = APInt::getHighBitsSet(64, 32);
47312 if (DAG.MaskedValueIsZero(N1, HiMask) ||
47313 DAG.MaskedValueIsZero(N0, HiMask)) {
47314 SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N0);
47315 SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N1);
47316 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
47317 DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));
47318 }
47319 }
47320
47321 // Match all-of bool scalar reductions into a bitcast/movmsk + cmp.
47322 // TODO: Support multiple SrcOps.
47323 if (VT == MVT::i1) {
47324 SmallVector<SDValue, 2> SrcOps;
47325 SmallVector<APInt, 2> SrcPartials;
47326 if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps, &SrcPartials) &&
47327 SrcOps.size() == 1) {
47328 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
47329 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
47330 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
47331 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
47332 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
47333 if (Mask) {
47334 assert(SrcPartials[0].getBitWidth() == NumElts &&(static_cast <bool> (SrcPartials[0].getBitWidth() == NumElts
&& "Unexpected partial reduction mask") ? void (0) :
__assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47335, __extension__
__PRETTY_FUNCTION__))
47335 "Unexpected partial reduction mask")(static_cast <bool> (SrcPartials[0].getBitWidth() == NumElts
&& "Unexpected partial reduction mask") ? void (0) :
__assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47335, __extension__
__PRETTY_FUNCTION__))
;
47336 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
47337 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
47338 return DAG.getSetCC(dl, MVT::i1, Mask, PartialBits, ISD::SETEQ);
47339 }
47340 }
47341 }
47342
47343 if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget))
47344 return V;
47345
47346 if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
47347 return R;
47348
47349 if (SDValue R = combineBitOpWithShift(N, DAG))
47350 return R;
47351
47352 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))
47353 return FPLogic;
47354
47355 if (DCI.isBeforeLegalizeOps())
47356 return SDValue();
47357
47358 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
47359 return R;
47360
47361 if (SDValue R = combineAndNotIntoANDNP(N, DAG))
47362 return R;
47363
47364 if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
47365 return ShiftRight;
47366
47367 if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
47368 return R;
47369
47370 // Fold AND(SRL(X,Y),1) -> SETCC(BT(X,Y), COND_B) iff Y is not a constant
47371 // avoids slow variable shift (moving shift amount to ECX etc.)
47372 if (isOneConstant(N1) && N0->hasOneUse()) {
47373 SDValue Src = N0;
47374 while ((Src.getOpcode() == ISD::ZERO_EXTEND ||
47375 Src.getOpcode() == ISD::TRUNCATE) &&
47376 Src.getOperand(0)->hasOneUse())
47377 Src = Src.getOperand(0);
47378 X86::CondCode X86CC = X86::COND_B;
47379 // Peek through AND(NOT(SRL(X,Y)),1).
47380 if (isBitwiseNot(Src)) {
47381 Src = Src.getOperand(0);
47382 X86CC = X86::COND_AE;
47383 }
47384 if (Src.getOpcode() == ISD::SRL &&
47385 !isa<ConstantSDNode>(Src.getOperand(1))) {
47386 SDValue BitNo = Src.getOperand(1);
47387 Src = Src.getOperand(0);
47388 // Peek through AND(SRL(NOT(X),Y),1).
47389 if (isBitwiseNot(Src)) {
47390 Src = Src.getOperand(0);
47391 X86CC = X86CC == X86::COND_AE ? X86::COND_B : X86::COND_AE;
47392 }
47393 if (SDValue BT = getBT(Src, BitNo, dl, DAG))
47394 return DAG.getZExtOrTrunc(getSETCC(X86CC, BT, dl, DAG), dl, VT);
47395 }
47396 }
47397
47398 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
47399 // Attempt to recursively combine a bitmask AND with shuffles.
47400 SDValue Op(N, 0);
47401 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
47402 return Res;
47403
47404 // If either operand is a constant mask, then only the elements that aren't
47405 // zero are actually demanded by the other operand.
47406 auto GetDemandedMasks = [&](SDValue Op) {
47407 APInt UndefElts;
47408 SmallVector<APInt> EltBits;
47409 int NumElts = VT.getVectorNumElements();
47410 int EltSizeInBits = VT.getScalarSizeInBits();
47411 APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
47412 APInt DemandedElts = APInt::getAllOnes(NumElts);
47413 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
47414 EltBits)) {
47415 DemandedBits.clearAllBits();
47416 DemandedElts.clearAllBits();
47417 for (int I = 0; I != NumElts; ++I)
47418 if (!EltBits[I].isZero()) {
47419 DemandedBits |= EltBits[I];
47420 DemandedElts.setBit(I);
47421 }
47422 }
47423 return std::make_pair(DemandedBits, DemandedElts);
47424 };
47425 std::pair<APInt, APInt> Demand0 = GetDemandedMasks(N1);
47426 std::pair<APInt, APInt> Demand1 = GetDemandedMasks(N0);
47427
47428 if (TLI.SimplifyDemandedVectorElts(N0, Demand0.second, DCI) ||
47429 TLI.SimplifyDemandedVectorElts(N1, Demand1.second, DCI) ||
47430 TLI.SimplifyDemandedBits(N0, Demand0.first, Demand0.second, DCI) ||
47431 TLI.SimplifyDemandedBits(N1, Demand1.first, Demand1.second, DCI)) {
47432 if (N->getOpcode() != ISD::DELETED_NODE)
47433 DCI.AddToWorklist(N);
47434 return SDValue(N, 0);
47435 }
47436
47437 SDValue NewN0 = TLI.SimplifyMultipleUseDemandedBits(N0, Demand0.first,
47438 Demand0.second, DAG);
47439 SDValue NewN1 = TLI.SimplifyMultipleUseDemandedBits(N1, Demand1.first,
47440 Demand1.second, DAG);
47441 if (NewN0 || NewN1)
47442 return DAG.getNode(ISD::AND, dl, VT, NewN0 ? NewN0 : N0,
47443 NewN1 ? NewN1 : N1);
47444 }
47445
47446 // Attempt to combine a scalar bitmask AND with an extracted shuffle.
47447 if ((VT.getScalarSizeInBits() % 8) == 0 &&
47448 N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
47449 isa<ConstantSDNode>(N0.getOperand(1))) {
47450 SDValue BitMask = N1;
47451 SDValue SrcVec = N0.getOperand(0);
47452 EVT SrcVecVT = SrcVec.getValueType();
47453
47454 // Check that the constant bitmask masks whole bytes.
47455 APInt UndefElts;
47456 SmallVector<APInt, 64> EltBits;
47457 if (VT == SrcVecVT.getScalarType() && N0->isOnlyUserOf(SrcVec.getNode()) &&
47458 getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
47459 llvm::all_of(EltBits, [](const APInt &M) {
47460 return M.isZero() || M.isAllOnes();
47461 })) {
47462 unsigned NumElts = SrcVecVT.getVectorNumElements();
47463 unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
47464 unsigned Idx = N0.getConstantOperandVal(1);
47465
47466 // Create a root shuffle mask from the byte mask and the extracted index.
47467 SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
47468 for (unsigned i = 0; i != Scale; ++i) {
47469 if (UndefElts[i])
47470 continue;
47471 int VecIdx = Scale * Idx + i;
47472 ShuffleMask[VecIdx] = EltBits[i].isZero() ? SM_SentinelZero : VecIdx;
47473 }
47474
47475 if (SDValue Shuffle = combineX86ShufflesRecursively(
47476 {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 1,
47477 X86::MaxShuffleCombineDepth,
47478 /*HasVarMask*/ false, /*AllowVarCrossLaneMask*/ true,
47479 /*AllowVarPerLaneMask*/ true, DAG, Subtarget))
47480 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Shuffle,
47481 N0.getOperand(1));
47482 }
47483 }
47484
47485 return SDValue();
47486}
47487
47488// Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))
47489static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG,
47490 const X86Subtarget &Subtarget) {
47491 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode")(static_cast <bool> (N->getOpcode() == ISD::OR &&
"Unexpected Opcode") ? void (0) : __assert_fail ("N->getOpcode() == ISD::OR && \"Unexpected Opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47491, __extension__
__PRETTY_FUNCTION__))
;
47492
47493 MVT VT = N->getSimpleValueType(0);
47494 unsigned EltSizeInBits = VT.getScalarSizeInBits();
47495 if (!VT.isVector() || (EltSizeInBits % 8) != 0)
47496 return SDValue();
47497
47498 SDValue N0 = peekThroughBitcasts(N->getOperand(0));
47499 SDValue N1 = peekThroughBitcasts(N->getOperand(1));
47500 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)
47501 return SDValue();
47502
47503 // On XOP we'll lower to PCMOV so accept one use. With AVX512, we can use
47504 // VPTERNLOG. Otherwise only do this if either mask has multiple uses already.
47505 if (!(Subtarget.hasXOP() || useVPTERNLOG(Subtarget, VT) ||
47506 !N0.getOperand(1).hasOneUse() || !N1.getOperand(1).hasOneUse()))
47507 return SDValue();
47508
47509 // Attempt to extract constant byte masks.
47510 APInt UndefElts0, UndefElts1;
47511 SmallVector<APInt, 32> EltBits0, EltBits1;
47512 if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0,
47513 false, false))
47514 return SDValue();
47515 if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1,
47516 false, false))
47517 return SDValue();
47518
47519 for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) {
47520 // TODO - add UNDEF elts support.
47521 if (UndefElts0[i] || UndefElts1[i])
47522 return SDValue();
47523 if (EltBits0[i] != ~EltBits1[i])
47524 return SDValue();
47525 }
47526
47527 SDLoc DL(N);
47528
47529 if (useVPTERNLOG(Subtarget, VT)) {
47530 // Emit a VPTERNLOG node directly - 0xCA is the imm code for A?B:C.
47531 // VPTERNLOG is only available as vXi32/64-bit types.
47532 MVT OpSVT = EltSizeInBits == 32 ? MVT::i32 : MVT::i64;
47533 MVT OpVT =
47534 MVT::getVectorVT(OpSVT, VT.getSizeInBits() / OpSVT.getSizeInBits());
47535 SDValue A = DAG.getBitcast(OpVT, N0.getOperand(1));
47536 SDValue B = DAG.getBitcast(OpVT, N0.getOperand(0));
47537 SDValue C = DAG.getBitcast(OpVT, N1.getOperand(0));
47538 SDValue Imm = DAG.getTargetConstant(0xCA, DL, MVT::i8);
47539 SDValue Res = getAVX512Node(X86ISD::VPTERNLOG, DL, OpVT, {A, B, C, Imm},
47540 DAG, Subtarget);
47541 return DAG.getBitcast(VT, Res);
47542 }
47543
47544 SDValue X = N->getOperand(0);
47545 SDValue Y =
47546 DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),
47547 DAG.getBitcast(VT, N1.getOperand(0)));
47548 return DAG.getNode(ISD::OR, DL, VT, X, Y);
47549}
47550
47551// Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
47552static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
47553 if (N->getOpcode() != ISD::OR)
47554 return false;
47555
47556 SDValue N0 = N->getOperand(0);
47557 SDValue N1 = N->getOperand(1);
47558
47559 // Canonicalize AND to LHS.
47560 if (N1.getOpcode() == ISD::AND)
47561 std::swap(N0, N1);
47562
47563 // Attempt to match OR(AND(M,Y),ANDNP(M,X)).
47564 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
47565 return false;
47566
47567 Mask = N1.getOperand(0);
47568 X = N1.getOperand(1);
47569
47570 // Check to see if the mask appeared in both the AND and ANDNP.
47571 if (N0.getOperand(0) == Mask)
47572 Y = N0.getOperand(1);
47573 else if (N0.getOperand(1) == Mask)
47574 Y = N0.getOperand(0);
47575 else
47576 return false;
47577
47578 // TODO: Attempt to match against AND(XOR(-1,M),Y) as well, waiting for
47579 // ANDNP combine allows other combines to happen that prevent matching.
47580 return true;
47581}
47582
47583// Try to fold:
47584// (or (and (m, y), (pandn m, x)))
47585// into:
47586// (vselect m, x, y)
47587// As a special case, try to fold:
47588// (or (and (m, (sub 0, x)), (pandn m, x)))
47589// into:
47590// (sub (xor X, M), M)
47591static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
47592 const X86Subtarget &Subtarget) {
47593 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode")(static_cast <bool> (N->getOpcode() == ISD::OR &&
"Unexpected Opcode") ? void (0) : __assert_fail ("N->getOpcode() == ISD::OR && \"Unexpected Opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47593, __extension__
__PRETTY_FUNCTION__))
;
47594
47595 EVT VT = N->getValueType(0);
47596 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
47597 (VT.is256BitVector() && Subtarget.hasInt256())))
47598 return SDValue();
47599
47600 SDValue X, Y, Mask;
47601 if (!matchLogicBlend(N, X, Y, Mask))
47602 return SDValue();
47603
47604 // Validate that X, Y, and Mask are bitcasts, and see through them.
47605 Mask = peekThroughBitcasts(Mask);
47606 X = peekThroughBitcasts(X);
47607 Y = peekThroughBitcasts(Y);
47608
47609 EVT MaskVT = Mask.getValueType();
47610 unsigned EltBits = MaskVT.getScalarSizeInBits();
47611
47612 // TODO: Attempt to handle floating point cases as well?
47613 if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
47614 return SDValue();
47615
47616 SDLoc DL(N);
47617
47618 // Attempt to combine to conditional negate: (sub (xor X, M), M)
47619 if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL,
47620 DAG, Subtarget))
47621 return Res;
47622
47623 // PBLENDVB is only available on SSE 4.1.
47624 if (!Subtarget.hasSSE41())
47625 return SDValue();
47626
47627 // If we have VPTERNLOG we should prefer that since PBLENDVB is multiple uops.
47628 if (Subtarget.hasVLX())
47629 return SDValue();
47630
47631 MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;
47632
47633 X = DAG.getBitcast(BlendVT, X);
47634 Y = DAG.getBitcast(BlendVT, Y);
47635 Mask = DAG.getBitcast(BlendVT, Mask);
47636 Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
47637 return DAG.getBitcast(VT, Mask);
47638}
47639
47640// Helper function for combineOrCmpEqZeroToCtlzSrl
47641// Transforms:
47642// seteq(cmp x, 0)
47643// into:
47644// srl(ctlz x), log2(bitsize(x))
47645// Input pattern is checked by caller.
47646static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, SelectionDAG &DAG) {
47647 SDValue Cmp = Op.getOperand(1);
47648 EVT VT = Cmp.getOperand(0).getValueType();
47649 unsigned Log2b = Log2_32(VT.getSizeInBits());
47650 SDLoc dl(Op);
47651 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
47652 // The result of the shift is true or false, and on X86, the 32-bit
47653 // encoding of shr and lzcnt is more desirable.
47654 SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
47655 SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
47656 DAG.getConstant(Log2b, dl, MVT::i8));
47657 return Scc;
47658}
47659
47660// Try to transform:
47661// zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
47662// into:
47663// srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
47664// Will also attempt to match more generic cases, eg:
47665// zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
47666// Only applies if the target supports the FastLZCNT feature.
47667static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
47668 TargetLowering::DAGCombinerInfo &DCI,
47669 const X86Subtarget &Subtarget) {
47670 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
47671 return SDValue();
47672
47673 auto isORCandidate = [](SDValue N) {
47674 return (N->getOpcode() == ISD::OR && N->hasOneUse());
47675 };
47676
47677 // Check the zero extend is extending to 32-bit or more. The code generated by
47678 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
47679 // instructions to clear the upper bits.
47680 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
47681 !isORCandidate(N->getOperand(0)))
47682 return SDValue();
47683
47684 // Check the node matches: setcc(eq, cmp 0)
47685 auto isSetCCCandidate = [](SDValue N) {
47686 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
47687 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
47688 N->getOperand(1).getOpcode() == X86ISD::CMP &&
47689 isNullConstant(N->getOperand(1).getOperand(1)) &&
47690 N->getOperand(1).getValueType().bitsGE(MVT::i32);
47691 };
47692
47693 SDNode *OR = N->getOperand(0).getNode();
47694 SDValue LHS = OR->getOperand(0);
47695 SDValue RHS = OR->getOperand(1);
47696
47697 // Save nodes matching or(or, setcc(eq, cmp 0)).
47698 SmallVector<SDNode *, 2> ORNodes;
47699 while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
47700 (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
47701 ORNodes.push_back(OR);
47702 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
47703 LHS = OR->getOperand(0);
47704 RHS = OR->getOperand(1);
47705 }
47706
47707 // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
47708 if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
47709 !isORCandidate(SDValue(OR, 0)))
47710 return SDValue();
47711
47712 // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
47713 // to
47714 // or(srl(ctlz),srl(ctlz)).
47715 // The dag combiner can then fold it into:
47716 // srl(or(ctlz, ctlz)).
47717 SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, DAG);
47718 SDValue Ret, NewRHS;
47719 if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG)))
47720 Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, NewLHS, NewRHS);
47721
47722 if (!Ret)
47723 return SDValue();
47724
47725 // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
47726 while (ORNodes.size() > 0) {
47727 OR = ORNodes.pop_back_val();
47728 LHS = OR->getOperand(0);
47729 RHS = OR->getOperand(1);
47730 // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
47731 if (RHS->getOpcode() == ISD::OR)
47732 std::swap(LHS, RHS);
47733 NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG);
47734 if (!NewRHS)
47735 return SDValue();
47736 Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, Ret, NewRHS);
47737 }
47738
47739 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
47740}
47741
47742static SDValue foldMaskedMergeImpl(SDValue And0_L, SDValue And0_R,
47743 SDValue And1_L, SDValue And1_R, SDLoc DL,
47744 SelectionDAG &DAG) {
47745 if (!isBitwiseNot(And0_L, true) || !And0_L->hasOneUse())
47746 return SDValue();
47747 SDValue NotOp = And0_L->getOperand(0);
47748 if (NotOp == And1_R)
47749 std::swap(And1_R, And1_L);
47750 if (NotOp != And1_L)
47751 return SDValue();
47752
47753 // (~(NotOp) & And0_R) | (NotOp & And1_R)
47754 // --> ((And0_R ^ And1_R) & NotOp) ^ And1_R
47755 EVT VT = And1_L->getValueType(0);
47756 SDValue Freeze_And0_R = DAG.getNode(ISD::FREEZE, SDLoc(), VT, And0_R);
47757 SDValue Xor0 = DAG.getNode(ISD::XOR, DL, VT, And1_R, Freeze_And0_R);
47758 SDValue And = DAG.getNode(ISD::AND, DL, VT, Xor0, NotOp);
47759 SDValue Xor1 = DAG.getNode(ISD::XOR, DL, VT, And, Freeze_And0_R);
47760 return Xor1;
47761}
47762
47763/// Fold "masked merge" expressions like `(m & x) | (~m & y)` into the
47764/// equivalent `((x ^ y) & m) ^ y)` pattern.
47765/// This is typically a better representation for targets without a fused
47766/// "and-not" operation. This function is intended to be called from a
47767/// `TargetLowering::PerformDAGCombine` callback on `ISD::OR` nodes.
47768static SDValue foldMaskedMerge(SDNode *Node, SelectionDAG &DAG) {
47769 // Note that masked-merge variants using XOR or ADD expressions are
47770 // normalized to OR by InstCombine so we only check for OR.
47771 assert(Node->getOpcode() == ISD::OR && "Must be called with ISD::OR node")(static_cast <bool> (Node->getOpcode() == ISD::OR &&
"Must be called with ISD::OR node") ? void (0) : __assert_fail
("Node->getOpcode() == ISD::OR && \"Must be called with ISD::OR node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47771, __extension__
__PRETTY_FUNCTION__))
;
47772 SDValue N0 = Node->getOperand(0);
47773 if (N0->getOpcode() != ISD::AND || !N0->hasOneUse())
47774 return SDValue();
47775 SDValue N1 = Node->getOperand(1);
47776 if (N1->getOpcode() != ISD::AND || !N1->hasOneUse())
47777 return SDValue();
47778
47779 SDLoc DL(Node);
47780 SDValue N00 = N0->getOperand(0);
47781 SDValue N01 = N0->getOperand(1);
47782 SDValue N10 = N1->getOperand(0);
47783 SDValue N11 = N1->getOperand(1);
47784 if (SDValue Result = foldMaskedMergeImpl(N00, N01, N10, N11, DL, DAG))
47785 return Result;
47786 if (SDValue Result = foldMaskedMergeImpl(N01, N00, N10, N11, DL, DAG))
47787 return Result;
47788 if (SDValue Result = foldMaskedMergeImpl(N10, N11, N00, N01, DL, DAG))
47789 return Result;
47790 if (SDValue Result = foldMaskedMergeImpl(N11, N10, N00, N01, DL, DAG))
47791 return Result;
47792 return SDValue();
47793}
47794
47795static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
47796 TargetLowering::DAGCombinerInfo &DCI,
47797 const X86Subtarget &Subtarget) {
47798 SDValue N0 = N->getOperand(0);
47799 SDValue N1 = N->getOperand(1);
47800 EVT VT = N->getValueType(0);
47801 SDLoc dl(N);
47802 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47803
47804 // If this is SSE1 only convert to FOR to avoid scalarization.
47805 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
47806 return DAG.getBitcast(MVT::v4i32,
47807 DAG.getNode(X86ISD::FOR, dl, MVT::v4f32,
47808 DAG.getBitcast(MVT::v4f32, N0),
47809 DAG.getBitcast(MVT::v4f32, N1)));
47810 }
47811
47812 // Match any-of bool scalar reductions into a bitcast/movmsk + cmp.
47813 // TODO: Support multiple SrcOps.
47814 if (VT == MVT::i1) {
47815 SmallVector<SDValue, 2> SrcOps;
47816 SmallVector<APInt, 2> SrcPartials;
47817 if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps, &SrcPartials) &&
47818 SrcOps.size() == 1) {
47819 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
47820 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
47821 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
47822 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
47823 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
47824 if (Mask) {
47825 assert(SrcPartials[0].getBitWidth() == NumElts &&(static_cast <bool> (SrcPartials[0].getBitWidth() == NumElts
&& "Unexpected partial reduction mask") ? void (0) :
__assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47826, __extension__
__PRETTY_FUNCTION__))
47826 "Unexpected partial reduction mask")(static_cast <bool> (SrcPartials[0].getBitWidth() == NumElts
&& "Unexpected partial reduction mask") ? void (0) :
__assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47826, __extension__
__PRETTY_FUNCTION__))
;
47827 SDValue ZeroBits = DAG.getConstant(0, dl, MaskVT);
47828 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
47829 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
47830 return DAG.getSetCC(dl, MVT::i1, Mask, ZeroBits, ISD::SETNE);
47831 }
47832 }
47833 }
47834
47835 if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
47836 return R;
47837
47838 if (SDValue R = combineBitOpWithShift(N, DAG))
47839 return R;
47840
47841 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))
47842 return FPLogic;
47843
47844 if (DCI.isBeforeLegalizeOps())
47845 return SDValue();
47846
47847 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
47848 return R;
47849
47850 if (SDValue R = canonicalizeBitSelect(N, DAG, Subtarget))
47851 return R;
47852
47853 if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
47854 return R;
47855
47856 // Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y).
47857 // Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X).
47858 // iff the upper elements of the non-shifted arg are zero.
47859 // KUNPCK require 16+ bool vector elements.
47860 if (N0.getOpcode() == X86ISD::KSHIFTL || N1.getOpcode() == X86ISD::KSHIFTL) {
47861 unsigned NumElts = VT.getVectorNumElements();
47862 unsigned HalfElts = NumElts / 2;
47863 APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts);
47864 if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL &&
47865 N1.getConstantOperandAPInt(1) == HalfElts &&
47866 DAG.MaskedValueIsZero(N0, APInt(1, 1), UpperElts)) {
47867 return DAG.getNode(
47868 ISD::CONCAT_VECTORS, dl, VT,
47869 extractSubVector(N0, 0, DAG, dl, HalfElts),
47870 extractSubVector(N1.getOperand(0), 0, DAG, dl, HalfElts));
47871 }
47872 if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL &&
47873 N0.getConstantOperandAPInt(1) == HalfElts &&
47874 DAG.MaskedValueIsZero(N1, APInt(1, 1), UpperElts)) {
47875 return DAG.getNode(
47876 ISD::CONCAT_VECTORS, dl, VT,
47877 extractSubVector(N1, 0, DAG, dl, HalfElts),
47878 extractSubVector(N0.getOperand(0), 0, DAG, dl, HalfElts));
47879 }
47880 }
47881
47882 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
47883 // Attempt to recursively combine an OR of shuffles.
47884 SDValue Op(N, 0);
47885 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
47886 return Res;
47887
47888 // If either operand is a constant mask, then only the elements that aren't
47889 // allones are actually demanded by the other operand.
47890 auto SimplifyUndemandedElts = [&](SDValue Op, SDValue OtherOp) {
47891 APInt UndefElts;
47892 SmallVector<APInt> EltBits;
47893 int NumElts = VT.getVectorNumElements();
47894 int EltSizeInBits = VT.getScalarSizeInBits();
47895 if (!getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, EltBits))
47896 return false;
47897
47898 APInt DemandedElts = APInt::getZero(NumElts);
47899 for (int I = 0; I != NumElts; ++I)
47900 if (!EltBits[I].isAllOnes())
47901 DemandedElts.setBit(I);
47902
47903 return TLI.SimplifyDemandedVectorElts(OtherOp, DemandedElts, DCI);
47904 };
47905 if (SimplifyUndemandedElts(N0, N1) || SimplifyUndemandedElts(N1, N0)) {
47906 if (N->getOpcode() != ISD::DELETED_NODE)
47907 DCI.AddToWorklist(N);
47908 return SDValue(N, 0);
47909 }
47910 }
47911
47912 // We should fold "masked merge" patterns when `andn` is not available.
47913 if (!Subtarget.hasBMI() && VT.isScalarInteger() && VT != MVT::i1)
47914 if (SDValue R = foldMaskedMerge(N, DAG))
47915 return R;
47916
47917 return SDValue();
47918}
47919
47920/// Try to turn tests against the signbit in the form of:
47921/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
47922/// into:
47923/// SETGT(X, -1)
47924static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
47925 // This is only worth doing if the output type is i8 or i1.
47926 EVT ResultType = N->getValueType(0);
47927 if (ResultType != MVT::i8 && ResultType != MVT::i1)
47928 return SDValue();
47929
47930 SDValue N0 = N->getOperand(0);
47931 SDValue N1 = N->getOperand(1);
47932
47933 // We should be performing an xor against a truncated shift.
47934 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
47935 return SDValue();
47936
47937 // Make sure we are performing an xor against one.
47938 if (!isOneConstant(N1))
47939 return SDValue();
47940
47941 // SetCC on x86 zero extends so only act on this if it's a logical shift.
47942 SDValue Shift = N0.getOperand(0);
47943 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
47944 return SDValue();
47945
47946 // Make sure we are truncating from one of i16, i32 or i64.
47947 EVT ShiftTy = Shift.getValueType();
47948 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
47949 return SDValue();
47950
47951 // Make sure the shift amount extracts the sign bit.
47952 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
47953 Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))
47954 return SDValue();
47955
47956 // Create a greater-than comparison against -1.
47957 // N.B. Using SETGE against 0 works but we want a canonical looking
47958 // comparison, using SETGT matches up with what TranslateX86CC.
47959 SDLoc DL(N);
47960 SDValue ShiftOp = Shift.getOperand(0);
47961 EVT ShiftOpTy = ShiftOp.getValueType();
47962 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47963 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
47964 *DAG.getContext(), ResultType);
47965 SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
47966 DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
47967 if (SetCCResultType != ResultType)
47968 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
47969 return Cond;
47970}
47971
47972/// Turn vector tests of the signbit in the form of:
47973/// xor (sra X, elt_size(X)-1), -1
47974/// into:
47975/// pcmpgt X, -1
47976///
47977/// This should be called before type legalization because the pattern may not
47978/// persist after that.
47979static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
47980 const X86Subtarget &Subtarget) {
47981 EVT VT = N->getValueType(0);
47982 if (!VT.isSimple())
47983 return SDValue();
47984
47985 switch (VT.getSimpleVT().SimpleTy) {
47986 default: return SDValue();
47987 case MVT::v16i8:
47988 case MVT::v8i16:
47989 case MVT::v4i32:
47990 case MVT::v2i64: if (!Subtarget.hasSSE2()) return SDValue(); break;
47991 case MVT::v32i8:
47992 case MVT::v16i16:
47993 case MVT::v8i32:
47994 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
47995 }
47996
47997 // There must be a shift right algebraic before the xor, and the xor must be a
47998 // 'not' operation.
47999 SDValue Shift = N->getOperand(0);
48000 SDValue Ones = N->getOperand(1);
48001 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
48002 !ISD::isBuildVectorAllOnes(Ones.getNode()))
48003 return SDValue();
48004
48005 // The shift should be smearing the sign bit across each vector element.
48006 auto *ShiftAmt =
48007 isConstOrConstSplat(Shift.getOperand(1), /*AllowUndefs*/ true);
48008 if (!ShiftAmt ||
48009 ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))
48010 return SDValue();
48011
48012 // Create a greater-than comparison against -1. We don't use the more obvious
48013 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
48014 return DAG.getSetCC(SDLoc(N), VT, Shift.getOperand(0), Ones, ISD::SETGT);
48015}
48016
48017/// Detect patterns of truncation with unsigned saturation:
48018///
48019/// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
48020/// Return the source value x to be truncated or SDValue() if the pattern was
48021/// not matched.
48022///
48023/// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),
48024/// where C1 >= 0 and C2 is unsigned max of destination type.
48025///
48026/// (truncate (smax (smin (x, C2), C1)) to dest_type)
48027/// where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.
48028///
48029/// These two patterns are equivalent to:
48030/// (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)
48031/// So return the smax(x, C1) value to be truncated or SDValue() if the
48032/// pattern was not matched.
48033static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG,
48034 const SDLoc &DL) {
48035 EVT InVT = In.getValueType();
48036
48037 // Saturation with truncation. We truncate from InVT to VT.
48038 assert(InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&(static_cast <bool> (InVT.getScalarSizeInBits() > VT
.getScalarSizeInBits() && "Unexpected types for truncate operation"
) ? void (0) : __assert_fail ("InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() && \"Unexpected types for truncate operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48039, __extension__
__PRETTY_FUNCTION__))
48039 "Unexpected types for truncate operation")(static_cast <bool> (InVT.getScalarSizeInBits() > VT
.getScalarSizeInBits() && "Unexpected types for truncate operation"
) ? void (0) : __assert_fail ("InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() && \"Unexpected types for truncate operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48039, __extension__
__PRETTY_FUNCTION__))
;
48040
48041 // Match min/max and return limit value as a parameter.
48042 auto MatchMinMax = [](SDValue V, unsigned Opcode, APInt &Limit) -> SDValue {
48043 if (V.getOpcode() == Opcode &&
48044 ISD::isConstantSplatVector(V.getOperand(1).getNode(), Limit))
48045 return V.getOperand(0);
48046 return SDValue();
48047 };
48048
48049 APInt C1, C2;
48050 if (SDValue UMin = MatchMinMax(In, ISD::UMIN, C2))
48051 // C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
48052 // the element size of the destination type.
48053 if (C2.isMask(VT.getScalarSizeInBits()))
48054 return UMin;
48055
48056 if (SDValue SMin = MatchMinMax(In, ISD::SMIN, C2))
48057 if (MatchMinMax(SMin, ISD::SMAX, C1))
48058 if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))
48059 return SMin;
48060
48061 if (SDValue SMax = MatchMinMax(In, ISD::SMAX, C1))
48062 if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, C2))
48063 if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) &&
48064 C2.uge(C1)) {
48065 return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));
48066 }
48067
48068 return SDValue();
48069}
48070
48071/// Detect patterns of truncation with signed saturation:
48072/// (truncate (smin ((smax (x, signed_min_of_dest_type)),
48073/// signed_max_of_dest_type)) to dest_type)
48074/// or:
48075/// (truncate (smax ((smin (x, signed_max_of_dest_type)),
48076/// signed_min_of_dest_type)) to dest_type).
48077/// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].
48078/// Return the source value to be truncated or SDValue() if the pattern was not
48079/// matched.
48080static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
48081 unsigned NumDstBits = VT.getScalarSizeInBits();
48082 unsigned NumSrcBits = In.getScalarValueSizeInBits();
48083 assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation")(static_cast <bool> (NumSrcBits > NumDstBits &&
"Unexpected types for truncate operation") ? void (0) : __assert_fail
("NumSrcBits > NumDstBits && \"Unexpected types for truncate operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48083, __extension__
__PRETTY_FUNCTION__))
;
48084
48085 auto MatchMinMax = [](SDValue V, unsigned Opcode,
48086 const APInt &Limit) -> SDValue {
48087 APInt C;
48088 if (V.getOpcode() == Opcode &&
48089 ISD::isConstantSplatVector(V.getOperand(1).getNode(), C) && C == Limit)
48090 return V.getOperand(0);
48091 return SDValue();
48092 };
48093
48094 APInt SignedMax, SignedMin;
48095 if (MatchPackUS) {
48096 SignedMax = APInt::getAllOnes(NumDstBits).zext(NumSrcBits);
48097 SignedMin = APInt(NumSrcBits, 0);
48098 } else {
48099 SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
48100 SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);
48101 }
48102
48103 if (SDValue SMin = MatchMinMax(In, ISD::SMIN, SignedMax))
48104 if (SDValue SMax = MatchMinMax(SMin, ISD::SMAX, SignedMin))
48105 return SMax;
48106
48107 if (SDValue SMax = MatchMinMax(In, ISD::SMAX, SignedMin))
48108 if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, SignedMax))
48109 return SMin;
48110
48111 return SDValue();
48112}
48113
48114static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
48115 SelectionDAG &DAG,
48116 const X86Subtarget &Subtarget) {
48117 if (!Subtarget.hasSSE2() || !VT.isVector())
48118 return SDValue();
48119
48120 EVT SVT = VT.getVectorElementType();
48121 EVT InVT = In.getValueType();
48122 EVT InSVT = InVT.getVectorElementType();
48123
48124 // If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is
48125 // split across two registers. We can use a packusdw+perm to clamp to 0-65535
48126 // and concatenate at the same time. Then we can use a final vpmovuswb to
48127 // clip to 0-255.
48128 if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
48129 InVT == MVT::v16i32 && VT == MVT::v16i8) {
48130 if (auto USatVal = detectSSatPattern(In, VT, true)) {
48131 // Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.
48132 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,
48133 DL, DAG, Subtarget);
48134 assert(Mid && "Failed to pack!")(static_cast <bool> (Mid && "Failed to pack!") ?
void (0) : __assert_fail ("Mid && \"Failed to pack!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48134, __extension__
__PRETTY_FUNCTION__))
;
48135 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid);
48136 }
48137 }
48138
48139 // vXi32 truncate instructions are available with AVX512F.
48140 // vXi16 truncate instructions are only available with AVX512BW.
48141 // For 256-bit or smaller vectors, we require VLX.
48142 // FIXME: We could widen truncates to 512 to remove the VLX restriction.
48143 // If the result type is 256-bits or larger and we have disable 512-bit
48144 // registers, we should go ahead and use the pack instructions if possible.
48145 bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) ||
48146 (Subtarget.hasBWI() && InSVT == MVT::i16)) &&
48147 (InVT.getSizeInBits() > 128) &&
48148 (Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&
48149 !(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);
48150
48151 if (isPowerOf2_32(VT.getVectorNumElements()) && !PreferAVX512 &&
48152 VT.getSizeInBits() >= 64 &&
48153 (SVT == MVT::i8 || SVT == MVT::i16) &&
48154 (InSVT == MVT::i16 || InSVT == MVT::i32)) {
48155 if (auto USatVal = detectSSatPattern(In, VT, true)) {
48156 // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
48157 // Only do this when the result is at least 64 bits or we'll leaving
48158 // dangling PACKSSDW nodes.
48159 if (SVT == MVT::i8 && InSVT == MVT::i32) {
48160 EVT MidVT = VT.changeVectorElementType(MVT::i16);
48161 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
48162 DAG, Subtarget);
48163 assert(Mid && "Failed to pack!")(static_cast <bool> (Mid && "Failed to pack!") ?
void (0) : __assert_fail ("Mid && \"Failed to pack!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48163, __extension__
__PRETTY_FUNCTION__))
;
48164 SDValue V = truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG,
48165 Subtarget);
48166 assert(V && "Failed to pack!")(static_cast <bool> (V && "Failed to pack!") ? void
(0) : __assert_fail ("V && \"Failed to pack!\"", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 48166, __extension__ __PRETTY_FUNCTION__))
;
48167 return V;
48168 } else if (SVT == MVT::i8 || Subtarget.hasSSE41())
48169 return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
48170 Subtarget);
48171 }
48172 if (auto SSatVal = detectSSatPattern(In, VT))
48173 return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
48174 Subtarget);
48175 }
48176
48177 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48178 if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&
48179 Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI()) &&
48180 (SVT == MVT::i32 || SVT == MVT::i16 || SVT == MVT::i8)) {
48181 unsigned TruncOpc = 0;
48182 SDValue SatVal;
48183 if (auto SSatVal = detectSSatPattern(In, VT)) {
48184 SatVal = SSatVal;
48185 TruncOpc = X86ISD::VTRUNCS;
48186 } else if (auto USatVal = detectUSatPattern(In, VT, DAG, DL)) {
48187 SatVal = USatVal;
48188 TruncOpc = X86ISD::VTRUNCUS;
48189 }
48190 if (SatVal) {
48191 unsigned ResElts = VT.getVectorNumElements();
48192 // If the input type is less than 512 bits and we don't have VLX, we need
48193 // to widen to 512 bits.
48194 if (!Subtarget.hasVLX() && !InVT.is512BitVector()) {
48195 unsigned NumConcats = 512 / InVT.getSizeInBits();
48196 ResElts *= NumConcats;
48197 SmallVector<SDValue, 4> ConcatOps(NumConcats, DAG.getUNDEF(InVT));
48198 ConcatOps[0] = SatVal;
48199 InVT = EVT::getVectorVT(*DAG.getContext(), InSVT,
48200 NumConcats * InVT.getVectorNumElements());
48201 SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps);
48202 }
48203 // Widen the result if its narrower than 128 bits.
48204 if (ResElts * SVT.getSizeInBits() < 128)
48205 ResElts = 128 / SVT.getSizeInBits();
48206 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts);
48207 SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal);
48208 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
48209 DAG.getIntPtrConstant(0, DL));
48210 }
48211 }
48212
48213 return SDValue();
48214}
48215
48216/// This function detects the AVG pattern between vectors of unsigned i8/i16,
48217/// which is c = (a + b + 1) / 2, and replace this operation with the efficient
48218/// ISD::AVGCEILU (AVG) instruction.
48219static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
48220 const X86Subtarget &Subtarget,
48221 const SDLoc &DL) {
48222 if (!VT.isVector())
48223 return SDValue();
48224 EVT InVT = In.getValueType();
48225 unsigned NumElems = VT.getVectorNumElements();
48226
48227 EVT ScalarVT = VT.getVectorElementType();
48228 if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) && NumElems >= 2))
48229 return SDValue();
48230
48231 // InScalarVT is the intermediate type in AVG pattern and it should be greater
48232 // than the original input type (i8/i16).
48233 EVT InScalarVT = InVT.getVectorElementType();
48234 if (InScalarVT.getFixedSizeInBits() <= ScalarVT.getFixedSizeInBits())
48235 return SDValue();
48236
48237 if (!Subtarget.hasSSE2())
48238 return SDValue();
48239
48240 // Detect the following pattern:
48241 //
48242 // %1 = zext <N x i8> %a to <N x i32>
48243 // %2 = zext <N x i8> %b to <N x i32>
48244 // %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
48245 // %4 = add nuw nsw <N x i32> %3, %2
48246 // %5 = lshr <N x i32> %N, <i32 1 x N>
48247 // %6 = trunc <N x i32> %5 to <N x i8>
48248 //
48249 // In AVX512, the last instruction can also be a trunc store.
48250 if (In.getOpcode() != ISD::SRL)
48251 return SDValue();
48252
48253 // A lambda checking the given SDValue is a constant vector and each element
48254 // is in the range [Min, Max].
48255 auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
48256 return ISD::matchUnaryPredicate(V, [Min, Max](ConstantSDNode *C) {
48257 return !(C->getAPIntValue().ult(Min) || C->getAPIntValue().ugt(Max));
48258 });
48259 };
48260
48261 auto IsZExtLike = [DAG = &DAG, ScalarVT](SDValue V) {
48262 unsigned MaxActiveBits = DAG->computeKnownBits(V).countMaxActiveBits();
48263 return MaxActiveBits <= ScalarVT.getSizeInBits();
48264 };
48265
48266 // Check if each element of the vector is right-shifted by one.
48267 SDValue LHS = In.getOperand(0);
48268 SDValue RHS = In.getOperand(1);
48269 if (!IsConstVectorInRange(RHS, 1, 1))
48270 return SDValue();
48271 if (LHS.getOpcode() != ISD::ADD)
48272 return SDValue();
48273
48274 // Detect a pattern of a + b + 1 where the order doesn't matter.
48275 SDValue Operands[3];
48276 Operands[0] = LHS.getOperand(0);
48277 Operands[1] = LHS.getOperand(1);
48278
48279 auto AVGBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
48280 ArrayRef<SDValue> Ops) {
48281 return DAG.getNode(ISD::AVGCEILU, DL, Ops[0].getValueType(), Ops);
48282 };
48283
48284 auto AVGSplitter = [&](std::array<SDValue, 2> Ops) {
48285 for (SDValue &Op : Ops)
48286 if (Op.getValueType() != VT)
48287 Op = DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
48288 // Pad to a power-of-2 vector, split+apply and extract the original vector.
48289 unsigned NumElemsPow2 = PowerOf2Ceil(NumElems);
48290 EVT Pow2VT = EVT::getVectorVT(*DAG.getContext(), ScalarVT, NumElemsPow2);
48291 if (NumElemsPow2 != NumElems) {
48292 for (SDValue &Op : Ops) {
48293 SmallVector<SDValue, 32> EltsOfOp(NumElemsPow2, DAG.getUNDEF(ScalarVT));
48294 for (unsigned i = 0; i != NumElems; ++i) {
48295 SDValue Idx = DAG.getIntPtrConstant(i, DL);
48296 EltsOfOp[i] =
48297 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Op, Idx);
48298 }
48299 Op = DAG.getBuildVector(Pow2VT, DL, EltsOfOp);
48300 }
48301 }
48302 SDValue Res = SplitOpsAndApply(DAG, Subtarget, DL, Pow2VT, Ops, AVGBuilder);
48303 if (NumElemsPow2 == NumElems)
48304 return Res;
48305 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
48306 DAG.getIntPtrConstant(0, DL));
48307 };
48308
48309 // Take care of the case when one of the operands is a constant vector whose
48310 // element is in the range [1, 256].
48311 if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
48312 IsZExtLike(Operands[0])) {
48313 // The pattern is detected. Subtract one from the constant vector, then
48314 // demote it and emit X86ISD::AVG instruction.
48315 SDValue VecOnes = DAG.getConstant(1, DL, InVT);
48316 Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
48317 return AVGSplitter({Operands[0], Operands[1]});
48318 }
48319
48320 // Matches 'add like' patterns: add(Op0,Op1) + zext(or(Op0,Op1)).
48321 // Match the or case only if its 'add-like' - can be replaced by an add.
48322 auto FindAddLike = [&](SDValue V, SDValue &Op0, SDValue &Op1) {
48323 if (ISD::ADD == V.getOpcode()) {
48324 Op0 = V.getOperand(0);
48325 Op1 = V.getOperand(1);
48326 return true;
48327 }
48328 if (ISD::ZERO_EXTEND != V.getOpcode())
48329 return false;
48330 V = V.getOperand(0);
48331 if (V.getValueType() != VT || ISD::OR != V.getOpcode() ||
48332 !DAG.haveNoCommonBitsSet(V.getOperand(0), V.getOperand(1)))
48333 return false;
48334 Op0 = V.getOperand(0);
48335 Op1 = V.getOperand(1);
48336 return true;
48337 };
48338
48339 SDValue Op0, Op1;
48340 if (FindAddLike(Operands[0], Op0, Op1))
48341 std::swap(Operands[0], Operands[1]);
48342 else if (!FindAddLike(Operands[1], Op0, Op1))
48343 return SDValue();
48344 Operands[2] = Op0;
48345 Operands[1] = Op1;
48346
48347 // Now we have three operands of two additions. Check that one of them is a
48348 // constant vector with ones, and the other two can be promoted from i8/i16.
48349 for (int i = 0; i < 3; ++i) {
48350 if (!IsConstVectorInRange(Operands[i], 1, 1))
48351 continue;
48352 std::swap(Operands[i], Operands[2]);
48353
48354 // Check if Operands[0] and Operands[1] are results of type promotion.
48355 for (int j = 0; j < 2; ++j)
48356 if (Operands[j].getValueType() != VT)
48357 if (!IsZExtLike(Operands[j]))
48358 return SDValue();
48359
48360 // The pattern is detected, emit X86ISD::AVG instruction(s).
48361 return AVGSplitter({Operands[0], Operands[1]});
48362 }
48363
48364 return SDValue();
48365}
48366
48367static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
48368 TargetLowering::DAGCombinerInfo &DCI,
48369 const X86Subtarget &Subtarget) {
48370 LoadSDNode *Ld = cast<LoadSDNode>(N);
48371 EVT RegVT = Ld->getValueType(0);
48372 EVT MemVT = Ld->getMemoryVT();
48373 SDLoc dl(Ld);
48374 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48375
48376 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
48377 // into two 16-byte operations. Also split non-temporal aligned loads on
48378 // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
48379 ISD::LoadExtType Ext = Ld->getExtensionType();
48380 bool Fast;
48381 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
48382 Ext == ISD::NON_EXTLOAD &&
48383 ((Ld->isNonTemporal() && !Subtarget.hasInt256() &&
48384 Ld->getAlignment() >= 16) ||
48385 (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
48386 *Ld->getMemOperand(), &Fast) &&
48387 !Fast))) {
48388 unsigned NumElems = RegVT.getVectorNumElements();
48389 if (NumElems < 2)
48390 return SDValue();
48391
48392 unsigned HalfOffset = 16;
48393 SDValue Ptr1 = Ld->getBasePtr();
48394 SDValue Ptr2 =
48395 DAG.getMemBasePlusOffset(Ptr1, TypeSize::Fixed(HalfOffset), dl);
48396 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
48397 NumElems / 2);
48398 SDValue Load1 =
48399 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),
48400 Ld->getOriginalAlign(),
48401 Ld->getMemOperand()->getFlags());
48402 SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,
48403 Ld->getPointerInfo().getWithOffset(HalfOffset),
48404 Ld->getOriginalAlign(),
48405 Ld->getMemOperand()->getFlags());
48406 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
48407 Load1.getValue(1), Load2.getValue(1));
48408
48409 SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
48410 return DCI.CombineTo(N, NewVec, TF, true);
48411 }
48412
48413 // Bool vector load - attempt to cast to an integer, as we have good
48414 // (vXiY *ext(vXi1 bitcast(iX))) handling.
48415 if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() &&
48416 RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) {
48417 unsigned NumElts = RegVT.getVectorNumElements();
48418 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
48419 if (TLI.isTypeLegal(IntVT)) {
48420 SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),
48421 Ld->getPointerInfo(),
48422 Ld->getOriginalAlign(),
48423 Ld->getMemOperand()->getFlags());
48424 SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);
48425 return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);
48426 }
48427 }
48428
48429 // If we also broadcast this as a subvector to a wider type, then just extract
48430 // the lowest subvector.
48431 if (Ext == ISD::NON_EXTLOAD && Subtarget.hasAVX() && Ld->isSimple() &&
48432 (RegVT.is128BitVector() || RegVT.is256BitVector())) {
48433 SDValue Ptr = Ld->getBasePtr();
48434 SDValue Chain = Ld->getChain();
48435 for (SDNode *User : Ptr->uses()) {
48436 if (User != N && User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
48437 cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
48438 cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
48439 cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
48440 MemVT.getSizeInBits() &&
48441 !User->hasAnyUseOfValue(1) &&
48442 User->getValueSizeInBits(0).getFixedSize() >
48443 RegVT.getFixedSizeInBits()) {
48444 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
48445 RegVT.getSizeInBits());
48446 Extract = DAG.getBitcast(RegVT, Extract);
48447 return DCI.CombineTo(N, Extract, SDValue(User, 1));
48448 }
48449 }
48450 }
48451
48452 // Cast ptr32 and ptr64 pointers to the default address space before a load.
48453 unsigned AddrSpace = Ld->getAddressSpace();
48454 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
48455 AddrSpace == X86AS::PTR32_UPTR) {
48456 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
48457 if (PtrVT != Ld->getBasePtr().getSimpleValueType()) {
48458 SDValue Cast =
48459 DAG.getAddrSpaceCast(dl, PtrVT, Ld->getBasePtr(), AddrSpace, 0);
48460 return DAG.getLoad(RegVT, dl, Ld->getChain(), Cast, Ld->getPointerInfo(),
48461 Ld->getOriginalAlign(),
48462 Ld->getMemOperand()->getFlags());
48463 }
48464 }
48465
48466 return SDValue();
48467}
48468
48469/// If V is a build vector of boolean constants and exactly one of those
48470/// constants is true, return the operand index of that true element.
48471/// Otherwise, return -1.
48472static int getOneTrueElt(SDValue V) {
48473 // This needs to be a build vector of booleans.
48474 // TODO: Checking for the i1 type matches the IR definition for the mask,
48475 // but the mask check could be loosened to i8 or other types. That might
48476 // also require checking more than 'allOnesValue'; eg, the x86 HW
48477 // instructions only require that the MSB is set for each mask element.
48478 // The ISD::MSTORE comments/definition do not specify how the mask operand
48479 // is formatted.
48480 auto *BV = dyn_cast<BuildVectorSDNode>(V);
48481 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
48482 return -1;
48483
48484 int TrueIndex = -1;
48485 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
48486 for (unsigned i = 0; i < NumElts; ++i) {
48487 const SDValue &Op = BV->getOperand(i);
48488 if (Op.isUndef())
48489 continue;
48490 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
48491 if (!ConstNode)
48492 return -1;
48493 if (ConstNode->getAPIntValue().countTrailingOnes() >= 1) {
48494 // If we already found a one, this is too many.
48495 if (TrueIndex >= 0)
48496 return -1;
48497 TrueIndex = i;
48498 }
48499 }
48500 return TrueIndex;
48501}
48502
48503/// Given a masked memory load/store operation, return true if it has one mask
48504/// bit set. If it has one mask bit set, then also return the memory address of
48505/// the scalar element to load/store, the vector index to insert/extract that
48506/// scalar element, and the alignment for the scalar memory access.
48507static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
48508 SelectionDAG &DAG, SDValue &Addr,
48509 SDValue &Index, Align &Alignment,
48510 unsigned &Offset) {
48511 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
48512 if (TrueMaskElt < 0)
48513 return false;
48514
48515 // Get the address of the one scalar element that is specified by the mask
48516 // using the appropriate offset from the base pointer.
48517 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
48518 Offset = 0;
48519 Addr = MaskedOp->getBasePtr();
48520 if (TrueMaskElt != 0) {
48521 Offset = TrueMaskElt * EltVT.getStoreSize();
48522 Addr = DAG.getMemBasePlusOffset(Addr, TypeSize::Fixed(Offset),
48523 SDLoc(MaskedOp));
48524 }
48525
48526 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
48527 Alignment = commonAlignment(MaskedOp->getOriginalAlign(),
48528 EltVT.getStoreSize());
48529 return true;
48530}
48531
48532/// If exactly one element of the mask is set for a non-extending masked load,
48533/// it is a scalar load and vector insert.
48534/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
48535/// mask have already been optimized in IR, so we don't bother with those here.
48536static SDValue
48537reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
48538 TargetLowering::DAGCombinerInfo &DCI,
48539 const X86Subtarget &Subtarget) {
48540 assert(ML->isUnindexed() && "Unexpected indexed masked load!")(static_cast <bool> (ML->isUnindexed() && "Unexpected indexed masked load!"
) ? void (0) : __assert_fail ("ML->isUnindexed() && \"Unexpected indexed masked load!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48540, __extension__
__PRETTY_FUNCTION__))
;
48541 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
48542 // However, some target hooks may need to be added to know when the transform
48543 // is profitable. Endianness would also have to be considered.
48544
48545 SDValue Addr, VecIndex;
48546 Align Alignment;
48547 unsigned Offset;
48548 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment, Offset))
48549 return SDValue();
48550
48551 // Load the one scalar element that is specified by the mask using the
48552 // appropriate offset from the base pointer.
48553 SDLoc DL(ML);
48554 EVT VT = ML->getValueType(0);
48555 EVT EltVT = VT.getVectorElementType();
48556
48557 EVT CastVT = VT;
48558 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
48559 EltVT = MVT::f64;
48560 CastVT = VT.changeVectorElementType(EltVT);
48561 }
48562
48563 SDValue Load =
48564 DAG.getLoad(EltVT, DL, ML->getChain(), Addr,
48565 ML->getPointerInfo().getWithOffset(Offset),
48566 Alignment, ML->getMemOperand()->getFlags());
48567
48568 SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru());
48569
48570 // Insert the loaded element into the appropriate place in the vector.
48571 SDValue Insert =
48572 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, CastVT, PassThru, Load, VecIndex);
48573 Insert = DAG.getBitcast(VT, Insert);
48574 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
48575}
48576
48577static SDValue
48578combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
48579 TargetLowering::DAGCombinerInfo &DCI) {
48580 assert(ML->isUnindexed() && "Unexpected indexed masked load!")(static_cast <bool> (ML->isUnindexed() && "Unexpected indexed masked load!"
) ? void (0) : __assert_fail ("ML->isUnindexed() && \"Unexpected indexed masked load!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48580, __extension__
__PRETTY_FUNCTION__))
;
48581 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
48582 return SDValue();
48583
48584 SDLoc DL(ML);
48585 EVT VT = ML->getValueType(0);
48586
48587 // If we are loading the first and last elements of a vector, it is safe and
48588 // always faster to load the whole vector. Replace the masked load with a
48589 // vector load and select.
48590 unsigned NumElts = VT.getVectorNumElements();
48591 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
48592 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
48593 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
48594 if (LoadFirstElt && LoadLastElt) {
48595 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
48596 ML->getMemOperand());
48597 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,
48598 ML->getPassThru());
48599 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
48600 }
48601
48602 // Convert a masked load with a constant mask into a masked load and a select.
48603 // This allows the select operation to use a faster kind of select instruction
48604 // (for example, vblendvps -> vblendps).
48605
48606 // Don't try this if the pass-through operand is already undefined. That would
48607 // cause an infinite loop because that's what we're about to create.
48608 if (ML->getPassThru().isUndef())
48609 return SDValue();
48610
48611 if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))
48612 return SDValue();
48613
48614 // The new masked load has an undef pass-through operand. The select uses the
48615 // original pass-through operand.
48616 SDValue NewML = DAG.getMaskedLoad(
48617 VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),
48618 DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),
48619 ML->getAddressingMode(), ML->getExtensionType());
48620 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,
48621 ML->getPassThru());
48622
48623 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
48624}
48625
48626static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
48627 TargetLowering::DAGCombinerInfo &DCI,
48628 const X86Subtarget &Subtarget) {
48629 auto *Mld = cast<MaskedLoadSDNode>(N);
48630
48631 // TODO: Expanding load with constant mask may be optimized as well.
48632 if (Mld->isExpandingLoad())
48633 return SDValue();
48634
48635 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
48636 if (SDValue ScalarLoad =
48637 reduceMaskedLoadToScalarLoad(Mld, DAG, DCI, Subtarget))
48638 return ScalarLoad;
48639
48640 // TODO: Do some AVX512 subsets benefit from this transform?
48641 if (!Subtarget.hasAVX512())
48642 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
48643 return Blend;
48644 }
48645
48646 // If the mask value has been legalized to a non-boolean vector, try to
48647 // simplify ops leading up to it. We only demand the MSB of each lane.
48648 SDValue Mask = Mld->getMask();
48649 if (Mask.getScalarValueSizeInBits() != 1) {
48650 EVT VT = Mld->getValueType(0);
48651 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48652 APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));
48653 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
48654 if (N->getOpcode() != ISD::DELETED_NODE)
48655 DCI.AddToWorklist(N);
48656 return SDValue(N, 0);
48657 }
48658 if (SDValue NewMask =
48659 TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))
48660 return DAG.getMaskedLoad(
48661 VT, SDLoc(N), Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(),
48662 NewMask, Mld->getPassThru(), Mld->getMemoryVT(), Mld->getMemOperand(),
48663 Mld->getAddressingMode(), Mld->getExtensionType());
48664 }
48665
48666 return SDValue();
48667}
48668
48669/// If exactly one element of the mask is set for a non-truncating masked store,
48670/// it is a vector extract and scalar store.
48671/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
48672/// mask have already been optimized in IR, so we don't bother with those here.
48673static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
48674 SelectionDAG &DAG,
48675 const X86Subtarget &Subtarget) {
48676 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
48677 // However, some target hooks may need to be added to know when the transform
48678 // is profitable. Endianness would also have to be considered.
48679
48680 SDValue Addr, VecIndex;
48681 Align Alignment;
48682 unsigned Offset;
48683 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment, Offset))
48684 return SDValue();
48685
48686 // Extract the one scalar element that is actually being stored.
48687 SDLoc DL(MS);
48688 SDValue Value = MS->getValue();
48689 EVT VT = Value.getValueType();
48690 EVT EltVT = VT.getVectorElementType();
48691 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
48692 EltVT = MVT::f64;
48693 EVT CastVT = VT.changeVectorElementType(EltVT);
48694 Value = DAG.getBitcast(CastVT, Value);
48695 }
48696 SDValue Extract =
48697 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Value, VecIndex);
48698
48699 // Store that element at the appropriate offset from the base pointer.
48700 return DAG.getStore(MS->getChain(), DL, Extract, Addr,
48701 MS->getPointerInfo().getWithOffset(Offset),
48702 Alignment, MS->getMemOperand()->getFlags());
48703}
48704
48705static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
48706 TargetLowering::DAGCombinerInfo &DCI,
48707 const X86Subtarget &Subtarget) {
48708 MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
48709 if (Mst->isCompressingStore())
48710 return SDValue();
48711
48712 EVT VT = Mst->getValue().getValueType();
48713 SDLoc dl(Mst);
48714 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48715
48716 if (Mst->isTruncatingStore())
48717 return SDValue();
48718
48719 if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget))
48720 return ScalarStore;
48721
48722 // If the mask value has been legalized to a non-boolean vector, try to
48723 // simplify ops leading up to it. We only demand the MSB of each lane.
48724 SDValue Mask = Mst->getMask();
48725 if (Mask.getScalarValueSizeInBits() != 1) {
48726 APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));
48727 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
48728 if (N->getOpcode() != ISD::DELETED_NODE)
48729 DCI.AddToWorklist(N);
48730 return SDValue(N, 0);
48731 }
48732 if (SDValue NewMask =
48733 TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))
48734 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(),
48735 Mst->getBasePtr(), Mst->getOffset(), NewMask,
48736 Mst->getMemoryVT(), Mst->getMemOperand(),
48737 Mst->getAddressingMode());
48738 }
48739
48740 SDValue Value = Mst->getValue();
48741 if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
48742 TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
48743 Mst->getMemoryVT())) {
48744 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
48745 Mst->getBasePtr(), Mst->getOffset(), Mask,
48746 Mst->getMemoryVT(), Mst->getMemOperand(),
48747 Mst->getAddressingMode(), true);
48748 }
48749
48750 return SDValue();
48751}
48752
48753static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
48754 TargetLowering::DAGCombinerInfo &DCI,
48755 const X86Subtarget &Subtarget) {
48756 StoreSDNode *St = cast<StoreSDNode>(N);
48757 EVT StVT = St->getMemoryVT();
48758 SDLoc dl(St);
48759 SDValue StoredVal = St->getValue();
48760 EVT VT = StoredVal.getValueType();
48761 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48762
48763 // Convert a store of vXi1 into a store of iX and a bitcast.
48764 if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() &&
48765 VT.getVectorElementType() == MVT::i1) {
48766
48767 EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());
48768 StoredVal = DAG.getBitcast(NewVT, StoredVal);
48769
48770 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
48771 St->getPointerInfo(), St->getOriginalAlign(),
48772 St->getMemOperand()->getFlags());
48773 }
48774
48775 // If this is a store of a scalar_to_vector to v1i1, just use a scalar store.
48776 // This will avoid a copy to k-register.
48777 if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
48778 StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
48779 StoredVal.getOperand(0).getValueType() == MVT::i8) {
48780 SDValue Val = StoredVal.getOperand(0);
48781 // We must store zeros to the unused bits.
48782 Val = DAG.getZeroExtendInReg(Val, dl, MVT::i1);
48783 return DAG.getStore(St->getChain(), dl, Val,
48784 St->getBasePtr(), St->getPointerInfo(),
48785 St->getOriginalAlign(),
48786 St->getMemOperand()->getFlags());
48787 }
48788
48789 // Widen v2i1/v4i1 stores to v8i1.
48790 if ((VT == MVT::v1i1 || VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
48791 Subtarget.hasAVX512()) {
48792 unsigned NumConcats = 8 / VT.getVectorNumElements();
48793 // We must store zeros to the unused bits.
48794 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, VT));
48795 Ops[0] = StoredVal;
48796 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
48797 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
48798 St->getPointerInfo(), St->getOriginalAlign(),
48799 St->getMemOperand()->getFlags());
48800 }
48801
48802 // Turn vXi1 stores of constants into a scalar store.
48803 if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||
48804 VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
48805 ISD::isBuildVectorOfConstantSDNodes(StoredVal.getNode())) {
48806 // If its a v64i1 store without 64-bit support, we need two stores.
48807 if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) {
48808 SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
48809 StoredVal->ops().slice(0, 32));
48810 Lo = combinevXi1ConstantToInteger(Lo, DAG);
48811 SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
48812 StoredVal->ops().slice(32, 32));
48813 Hi = combinevXi1ConstantToInteger(Hi, DAG);
48814
48815 SDValue Ptr0 = St->getBasePtr();
48816 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(4), dl);
48817
48818 SDValue Ch0 =
48819 DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
48820 St->getOriginalAlign(),
48821 St->getMemOperand()->getFlags());
48822 SDValue Ch1 =
48823 DAG.getStore(St->getChain(), dl, Hi, Ptr1,
48824 St->getPointerInfo().getWithOffset(4),
48825 St->getOriginalAlign(),
48826 St->getMemOperand()->getFlags());
48827 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
48828 }
48829
48830 StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
48831 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
48832 St->getPointerInfo(), St->getOriginalAlign(),
48833 St->getMemOperand()->getFlags());
48834 }
48835
48836 // If we are saving a 32-byte vector and 32-byte stores are slow, such as on
48837 // Sandy Bridge, perform two 16-byte stores.
48838 bool Fast;
48839 if (VT.is256BitVector() && StVT == VT &&
48840 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
48841 *St->getMemOperand(), &Fast) &&
48842 !Fast) {
48843 unsigned NumElems = VT.getVectorNumElements();
48844 if (NumElems < 2)
48845 return SDValue();
48846
48847 return splitVectorStore(St, DAG);
48848 }
48849
48850 // Split under-aligned vector non-temporal stores.
48851 if (St->isNonTemporal() && StVT == VT &&
48852 St->getAlignment() < VT.getStoreSize()) {
48853 // ZMM/YMM nt-stores - either it can be stored as a series of shorter
48854 // vectors or the legalizer can scalarize it to use MOVNTI.
48855 if (VT.is256BitVector() || VT.is512BitVector()) {
48856 unsigned NumElems = VT.getVectorNumElements();
48857 if (NumElems < 2)
48858 return SDValue();
48859 return splitVectorStore(St, DAG);
48860 }
48861
48862 // XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64
48863 // to use MOVNTI.
48864 if (VT.is128BitVector() && Subtarget.hasSSE2()) {
48865 MVT NTVT = Subtarget.hasSSE4A()
48866 ? MVT::v2f64
48867 : (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);
48868 return scalarizeVectorStore(St, NTVT, DAG);
48869 }
48870 }
48871
48872 // Try to optimize v16i16->v16i8 truncating stores when BWI is not
48873 // supported, but avx512f is by extending to v16i32 and truncating.
48874 if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&
48875 St->getValue().getOpcode() == ISD::TRUNCATE &&
48876 St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&
48877 TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) &&
48878 St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) {
48879 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32,
48880 St->getValue().getOperand(0));
48881 return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
48882 MVT::v16i8, St->getMemOperand());
48883 }
48884
48885 // Try to fold a VTRUNCUS or VTRUNCS into a truncating store.
48886 if (!St->isTruncatingStore() &&
48887 (StoredVal.getOpcode() == X86ISD::VTRUNCUS ||
48888 StoredVal.getOpcode() == X86ISD::VTRUNCS) &&
48889 StoredVal.hasOneUse() &&
48890 TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) {
48891 bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS;
48892 return EmitTruncSStore(IsSigned, St->getChain(),
48893 dl, StoredVal.getOperand(0), St->getBasePtr(),
48894 VT, St->getMemOperand(), DAG);
48895 }
48896
48897 // Try to fold a extract_element(VTRUNC) pattern into a truncating store.
48898 if (!St->isTruncatingStore()) {
48899 auto IsExtractedElement = [](SDValue V) {
48900 if (V.getOpcode() == ISD::TRUNCATE && V.hasOneUse())
48901 V = V.getOperand(0);
48902 unsigned Opc = V.getOpcode();
48903 if ((Opc == ISD::EXTRACT_VECTOR_ELT || Opc == X86ISD::PEXTRW) &&
48904 isNullConstant(V.getOperand(1)) && V.hasOneUse() &&
48905 V.getOperand(0).hasOneUse())
48906 return V.getOperand(0);
48907 return SDValue();
48908 };
48909 if (SDValue Extract = IsExtractedElement(StoredVal)) {
48910 SDValue Trunc = peekThroughOneUseBitcasts(Extract);
48911 if (Trunc.getOpcode() == X86ISD::VTRUNC) {
48912 SDValue Src = Trunc.getOperand(0);
48913 MVT DstVT = Trunc.getSimpleValueType();
48914 MVT SrcVT = Src.getSimpleValueType();
48915 unsigned NumSrcElts = SrcVT.getVectorNumElements();
48916 unsigned NumTruncBits = DstVT.getScalarSizeInBits() * NumSrcElts;
48917 MVT TruncVT = MVT::getVectorVT(DstVT.getScalarType(), NumSrcElts);
48918 if (NumTruncBits == VT.getSizeInBits() &&
48919 TLI.isTruncStoreLegal(SrcVT, TruncVT)) {
48920 return DAG.getTruncStore(St->getChain(), dl, Src, St->getBasePtr(),
48921 TruncVT, St->getMemOperand());
48922 }
48923 }
48924 }
48925 }
48926
48927 // Optimize trunc store (of multiple scalars) to shuffle and store.
48928 // First, pack all of the elements in one place. Next, store to memory
48929 // in fewer chunks.
48930 if (St->isTruncatingStore() && VT.isVector()) {
48931 // Check if we can detect an AVG pattern from the truncation. If yes,
48932 // replace the trunc store by a normal store with the result of X86ISD::AVG
48933 // instruction.
48934 if (DCI.isBeforeLegalize() || TLI.isTypeLegal(St->getMemoryVT()))
48935 if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
48936 Subtarget, dl))
48937 return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
48938 St->getPointerInfo(), St->getOriginalAlign(),
48939 St->getMemOperand()->getFlags());
48940
48941 if (TLI.isTruncStoreLegal(VT, StVT)) {
48942 if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))
48943 return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
48944 dl, Val, St->getBasePtr(),
48945 St->getMemoryVT(), St->getMemOperand(), DAG);
48946 if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),
48947 DAG, dl))
48948 return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
48949 dl, Val, St->getBasePtr(),
48950 St->getMemoryVT(), St->getMemOperand(), DAG);
48951 }
48952
48953 return SDValue();
48954 }
48955
48956 // Cast ptr32 and ptr64 pointers to the default address space before a store.
48957 unsigned AddrSpace = St->getAddressSpace();
48958 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
48959 AddrSpace == X86AS::PTR32_UPTR) {
48960 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
48961 if (PtrVT != St->getBasePtr().getSimpleValueType()) {
48962 SDValue Cast =
48963 DAG.getAddrSpaceCast(dl, PtrVT, St->getBasePtr(), AddrSpace, 0);
48964 return DAG.getStore(St->getChain(), dl, StoredVal, Cast,
48965 St->getPointerInfo(), St->getOriginalAlign(),
48966 St->getMemOperand()->getFlags(), St->getAAInfo());
48967 }
48968 }
48969
48970 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
48971 // the FP state in cases where an emms may be missing.
48972 // A preferable solution to the general problem is to figure out the right
48973 // places to insert EMMS. This qualifies as a quick hack.
48974
48975 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
48976 if (VT.getSizeInBits() != 64)
48977 return SDValue();
48978
48979 const Function &F = DAG.getMachineFunction().getFunction();
48980 bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
48981 bool F64IsLegal =
48982 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
48983 if ((VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit()) &&
48984 isa<LoadSDNode>(St->getValue()) &&
48985 cast<LoadSDNode>(St->getValue())->isSimple() &&
48986 St->getChain().hasOneUse() && St->isSimple()) {
48987 LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode());
48988
48989 if (!ISD::isNormalLoad(Ld))
48990 return SDValue();
48991
48992 // Avoid the transformation if there are multiple uses of the loaded value.
48993 if (!Ld->hasNUsesOfValue(1, 0))
48994 return SDValue();
48995
48996 SDLoc LdDL(Ld);
48997 SDLoc StDL(N);
48998 // Lower to a single movq load/store pair.
48999 SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(),
49000 Ld->getBasePtr(), Ld->getMemOperand());
49001
49002 // Make sure new load is placed in same chain order.
49003 DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
49004 return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
49005 St->getMemOperand());
49006 }
49007
49008 // This is similar to the above case, but here we handle a scalar 64-bit
49009 // integer store that is extracted from a vector on a 32-bit target.
49010 // If we have SSE2, then we can treat it like a floating-point double
49011 // to get past legalization. The execution dependencies fixup pass will
49012 // choose the optimal machine instruction for the store if this really is
49013 // an integer or v2f32 rather than an f64.
49014 if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
49015 St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
49016 SDValue OldExtract = St->getOperand(1);
49017 SDValue ExtOp0 = OldExtract.getOperand(0);
49018 unsigned VecSize = ExtOp0.getValueSizeInBits();
49019 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
49020 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
49021 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
49022 BitCast, OldExtract.getOperand(1));
49023 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
49024 St->getPointerInfo(), St->getOriginalAlign(),
49025 St->getMemOperand()->getFlags());
49026 }
49027
49028 return SDValue();
49029}
49030
49031static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG,
49032 TargetLowering::DAGCombinerInfo &DCI,
49033 const X86Subtarget &Subtarget) {
49034 auto *St = cast<MemIntrinsicSDNode>(N);
49035
49036 SDValue StoredVal = N->getOperand(1);
49037 MVT VT = StoredVal.getSimpleValueType();
49038 EVT MemVT = St->getMemoryVT();
49039
49040 // Figure out which elements we demand.
49041 unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits();
49042 APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts);
49043
49044 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49045 if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, DCI)) {
49046 if (N->getOpcode() != ISD::DELETED_NODE)
49047 DCI.AddToWorklist(N);
49048 return SDValue(N, 0);
49049 }
49050
49051 return SDValue();
49052}
49053
49054/// Return 'true' if this vector operation is "horizontal"
49055/// and return the operands for the horizontal operation in LHS and RHS. A
49056/// horizontal operation performs the binary operation on successive elements
49057/// of its first operand, then on successive elements of its second operand,
49058/// returning the resulting values in a vector. For example, if
49059/// A = < float a0, float a1, float a2, float a3 >
49060/// and
49061/// B = < float b0, float b1, float b2, float b3 >
49062/// then the result of doing a horizontal operation on A and B is
49063/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
49064/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
49065/// A horizontal-op B, for some already available A and B, and if so then LHS is
49066/// set to A, RHS to B, and the routine returns 'true'.
49067static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS,
49068 SelectionDAG &DAG, const X86Subtarget &Subtarget,
49069 bool IsCommutative,
49070 SmallVectorImpl<int> &PostShuffleMask) {
49071 // If either operand is undef, bail out. The binop should be simplified.
49072 if (LHS.isUndef() || RHS.isUndef())
49073 return false;
49074
49075 // Look for the following pattern:
49076 // A = < float a0, float a1, float a2, float a3 >
49077 // B = < float b0, float b1, float b2, float b3 >
49078 // and
49079 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
49080 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
49081 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
49082 // which is A horizontal-op B.
49083
49084 MVT VT = LHS.getSimpleValueType();
49085 assert((VT.is128BitVector() || VT.is256BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
()) && "Unsupported vector type for horizontal add/sub"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Unsupported vector type for horizontal add/sub\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49086, __extension__
__PRETTY_FUNCTION__))
49086 "Unsupported vector type for horizontal add/sub")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
()) && "Unsupported vector type for horizontal add/sub"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Unsupported vector type for horizontal add/sub\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49086, __extension__
__PRETTY_FUNCTION__))
;
49087 unsigned NumElts = VT.getVectorNumElements();
49088
49089 auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,
49090 SmallVectorImpl<int> &ShuffleMask) {
49091 bool UseSubVector = false;
49092 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
49093 Op.getOperand(0).getValueType().is256BitVector() &&
49094 llvm::isNullConstant(Op.getOperand(1))) {
49095 Op = Op.getOperand(0);
49096 UseSubVector = true;
49097 }
49098 SmallVector<SDValue, 2> SrcOps;
49099 SmallVector<int, 16> SrcMask, ScaledMask;
49100 SDValue BC = peekThroughBitcasts(Op);
49101 if (getTargetShuffleInputs(BC, SrcOps, SrcMask, DAG) &&
49102 !isAnyZero(SrcMask) && all_of(SrcOps, [BC](SDValue Op) {
49103 return Op.getValueSizeInBits() == BC.getValueSizeInBits();
49104 })) {
49105 resolveTargetShuffleInputsAndMask(SrcOps, SrcMask);
49106 if (!UseSubVector && SrcOps.size() <= 2 &&
49107 scaleShuffleElements(SrcMask, NumElts, ScaledMask)) {
49108 N0 = SrcOps.size() > 0 ? SrcOps[0] : SDValue();
49109 N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();
49110 ShuffleMask.assign(ScaledMask.begin(), ScaledMask.end());
49111 }
49112 if (UseSubVector && SrcOps.size() == 1 &&
49113 scaleShuffleElements(SrcMask, 2 * NumElts, ScaledMask)) {
49114 std::tie(N0, N1) = DAG.SplitVector(SrcOps[0], SDLoc(Op));
49115 ArrayRef<int> Mask = ArrayRef<int>(ScaledMask).slice(0, NumElts);
49116 ShuffleMask.assign(Mask.begin(), Mask.end());
49117 }
49118 }
49119 };
49120
49121 // View LHS in the form
49122 // LHS = VECTOR_SHUFFLE A, B, LMask
49123 // If LHS is not a shuffle, then pretend it is the identity shuffle:
49124 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
49125 // NOTE: A default initialized SDValue represents an UNDEF of type VT.
49126 SDValue A, B;
49127 SmallVector<int, 16> LMask;
49128 GetShuffle(LHS, A, B, LMask);
49129
49130 // Likewise, view RHS in the form
49131 // RHS = VECTOR_SHUFFLE C, D, RMask
49132 SDValue C, D;
49133 SmallVector<int, 16> RMask;
49134 GetShuffle(RHS, C, D, RMask);
49135
49136 // At least one of the operands should be a vector shuffle.
49137 unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1);
49138 if (NumShuffles == 0)
49139 return false;
49140
49141 if (LMask.empty()) {
49142 A = LHS;
49143 for (unsigned i = 0; i != NumElts; ++i)
49144 LMask.push_back(i);
49145 }
49146
49147 if (RMask.empty()) {
49148 C = RHS;
49149 for (unsigned i = 0; i != NumElts; ++i)
49150 RMask.push_back(i);
49151 }
49152
49153 // If we have an unary mask, ensure the other op is set to null.
49154 if (isUndefOrInRange(LMask, 0, NumElts))
49155 B = SDValue();
49156 else if (isUndefOrInRange(LMask, NumElts, NumElts * 2))
49157 A = SDValue();
49158
49159 if (isUndefOrInRange(RMask, 0, NumElts))
49160 D = SDValue();
49161 else if (isUndefOrInRange(RMask, NumElts, NumElts * 2))
49162 C = SDValue();
49163
49164 // If A and B occur in reverse order in RHS, then canonicalize by commuting
49165 // RHS operands and shuffle mask.
49166 if (A != C) {
49167 std::swap(C, D);
49168 ShuffleVectorSDNode::commuteMask(RMask);
49169 }
49170 // Check that the shuffles are both shuffling the same vectors.
49171 if (!(A == C && B == D))
49172 return false;
49173
49174 PostShuffleMask.clear();
49175 PostShuffleMask.append(NumElts, SM_SentinelUndef);
49176
49177 // LHS and RHS are now:
49178 // LHS = shuffle A, B, LMask
49179 // RHS = shuffle A, B, RMask
49180 // Check that the masks correspond to performing a horizontal operation.
49181 // AVX defines horizontal add/sub to operate independently on 128-bit lanes,
49182 // so we just repeat the inner loop if this is a 256-bit op.
49183 unsigned Num128BitChunks = VT.getSizeInBits() / 128;
49184 unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;
49185 unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
49186 assert((NumEltsPer128BitChunk % 2 == 0) &&(static_cast <bool> ((NumEltsPer128BitChunk % 2 == 0) &&
"Vector type should have an even number of elements in each lane"
) ? void (0) : __assert_fail ("(NumEltsPer128BitChunk % 2 == 0) && \"Vector type should have an even number of elements in each lane\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49187, __extension__
__PRETTY_FUNCTION__))
49187 "Vector type should have an even number of elements in each lane")(static_cast <bool> ((NumEltsPer128BitChunk % 2 == 0) &&
"Vector type should have an even number of elements in each lane"
) ? void (0) : __assert_fail ("(NumEltsPer128BitChunk % 2 == 0) && \"Vector type should have an even number of elements in each lane\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49187, __extension__
__PRETTY_FUNCTION__))
;
49188 for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {
49189 for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {
49190 // Ignore undefined components.
49191 int LIdx = LMask[i + j], RIdx = RMask[i + j];
49192 if (LIdx < 0 || RIdx < 0 ||
49193 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
49194 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
49195 continue;
49196
49197 // Check that successive odd/even elements are being operated on. If not,
49198 // this is not a horizontal operation.
49199 if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) &&
49200 !((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative))
49201 return false;
49202
49203 // Compute the post-shuffle mask index based on where the element
49204 // is stored in the HOP result, and where it needs to be moved to.
49205 int Base = LIdx & ~1u;
49206 int Index = ((Base % NumEltsPer128BitChunk) / 2) +
49207 ((Base % NumElts) & ~(NumEltsPer128BitChunk - 1));
49208
49209 // The low half of the 128-bit result must choose from A.
49210 // The high half of the 128-bit result must choose from B,
49211 // unless B is undef. In that case, we are always choosing from A.
49212 if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk))
49213 Index += NumEltsPer64BitChunk;
49214 PostShuffleMask[i + j] = Index;
49215 }
49216 }
49217
49218 SDValue NewLHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
49219 SDValue NewRHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
49220
49221 bool IsIdentityPostShuffle =
49222 isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);
49223 if (IsIdentityPostShuffle)
49224 PostShuffleMask.clear();
49225
49226 // Avoid 128-bit multi lane shuffles if pre-AVX2 and FP (integer will split).
49227 if (!IsIdentityPostShuffle && !Subtarget.hasAVX2() && VT.isFloatingPoint() &&
49228 isMultiLaneShuffleMask(128, VT.getScalarSizeInBits(), PostShuffleMask))
49229 return false;
49230
49231 // If the source nodes are already used in HorizOps then always accept this.
49232 // Shuffle folding should merge these back together.
49233 bool FoundHorizLHS = llvm::any_of(NewLHS->uses(), [&](SDNode *User) {
49234 return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
49235 });
49236 bool FoundHorizRHS = llvm::any_of(NewRHS->uses(), [&](SDNode *User) {
49237 return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
49238 });
49239 bool ForceHorizOp = FoundHorizLHS && FoundHorizRHS;
49240
49241 // Assume a SingleSource HOP if we only shuffle one input and don't need to
49242 // shuffle the result.
49243 if (!ForceHorizOp &&
49244 !shouldUseHorizontalOp(NewLHS == NewRHS &&
49245 (NumShuffles < 2 || !IsIdentityPostShuffle),
49246 DAG, Subtarget))
49247 return false;
49248
49249 LHS = DAG.getBitcast(VT, NewLHS);
49250 RHS = DAG.getBitcast(VT, NewRHS);
49251 return true;
49252}
49253
49254// Try to synthesize horizontal (f)hadd/hsub from (f)adds/subs of shuffles.
49255static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG,
49256 const X86Subtarget &Subtarget) {
49257 EVT VT = N->getValueType(0);
49258 unsigned Opcode = N->getOpcode();
49259 bool IsAdd = (Opcode == ISD::FADD) || (Opcode == ISD::ADD);
49260 SmallVector<int, 8> PostShuffleMask;
49261
49262 switch (Opcode) {
49263 case ISD::FADD:
49264 case ISD::FSUB:
49265 if ((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
49266 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
49267 SDValue LHS = N->getOperand(0);
49268 SDValue RHS = N->getOperand(1);
49269 auto HorizOpcode = IsAdd ? X86ISD::FHADD : X86ISD::FHSUB;
49270 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
49271 PostShuffleMask)) {
49272 SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
49273 if (!PostShuffleMask.empty())
49274 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
49275 DAG.getUNDEF(VT), PostShuffleMask);
49276 return HorizBinOp;
49277 }
49278 }
49279 break;
49280 case ISD::ADD:
49281 case ISD::SUB:
49282 if (Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
49283 VT == MVT::v16i16 || VT == MVT::v8i32)) {
49284 SDValue LHS = N->getOperand(0);
49285 SDValue RHS = N->getOperand(1);
49286 auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB;
49287 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
49288 PostShuffleMask)) {
49289 auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,
49290 ArrayRef<SDValue> Ops) {
49291 return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);
49292 };
49293 SDValue HorizBinOp = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
49294 {LHS, RHS}, HOpBuilder);
49295 if (!PostShuffleMask.empty())
49296 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
49297 DAG.getUNDEF(VT), PostShuffleMask);
49298 return HorizBinOp;
49299 }
49300 }
49301 break;
49302 }
49303
49304 return SDValue();
49305}
49306
49307// Try to combine the following nodes
49308// t29: i64 = X86ISD::Wrapper TargetConstantPool:i64
49309// <i32 -2147483648[float -0.000000e+00]> 0
49310// t27: v16i32[v16f32],ch = X86ISD::VBROADCAST_LOAD
49311// <(load 4 from constant-pool)> t0, t29
49312// [t30: v16i32 = bitcast t27]
49313// t6: v16i32 = xor t7, t27[t30]
49314// t11: v16f32 = bitcast t6
49315// t21: v16f32 = X86ISD::VFMULC[X86ISD::VCFMULC] t11, t8
49316// into X86ISD::VFCMULC[X86ISD::VFMULC] if possible:
49317// t22: v16f32 = bitcast t7
49318// t23: v16f32 = X86ISD::VFCMULC[X86ISD::VFMULC] t8, t22
49319// t24: v32f16 = bitcast t23
49320static SDValue combineFMulcFCMulc(SDNode *N, SelectionDAG &DAG,
49321 const X86Subtarget &Subtarget) {
49322 EVT VT = N->getValueType(0);
49323 SDValue LHS = N->getOperand(0);
49324 SDValue RHS = N->getOperand(1);
49325 int CombineOpcode =
49326 N->getOpcode() == X86ISD::VFCMULC ? X86ISD::VFMULC : X86ISD::VFCMULC;
49327 auto isConjugationConstant = [](const Constant *c) {
49328 if (const auto *CI = dyn_cast<ConstantInt>(c)) {
49329 APInt ConjugationInt32 = APInt(32, 0x80000000, true);
49330 APInt ConjugationInt64 = APInt(64, 0x8000000080000000ULL, true);
49331 switch (CI->getBitWidth()) {
49332 case 16:
49333 return false;
49334 case 32:
49335 return CI->getValue() == ConjugationInt32;
49336 case 64:
49337 return CI->getValue() == ConjugationInt64;
49338 default:
49339 llvm_unreachable("Unexpected bit width")::llvm::llvm_unreachable_internal("Unexpected bit width", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 49339)
;
49340 }
49341 }
49342 if (const auto *CF = dyn_cast<ConstantFP>(c))
49343 return CF->isNegativeZeroValue();
49344 return false;
49345 };
49346 auto combineConjugation = [&](SDValue &r) {
49347 if (LHS->getOpcode() == ISD::BITCAST && RHS.hasOneUse()) {
49348 SDValue XOR = LHS.getOperand(0);
49349 if (XOR->getOpcode() == ISD::XOR && XOR.hasOneUse()) {
49350 SDValue XORRHS = XOR.getOperand(1);
49351 if (XORRHS.getOpcode() == ISD::BITCAST && XORRHS.hasOneUse())
49352 XORRHS = XORRHS.getOperand(0);
49353 if (XORRHS.getOpcode() == X86ISD::VBROADCAST_LOAD &&
49354 XORRHS.getOperand(1).getNumOperands()) {
49355 ConstantPoolSDNode *CP =
49356 dyn_cast<ConstantPoolSDNode>(XORRHS.getOperand(1).getOperand(0));
49357 if (CP && isConjugationConstant(CP->getConstVal())) {
49358 SelectionDAG::FlagInserter FlagsInserter(DAG, N);
49359 SDValue I2F = DAG.getBitcast(VT, LHS.getOperand(0).getOperand(0));
49360 SDValue FCMulC = DAG.getNode(CombineOpcode, SDLoc(N), VT, RHS, I2F);
49361 r = DAG.getBitcast(VT, FCMulC);
49362 return true;
49363 }
49364 }
49365 }
49366 }
49367 return false;
49368 };
49369 SDValue Res;
49370 if (combineConjugation(Res))
49371 return Res;
49372 std::swap(LHS, RHS);
49373 if (combineConjugation(Res))
49374 return Res;
49375 return Res;
49376}
49377
49378// Try to combine the following nodes:
49379// FADD(A, FMA(B, C, 0)) and FADD(A, FMUL(B, C)) to FMA(B, C, A)
49380static SDValue combineFaddCFmul(SDNode *N, SelectionDAG &DAG,
49381 const X86Subtarget &Subtarget) {
49382 auto AllowContract = [&DAG](const SDNodeFlags &Flags) {
49383 return DAG.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast ||
49384 Flags.hasAllowContract();
49385 };
49386
49387 auto HasNoSignedZero = [&DAG](const SDNodeFlags &Flags) {
49388 return DAG.getTarget().Options.NoSignedZerosFPMath ||
49389 Flags.hasNoSignedZeros();
49390 };
49391 auto IsVectorAllNegativeZero = [](const SDNode *N) {
49392 if (N->getOpcode() != X86ISD::VBROADCAST_LOAD)
49393 return false;
49394 assert(N->getSimpleValueType(0).getScalarType() == MVT::f32 &&(static_cast <bool> (N->getSimpleValueType(0).getScalarType
() == MVT::f32 && "Unexpected vector type!") ? void (
0) : __assert_fail ("N->getSimpleValueType(0).getScalarType() == MVT::f32 && \"Unexpected vector type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49395, __extension__
__PRETTY_FUNCTION__))
49395 "Unexpected vector type!")(static_cast <bool> (N->getSimpleValueType(0).getScalarType
() == MVT::f32 && "Unexpected vector type!") ? void (
0) : __assert_fail ("N->getSimpleValueType(0).getScalarType() == MVT::f32 && \"Unexpected vector type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49395, __extension__
__PRETTY_FUNCTION__))
;
49396 if (ConstantPoolSDNode *CP =
49397 dyn_cast<ConstantPoolSDNode>(N->getOperand(1)->getOperand(0))) {
49398 APInt AI = APInt(32, 0x80008000, true);
49399 if (const auto *CI = dyn_cast<ConstantInt>(CP->getConstVal()))
49400 return CI->getValue() == AI;
49401 if (const auto *CF = dyn_cast<ConstantFP>(CP->getConstVal()))
49402 return CF->getValue() == APFloat(APFloat::IEEEsingle(), AI);
49403 }
49404 return false;
49405 };
49406
49407 if (N->getOpcode() != ISD::FADD || !Subtarget.hasFP16() ||
49408 !AllowContract(N->getFlags()))
49409 return SDValue();
49410
49411 EVT VT = N->getValueType(0);
49412 if (VT != MVT::v8f16 && VT != MVT::v16f16 && VT != MVT::v32f16)
49413 return SDValue();
49414
49415 SDValue LHS = N->getOperand(0);
49416 SDValue RHS = N->getOperand(1);
49417 bool IsConj;
49418 SDValue FAddOp1, MulOp0, MulOp1;
49419 auto GetCFmulFrom = [&MulOp0, &MulOp1, &IsConj, &AllowContract,
49420 &IsVectorAllNegativeZero,
49421 &HasNoSignedZero](SDValue N) -> bool {
49422 if (!N.hasOneUse() || N.getOpcode() != ISD::BITCAST)
49423 return false;
49424 SDValue Op0 = N.getOperand(0);
49425 unsigned Opcode = Op0.getOpcode();
49426 if (Op0.hasOneUse() && AllowContract(Op0->getFlags())) {
49427 if ((Opcode == X86ISD::VFMULC || Opcode == X86ISD::VFCMULC)) {
49428 MulOp0 = Op0.getOperand(0);
49429 MulOp1 = Op0.getOperand(1);
49430 IsConj = Opcode == X86ISD::VFCMULC;
49431 return true;
49432 }
49433 if ((Opcode == X86ISD::VFMADDC || Opcode == X86ISD::VFCMADDC) &&
49434 ((ISD::isBuildVectorAllZeros(Op0->getOperand(2).getNode()) &&
49435 HasNoSignedZero(Op0->getFlags())) ||
49436 IsVectorAllNegativeZero(Op0->getOperand(2).getNode()))) {
49437 MulOp0 = Op0.getOperand(0);
49438 MulOp1 = Op0.getOperand(1);
49439 IsConj = Opcode == X86ISD::VFCMADDC;
49440 return true;
49441 }
49442 }
49443 return false;
49444 };
49445
49446 if (GetCFmulFrom(LHS))
49447 FAddOp1 = RHS;
49448 else if (GetCFmulFrom(RHS))
49449 FAddOp1 = LHS;
49450 else
49451 return SDValue();
49452
49453 MVT CVT = MVT::getVectorVT(MVT::f32, VT.getVectorNumElements() / 2);
49454 FAddOp1 = DAG.getBitcast(CVT, FAddOp1);
49455 unsigned NewOp = IsConj ? X86ISD::VFCMADDC : X86ISD::VFMADDC;
49456 // FIXME: How do we handle when fast math flags of FADD are different from
49457 // CFMUL's?
49458 SDValue CFmul =
49459 DAG.getNode(NewOp, SDLoc(N), CVT, MulOp0, MulOp1, FAddOp1, N->getFlags());
49460 return DAG.getBitcast(VT, CFmul);
49461}
49462
49463/// Do target-specific dag combines on floating-point adds/subs.
49464static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
49465 const X86Subtarget &Subtarget) {
49466 if (SDValue HOp = combineToHorizontalAddSub(N, DAG, Subtarget))
49467 return HOp;
49468
49469 if (SDValue COp = combineFaddCFmul(N, DAG, Subtarget))
49470 return COp;
49471
49472 return SDValue();
49473}
49474
49475/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
49476/// the codegen.
49477/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
49478/// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove
49479/// anything that is guaranteed to be transformed by DAGCombiner.
49480static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
49481 const X86Subtarget &Subtarget,
49482 const SDLoc &DL) {
49483 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode")(static_cast <bool> (N->getOpcode() == ISD::TRUNCATE
&& "Wrong opcode") ? void (0) : __assert_fail ("N->getOpcode() == ISD::TRUNCATE && \"Wrong opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49483, __extension__
__PRETTY_FUNCTION__))
;
49484 SDValue Src = N->getOperand(0);
49485 unsigned SrcOpcode = Src.getOpcode();
49486 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49487
49488 EVT VT = N->getValueType(0);
49489 EVT SrcVT = Src.getValueType();
49490
49491 auto IsFreeTruncation = [VT](SDValue Op) {
49492 unsigned TruncSizeInBits = VT.getScalarSizeInBits();
49493
49494 // See if this has been extended from a smaller/equal size to
49495 // the truncation size, allowing a truncation to combine with the extend.
49496 unsigned Opcode = Op.getOpcode();
49497 if ((Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND ||
49498 Opcode == ISD::ZERO_EXTEND) &&
49499 Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
49500 return true;
49501
49502 // See if this is a single use constant which can be constant folded.
49503 // NOTE: We don't peek throught bitcasts here because there is currently
49504 // no support for constant folding truncate+bitcast+vector_of_constants. So
49505 // we'll just send up with a truncate on both operands which will
49506 // get turned back into (truncate (binop)) causing an infinite loop.
49507 return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
49508 };
49509
49510 auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
49511 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
49512 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
49513 return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1);
49514 };
49515
49516 // Don't combine if the operation has other uses.
49517 if (!Src.hasOneUse())
49518 return SDValue();
49519
49520 // Only support vector truncation for now.
49521 // TODO: i64 scalar math would benefit as well.
49522 if (!VT.isVector())
49523 return SDValue();
49524
49525 // In most cases its only worth pre-truncating if we're only facing the cost
49526 // of one truncation.
49527 // i.e. if one of the inputs will constant fold or the input is repeated.
49528 switch (SrcOpcode) {
49529 case ISD::MUL:
49530 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
49531 // better to truncate if we have the chance.
49532 if (SrcVT.getScalarType() == MVT::i64 &&
49533 TLI.isOperationLegal(SrcOpcode, VT) &&
49534 !TLI.isOperationLegal(SrcOpcode, SrcVT))
49535 return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
49536 LLVM_FALLTHROUGH[[gnu::fallthrough]];
49537 case ISD::AND:
49538 case ISD::XOR:
49539 case ISD::OR:
49540 case ISD::ADD:
49541 case ISD::SUB: {
49542 SDValue Op0 = Src.getOperand(0);
49543 SDValue Op1 = Src.getOperand(1);
49544 if (TLI.isOperationLegal(SrcOpcode, VT) &&
49545 (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
49546 return TruncateArithmetic(Op0, Op1);
49547 break;
49548 }
49549 }
49550
49551 return SDValue();
49552}
49553
49554/// Truncate using ISD::AND mask and X86ISD::PACKUS.
49555/// e.g. trunc <8 x i32> X to <8 x i16> -->
49556/// MaskX = X & 0xffff (clear high bits to prevent saturation)
49557/// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1)
49558static SDValue combineVectorTruncationWithPACKUS(SDNode *N, const SDLoc &DL,
49559 const X86Subtarget &Subtarget,
49560 SelectionDAG &DAG) {
49561 SDValue In = N->getOperand(0);
49562 EVT InVT = In.getValueType();
49563 EVT OutVT = N->getValueType(0);
49564
49565 APInt Mask = APInt::getLowBitsSet(InVT.getScalarSizeInBits(),
49566 OutVT.getScalarSizeInBits());
49567 In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(Mask, DL, InVT));
49568 return truncateVectorWithPACK(X86ISD::PACKUS, OutVT, In, DL, DAG, Subtarget);
49569}
49570
49571/// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
49572static SDValue combineVectorTruncationWithPACKSS(SDNode *N, const SDLoc &DL,
49573 const X86Subtarget &Subtarget,
49574 SelectionDAG &DAG) {
49575 SDValue In = N->getOperand(0);
49576 EVT InVT = In.getValueType();
49577 EVT OutVT = N->getValueType(0);
49578 In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, InVT, In,
49579 DAG.getValueType(OutVT));
49580 return truncateVectorWithPACK(X86ISD::PACKSS, OutVT, In, DL, DAG, Subtarget);
49581}
49582
49583/// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
49584/// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
49585/// legalization the truncation will be translated into a BUILD_VECTOR with each
49586/// element that is extracted from a vector and then truncated, and it is
49587/// difficult to do this optimization based on them.
49588static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
49589 const X86Subtarget &Subtarget) {
49590 EVT OutVT = N->getValueType(0);
49591 if (!OutVT.isVector())
49592 return SDValue();
49593
49594 SDValue In = N->getOperand(0);
49595 if (!In.getValueType().isSimple())
49596 return SDValue();
49597
49598 EVT InVT = In.getValueType();
49599 unsigned NumElems = OutVT.getVectorNumElements();
49600
49601 // AVX512 provides fast truncate ops.
49602 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
49603 return SDValue();
49604
49605 EVT OutSVT = OutVT.getVectorElementType();
49606 EVT InSVT = InVT.getVectorElementType();
49607 if (!((InSVT == MVT::i16 || InSVT == MVT::i32 || InSVT == MVT::i64) &&
49608 (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
49609 NumElems >= 8))
49610 return SDValue();
49611
49612 // SSSE3's pshufb results in less instructions in the cases below.
49613 if (Subtarget.hasSSSE3() && NumElems == 8) {
49614 if (InSVT == MVT::i16)
49615 return SDValue();
49616 if (InSVT == MVT::i32 &&
49617 (OutSVT == MVT::i8 || !Subtarget.hasSSE41() || Subtarget.hasInt256()))
49618 return SDValue();
49619 }
49620
49621 SDLoc DL(N);
49622 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
49623 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
49624 // truncate 2 x v4i32 to v8i16.
49625 if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
49626 return combineVectorTruncationWithPACKUS(N, DL, Subtarget, DAG);
49627 if (InSVT == MVT::i32)
49628 return combineVectorTruncationWithPACKSS(N, DL, Subtarget, DAG);
49629
49630 return SDValue();
49631}
49632
49633/// This function transforms vector truncation of 'extended sign-bits' or
49634/// 'extended zero-bits' values.
49635/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
49636static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
49637 SelectionDAG &DAG,
49638 const X86Subtarget &Subtarget) {
49639 // Requires SSE2.
49640 if (!Subtarget.hasSSE2())
49641 return SDValue();
49642
49643 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())
49644 return SDValue();
49645
49646 SDValue In = N->getOperand(0);
49647 if (!In.getValueType().isSimple())
49648 return SDValue();
49649
49650 MVT VT = N->getValueType(0).getSimpleVT();
49651 MVT SVT = VT.getScalarType();
49652
49653 MVT InVT = In.getValueType().getSimpleVT();
49654 MVT InSVT = InVT.getScalarType();
49655
49656 // Check we have a truncation suited for PACKSS/PACKUS.
49657 if (!isPowerOf2_32(VT.getVectorNumElements()))
49658 return SDValue();
49659 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
49660 return SDValue();
49661 if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
49662 return SDValue();
49663
49664 // Truncation to sub-128bit vXi32 can be better handled with shuffles.
49665 if (SVT == MVT::i32 && VT.getSizeInBits() < 128)
49666 return SDValue();
49667
49668 // AVX512 has fast truncate, but if the input is already going to be split,
49669 // there's no harm in trying pack.
49670 if (Subtarget.hasAVX512() &&
49671 !(!Subtarget.useAVX512Regs() && VT.is256BitVector() &&
49672 InVT.is512BitVector())) {
49673 // PACK should still be worth it for 128-bit vectors if the sources were
49674 // originally concatenated from subvectors.
49675 SmallVector<SDValue> ConcatOps;
49676 if (VT.getSizeInBits() > 128 || !collectConcatOps(In.getNode(), ConcatOps))
49677 return SDValue();
49678 }
49679
49680 unsigned NumPackedSignBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
49681 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
49682
49683 // Use PACKUS if the input has zero-bits that extend all the way to the
49684 // packed/truncated value. e.g. masks, zext_in_reg, etc.
49685 KnownBits Known = DAG.computeKnownBits(In);
49686 unsigned NumLeadingZeroBits = Known.countMinLeadingZeros();
49687 if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedZeroBits))
49688 return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget);
49689
49690 // Use PACKSS if the input has sign-bits that extend all the way to the
49691 // packed/truncated value. e.g. Comparison result, sext_in_reg, etc.
49692 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
49693
49694 // Don't use PACKSS for vXi64 -> vXi32 truncations unless we're dealing with
49695 // a sign splat. ComputeNumSignBits struggles to see through BITCASTs later
49696 // on and combines/simplifications can't then use it.
49697 if (SVT == MVT::i32 && NumSignBits != InSVT.getSizeInBits())
49698 return SDValue();
49699
49700 unsigned MinSignBits = InSVT.getSizeInBits() - NumPackedSignBits;
49701 if (NumSignBits > MinSignBits)
49702 return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);
49703
49704 // If we have a srl that only generates signbits that we will discard in
49705 // the truncation then we can use PACKSS by converting the srl to a sra.
49706 // SimplifyDemandedBits often relaxes sra to srl so we need to reverse it.
49707 if (In.getOpcode() == ISD::SRL && N->isOnlyUserOf(In.getNode()))
49708 if (const APInt *ShAmt = DAG.getValidShiftAmountConstant(
49709 In, APInt::getAllOnes(VT.getVectorNumElements()))) {
49710 if (*ShAmt == MinSignBits) {
49711 SDValue NewIn = DAG.getNode(ISD::SRA, DL, InVT, In->ops());
49712 return truncateVectorWithPACK(X86ISD::PACKSS, VT, NewIn, DL, DAG,
49713 Subtarget);
49714 }
49715 }
49716
49717 return SDValue();
49718}
49719
49720// Try to form a MULHU or MULHS node by looking for
49721// (trunc (srl (mul ext, ext), 16))
49722// TODO: This is X86 specific because we want to be able to handle wide types
49723// before type legalization. But we can only do it if the vector will be
49724// legalized via widening/splitting. Type legalization can't handle promotion
49725// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
49726// combiner.
49727static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
49728 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
49729 // First instruction should be a right shift of a multiply.
49730 if (Src.getOpcode() != ISD::SRL ||
49731 Src.getOperand(0).getOpcode() != ISD::MUL)
49732 return SDValue();
49733
49734 if (!Subtarget.hasSSE2())
49735 return SDValue();
49736
49737 // Only handle vXi16 types that are at least 128-bits unless they will be
49738 // widened.
49739 if (!VT.isVector() || VT.getVectorElementType() != MVT::i16)
49740 return SDValue();
49741
49742 // Input type should be at least vXi32.
49743 EVT InVT = Src.getValueType();
49744 if (InVT.getVectorElementType().getSizeInBits() < 32)
49745 return SDValue();
49746
49747 // Need a shift by 16.
49748 APInt ShiftAmt;
49749 if (!ISD::isConstantSplatVector(Src.getOperand(1).getNode(), ShiftAmt) ||
49750 ShiftAmt != 16)
49751 return SDValue();
49752
49753 SDValue LHS = Src.getOperand(0).getOperand(0);
49754 SDValue RHS = Src.getOperand(0).getOperand(1);
49755
49756 // Count leading sign/zero bits on both inputs - if there are enough then
49757 // truncation back to vXi16 will be cheap - either as a pack/shuffle
49758 // sequence or using AVX512 truncations. If the inputs are sext/zext then the
49759 // truncations may actually be free by peeking through to the ext source.
49760 auto IsSext = [&DAG](SDValue V) {
49761 return DAG.ComputeMaxSignificantBits(V) <= 16;
49762 };
49763 auto IsZext = [&DAG](SDValue V) {
49764 return DAG.computeKnownBits(V).countMaxActiveBits() <= 16;
49765 };
49766
49767 bool IsSigned = IsSext(LHS) && IsSext(RHS);
49768 bool IsUnsigned = IsZext(LHS) && IsZext(RHS);
49769 if (!IsSigned && !IsUnsigned)
49770 return SDValue();
49771
49772 // Check if both inputs are extensions, which will be removed by truncation.
49773 bool IsTruncateFree = (LHS.getOpcode() == ISD::SIGN_EXTEND ||
49774 LHS.getOpcode() == ISD::ZERO_EXTEND) &&
49775 (RHS.getOpcode() == ISD::SIGN_EXTEND ||
49776 RHS.getOpcode() == ISD::ZERO_EXTEND) &&
49777 LHS.getOperand(0).getScalarValueSizeInBits() <= 16 &&
49778 RHS.getOperand(0).getScalarValueSizeInBits() <= 16;
49779
49780 // For AVX2+ targets, with the upper bits known zero, we can perform MULHU on
49781 // the (bitcasted) inputs directly, and then cheaply pack/truncate the result
49782 // (upper elts will be zero). Don't attempt this with just AVX512F as MULHU
49783 // will have to split anyway.
49784 unsigned InSizeInBits = InVT.getSizeInBits();
49785 if (IsUnsigned && !IsTruncateFree && Subtarget.hasInt256() &&
49786 !(Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.is256BitVector()) &&
49787 (InSizeInBits % 16) == 0) {
49788 EVT BCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
49789 InVT.getSizeInBits() / 16);
49790 SDValue Res = DAG.getNode(ISD::MULHU, DL, BCVT, DAG.getBitcast(BCVT, LHS),
49791 DAG.getBitcast(BCVT, RHS));
49792 return DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getBitcast(InVT, Res));
49793 }
49794
49795 // Truncate back to source type.
49796 LHS = DAG.getNode(ISD::TRUNCATE, DL, VT, LHS);
49797 RHS = DAG.getNode(ISD::TRUNCATE, DL, VT, RHS);
49798
49799 unsigned Opc = IsSigned ? ISD::MULHS : ISD::MULHU;
49800 return DAG.getNode(Opc, DL, VT, LHS, RHS);
49801}
49802
49803// Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes
49804// from one vector with signed bytes from another vector, adds together
49805// adjacent pairs of 16-bit products, and saturates the result before
49806// truncating to 16-bits.
49807//
49808// Which looks something like this:
49809// (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),
49810// (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))
49811static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG,
49812 const X86Subtarget &Subtarget,
49813 const SDLoc &DL) {
49814 if (!VT.isVector() || !Subtarget.hasSSSE3())
49815 return SDValue();
49816
49817 unsigned NumElems = VT.getVectorNumElements();
49818 EVT ScalarVT = VT.getVectorElementType();
49819 if (ScalarVT != MVT::i16 || NumElems < 8 || !isPowerOf2_32(NumElems))
49820 return SDValue();
49821
49822 SDValue SSatVal = detectSSatPattern(In, VT);
49823 if (!SSatVal || SSatVal.getOpcode() != ISD::ADD)
49824 return SDValue();
49825
49826 // Ok this is a signed saturation of an ADD. See if this ADD is adding pairs
49827 // of multiplies from even/odd elements.
49828 SDValue N0 = SSatVal.getOperand(0);
49829 SDValue N1 = SSatVal.getOperand(1);
49830
49831 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
49832 return SDValue();
49833
49834 SDValue N00 = N0.getOperand(0);
49835 SDValue N01 = N0.getOperand(1);
49836 SDValue N10 = N1.getOperand(0);
49837 SDValue N11 = N1.getOperand(1);
49838
49839 // TODO: Handle constant vectors and use knownbits/computenumsignbits?
49840 // Canonicalize zero_extend to LHS.
49841 if (N01.getOpcode() == ISD::ZERO_EXTEND)
49842 std::swap(N00, N01);
49843 if (N11.getOpcode() == ISD::ZERO_EXTEND)
49844 std::swap(N10, N11);
49845
49846 // Ensure we have a zero_extend and a sign_extend.
49847 if (N00.getOpcode() != ISD::ZERO_EXTEND ||
49848 N01.getOpcode() != ISD::SIGN_EXTEND ||
49849 N10.getOpcode() != ISD::ZERO_EXTEND ||
49850 N11.getOpcode() != ISD::SIGN_EXTEND)
49851 return SDValue();
49852
49853 // Peek through the extends.
49854 N00 = N00.getOperand(0);
49855 N01 = N01.getOperand(0);
49856 N10 = N10.getOperand(0);
49857 N11 = N11.getOperand(0);
49858
49859 // Ensure the extend is from vXi8.
49860 if (N00.getValueType().getVectorElementType() != MVT::i8 ||
49861 N01.getValueType().getVectorElementType() != MVT::i8 ||
49862 N10.getValueType().getVectorElementType() != MVT::i8 ||
49863 N11.getValueType().getVectorElementType() != MVT::i8)
49864 return SDValue();
49865
49866 // All inputs should be build_vectors.
49867 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
49868 N01.getOpcode() != ISD::BUILD_VECTOR ||
49869 N10.getOpcode() != ISD::BUILD_VECTOR ||
49870 N11.getOpcode() != ISD::BUILD_VECTOR)
49871 return SDValue();
49872
49873 // N00/N10 are zero extended. N01/N11 are sign extended.
49874
49875 // For each element, we need to ensure we have an odd element from one vector
49876 // multiplied by the odd element of another vector and the even element from
49877 // one of the same vectors being multiplied by the even element from the
49878 // other vector. So we need to make sure for each element i, this operator
49879 // is being performed:
49880 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
49881 SDValue ZExtIn, SExtIn;
49882 for (unsigned i = 0; i != NumElems; ++i) {
49883 SDValue N00Elt = N00.getOperand(i);
49884 SDValue N01Elt = N01.getOperand(i);
49885 SDValue N10Elt = N10.getOperand(i);
49886 SDValue N11Elt = N11.getOperand(i);
49887 // TODO: Be more tolerant to undefs.
49888 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
49889 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
49890 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
49891 N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
49892 return SDValue();
49893 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
49894 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
49895 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
49896 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
49897 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
49898 return SDValue();
49899 unsigned IdxN00 = ConstN00Elt->getZExtValue();
49900 unsigned IdxN01 = ConstN01Elt->getZExtValue();
49901 unsigned IdxN10 = ConstN10Elt->getZExtValue();
49902 unsigned IdxN11 = ConstN11Elt->getZExtValue();
49903 // Add is commutative so indices can be reordered.
49904 if (IdxN00 > IdxN10) {
49905 std::swap(IdxN00, IdxN10);
49906 std::swap(IdxN01, IdxN11);
49907 }
49908 // N0 indices be the even element. N1 indices must be the next odd element.
49909 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
49910 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
49911 return SDValue();
49912 SDValue N00In = N00Elt.getOperand(0);
49913 SDValue N01In = N01Elt.getOperand(0);
49914 SDValue N10In = N10Elt.getOperand(0);
49915 SDValue N11In = N11Elt.getOperand(0);
49916 // First time we find an input capture it.
49917 if (!ZExtIn) {
49918 ZExtIn = N00In;
49919 SExtIn = N01In;
49920 }
49921 if (ZExtIn != N00In || SExtIn != N01In ||
49922 ZExtIn != N10In || SExtIn != N11In)
49923 return SDValue();
49924 }
49925
49926 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49927 ArrayRef<SDValue> Ops) {
49928 // Shrink by adding truncate nodes and let DAGCombine fold with the
49929 // sources.
49930 EVT InVT = Ops[0].getValueType();
49931 assert(InVT.getScalarType() == MVT::i8 &&(static_cast <bool> (InVT.getScalarType() == MVT::i8 &&
"Unexpected scalar element type") ? void (0) : __assert_fail
("InVT.getScalarType() == MVT::i8 && \"Unexpected scalar element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49932, __extension__
__PRETTY_FUNCTION__))
49932 "Unexpected scalar element type")(static_cast <bool> (InVT.getScalarType() == MVT::i8 &&
"Unexpected scalar element type") ? void (0) : __assert_fail
("InVT.getScalarType() == MVT::i8 && \"Unexpected scalar element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49932, __extension__
__PRETTY_FUNCTION__))
;
49933 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch")(static_cast <bool> (InVT == Ops[1].getValueType() &&
"Operands' types mismatch") ? void (0) : __assert_fail ("InVT == Ops[1].getValueType() && \"Operands' types mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49933, __extension__
__PRETTY_FUNCTION__))
;
49934 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
49935 InVT.getVectorNumElements() / 2);
49936 return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);
49937 };
49938 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },
49939 PMADDBuilder);
49940}
49941
49942static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
49943 const X86Subtarget &Subtarget) {
49944 EVT VT = N->getValueType(0);
49945 SDValue Src = N->getOperand(0);
49946 SDLoc DL(N);
49947
49948 // Attempt to pre-truncate inputs to arithmetic ops instead.
49949 if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
49950 return V;
49951
49952 // Try to detect AVG pattern first.
49953 if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
49954 return Avg;
49955
49956 // Try to detect PMADD
49957 if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))
49958 return PMAdd;
49959
49960 // Try to combine truncation with signed/unsigned saturation.
49961 if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
49962 return Val;
49963
49964 // Try to combine PMULHUW/PMULHW for vXi16.
49965 if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
49966 return V;
49967
49968 // The bitcast source is a direct mmx result.
49969 // Detect bitcasts between i32 to x86mmx
49970 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
49971 SDValue BCSrc = Src.getOperand(0);
49972 if (BCSrc.getValueType() == MVT::x86mmx)
49973 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
49974 }
49975
49976 // Try to truncate extended sign/zero bits with PACKSS/PACKUS.
49977 if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
49978 return V;
49979
49980 return combineVectorTruncation(N, DAG, Subtarget);
49981}
49982
49983static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG,
49984 TargetLowering::DAGCombinerInfo &DCI) {
49985 EVT VT = N->getValueType(0);
49986 SDValue In = N->getOperand(0);
49987 SDLoc DL(N);
49988
49989 if (auto SSatVal = detectSSatPattern(In, VT))
49990 return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
49991 if (auto USatVal = detectUSatPattern(In, VT, DAG, DL))
49992 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
49993
49994 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49995 APInt DemandedMask(APInt::getAllOnes(VT.getScalarSizeInBits()));
49996 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
49997 return SDValue(N, 0);
49998
49999 return SDValue();
50000}
50001
50002/// Returns the negated value if the node \p N flips sign of FP value.
50003///
50004/// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)
50005/// or FSUB(0, x)
50006/// AVX512F does not have FXOR, so FNEG is lowered as
50007/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
50008/// In this case we go though all bitcasts.
50009/// This also recognizes splat of a negated value and returns the splat of that
50010/// value.
50011static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {
50012 if (N->getOpcode() == ISD::FNEG)
50013 return N->getOperand(0);
50014
50015 // Don't recurse exponentially.
50016 if (Depth > SelectionDAG::MaxRecursionDepth)
50017 return SDValue();
50018
50019 unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();
50020
50021 SDValue Op = peekThroughBitcasts(SDValue(N, 0));
50022 EVT VT = Op->getValueType(0);
50023
50024 // Make sure the element size doesn't change.
50025 if (VT.getScalarSizeInBits() != ScalarSize)
50026 return SDValue();
50027
50028 unsigned Opc = Op.getOpcode();
50029 switch (Opc) {
50030 case ISD::VECTOR_SHUFFLE: {
50031 // For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate
50032 // of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here.
50033 if (!Op.getOperand(1).isUndef())
50034 return SDValue();
50035 if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1))
50036 if (NegOp0.getValueType() == VT) // FIXME: Can we do better?
50037 return DAG.getVectorShuffle(VT, SDLoc(Op), NegOp0, DAG.getUNDEF(VT),
50038 cast<ShuffleVectorSDNode>(Op)->getMask());
50039 break;
50040 }
50041 case ISD::INSERT_VECTOR_ELT: {
50042 // Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF,
50043 // -V, INDEX).
50044 SDValue InsVector = Op.getOperand(0);
50045 SDValue InsVal = Op.getOperand(1);
50046 if (!InsVector.isUndef())
50047 return SDValue();
50048 if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1))
50049 if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME
50050 return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,
50051 NegInsVal, Op.getOperand(2));
50052 break;
50053 }
50054 case ISD::FSUB:
50055 case ISD::XOR:
50056 case X86ISD::FXOR: {
50057 SDValue Op1 = Op.getOperand(1);
50058 SDValue Op0 = Op.getOperand(0);
50059
50060 // For XOR and FXOR, we want to check if constant
50061 // bits of Op1 are sign bit masks. For FSUB, we
50062 // have to check if constant bits of Op0 are sign
50063 // bit masks and hence we swap the operands.
50064 if (Opc == ISD::FSUB)
50065 std::swap(Op0, Op1);
50066
50067 APInt UndefElts;
50068 SmallVector<APInt, 16> EltBits;
50069 // Extract constant bits and see if they are all
50070 // sign bit masks. Ignore the undef elements.
50071 if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits,
50072 /* AllowWholeUndefs */ true,
50073 /* AllowPartialUndefs */ false)) {
50074 for (unsigned I = 0, E = EltBits.size(); I < E; I++)
50075 if (!UndefElts[I] && !EltBits[I].isSignMask())
50076 return SDValue();
50077
50078 return peekThroughBitcasts(Op0);
50079 }
50080 }
50081 }
50082
50083 return SDValue();
50084}
50085
50086static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
50087 bool NegRes) {
50088 if (NegMul) {
50089 switch (Opcode) {
50090 default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 50090)
;
50091 case ISD::FMA: Opcode = X86ISD::FNMADD; break;
50092 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FNMADD; break;
50093 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;
50094 case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;
50095 case X86ISD::STRICT_FMSUB: Opcode = X86ISD::STRICT_FNMSUB; break;
50096 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;
50097 case X86ISD::FNMADD: Opcode = ISD::FMA; break;
50098 case X86ISD::STRICT_FNMADD: Opcode = ISD::STRICT_FMA; break;
50099 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;
50100 case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;
50101 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB; break;
50102 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;
50103 }
50104 }
50105
50106 if (NegAcc) {
50107 switch (Opcode) {
50108 default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 50108)
;
50109 case ISD::FMA: Opcode = X86ISD::FMSUB; break;
50110 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FMSUB; break;
50111 case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
50112 case X86ISD::FMSUB: Opcode = ISD::FMA; break;
50113 case X86ISD::STRICT_FMSUB: Opcode = ISD::STRICT_FMA; break;
50114 case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
50115 case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;
50116 case X86ISD::STRICT_FNMADD: Opcode = X86ISD::STRICT_FNMSUB; break;
50117 case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
50118 case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;
50119 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FNMADD; break;
50120 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
50121 case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break;
50122 case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break;
50123 case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break;
50124 case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break;
50125 }
50126 }
50127
50128 if (NegRes) {
50129 switch (Opcode) {
50130 // For accuracy reason, we never combine fneg and fma under strict FP.
50131 default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 50131)
;
50132 case ISD::FMA: Opcode = X86ISD::FNMSUB; break;
50133 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
50134 case X86ISD::FMSUB: Opcode = X86ISD::FNMADD; break;
50135 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
50136 case X86ISD::FNMADD: Opcode = X86ISD::FMSUB; break;
50137 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
50138 case X86ISD::FNMSUB: Opcode = ISD::FMA; break;
50139 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
50140 }
50141 }
50142
50143 return Opcode;
50144}
50145
50146/// Do target-specific dag combines on floating point negations.
50147static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
50148 TargetLowering::DAGCombinerInfo &DCI,
50149 const X86Subtarget &Subtarget) {
50150 EVT OrigVT = N->getValueType(0);
50151 SDValue Arg = isFNEG(DAG, N);
50152 if (!Arg)
50153 return SDValue();
50154
50155 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50156 EVT VT = Arg.getValueType();
50157 EVT SVT = VT.getScalarType();
50158 SDLoc DL(N);
50159
50160 // Let legalize expand this if it isn't a legal type yet.
50161 if (!TLI.isTypeLegal(VT))
50162 return SDValue();
50163
50164 // If we're negating a FMUL node on a target with FMA, then we can avoid the
50165 // use of a constant by performing (-0 - A*B) instead.
50166 // FIXME: Check rounding control flags as well once it becomes available.
50167 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
50168 Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
50169 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
50170 SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
50171 Arg.getOperand(1), Zero);
50172 return DAG.getBitcast(OrigVT, NewNode);
50173 }
50174
50175 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
50176 bool LegalOperations = !DCI.isBeforeLegalizeOps();
50177 if (SDValue NegArg =
50178 TLI.getNegatedExpression(Arg, DAG, LegalOperations, CodeSize))
50179 return DAG.getBitcast(OrigVT, NegArg);
50180
50181 return SDValue();
50182}
50183
50184SDValue X86TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
50185 bool LegalOperations,
50186 bool ForCodeSize,
50187 NegatibleCost &Cost,
50188 unsigned Depth) const {
50189 // fneg patterns are removable even if they have multiple uses.
50190 if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth)) {
50191 Cost = NegatibleCost::Cheaper;
50192 return DAG.getBitcast(Op.getValueType(), Arg);
50193 }
50194
50195 EVT VT = Op.getValueType();
50196 EVT SVT = VT.getScalarType();
50197 unsigned Opc = Op.getOpcode();
50198 SDNodeFlags Flags = Op.getNode()->getFlags();
50199 switch (Opc) {
50200 case ISD::FMA:
50201 case X86ISD::FMSUB:
50202 case X86ISD::FNMADD:
50203 case X86ISD::FNMSUB:
50204 case X86ISD::FMADD_RND:
50205 case X86ISD::FMSUB_RND:
50206 case X86ISD::FNMADD_RND:
50207 case X86ISD::FNMSUB_RND: {
50208 if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||
50209 !(SVT == MVT::f32 || SVT == MVT::f64) ||
50210 !isOperationLegal(ISD::FMA, VT))
50211 break;
50212
50213 // Don't fold (fneg (fma (fneg x), y, (fneg z))) to (fma x, y, z)
50214 // if it may have signed zeros.
50215 if (!Flags.hasNoSignedZeros())
50216 break;
50217
50218 // This is always negatible for free but we might be able to remove some
50219 // extra operand negations as well.
50220 SmallVector<SDValue, 4> NewOps(Op.getNumOperands(), SDValue());
50221 for (int i = 0; i != 3; ++i)
50222 NewOps[i] = getCheaperNegatedExpression(
50223 Op.getOperand(i), DAG, LegalOperations, ForCodeSize, Depth + 1);
50224
50225 bool NegA = !!NewOps[0];
50226 bool NegB = !!NewOps[1];
50227 bool NegC = !!NewOps[2];
50228 unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true);
50229
50230 Cost = (NegA || NegB || NegC) ? NegatibleCost::Cheaper
50231 : NegatibleCost::Neutral;
50232
50233 // Fill in the non-negated ops with the original values.
50234 for (int i = 0, e = Op.getNumOperands(); i != e; ++i)
50235 if (!NewOps[i])
50236 NewOps[i] = Op.getOperand(i);
50237 return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps);
50238 }
50239 case X86ISD::FRCP:
50240 if (SDValue NegOp0 =
50241 getNegatedExpression(Op.getOperand(0), DAG, LegalOperations,
50242 ForCodeSize, Cost, Depth + 1))
50243 return DAG.getNode(Opc, SDLoc(Op), VT, NegOp0);
50244 break;
50245 }
50246
50247 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
50248 ForCodeSize, Cost, Depth);
50249}
50250
50251static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
50252 const X86Subtarget &Subtarget) {
50253 MVT VT = N->getSimpleValueType(0);
50254 // If we have integer vector types available, use the integer opcodes.
50255 if (!VT.isVector() || !Subtarget.hasSSE2())
50256 return SDValue();
50257
50258 SDLoc dl(N);
50259
50260 unsigned IntBits = VT.getScalarSizeInBits();
50261 MVT IntSVT = MVT::getIntegerVT(IntBits);
50262 MVT IntVT = MVT::getVectorVT(IntSVT, VT.getSizeInBits() / IntBits);
50263
50264 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
50265 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
50266 unsigned IntOpcode;
50267 switch (N->getOpcode()) {
50268 default: llvm_unreachable("Unexpected FP logic op")::llvm::llvm_unreachable_internal("Unexpected FP logic op", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 50268)
;
50269 case X86ISD::FOR: IntOpcode = ISD::OR; break;
50270 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
50271 case X86ISD::FAND: IntOpcode = ISD::AND; break;
50272 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
50273 }
50274 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
50275 return DAG.getBitcast(VT, IntOp);
50276}
50277
50278
50279/// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
50280static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) {
50281 if (N->getOpcode() != ISD::XOR)
50282 return SDValue();
50283
50284 SDValue LHS = N->getOperand(0);
50285 if (!isOneConstant(N->getOperand(1)) || LHS->getOpcode() != X86ISD::SETCC)
50286 return SDValue();
50287
50288 X86::CondCode NewCC = X86::GetOppositeBranchCondition(
50289 X86::CondCode(LHS->getConstantOperandVal(0)));
50290 SDLoc DL(N);
50291 return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
50292}
50293
50294static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
50295 TargetLowering::DAGCombinerInfo &DCI,
50296 const X86Subtarget &Subtarget) {
50297 SDValue N0 = N->getOperand(0);
50298 SDValue N1 = N->getOperand(1);
50299 EVT VT = N->getValueType(0);
50300
50301 // If this is SSE1 only convert to FXOR to avoid scalarization.
50302 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
50303 return DAG.getBitcast(MVT::v4i32,
50304 DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32,
50305 DAG.getBitcast(MVT::v4f32, N0),
50306 DAG.getBitcast(MVT::v4f32, N1)));
50307 }
50308
50309 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
50310 return Cmp;
50311
50312 if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
50313 return R;
50314
50315 if (SDValue R = combineBitOpWithShift(N, DAG))
50316 return R;
50317
50318 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))
50319 return FPLogic;
50320
50321 if (DCI.isBeforeLegalizeOps())
50322 return SDValue();
50323
50324 if (SDValue SetCC = foldXor1SetCC(N, DAG))
50325 return SetCC;
50326
50327 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
50328 return RV;
50329
50330 // Fold not(iX bitcast(vXi1)) -> (iX bitcast(not(vec))) for legal boolvecs.
50331 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50332 if (llvm::isAllOnesConstant(N1) && N0.getOpcode() == ISD::BITCAST &&
50333 N0.getOperand(0).getValueType().isVector() &&
50334 N0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
50335 TLI.isTypeLegal(N0.getOperand(0).getValueType()) && N0.hasOneUse()) {
50336 return DAG.getBitcast(VT, DAG.getNOT(SDLoc(N), N0.getOperand(0),
50337 N0.getOperand(0).getValueType()));
50338 }
50339
50340 // Handle AVX512 mask widening.
50341 // Fold not(insert_subvector(undef,sub)) -> insert_subvector(undef,not(sub))
50342 if (ISD::isBuildVectorAllOnes(N1.getNode()) && VT.isVector() &&
50343 VT.getVectorElementType() == MVT::i1 &&
50344 N0.getOpcode() == ISD::INSERT_SUBVECTOR && N0.getOperand(0).isUndef() &&
50345 TLI.isTypeLegal(N0.getOperand(1).getValueType())) {
50346 return DAG.getNode(
50347 ISD::INSERT_SUBVECTOR, SDLoc(N), VT, N0.getOperand(0),
50348 DAG.getNOT(SDLoc(N), N0.getOperand(1), N0.getOperand(1).getValueType()),
50349 N0.getOperand(2));
50350 }
50351
50352 // Fold xor(zext(xor(x,c1)),c2) -> xor(zext(x),xor(zext(c1),c2))
50353 // Fold xor(truncate(xor(x,c1)),c2) -> xor(truncate(x),xor(truncate(c1),c2))
50354 // TODO: Under what circumstances could this be performed in DAGCombine?
50355 if ((N0.getOpcode() == ISD::TRUNCATE || N0.getOpcode() == ISD::ZERO_EXTEND) &&
50356 N0.getOperand(0).getOpcode() == N->getOpcode()) {
50357 SDValue TruncExtSrc = N0.getOperand(0);
50358 auto *N1C = dyn_cast<ConstantSDNode>(N1);
50359 auto *N001C = dyn_cast<ConstantSDNode>(TruncExtSrc.getOperand(1));
50360 if (N1C && !N1C->isOpaque() && N001C && !N001C->isOpaque()) {
50361 SDLoc DL(N);
50362 SDValue LHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(0), DL, VT);
50363 SDValue RHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(1), DL, VT);
50364 return DAG.getNode(ISD::XOR, DL, VT, LHS,
50365 DAG.getNode(ISD::XOR, DL, VT, RHS, N1));
50366 }
50367 }
50368
50369 return combineFneg(N, DAG, DCI, Subtarget);
50370}
50371
50372static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG,
50373 TargetLowering::DAGCombinerInfo &DCI,
50374 const X86Subtarget &Subtarget) {
50375 EVT VT = N->getValueType(0);
50376 unsigned NumBits = VT.getSizeInBits();
50377
50378 // TODO - Constant Folding.
50379
50380 // Simplify the inputs.
50381 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50382 APInt DemandedMask(APInt::getAllOnes(NumBits));
50383 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
50384 return SDValue(N, 0);
50385
50386 return SDValue();
50387}
50388
50389static bool isNullFPScalarOrVectorConst(SDValue V) {
50390 return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
50391}
50392
50393/// If a value is a scalar FP zero or a vector FP zero (potentially including
50394/// undefined elements), return a zero constant that may be used to fold away
50395/// that value. In the case of a vector, the returned constant will not contain
50396/// undefined elements even if the input parameter does. This makes it suitable
50397/// to be used as a replacement operand with operations (eg, bitwise-and) where
50398/// an undef should not propagate.
50399static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
50400 const X86Subtarget &Subtarget) {
50401 if (!isNullFPScalarOrVectorConst(V))
50402 return SDValue();
50403
50404 if (V.getValueType().isVector())
50405 return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
50406
50407 return V;
50408}
50409
50410static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
50411 const X86Subtarget &Subtarget) {
50412 SDValue N0 = N->getOperand(0);
50413 SDValue N1 = N->getOperand(1);
50414 EVT VT = N->getValueType(0);
50415 SDLoc DL(N);
50416
50417 // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
50418 if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
50419 (VT == MVT::f64 && Subtarget.hasSSE2()) ||
50420 (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
50421 return SDValue();
50422
50423 auto isAllOnesConstantFP = [](SDValue V) {
50424 if (V.getSimpleValueType().isVector())
50425 return ISD::isBuildVectorAllOnes(V.getNode());
50426 auto *C = dyn_cast<ConstantFPSDNode>(V);
50427 return C && C->getConstantFPValue()->isAllOnesValue();
50428 };
50429
50430 // fand (fxor X, -1), Y --> fandn X, Y
50431 if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
50432 return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
50433
50434 // fand X, (fxor Y, -1) --> fandn Y, X
50435 if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
50436 return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
50437
50438 return SDValue();
50439}
50440
50441/// Do target-specific dag combines on X86ISD::FAND nodes.
50442static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
50443 const X86Subtarget &Subtarget) {
50444 // FAND(0.0, x) -> 0.0
50445 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
50446 return V;
50447
50448 // FAND(x, 0.0) -> 0.0
50449 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
50450 return V;
50451
50452 if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
50453 return V;
50454
50455 return lowerX86FPLogicOp(N, DAG, Subtarget);
50456}
50457
50458/// Do target-specific dag combines on X86ISD::FANDN nodes.
50459static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
50460 const X86Subtarget &Subtarget) {
50461 // FANDN(0.0, x) -> x
50462 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
50463 return N->getOperand(1);
50464
50465 // FANDN(x, 0.0) -> 0.0
50466 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
50467 return V;
50468
50469 return lowerX86FPLogicOp(N, DAG, Subtarget);
50470}
50471
50472/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
50473static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
50474 TargetLowering::DAGCombinerInfo &DCI,
50475 const X86Subtarget &Subtarget) {
50476 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR)(static_cast <bool> (N->getOpcode() == X86ISD::FOR ||
N->getOpcode() == X86ISD::FXOR) ? void (0) : __assert_fail
("N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50476, __extension__
__PRETTY_FUNCTION__))
;
50477
50478 // F[X]OR(0.0, x) -> x
50479 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
50480 return N->getOperand(1);
50481
50482 // F[X]OR(x, 0.0) -> x
50483 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
50484 return N->getOperand(0);
50485
50486 if (SDValue NewVal = combineFneg(N, DAG, DCI, Subtarget))
50487 return NewVal;
50488
50489 return lowerX86FPLogicOp(N, DAG, Subtarget);
50490}
50491
50492/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
50493static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
50494 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX)(static_cast <bool> (N->getOpcode() == X86ISD::FMIN ||
N->getOpcode() == X86ISD::FMAX) ? void (0) : __assert_fail
("N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50494, __extension__
__PRETTY_FUNCTION__))
;
50495
50496 // FMIN/FMAX are commutative if no NaNs and no negative zeros are allowed.
50497 if (!DAG.getTarget().Options.NoNaNsFPMath ||
50498 !DAG.getTarget().Options.NoSignedZerosFPMath)
50499 return SDValue();
50500
50501 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
50502 // into FMINC and FMAXC, which are Commutative operations.
50503 unsigned NewOp = 0;
50504 switch (N->getOpcode()) {
50505 default: llvm_unreachable("unknown opcode")::llvm::llvm_unreachable_internal("unknown opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 50505)
;
50506 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
50507 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
50508 }
50509
50510 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
50511 N->getOperand(0), N->getOperand(1));
50512}
50513
50514static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
50515 const X86Subtarget &Subtarget) {
50516 if (Subtarget.useSoftFloat())
50517 return SDValue();
50518
50519 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50520
50521 EVT VT = N->getValueType(0);
50522 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
50523 (Subtarget.hasSSE2() && VT == MVT::f64) ||
50524 (Subtarget.hasFP16() && VT == MVT::f16) ||
50525 (VT.isVector() && TLI.isTypeLegal(VT))))
50526 return SDValue();
50527
50528 SDValue Op0 = N->getOperand(0);
50529 SDValue Op1 = N->getOperand(1);
50530 SDLoc DL(N);
50531 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
50532
50533 // If we don't have to respect NaN inputs, this is a direct translation to x86
50534 // min/max instructions.
50535 if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
50536 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
50537
50538 // If one of the operands is known non-NaN use the native min/max instructions
50539 // with the non-NaN input as second operand.
50540 if (DAG.isKnownNeverNaN(Op1))
50541 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
50542 if (DAG.isKnownNeverNaN(Op0))
50543 return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());
50544
50545 // If we have to respect NaN inputs, this takes at least 3 instructions.
50546 // Favor a library call when operating on a scalar and minimizing code size.
50547 if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())
50548 return SDValue();
50549
50550 EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
50551 VT);
50552
50553 // There are 4 possibilities involving NaN inputs, and these are the required
50554 // outputs:
50555 // Op1
50556 // Num NaN
50557 // ----------------
50558 // Num | Max | Op0 |
50559 // Op0 ----------------
50560 // NaN | Op1 | NaN |
50561 // ----------------
50562 //
50563 // The SSE FP max/min instructions were not designed for this case, but rather
50564 // to implement:
50565 // Min = Op1 < Op0 ? Op1 : Op0
50566 // Max = Op1 > Op0 ? Op1 : Op0
50567 //
50568 // So they always return Op0 if either input is a NaN. However, we can still
50569 // use those instructions for fmaxnum by selecting away a NaN input.
50570
50571 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
50572 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
50573 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);
50574
50575 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
50576 // are NaN, the NaN value of Op1 is the result.
50577 return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
50578}
50579
50580static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,
50581 TargetLowering::DAGCombinerInfo &DCI) {
50582 EVT VT = N->getValueType(0);
50583 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50584
50585 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
50586 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
50587 return SDValue(N, 0);
50588
50589 // Convert a full vector load into vzload when not all bits are needed.
50590 SDValue In = N->getOperand(0);
50591 MVT InVT = In.getSimpleValueType();
50592 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
50593 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
50594 assert(InVT.is128BitVector() && "Expected 128-bit input vector")(static_cast <bool> (InVT.is128BitVector() && "Expected 128-bit input vector"
) ? void (0) : __assert_fail ("InVT.is128BitVector() && \"Expected 128-bit input vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50594, __extension__
__PRETTY_FUNCTION__))
;
50595 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
50596 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
50597 MVT MemVT = MVT::getIntegerVT(NumBits);
50598 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
50599 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
50600 SDLoc dl(N);
50601 SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
50602 DAG.getBitcast(InVT, VZLoad));
50603 DCI.CombineTo(N, Convert);
50604 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
50605 DCI.recursivelyDeleteUnusedNodes(LN);
50606 return SDValue(N, 0);
50607 }
50608 }
50609
50610 return SDValue();
50611}
50612
50613static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG,
50614 TargetLowering::DAGCombinerInfo &DCI) {
50615 bool IsStrict = N->isTargetStrictFPOpcode();
50616 EVT VT = N->getValueType(0);
50617
50618 // Convert a full vector load into vzload when not all bits are needed.
50619 SDValue In = N->getOperand(IsStrict ? 1 : 0);
50620 MVT InVT = In.getSimpleValueType();
50621 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
50622 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
50623 assert(InVT.is128BitVector() && "Expected 128-bit input vector")(static_cast <bool> (InVT.is128BitVector() && "Expected 128-bit input vector"
) ? void (0) : __assert_fail ("InVT.is128BitVector() && \"Expected 128-bit input vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50623, __extension__
__PRETTY_FUNCTION__))
;
50624 LoadSDNode *LN = cast<LoadSDNode>(In);
50625 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
50626 MVT MemVT = MVT::getFloatingPointVT(NumBits);
50627 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
50628 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
50629 SDLoc dl(N);
50630 if (IsStrict) {
50631 SDValue Convert =
50632 DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
50633 {N->getOperand(0), DAG.getBitcast(InVT, VZLoad)});
50634 DCI.CombineTo(N, Convert, Convert.getValue(1));
50635 } else {
50636 SDValue Convert =
50637 DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad));
50638 DCI.CombineTo(N, Convert);
50639 }
50640 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
50641 DCI.recursivelyDeleteUnusedNodes(LN);
50642 return SDValue(N, 0);
50643 }
50644 }
50645
50646 return SDValue();
50647}
50648
50649/// Do target-specific dag combines on X86ISD::ANDNP nodes.
50650static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
50651 TargetLowering::DAGCombinerInfo &DCI,
50652 const X86Subtarget &Subtarget) {
50653 SDValue N0 = N->getOperand(0);
50654 SDValue N1 = N->getOperand(1);
50655 MVT VT = N->getSimpleValueType(0);
50656
50657 // ANDNP(0, x) -> x
50658 if (ISD::isBuildVectorAllZeros(N0.getNode()))
50659 return N1;
50660
50661 // ANDNP(x, 0) -> 0
50662 if (ISD::isBuildVectorAllZeros(N1.getNode()))
50663 return DAG.getConstant(0, SDLoc(N), VT);
50664
50665 // Turn ANDNP back to AND if input is inverted.
50666 if (SDValue Not = IsNOT(N0, DAG))
50667 return DAG.getNode(ISD::AND, SDLoc(N), VT, DAG.getBitcast(VT, Not), N1);
50668
50669 // Attempt to recursively combine a bitmask ANDNP with shuffles.
50670 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
50671 SDValue Op(N, 0);
50672 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
50673 return Res;
50674
50675 // If either operand is a constant mask, then only the elements that aren't
50676 // zero are actually demanded by the other operand.
50677 auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
50678 APInt UndefElts;
50679 SmallVector<APInt> EltBits;
50680 int NumElts = VT.getVectorNumElements();
50681 int EltSizeInBits = VT.getScalarSizeInBits();
50682 APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
50683 APInt DemandedElts = APInt::getAllOnes(NumElts);
50684 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
50685 EltBits)) {
50686 DemandedBits.clearAllBits();
50687 DemandedElts.clearAllBits();
50688 for (int I = 0; I != NumElts; ++I)
50689 if ((Invert && !EltBits[I].isAllOnes()) ||
50690 (!Invert && !EltBits[I].isZero())) {
50691 DemandedBits |= Invert ? ~EltBits[I] : EltBits[I];
50692 DemandedElts.setBit(I);
50693 }
50694 }
50695 return std::make_pair(DemandedBits, DemandedElts);
50696 };
50697 std::pair<APInt, APInt> Demand0 = GetDemandedMasks(N1);
50698 std::pair<APInt, APInt> Demand1 = GetDemandedMasks(N0, true);
50699
50700 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50701 if (TLI.SimplifyDemandedVectorElts(N0, Demand0.second, DCI) ||
50702 TLI.SimplifyDemandedVectorElts(N1, Demand1.second, DCI) ||
50703 TLI.SimplifyDemandedBits(N0, Demand0.first, Demand0.second, DCI) ||
50704 TLI.SimplifyDemandedBits(N1, Demand1.first, Demand1.second, DCI)) {
50705 if (N->getOpcode() != ISD::DELETED_NODE)
50706 DCI.AddToWorklist(N);
50707 return SDValue(N, 0);
50708 }
50709 }
50710
50711 return SDValue();
50712}
50713
50714static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
50715 TargetLowering::DAGCombinerInfo &DCI) {
50716 SDValue N1 = N->getOperand(1);
50717
50718 // BT ignores high bits in the bit index operand.
50719 unsigned BitWidth = N1.getValueSizeInBits();
50720 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
50721 if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(N1, DemandedMask, DCI)) {
50722 if (N->getOpcode() != ISD::DELETED_NODE)
50723 DCI.AddToWorklist(N);
50724 return SDValue(N, 0);
50725 }
50726
50727 return SDValue();
50728}
50729
50730static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG,
50731 TargetLowering::DAGCombinerInfo &DCI) {
50732 bool IsStrict = N->getOpcode() == X86ISD::STRICT_CVTPH2PS;
50733 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
50734
50735 if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) {
50736 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50737 APInt DemandedElts = APInt::getLowBitsSet(8, 4);
50738 if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, DCI)) {
50739 if (N->getOpcode() != ISD::DELETED_NODE)
50740 DCI.AddToWorklist(N);
50741 return SDValue(N, 0);
50742 }
50743
50744 // Convert a full vector load into vzload when not all bits are needed.
50745 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
50746 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(IsStrict ? 1 : 0));
50747 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::i64, MVT::v2i64, DAG)) {
50748 SDLoc dl(N);
50749 if (IsStrict) {
50750 SDValue Convert = DAG.getNode(
50751 N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
50752 {N->getOperand(0), DAG.getBitcast(MVT::v8i16, VZLoad)});
50753 DCI.CombineTo(N, Convert, Convert.getValue(1));
50754 } else {
50755 SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32,
50756 DAG.getBitcast(MVT::v8i16, VZLoad));
50757 DCI.CombineTo(N, Convert);
50758 }
50759
50760 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
50761 DCI.recursivelyDeleteUnusedNodes(LN);
50762 return SDValue(N, 0);
50763 }
50764 }
50765 }
50766
50767 return SDValue();
50768}
50769
50770// Try to combine sext_in_reg of a cmov of constants by extending the constants.
50771static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG) {
50772 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG)(static_cast <bool> (N->getOpcode() == ISD::SIGN_EXTEND_INREG
) ? void (0) : __assert_fail ("N->getOpcode() == ISD::SIGN_EXTEND_INREG"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50772, __extension__
__PRETTY_FUNCTION__))
;
50773
50774 EVT DstVT = N->getValueType(0);
50775
50776 SDValue N0 = N->getOperand(0);
50777 SDValue N1 = N->getOperand(1);
50778 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
50779
50780 if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16)
50781 return SDValue();
50782
50783 // Look through single use any_extends / truncs.
50784 SDValue IntermediateBitwidthOp;
50785 if ((N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::TRUNCATE) &&
50786 N0.hasOneUse()) {
50787 IntermediateBitwidthOp = N0;
50788 N0 = N0.getOperand(0);
50789 }
50790
50791 // See if we have a single use cmov.
50792 if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())
50793 return SDValue();
50794
50795 SDValue CMovOp0 = N0.getOperand(0);
50796 SDValue CMovOp1 = N0.getOperand(1);
50797
50798 // Make sure both operands are constants.
50799 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
50800 !isa<ConstantSDNode>(CMovOp1.getNode()))
50801 return SDValue();
50802
50803 SDLoc DL(N);
50804
50805 // If we looked through an any_extend/trunc above, add one to the constants.
50806 if (IntermediateBitwidthOp) {
50807 unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode();
50808 CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0);
50809 CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1);
50810 }
50811
50812 CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1);
50813 CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1);
50814
50815 EVT CMovVT = DstVT;
50816 // We do not want i16 CMOV's. Promote to i32 and truncate afterwards.
50817 if (DstVT == MVT::i16) {
50818 CMovVT = MVT::i32;
50819 CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0);
50820 CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1);
50821 }
50822
50823 SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1,
50824 N0.getOperand(2), N0.getOperand(3));
50825
50826 if (CMovVT != DstVT)
50827 CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov);
50828
50829 return CMov;
50830}
50831
50832static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
50833 const X86Subtarget &Subtarget) {
50834 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG)(static_cast <bool> (N->getOpcode() == ISD::SIGN_EXTEND_INREG
) ? void (0) : __assert_fail ("N->getOpcode() == ISD::SIGN_EXTEND_INREG"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50834, __extension__
__PRETTY_FUNCTION__))
;
50835
50836 if (SDValue V = combineSextInRegCmov(N, DAG))
50837 return V;
50838
50839 EVT VT = N->getValueType(0);
50840 SDValue N0 = N->getOperand(0);
50841 SDValue N1 = N->getOperand(1);
50842 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
50843 SDLoc dl(N);
50844
50845 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
50846 // both SSE and AVX2 since there is no sign-extended shift right
50847 // operation on a vector with 64-bit elements.
50848 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
50849 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
50850 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
50851 N0.getOpcode() == ISD::SIGN_EXTEND)) {
50852 SDValue N00 = N0.getOperand(0);
50853
50854 // EXTLOAD has a better solution on AVX2,
50855 // it may be replaced with X86ISD::VSEXT node.
50856 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
50857 if (!ISD::isNormalLoad(N00.getNode()))
50858 return SDValue();
50859
50860 // Attempt to promote any comparison mask ops before moving the
50861 // SIGN_EXTEND_INREG in the way.
50862 if (SDValue Promote = PromoteMaskArithmetic(N0.getNode(), DAG, Subtarget))
50863 return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1);
50864
50865 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
50866 SDValue Tmp =
50867 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1);
50868 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
50869 }
50870 }
50871 return SDValue();
50872}
50873
50874/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
50875/// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
50876/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
50877/// opportunities to combine math ops, use an LEA, or use a complex addressing
50878/// mode. This can eliminate extend, add, and shift instructions.
50879static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
50880 const X86Subtarget &Subtarget) {
50881 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
50882 Ext->getOpcode() != ISD::ZERO_EXTEND)
50883 return SDValue();
50884
50885 // TODO: This should be valid for other integer types.
50886 EVT VT = Ext->getValueType(0);
50887 if (VT != MVT::i64)
50888 return SDValue();
50889
50890 SDValue Add = Ext->getOperand(0);
50891 if (Add.getOpcode() != ISD::ADD)
50892 return SDValue();
50893
50894 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
50895 bool NSW = Add->getFlags().hasNoSignedWrap();
50896 bool NUW = Add->getFlags().hasNoUnsignedWrap();
50897
50898 // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
50899 // into the 'zext'
50900 if ((Sext && !NSW) || (!Sext && !NUW))
50901 return SDValue();
50902
50903 // Having a constant operand to the 'add' ensures that we are not increasing
50904 // the instruction count because the constant is extended for free below.
50905 // A constant operand can also become the displacement field of an LEA.
50906 auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
50907 if (!AddOp1)
50908 return SDValue();
50909
50910 // Don't make the 'add' bigger if there's no hope of combining it with some
50911 // other 'add' or 'shl' instruction.
50912 // TODO: It may be profitable to generate simpler LEA instructions in place
50913 // of single 'add' instructions, but the cost model for selecting an LEA
50914 // currently has a high threshold.
50915 bool HasLEAPotential = false;
50916 for (auto *User : Ext->uses()) {
50917 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
50918 HasLEAPotential = true;
50919 break;
50920 }
50921 }
50922 if (!HasLEAPotential)
50923 return SDValue();
50924
50925 // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
50926 int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
50927 SDValue AddOp0 = Add.getOperand(0);
50928 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
50929 SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
50930
50931 // The wider add is guaranteed to not wrap because both operands are
50932 // sign-extended.
50933 SDNodeFlags Flags;
50934 Flags.setNoSignedWrap(NSW);
50935 Flags.setNoUnsignedWrap(NUW);
50936 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
50937}
50938
50939// If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
50940// operands and the result of CMOV is not used anywhere else - promote CMOV
50941// itself instead of promoting its result. This could be beneficial, because:
50942// 1) X86TargetLowering::EmitLoweredSelect later can do merging of two
50943// (or more) pseudo-CMOVs only when they go one-after-another and
50944// getting rid of result extension code after CMOV will help that.
50945// 2) Promotion of constant CMOV arguments is free, hence the
50946// {ANY,SIGN,ZERO}_EXTEND will just be deleted.
50947// 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
50948// promotion is also good in terms of code-size.
50949// (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
50950// promotion).
50951static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) {
50952 SDValue CMovN = Extend->getOperand(0);
50953 if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())
50954 return SDValue();
50955
50956 EVT TargetVT = Extend->getValueType(0);
50957 unsigned ExtendOpcode = Extend->getOpcode();
50958 SDLoc DL(Extend);
50959
50960 EVT VT = CMovN.getValueType();
50961 SDValue CMovOp0 = CMovN.getOperand(0);
50962 SDValue CMovOp1 = CMovN.getOperand(1);
50963
50964 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
50965 !isa<ConstantSDNode>(CMovOp1.getNode()))
50966 return SDValue();
50967
50968 // Only extend to i32 or i64.
50969 if (TargetVT != MVT::i32 && TargetVT != MVT::i64)
50970 return SDValue();
50971
50972 // Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32
50973 // are free.
50974 if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))
50975 return SDValue();
50976
50977 // If this a zero extend to i64, we should only extend to i32 and use a free
50978 // zero extend to finish.
50979 EVT ExtendVT = TargetVT;
50980 if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)
50981 ExtendVT = MVT::i32;
50982
50983 CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);
50984 CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);
50985
50986 SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,
50987 CMovN.getOperand(2), CMovN.getOperand(3));
50988
50989 // Finish extending if needed.
50990 if (ExtendVT != TargetVT)
50991 Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);
50992
50993 return Res;
50994}
50995
50996// Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
50997// result type.
50998static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,
50999 const X86Subtarget &Subtarget) {
51000 SDValue N0 = N->getOperand(0);
51001 EVT VT = N->getValueType(0);
51002 SDLoc dl(N);
51003
51004 // Only do this combine with AVX512 for vector extends.
51005 if (!Subtarget.hasAVX512() || !VT.isVector() || N0.getOpcode() != ISD::SETCC)
51006 return SDValue();
51007
51008 // Only combine legal element types.
51009 EVT SVT = VT.getVectorElementType();
51010 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
51011 SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
51012 return SDValue();
51013
51014 // We don't have CMPP Instruction for vxf16
51015 if (N0.getOperand(0).getValueType().getVectorElementType() == MVT::f16)
51016 return SDValue();
51017 // We can only do this if the vector size in 256 bits or less.
51018 unsigned Size = VT.getSizeInBits();
51019 if (Size > 256 && Subtarget.useAVX512Regs())
51020 return SDValue();
51021
51022 // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
51023 // that's the only integer compares with we have.
51024 ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
51025 if (ISD::isUnsignedIntSetCC(CC))
51026 return SDValue();
51027
51028 // Only do this combine if the extension will be fully consumed by the setcc.
51029 EVT N00VT = N0.getOperand(0).getValueType();
51030 EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
51031 if (Size != MatchingVecType.getSizeInBits())
51032 return SDValue();
51033
51034 SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
51035
51036 if (N->getOpcode() == ISD::ZERO_EXTEND)
51037 Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType());
51038
51039 return Res;
51040}
51041
51042static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
51043 TargetLowering::DAGCombinerInfo &DCI,
51044 const X86Subtarget &Subtarget) {
51045 SDValue N0 = N->getOperand(0);
51046 EVT VT = N->getValueType(0);
51047 SDLoc DL(N);
51048
51049 // (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
51050 if (!DCI.isBeforeLegalizeOps() &&
51051 N0.getOpcode() == X86ISD::SETCC_CARRY) {
51052 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0),
51053 N0->getOperand(1));
51054 bool ReplaceOtherUses = !N0.hasOneUse();
51055 DCI.CombineTo(N, Setcc);
51056 // Replace other uses with a truncate of the widened setcc_carry.
51057 if (ReplaceOtherUses) {
51058 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
51059 N0.getValueType(), Setcc);
51060 DCI.CombineTo(N0.getNode(), Trunc);
51061 }
51062
51063 return SDValue(N, 0);
51064 }
51065
51066 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
51067 return NewCMov;
51068
51069 if (!DCI.isBeforeLegalizeOps())
51070 return SDValue();
51071
51072 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
51073 return V;
51074
51075 if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), DL, VT, N0,
51076 DAG, DCI, Subtarget))
51077 return V;
51078
51079 if (VT.isVector()) {
51080 if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
51081 return R;
51082
51083 if (N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG)
51084 return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0));
51085 }
51086
51087 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
51088 return NewAdd;
51089
51090 return SDValue();
51091}
51092
51093static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
51094 TargetLowering::DAGCombinerInfo &DCI,
51095 const X86Subtarget &Subtarget) {
51096 SDLoc dl(N);
51097 EVT VT = N->getValueType(0);
51098 bool IsStrict = N->isStrictFPOpcode() || N->isTargetStrictFPOpcode();
51099
51100 // Let legalize expand this if it isn't a legal type yet.
51101 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51102 if (!TLI.isTypeLegal(VT))
51103 return SDValue();
51104
51105 SDValue A = N->getOperand(IsStrict ? 1 : 0);
51106 SDValue B = N->getOperand(IsStrict ? 2 : 1);
51107 SDValue C = N->getOperand(IsStrict ? 3 : 2);
51108
51109 // If the operation allows fast-math and the target does not support FMA,
51110 // split this into mul+add to avoid libcall(s).
51111 SDNodeFlags Flags = N->getFlags();
51112 if (!IsStrict && Flags.hasAllowReassociation() &&
51113 TLI.isOperationExpand(ISD::FMA, VT)) {
51114 SDValue Fmul = DAG.getNode(ISD::FMUL, dl, VT, A, B, Flags);
51115 return DAG.getNode(ISD::FADD, dl, VT, Fmul, C, Flags);
51116 }
51117
51118 EVT ScalarVT = VT.getScalarType();
51119 if (((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
51120 !Subtarget.hasAnyFMA()) &&
51121 !(ScalarVT == MVT::f16 && Subtarget.hasFP16()))
51122 return SDValue();
51123
51124 auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {
51125 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
51126 bool LegalOperations = !DCI.isBeforeLegalizeOps();
51127 if (SDValue NegV = TLI.getCheaperNegatedExpression(V, DAG, LegalOperations,
51128 CodeSize)) {
51129 V = NegV;
51130 return true;
51131 }
51132 // Look through extract_vector_elts. If it comes from an FNEG, create a
51133 // new extract from the FNEG input.
51134 if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
51135 isNullConstant(V.getOperand(1))) {
51136 SDValue Vec = V.getOperand(0);
51137 if (SDValue NegV = TLI.getCheaperNegatedExpression(
51138 Vec, DAG, LegalOperations, CodeSize)) {
51139 V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
51140 NegV, V.getOperand(1));
51141 return true;
51142 }
51143 }
51144
51145 return false;
51146 };
51147
51148 // Do not convert the passthru input of scalar intrinsics.
51149 // FIXME: We could allow negations of the lower element only.
51150 bool NegA = invertIfNegative(A);
51151 bool NegB = invertIfNegative(B);
51152 bool NegC = invertIfNegative(C);
51153
51154 if (!NegA && !NegB && !NegC)
51155 return SDValue();
51156
51157 unsigned NewOpcode =
51158 negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);
51159
51160 // Propagate fast-math-flags to new FMA node.
51161 SelectionDAG::FlagInserter FlagsInserter(DAG, Flags);
51162 if (IsStrict) {
51163 assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4")(static_cast <bool> (N->getNumOperands() == 4 &&
"Shouldn't be greater than 4") ? void (0) : __assert_fail ("N->getNumOperands() == 4 && \"Shouldn't be greater than 4\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51163, __extension__
__PRETTY_FUNCTION__))
;
51164 return DAG.getNode(NewOpcode, dl, {VT, MVT::Other},
51165 {N->getOperand(0), A, B, C});
51166 } else {
51167 if (N->getNumOperands() == 4)
51168 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
51169 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
51170 }
51171}
51172
51173// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
51174// Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C)
51175static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,
51176 TargetLowering::DAGCombinerInfo &DCI) {
51177 SDLoc dl(N);
51178 EVT VT = N->getValueType(0);
51179 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51180 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
51181 bool LegalOperations = !DCI.isBeforeLegalizeOps();
51182
51183 SDValue N2 = N->getOperand(2);
51184
51185 SDValue NegN2 =
51186 TLI.getCheaperNegatedExpression(N2, DAG, LegalOperations, CodeSize);
51187 if (!NegN2)
51188 return SDValue();
51189 unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);
51190
51191 if (N->getNumOperands() == 4)
51192 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
51193 NegN2, N->getOperand(3));
51194 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
51195 NegN2);
51196}
51197
51198static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
51199 TargetLowering::DAGCombinerInfo &DCI,
51200 const X86Subtarget &Subtarget) {
51201 SDLoc dl(N);
51202 SDValue N0 = N->getOperand(0);
51203 EVT VT = N->getValueType(0);
51204
51205 // (i32 (aext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
51206 // FIXME: Is this needed? We don't seem to have any tests for it.
51207 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND &&
51208 N0.getOpcode() == X86ISD::SETCC_CARRY) {
51209 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0),
51210 N0->getOperand(1));
51211 bool ReplaceOtherUses = !N0.hasOneUse();
51212 DCI.CombineTo(N, Setcc);
51213 // Replace other uses with a truncate of the widened setcc_carry.
51214 if (ReplaceOtherUses) {
51215 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
51216 N0.getValueType(), Setcc);
51217 DCI.CombineTo(N0.getNode(), Trunc);
51218 }
51219
51220 return SDValue(N, 0);
51221 }
51222
51223 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
51224 return NewCMov;
51225
51226 if (DCI.isBeforeLegalizeOps())
51227 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
51228 return V;
51229
51230 if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), dl, VT, N0,
51231 DAG, DCI, Subtarget))
51232 return V;
51233
51234 if (VT.isVector())
51235 if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
51236 return R;
51237
51238 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
51239 return NewAdd;
51240
51241 if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
51242 return R;
51243
51244 // TODO: Combine with any target/faux shuffle.
51245 if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 &&
51246 VT.getScalarSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits()) {
51247 SDValue N00 = N0.getOperand(0);
51248 SDValue N01 = N0.getOperand(1);
51249 unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();
51250 APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);
51251 if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) &&
51252 (N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) {
51253 return concatSubVectors(N00, N01, DAG, dl);
51254 }
51255 }
51256
51257 return SDValue();
51258}
51259
51260/// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a
51261/// recognizable memcmp expansion.
51262static bool isOrXorXorTree(SDValue X, bool Root = true) {
51263 if (X.getOpcode() == ISD::OR)
51264 return isOrXorXorTree(X.getOperand(0), false) &&
51265 isOrXorXorTree(X.getOperand(1), false);
51266 if (Root)
51267 return false;
51268 return X.getOpcode() == ISD::XOR;
51269}
51270
51271/// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp
51272/// expansion.
51273template<typename F>
51274static SDValue emitOrXorXorTree(SDValue X, SDLoc &DL, SelectionDAG &DAG,
51275 EVT VecVT, EVT CmpVT, bool HasPT, F SToV) {
51276 SDValue Op0 = X.getOperand(0);
51277 SDValue Op1 = X.getOperand(1);
51278 if (X.getOpcode() == ISD::OR) {
51279 SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT, HasPT, SToV);
51280 SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT, HasPT, SToV);
51281 if (VecVT != CmpVT)
51282 return DAG.getNode(ISD::OR, DL, CmpVT, A, B);
51283 if (HasPT)
51284 return DAG.getNode(ISD::OR, DL, VecVT, A, B);
51285 return DAG.getNode(ISD::AND, DL, CmpVT, A, B);
51286 } else if (X.getOpcode() == ISD::XOR) {
51287 SDValue A = SToV(Op0);
51288 SDValue B = SToV(Op1);
51289 if (VecVT != CmpVT)
51290 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE);
51291 if (HasPT)
51292 return DAG.getNode(ISD::XOR, DL, VecVT, A, B);
51293 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
51294 }
51295 llvm_unreachable("Impossible")::llvm::llvm_unreachable_internal("Impossible", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 51295)
;
51296}
51297
51298/// Try to map a 128-bit or larger integer comparison to vector instructions
51299/// before type legalization splits it up into chunks.
51300static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
51301 const X86Subtarget &Subtarget) {
51302 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
51303 assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate")(static_cast <bool> ((CC == ISD::SETNE || CC == ISD::SETEQ
) && "Bad comparison predicate") ? void (0) : __assert_fail
("(CC == ISD::SETNE || CC == ISD::SETEQ) && \"Bad comparison predicate\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51303, __extension__
__PRETTY_FUNCTION__))
;
51304
51305 // We're looking for an oversized integer equality comparison.
51306 SDValue X = SetCC->getOperand(0);
51307 SDValue Y = SetCC->getOperand(1);
51308 EVT OpVT = X.getValueType();
51309 unsigned OpSize = OpVT.getSizeInBits();
51310 if (!OpVT.isScalarInteger() || OpSize < 128)
51311 return SDValue();
51312
51313 // Ignore a comparison with zero because that gets special treatment in
51314 // EmitTest(). But make an exception for the special case of a pair of
51315 // logically-combined vector-sized operands compared to zero. This pattern may
51316 // be generated by the memcmp expansion pass with oversized integer compares
51317 // (see PR33325).
51318 bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);
51319 if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)
51320 return SDValue();
51321
51322 // Don't perform this combine if constructing the vector will be expensive.
51323 auto IsVectorBitCastCheap = [](SDValue X) {
51324 X = peekThroughBitcasts(X);
51325 return isa<ConstantSDNode>(X) || X.getValueType().isVector() ||
51326 X.getOpcode() == ISD::LOAD;
51327 };
51328 if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) &&
51329 !IsOrXorXorTreeCCZero)
51330 return SDValue();
51331
51332 EVT VT = SetCC->getValueType(0);
51333 SDLoc DL(SetCC);
51334
51335 // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.
51336 // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.
51337 // Otherwise use PCMPEQ (plus AND) and mask testing.
51338 if ((OpSize == 128 && Subtarget.hasSSE2()) ||
51339 (OpSize == 256 && Subtarget.hasAVX()) ||
51340 (OpSize == 512 && Subtarget.useAVX512Regs())) {
51341 bool HasPT = Subtarget.hasSSE41();
51342
51343 // PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened
51344 // vector registers are essentially free. (Technically, widening registers
51345 // prevents load folding, but the tradeoff is worth it.)
51346 bool PreferKOT = Subtarget.preferMaskRegisters();
51347 bool NeedZExt = PreferKOT && !Subtarget.hasVLX() && OpSize != 512;
51348
51349 EVT VecVT = MVT::v16i8;
51350 EVT CmpVT = PreferKOT ? MVT::v16i1 : VecVT;
51351 if (OpSize == 256) {
51352 VecVT = MVT::v32i8;
51353 CmpVT = PreferKOT ? MVT::v32i1 : VecVT;
51354 }
51355 EVT CastVT = VecVT;
51356 bool NeedsAVX512FCast = false;
51357 if (OpSize == 512 || NeedZExt) {
51358 if (Subtarget.hasBWI()) {
51359 VecVT = MVT::v64i8;
51360 CmpVT = MVT::v64i1;
51361 if (OpSize == 512)
51362 CastVT = VecVT;
51363 } else {
51364 VecVT = MVT::v16i32;
51365 CmpVT = MVT::v16i1;
51366 CastVT = OpSize == 512 ? VecVT :
51367 OpSize == 256 ? MVT::v8i32 : MVT::v4i32;
51368 NeedsAVX512FCast = true;
51369 }
51370 }
51371
51372 auto ScalarToVector = [&](SDValue X) -> SDValue {
51373 bool TmpZext = false;
51374 EVT TmpCastVT = CastVT;
51375 if (X.getOpcode() == ISD::ZERO_EXTEND) {
51376 SDValue OrigX = X.getOperand(0);
51377 unsigned OrigSize = OrigX.getScalarValueSizeInBits();
51378 if (OrigSize < OpSize) {
51379 if (OrigSize == 128) {
51380 TmpCastVT = NeedsAVX512FCast ? MVT::v4i32 : MVT::v16i8;
51381 X = OrigX;
51382 TmpZext = true;
51383 } else if (OrigSize == 256) {
51384 TmpCastVT = NeedsAVX512FCast ? MVT::v8i32 : MVT::v32i8;
51385 X = OrigX;
51386 TmpZext = true;
51387 }
51388 }
51389 }
51390 X = DAG.getBitcast(TmpCastVT, X);
51391 if (!NeedZExt && !TmpZext)
51392 return X;
51393 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT,
51394 DAG.getConstant(0, DL, VecVT), X,
51395 DAG.getVectorIdxConstant(0, DL));
51396 };
51397
51398 SDValue Cmp;
51399 if (IsOrXorXorTreeCCZero) {
51400 // This is a bitwise-combined equality comparison of 2 pairs of vectors:
51401 // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne
51402 // Use 2 vector equality compares and 'and' the results before doing a
51403 // MOVMSK.
51404 Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT, HasPT, ScalarToVector);
51405 } else {
51406 SDValue VecX = ScalarToVector(X);
51407 SDValue VecY = ScalarToVector(Y);
51408 if (VecVT != CmpVT) {
51409 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE);
51410 } else if (HasPT) {
51411 Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY);
51412 } else {
51413 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
51414 }
51415 }
51416 // AVX512 should emit a setcc that will lower to kortest.
51417 if (VecVT != CmpVT) {
51418 EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64 :
51419 CmpVT == MVT::v32i1 ? MVT::i32 : MVT::i16;
51420 return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp),
51421 DAG.getConstant(0, DL, KRegVT), CC);
51422 }
51423 if (HasPT) {
51424 SDValue BCCmp = DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64,
51425 Cmp);
51426 SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp);
51427 X86::CondCode X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
51428 SDValue X86SetCC = getSETCC(X86CC, PT, DL, DAG);
51429 return DAG.getNode(ISD::TRUNCATE, DL, VT, X86SetCC.getValue(0));
51430 }
51431 // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
51432 // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
51433 // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
51434 assert(Cmp.getValueType() == MVT::v16i8 &&(static_cast <bool> (Cmp.getValueType() == MVT::v16i8 &&
"Non 128-bit vector on pre-SSE41 target") ? void (0) : __assert_fail
("Cmp.getValueType() == MVT::v16i8 && \"Non 128-bit vector on pre-SSE41 target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51435, __extension__
__PRETTY_FUNCTION__))
51435 "Non 128-bit vector on pre-SSE41 target")(static_cast <bool> (Cmp.getValueType() == MVT::v16i8 &&
"Non 128-bit vector on pre-SSE41 target") ? void (0) : __assert_fail
("Cmp.getValueType() == MVT::v16i8 && \"Non 128-bit vector on pre-SSE41 target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51435, __extension__
__PRETTY_FUNCTION__))
;
51436 SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
51437 SDValue FFFFs = DAG.getConstant(0xFFFF, DL, MVT::i32);
51438 return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
51439 }
51440
51441 return SDValue();
51442}
51443
51444static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
51445 TargetLowering::DAGCombinerInfo &DCI,
51446 const X86Subtarget &Subtarget) {
51447 const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
51448 const SDValue LHS = N->getOperand(0);
51449 const SDValue RHS = N->getOperand(1);
51450 EVT VT = N->getValueType(0);
51451 EVT OpVT = LHS.getValueType();
51452 SDLoc DL(N);
51453
51454 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
51455 if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))
51456 return V;
51457
51458 if (VT == MVT::i1 && isNullConstant(RHS)) {
51459 SDValue X86CC;
51460 if (SDValue V =
51461 MatchVectorAllZeroTest(LHS, CC, DL, Subtarget, DAG, X86CC))
51462 return DAG.getNode(ISD::TRUNCATE, DL, VT,
51463 DAG.getNode(X86ISD::SETCC, DL, MVT::i8, X86CC, V));
51464 }
51465
51466 if (OpVT.isScalarInteger()) {
51467 // cmpeq(or(X,Y),X) --> cmpeq(and(~X,Y),0)
51468 // cmpne(or(X,Y),X) --> cmpne(and(~X,Y),0)
51469 auto MatchOrCmpEq = [&](SDValue N0, SDValue N1) {
51470 if (N0.getOpcode() == ISD::OR && N0->hasOneUse()) {
51471 if (N0.getOperand(0) == N1)
51472 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
51473 N0.getOperand(1));
51474 if (N0.getOperand(1) == N1)
51475 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
51476 N0.getOperand(0));
51477 }
51478 return SDValue();
51479 };
51480 if (SDValue AndN = MatchOrCmpEq(LHS, RHS))
51481 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
51482 if (SDValue AndN = MatchOrCmpEq(RHS, LHS))
51483 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
51484
51485 // cmpeq(and(X,Y),Y) --> cmpeq(and(~X,Y),0)
51486 // cmpne(and(X,Y),Y) --> cmpne(and(~X,Y),0)
51487 auto MatchAndCmpEq = [&](SDValue N0, SDValue N1) {
51488 if (N0.getOpcode() == ISD::AND && N0->hasOneUse()) {
51489 if (N0.getOperand(0) == N1)
51490 return DAG.getNode(ISD::AND, DL, OpVT, N1,
51491 DAG.getNOT(DL, N0.getOperand(1), OpVT));
51492 if (N0.getOperand(1) == N1)
51493 return DAG.getNode(ISD::AND, DL, OpVT, N1,
51494 DAG.getNOT(DL, N0.getOperand(0), OpVT));
51495 }
51496 return SDValue();
51497 };
51498 if (SDValue AndN = MatchAndCmpEq(LHS, RHS))
51499 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
51500 if (SDValue AndN = MatchAndCmpEq(RHS, LHS))
51501 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
51502
51503 // cmpeq(trunc(x),0) --> cmpeq(x,0)
51504 // cmpne(trunc(x),0) --> cmpne(x,0)
51505 // iff x upper bits are zero.
51506 // TODO: Add support for RHS to be truncate as well?
51507 if (LHS.getOpcode() == ISD::TRUNCATE &&
51508 LHS.getOperand(0).getScalarValueSizeInBits() >= 32 &&
51509 isNullConstant(RHS) && !DCI.isBeforeLegalize()) {
51510 EVT SrcVT = LHS.getOperand(0).getValueType();
51511 APInt UpperBits = APInt::getBitsSetFrom(SrcVT.getScalarSizeInBits(),
51512 OpVT.getScalarSizeInBits());
51513 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51514 if (DAG.MaskedValueIsZero(LHS.getOperand(0), UpperBits) &&
51515 TLI.isTypeLegal(LHS.getOperand(0).getValueType()))
51516 return DAG.getSetCC(DL, VT, LHS.getOperand(0),
51517 DAG.getConstant(0, DL, SrcVT), CC);
51518 }
51519 }
51520 }
51521
51522 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
51523 (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
51524 // Using temporaries to avoid messing up operand ordering for later
51525 // transformations if this doesn't work.
51526 SDValue Op0 = LHS;
51527 SDValue Op1 = RHS;
51528 ISD::CondCode TmpCC = CC;
51529 // Put build_vector on the right.
51530 if (Op0.getOpcode() == ISD::BUILD_VECTOR) {
51531 std::swap(Op0, Op1);
51532 TmpCC = ISD::getSetCCSwappedOperands(TmpCC);
51533 }
51534
51535 bool IsSEXT0 =
51536 (Op0.getOpcode() == ISD::SIGN_EXTEND) &&
51537 (Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
51538 bool IsVZero1 = ISD::isBuildVectorAllZeros(Op1.getNode());
51539
51540 if (IsSEXT0 && IsVZero1) {
51541 assert(VT == Op0.getOperand(0).getValueType() &&(static_cast <bool> (VT == Op0.getOperand(0).getValueType
() && "Unexpected operand type") ? void (0) : __assert_fail
("VT == Op0.getOperand(0).getValueType() && \"Unexpected operand type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51542, __extension__
__PRETTY_FUNCTION__))
51542 "Unexpected operand type")(static_cast <bool> (VT == Op0.getOperand(0).getValueType
() && "Unexpected operand type") ? void (0) : __assert_fail
("VT == Op0.getOperand(0).getValueType() && \"Unexpected operand type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51542, __extension__
__PRETTY_FUNCTION__))
;
51543 if (TmpCC == ISD::SETGT)
51544 return DAG.getConstant(0, DL, VT);
51545 if (TmpCC == ISD::SETLE)
51546 return DAG.getConstant(1, DL, VT);
51547 if (TmpCC == ISD::SETEQ || TmpCC == ISD::SETGE)
51548 return DAG.getNOT(DL, Op0.getOperand(0), VT);
51549
51550 assert((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) &&(static_cast <bool> ((TmpCC == ISD::SETNE || TmpCC == ISD
::SETLT) && "Unexpected condition code!") ? void (0) :
__assert_fail ("(TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) && \"Unexpected condition code!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51551, __extension__
__PRETTY_FUNCTION__))
51551 "Unexpected condition code!")(static_cast <bool> ((TmpCC == ISD::SETNE || TmpCC == ISD
::SETLT) && "Unexpected condition code!") ? void (0) :
__assert_fail ("(TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) && \"Unexpected condition code!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51551, __extension__
__PRETTY_FUNCTION__))
;
51552 return Op0.getOperand(0);
51553 }
51554 }
51555
51556 // If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just
51557 // pre-promote its result type since vXi1 vectors don't get promoted
51558 // during type legalization.
51559 // NOTE: The element count check is to ignore operand types that need to
51560 // go through type promotion to a 128-bit vector.
51561 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
51562 VT.getVectorElementType() == MVT::i1 &&
51563 (OpVT.getVectorElementType() == MVT::i8 ||
51564 OpVT.getVectorElementType() == MVT::i16)) {
51565 SDValue Setcc = DAG.getSetCC(DL, OpVT, LHS, RHS, CC);
51566 return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
51567 }
51568
51569 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
51570 // to avoid scalarization via legalization because v4i32 is not a legal type.
51571 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
51572 LHS.getValueType() == MVT::v4f32)
51573 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
51574
51575 // X pred 0.0 --> X pred -X
51576 // If the negation of X already exists, use it in the comparison. This removes
51577 // the need to materialize 0.0 and allows matching to SSE's MIN/MAX
51578 // instructions in patterns with a 'select' node.
51579 if (isNullFPScalarOrVectorConst(RHS)) {
51580 SDVTList FNegVT = DAG.getVTList(OpVT);
51581 if (SDNode *FNeg = DAG.getNodeIfExists(ISD::FNEG, FNegVT, {LHS}))
51582 return DAG.getSetCC(DL, VT, LHS, SDValue(FNeg, 0), CC);
51583 }
51584
51585 return SDValue();
51586}
51587
51588static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
51589 TargetLowering::DAGCombinerInfo &DCI,
51590 const X86Subtarget &Subtarget) {
51591 SDValue Src = N->getOperand(0);
51592 MVT SrcVT = Src.getSimpleValueType();
51593 MVT VT = N->getSimpleValueType(0);
51594 unsigned NumBits = VT.getScalarSizeInBits();
51595 unsigned NumElts = SrcVT.getVectorNumElements();
51596
51597 // Perform constant folding.
51598 if (ISD::isBuildVectorOfConstantSDNodes(Src.getNode())) {
51599 assert(VT == MVT::i32 && "Unexpected result type")(static_cast <bool> (VT == MVT::i32 && "Unexpected result type"
) ? void (0) : __assert_fail ("VT == MVT::i32 && \"Unexpected result type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51599, __extension__
__PRETTY_FUNCTION__))
;
51600 APInt Imm(32, 0);
51601 for (unsigned Idx = 0, e = Src.getNumOperands(); Idx < e; ++Idx) {
51602 if (!Src.getOperand(Idx).isUndef() &&
51603 Src.getConstantOperandAPInt(Idx).isNegative())
51604 Imm.setBit(Idx);
51605 }
51606 return DAG.getConstant(Imm, SDLoc(N), VT);
51607 }
51608
51609 // Look through int->fp bitcasts that don't change the element width.
51610 unsigned EltWidth = SrcVT.getScalarSizeInBits();
51611 if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST &&
51612 Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)
51613 return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));
51614
51615 // Fold movmsk(not(x)) -> not(movmsk(x)) to improve folding of movmsk results
51616 // with scalar comparisons.
51617 if (SDValue NotSrc = IsNOT(Src, DAG)) {
51618 SDLoc DL(N);
51619 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
51620 NotSrc = DAG.getBitcast(SrcVT, NotSrc);
51621 return DAG.getNode(ISD::XOR, DL, VT,
51622 DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc),
51623 DAG.getConstant(NotMask, DL, VT));
51624 }
51625
51626 // Fold movmsk(icmp_sgt(x,-1)) -> not(movmsk(x)) to improve folding of movmsk
51627 // results with scalar comparisons.
51628 if (Src.getOpcode() == X86ISD::PCMPGT &&
51629 ISD::isBuildVectorAllOnes(Src.getOperand(1).getNode())) {
51630 SDLoc DL(N);
51631 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
51632 return DAG.getNode(ISD::XOR, DL, VT,
51633 DAG.getNode(X86ISD::MOVMSK, DL, VT, Src.getOperand(0)),
51634 DAG.getConstant(NotMask, DL, VT));
51635 }
51636
51637 // Fold movmsk(icmp_eq(and(x,c1),0)) -> movmsk(not(shl(x,c2)))
51638 // iff pow2splat(c1).
51639 if (Src.getOpcode() == X86ISD::PCMPEQ &&
51640 Src.getOperand(0).getOpcode() == ISD::AND &&
51641 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {
51642 SDValue LHS = Src.getOperand(0).getOperand(0);
51643 SDValue RHS = Src.getOperand(0).getOperand(1);
51644 KnownBits KnownRHS = DAG.computeKnownBits(RHS);
51645 if (KnownRHS.isConstant() && KnownRHS.getConstant().isPowerOf2()) {
51646 SDLoc DL(N);
51647 MVT ShiftVT = SrcVT;
51648 if (ShiftVT.getScalarType() == MVT::i8) {
51649 // vXi8 shifts - we only care about the signbit so can use PSLLW.
51650 ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
51651 LHS = DAG.getBitcast(ShiftVT, LHS);
51652 }
51653 unsigned ShiftAmt = KnownRHS.getConstant().countLeadingZeros();
51654 LHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT, LHS,
51655 ShiftAmt, DAG);
51656 LHS = DAG.getNOT(DL, DAG.getBitcast(SrcVT, LHS), SrcVT);
51657 return DAG.getNode(X86ISD::MOVMSK, DL, VT, LHS);
51658 }
51659 }
51660
51661 // Simplify the inputs.
51662 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51663 APInt DemandedMask(APInt::getAllOnes(NumBits));
51664 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
51665 return SDValue(N, 0);
51666
51667 return SDValue();
51668}
51669
51670static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG,
51671 TargetLowering::DAGCombinerInfo &DCI,
51672 const X86Subtarget &Subtarget) {
51673 auto *MemOp = cast<X86MaskedGatherScatterSDNode>(N);
51674 SDValue BasePtr = MemOp->getBasePtr();
51675 SDValue Index = MemOp->getIndex();
51676 SDValue Scale = MemOp->getScale();
51677 SDValue Mask = MemOp->getMask();
51678
51679 // Attempt to fold an index scale into the scale value directly.
51680 // For smaller indices, implicit sext is performed BEFORE scale, preventing
51681 // this fold under most circumstances.
51682 // TODO: Move this into X86DAGToDAGISel::matchVectorAddressRecursively?
51683 if ((Index.getOpcode() == X86ISD::VSHLI ||
51684 (Index.getOpcode() == ISD::ADD &&
51685 Index.getOperand(0) == Index.getOperand(1))) &&
51686 isa<ConstantSDNode>(Scale) &&
51687 BasePtr.getScalarValueSizeInBits() == Index.getScalarValueSizeInBits()) {
51688 unsigned ShiftAmt =
51689 Index.getOpcode() == ISD::ADD ? 1 : Index.getConstantOperandVal(1);
51690 uint64_t ScaleAmt = cast<ConstantSDNode>(Scale)->getZExtValue();
51691 uint64_t NewScaleAmt = ScaleAmt * (1ULL << ShiftAmt);
51692 if (isPowerOf2_64(NewScaleAmt) && NewScaleAmt <= 8) {
51693 SDValue NewIndex = Index.getOperand(0);
51694 SDValue NewScale =
51695 DAG.getTargetConstant(NewScaleAmt, SDLoc(N), Scale.getValueType());
51696 if (N->getOpcode() == X86ISD::MGATHER)
51697 return getAVX2GatherNode(N->getOpcode(), SDValue(N, 0), DAG,
51698 MemOp->getOperand(1), Mask,
51699 MemOp->getBasePtr(), NewIndex, NewScale,
51700 MemOp->getChain(), Subtarget);
51701 if (N->getOpcode() == X86ISD::MSCATTER)
51702 return getScatterNode(N->getOpcode(), SDValue(N, 0), DAG,
51703 MemOp->getOperand(1), Mask, MemOp->getBasePtr(),
51704 NewIndex, NewScale, MemOp->getChain(), Subtarget);
51705 }
51706 }
51707
51708 // With vector masks we only demand the upper bit of the mask.
51709 if (Mask.getScalarValueSizeInBits() != 1) {
51710 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51711 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
51712 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
51713 if (N->getOpcode() != ISD::DELETED_NODE)
51714 DCI.AddToWorklist(N);
51715 return SDValue(N, 0);
51716 }
51717 }
51718
51719 return SDValue();
51720}
51721
51722static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS,
51723 SDValue Index, SDValue Base, SDValue Scale,
51724 SelectionDAG &DAG) {
51725 SDLoc DL(GorS);
51726
51727 if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
51728 SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),
51729 Gather->getMask(), Base, Index, Scale } ;
51730 return DAG.getMaskedGather(Gather->getVTList(),
51731 Gather->getMemoryVT(), DL, Ops,
51732 Gather->getMemOperand(),
51733 Gather->getIndexType(),
51734 Gather->getExtensionType());
51735 }
51736 auto *Scatter = cast<MaskedScatterSDNode>(GorS);
51737 SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),
51738 Scatter->getMask(), Base, Index, Scale };
51739 return DAG.getMaskedScatter(Scatter->getVTList(),
51740 Scatter->getMemoryVT(), DL,
51741 Ops, Scatter->getMemOperand(),
51742 Scatter->getIndexType(),
51743 Scatter->isTruncatingStore());
51744}
51745
51746static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
51747 TargetLowering::DAGCombinerInfo &DCI) {
51748 SDLoc DL(N);
51749 auto *GorS = cast<MaskedGatherScatterSDNode>(N);
51750 SDValue Index = GorS->getIndex();
51751 SDValue Base = GorS->getBasePtr();
51752 SDValue Scale = GorS->getScale();
51753
51754 if (DCI.isBeforeLegalize()) {
51755 unsigned IndexWidth = Index.getScalarValueSizeInBits();
51756
51757 // Shrink constant indices if they are larger than 32-bits.
51758 // Only do this before legalize types since v2i64 could become v2i32.
51759 // FIXME: We could check that the type is legal if we're after legalize
51760 // types, but then we would need to construct test cases where that happens.
51761 // FIXME: We could support more than just constant vectors, but we need to
51762 // careful with costing. A truncate that can be optimized out would be fine.
51763 // Otherwise we might only want to create a truncate if it avoids a split.
51764 if (auto *BV = dyn_cast<BuildVectorSDNode>(Index)) {
51765 if (BV->isConstant() && IndexWidth > 32 &&
51766 DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
51767 EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);
51768 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
51769 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
51770 }
51771 }
51772
51773 // Shrink any sign/zero extends from 32 or smaller to larger than 32 if
51774 // there are sufficient sign bits. Only do this before legalize types to
51775 // avoid creating illegal types in truncate.
51776 if ((Index.getOpcode() == ISD::SIGN_EXTEND ||
51777 Index.getOpcode() == ISD::ZERO_EXTEND) &&
51778 IndexWidth > 32 &&
51779 Index.getOperand(0).getScalarValueSizeInBits() <= 32 &&
51780 DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
51781 EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);
51782 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
51783 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
51784 }
51785 }
51786
51787 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51788 EVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
51789 // Try to move splat constant adders from the index operand to the base
51790 // pointer operand. Taking care to multiply by the scale. We can only do
51791 // this when index element type is the same as the pointer type.
51792 // Otherwise we need to be sure the math doesn't wrap before the scale.
51793 if (Index.getOpcode() == ISD::ADD &&
51794 Index.getValueType().getVectorElementType() == PtrVT &&
51795 isa<ConstantSDNode>(Scale)) {
51796 uint64_t ScaleAmt = cast<ConstantSDNode>(Scale)->getZExtValue();
51797 if (auto *BV = dyn_cast<BuildVectorSDNode>(Index.getOperand(1))) {
51798 BitVector UndefElts;
51799 if (ConstantSDNode *C = BV->getConstantSplatNode(&UndefElts)) {
51800 // FIXME: Allow non-constant?
51801 if (UndefElts.none()) {
51802 // Apply the scale.
51803 APInt Adder = C->getAPIntValue() * ScaleAmt;
51804 // Add it to the existing base.
51805 Base = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
51806 DAG.getConstant(Adder, DL, PtrVT));
51807 Index = Index.getOperand(0);
51808 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
51809 }
51810 }
51811
51812 // It's also possible base is just a constant. In that case, just
51813 // replace it with 0 and move the displacement into the index.
51814 if (BV->isConstant() && isa<ConstantSDNode>(Base) &&
51815 isOneConstant(Scale)) {
51816 SDValue Splat = DAG.getSplatBuildVector(Index.getValueType(), DL, Base);
51817 // Combine the constant build_vector and the constant base.
51818 Splat = DAG.getNode(ISD::ADD, DL, Index.getValueType(),
51819 Index.getOperand(1), Splat);
51820 // Add to the LHS of the original Index add.
51821 Index = DAG.getNode(ISD::ADD, DL, Index.getValueType(),
51822 Index.getOperand(0), Splat);
51823 Base = DAG.getConstant(0, DL, Base.getValueType());
51824 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
51825 }
51826 }
51827 }
51828
51829 if (DCI.isBeforeLegalizeOps()) {
51830 unsigned IndexWidth = Index.getScalarValueSizeInBits();
51831
51832 // Make sure the index is either i32 or i64
51833 if (IndexWidth != 32 && IndexWidth != 64) {
51834 MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32;
51835 EVT IndexVT = Index.getValueType().changeVectorElementType(EltVT);
51836 Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
51837 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
51838 }
51839 }
51840
51841 // With vector masks we only demand the upper bit of the mask.
51842 SDValue Mask = GorS->getMask();
51843 if (Mask.getScalarValueSizeInBits() != 1) {
51844 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51845 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
51846 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
51847 if (N->getOpcode() != ISD::DELETED_NODE)
51848 DCI.AddToWorklist(N);
51849 return SDValue(N, 0);
51850 }
51851 }
51852
51853 return SDValue();
51854}
51855
51856// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
51857static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
51858 const X86Subtarget &Subtarget) {
51859 SDLoc DL(N);
51860 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
51861 SDValue EFLAGS = N->getOperand(1);
51862
51863 // Try to simplify the EFLAGS and condition code operands.
51864 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
51865 return getSETCC(CC, Flags, DL, DAG);
51866
51867 return SDValue();
51868}
51869
51870/// Optimize branch condition evaluation.
51871static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
51872 const X86Subtarget &Subtarget) {
51873 SDLoc DL(N);
51874 SDValue EFLAGS = N->getOperand(3);
51875 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
51876
51877 // Try to simplify the EFLAGS and condition code operands.
51878 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
51879 // RAUW them under us.
51880 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
51881 SDValue Cond = DAG.getTargetConstant(CC, DL, MVT::i8);
51882 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
51883 N->getOperand(1), Cond, Flags);
51884 }
51885
51886 return SDValue();
51887}
51888
51889// TODO: Could we move this to DAGCombine?
51890static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
51891 SelectionDAG &DAG) {
51892 // Take advantage of vector comparisons (etc.) producing 0 or -1 in each lane
51893 // to optimize away operation when it's from a constant.
51894 //
51895 // The general transformation is:
51896 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
51897 // AND(VECTOR_CMP(x,y), constant2)
51898 // constant2 = UNARYOP(constant)
51899
51900 // Early exit if this isn't a vector operation, the operand of the
51901 // unary operation isn't a bitwise AND, or if the sizes of the operations
51902 // aren't the same.
51903 EVT VT = N->getValueType(0);
51904 bool IsStrict = N->isStrictFPOpcode();
51905 unsigned NumEltBits = VT.getScalarSizeInBits();
51906 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
51907 if (!VT.isVector() || Op0.getOpcode() != ISD::AND ||
51908 DAG.ComputeNumSignBits(Op0.getOperand(0)) != NumEltBits ||
51909 VT.getSizeInBits() != Op0.getValueSizeInBits())
51910 return SDValue();
51911
51912 // Now check that the other operand of the AND is a constant. We could
51913 // make the transformation for non-constant splats as well, but it's unclear
51914 // that would be a benefit as it would not eliminate any operations, just
51915 // perform one more step in scalar code before moving to the vector unit.
51916 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op0.getOperand(1))) {
51917 // Bail out if the vector isn't a constant.
51918 if (!BV->isConstant())
51919 return SDValue();
51920
51921 // Everything checks out. Build up the new and improved node.
51922 SDLoc DL(N);
51923 EVT IntVT = BV->getValueType(0);
51924 // Create a new constant of the appropriate type for the transformed
51925 // DAG.
51926 SDValue SourceConst;
51927 if (IsStrict)
51928 SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other},
51929 {N->getOperand(0), SDValue(BV, 0)});
51930 else
51931 SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
51932 // The AND node needs bitcasts to/from an integer vector type around it.
51933 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
51934 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0),
51935 MaskConst);
51936 SDValue Res = DAG.getBitcast(VT, NewAnd);
51937 if (IsStrict)
51938 return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL);
51939 return Res;
51940 }
51941
51942 return SDValue();
51943}
51944
51945/// If we are converting a value to floating-point, try to replace scalar
51946/// truncate of an extracted vector element with a bitcast. This tries to keep
51947/// the sequence on XMM registers rather than moving between vector and GPRs.
51948static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG) {
51949 // TODO: This is currently only used by combineSIntToFP, but it is generalized
51950 // to allow being called by any similar cast opcode.
51951 // TODO: Consider merging this into lowering: vectorizeExtractedCast().
51952 SDValue Trunc = N->getOperand(0);
51953 if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE)
51954 return SDValue();
51955
51956 SDValue ExtElt = Trunc.getOperand(0);
51957 if (!ExtElt.hasOneUse() || ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
51958 !isNullConstant(ExtElt.getOperand(1)))
51959 return SDValue();
51960
51961 EVT TruncVT = Trunc.getValueType();
51962 EVT SrcVT = ExtElt.getValueType();
51963 unsigned DestWidth = TruncVT.getSizeInBits();
51964 unsigned SrcWidth = SrcVT.getSizeInBits();
51965 if (SrcWidth % DestWidth != 0)
51966 return SDValue();
51967
51968 // inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)
51969 EVT SrcVecVT = ExtElt.getOperand(0).getValueType();
51970 unsigned VecWidth = SrcVecVT.getSizeInBits();
51971 unsigned NumElts = VecWidth / DestWidth;
51972 EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts);
51973 SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0));
51974 SDLoc DL(N);
51975 SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT,
51976 BitcastVec, ExtElt.getOperand(1));
51977 return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);
51978}
51979
51980static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
51981 const X86Subtarget &Subtarget) {
51982 bool IsStrict = N->isStrictFPOpcode();
51983 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
51984 EVT VT = N->getValueType(0);
51985 EVT InVT = Op0.getValueType();
51986
51987 // UINT_TO_FP(vXi1~15) -> UINT_TO_FP(ZEXT(vXi1~15 to vXi16))
51988 // UINT_TO_FP(vXi17~31) -> UINT_TO_FP(ZEXT(vXi17~31 to vXi32))
51989 // UINT_TO_FP(vXi33~63) -> UINT_TO_FP(ZEXT(vXi33~63 to vXi64))
51990 if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
51991 unsigned ScalarSize = InVT.getScalarSizeInBits();
51992 if (ScalarSize == 16 || ScalarSize == 32 || ScalarSize >= 64)
51993 return SDValue();
51994 SDLoc dl(N);
51995 EVT DstVT = EVT::getVectorVT(*DAG.getContext(),
51996 ScalarSize < 16 ? MVT::i16
51997 : ScalarSize < 32 ? MVT::i32
51998 : MVT::i64,
51999 InVT.getVectorNumElements());
52000 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
52001 if (IsStrict)
52002 return DAG.getNode(ISD::STRICT_UINT_TO_FP, dl, {VT, MVT::Other},
52003 {N->getOperand(0), P});
52004 return DAG.getNode(ISD::UINT_TO_FP, dl, VT, P);
52005 }
52006
52007 // UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))
52008 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
52009 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
52010 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
52011 VT.getScalarType() != MVT::f16) {
52012 SDLoc dl(N);
52013 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
52014 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
52015
52016 // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
52017 if (IsStrict)
52018 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
52019 {N->getOperand(0), P});
52020 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
52021 }
52022
52023 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
52024 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
52025 // the optimization here.
52026 if (DAG.SignBitIsZero(Op0)) {
52027 if (IsStrict)
52028 return DAG.getNode(ISD::STRICT_SINT_TO_FP, SDLoc(N), {VT, MVT::Other},
52029 {N->getOperand(0), Op0});
52030 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
52031 }
52032
52033 return SDValue();
52034}
52035
52036static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
52037 TargetLowering::DAGCombinerInfo &DCI,
52038 const X86Subtarget &Subtarget) {
52039 // First try to optimize away the conversion entirely when it's
52040 // conditionally from a constant. Vectors only.
52041 bool IsStrict = N->isStrictFPOpcode();
52042 if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
52043 return Res;
52044
52045 // Now move on to more general possibilities.
52046 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
52047 EVT VT = N->getValueType(0);
52048 EVT InVT = Op0.getValueType();
52049
52050 // SINT_TO_FP(vXi1~15) -> SINT_TO_FP(SEXT(vXi1~15 to vXi16))
52051 // SINT_TO_FP(vXi17~31) -> SINT_TO_FP(SEXT(vXi17~31 to vXi32))
52052 // SINT_TO_FP(vXi33~63) -> SINT_TO_FP(SEXT(vXi33~63 to vXi64))
52053 if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
52054 unsigned ScalarSize = InVT.getScalarSizeInBits();
52055 if (ScalarSize == 16 || ScalarSize == 32 || ScalarSize >= 64)
52056 return SDValue();
52057 SDLoc dl(N);
52058 EVT DstVT = EVT::getVectorVT(*DAG.getContext(),
52059 ScalarSize < 16 ? MVT::i16
52060 : ScalarSize < 32 ? MVT::i32
52061 : MVT::i64,
52062 InVT.getVectorNumElements());
52063 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
52064 if (IsStrict)
52065 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
52066 {N->getOperand(0), P});
52067 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
52068 }
52069
52070 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
52071 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
52072 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
52073 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
52074 VT.getScalarType() != MVT::f16) {
52075 SDLoc dl(N);
52076 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
52077 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
52078 if (IsStrict)
52079 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
52080 {N->getOperand(0), P});
52081 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
52082 }
52083
52084 // Without AVX512DQ we only support i64 to float scalar conversion. For both
52085 // vectors and scalars, see if we know that the upper bits are all the sign
52086 // bit, in which case we can truncate the input to i32 and convert from that.
52087 if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
52088 unsigned BitWidth = InVT.getScalarSizeInBits();
52089 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
52090 if (NumSignBits >= (BitWidth - 31)) {
52091 EVT TruncVT = MVT::i32;
52092 if (InVT.isVector())
52093 TruncVT = InVT.changeVectorElementType(TruncVT);
52094 SDLoc dl(N);
52095 if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) {
52096 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
52097 if (IsStrict)
52098 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
52099 {N->getOperand(0), Trunc});
52100 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
52101 }
52102 // If we're after legalize and the type is v2i32 we need to shuffle and
52103 // use CVTSI2P.
52104 assert(InVT == MVT::v2i64 && "Unexpected VT!")(static_cast <bool> (InVT == MVT::v2i64 && "Unexpected VT!"
) ? void (0) : __assert_fail ("InVT == MVT::v2i64 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52104, __extension__
__PRETTY_FUNCTION__))
;
52105 SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);
52106 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,
52107 { 0, 2, -1, -1 });
52108 if (IsStrict)
52109 return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
52110 {N->getOperand(0), Shuf});
52111 return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf);
52112 }
52113 }
52114
52115 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
52116 // a 32-bit target where SSE doesn't support i64->FP operations.
52117 if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&
52118 Op0.getOpcode() == ISD::LOAD) {
52119 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
52120
52121 // This transformation is not supported if the result type is f16 or f128.
52122 if (VT == MVT::f16 || VT == MVT::f128)
52123 return SDValue();
52124
52125 // If we have AVX512DQ we can use packed conversion instructions unless
52126 // the VT is f80.
52127 if (Subtarget.hasDQI() && VT != MVT::f80)
52128 return SDValue();
52129
52130 if (Ld->isSimple() && !VT.isVector() && ISD::isNormalLoad(Op0.getNode()) &&
52131 Op0.hasOneUse() && !Subtarget.is64Bit() && InVT == MVT::i64) {
52132 std::pair<SDValue, SDValue> Tmp =
52133 Subtarget.getTargetLowering()->BuildFILD(
52134 VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(),
52135 Ld->getPointerInfo(), Ld->getOriginalAlign(), DAG);
52136 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second);
52137 return Tmp.first;
52138 }
52139 }
52140
52141 if (IsStrict)
52142 return SDValue();
52143
52144 if (SDValue V = combineToFPTruncExtElt(N, DAG))
52145 return V;
52146
52147 return SDValue();
52148}
52149
52150static bool needCarryOrOverflowFlag(SDValue Flags) {
52151 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!")(static_cast <bool> (Flags.getValueType() == MVT::i32 &&
"Unexpected VT!") ? void (0) : __assert_fail ("Flags.getValueType() == MVT::i32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52151, __extension__
__PRETTY_FUNCTION__))
;
52152
52153 for (const SDNode *User : Flags->uses()) {
52154 X86::CondCode CC;
52155 switch (User->getOpcode()) {
52156 default:
52157 // Be conservative.
52158 return true;
52159 case X86ISD::SETCC:
52160 case X86ISD::SETCC_CARRY:
52161 CC = (X86::CondCode)User->getConstantOperandVal(0);
52162 break;
52163 case X86ISD::BRCOND:
52164 CC = (X86::CondCode)User->getConstantOperandVal(2);
52165 break;
52166 case X86ISD::CMOV:
52167 CC = (X86::CondCode)User->getConstantOperandVal(2);
52168 break;
52169 }
52170
52171 switch (CC) {
52172 default: break;
52173 case X86::COND_A: case X86::COND_AE:
52174 case X86::COND_B: case X86::COND_BE:
52175 case X86::COND_O: case X86::COND_NO:
52176 case X86::COND_G: case X86::COND_GE:
52177 case X86::COND_L: case X86::COND_LE:
52178 return true;
52179 }
52180 }
52181
52182 return false;
52183}
52184
52185static bool onlyZeroFlagUsed(SDValue Flags) {
52186 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!")(static_cast <bool> (Flags.getValueType() == MVT::i32 &&
"Unexpected VT!") ? void (0) : __assert_fail ("Flags.getValueType() == MVT::i32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52186, __extension__
__PRETTY_FUNCTION__))
;
52187
52188 for (const SDNode *User : Flags->uses()) {
52189 unsigned CCOpNo;
52190 switch (User->getOpcode()) {
52191 default:
52192 // Be conservative.
52193 return false;
52194 case X86ISD::SETCC: CCOpNo = 0; break;
52195 case X86ISD::SETCC_CARRY: CCOpNo = 0; break;
52196 case X86ISD::BRCOND: CCOpNo = 2; break;
52197 case X86ISD::CMOV: CCOpNo = 2; break;
52198 }
52199
52200 X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
52201 if (CC != X86::COND_E && CC != X86::COND_NE)
52202 return false;
52203 }
52204
52205 return true;
52206}
52207
52208static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) {
52209 // Only handle test patterns.
52210 if (!isNullConstant(N->getOperand(1)))
52211 return SDValue();
52212
52213 // If we have a CMP of a truncated binop, see if we can make a smaller binop
52214 // and use its flags directly.
52215 // TODO: Maybe we should try promoting compares that only use the zero flag
52216 // first if we can prove the upper bits with computeKnownBits?
52217 SDLoc dl(N);
52218 SDValue Op = N->getOperand(0);
52219 EVT VT = Op.getValueType();
52220
52221 // If we have a constant logical shift that's only used in a comparison
52222 // against zero turn it into an equivalent AND. This allows turning it into
52223 // a TEST instruction later.
52224 if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) &&
52225 Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&
52226 onlyZeroFlagUsed(SDValue(N, 0))) {
52227 unsigned BitWidth = VT.getSizeInBits();
52228 const APInt &ShAmt = Op.getConstantOperandAPInt(1);
52229 if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts.
52230 unsigned MaskBits = BitWidth - ShAmt.getZExtValue();
52231 APInt Mask = Op.getOpcode() == ISD::SRL
52232 ? APInt::getHighBitsSet(BitWidth, MaskBits)
52233 : APInt::getLowBitsSet(BitWidth, MaskBits);
52234 if (Mask.isSignedIntN(32)) {
52235 Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),
52236 DAG.getConstant(Mask, dl, VT));
52237 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
52238 DAG.getConstant(0, dl, VT));
52239 }
52240 }
52241 }
52242
52243 // Peek through any zero-extend if we're only testing for a zero result.
52244 if (Op.getOpcode() == ISD::ZERO_EXTEND && onlyZeroFlagUsed(SDValue(N, 0))) {
52245 SDValue Src = Op.getOperand(0);
52246 EVT SrcVT = Src.getValueType();
52247 if (SrcVT.getScalarSizeInBits() >= 8 &&
52248 DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
52249 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Src,
52250 DAG.getConstant(0, dl, SrcVT));
52251 }
52252
52253 // Look for a truncate.
52254 if (Op.getOpcode() != ISD::TRUNCATE)
52255 return SDValue();
52256
52257 SDValue Trunc = Op;
52258 Op = Op.getOperand(0);
52259
52260 // See if we can compare with zero against the truncation source,
52261 // which should help using the Z flag from many ops. Only do this for
52262 // i32 truncated op to prevent partial-reg compares of promoted ops.
52263 EVT OpVT = Op.getValueType();
52264 APInt UpperBits =
52265 APInt::getBitsSetFrom(OpVT.getSizeInBits(), VT.getSizeInBits());
52266 if (OpVT == MVT::i32 && DAG.MaskedValueIsZero(Op, UpperBits) &&
52267 onlyZeroFlagUsed(SDValue(N, 0))) {
52268 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
52269 DAG.getConstant(0, dl, OpVT));
52270 }
52271
52272 // After this the truncate and arithmetic op must have a single use.
52273 if (!Trunc.hasOneUse() || !Op.hasOneUse())
52274 return SDValue();
52275
52276 unsigned NewOpc;
52277 switch (Op.getOpcode()) {
52278 default: return SDValue();
52279 case ISD::AND:
52280 // Skip and with constant. We have special handling for and with immediate
52281 // during isel to generate test instructions.
52282 if (isa<ConstantSDNode>(Op.getOperand(1)))
52283 return SDValue();
52284 NewOpc = X86ISD::AND;
52285 break;
52286 case ISD::OR: NewOpc = X86ISD::OR; break;
52287 case ISD::XOR: NewOpc = X86ISD::XOR; break;
52288 case ISD::ADD:
52289 // If the carry or overflow flag is used, we can't truncate.
52290 if (needCarryOrOverflowFlag(SDValue(N, 0)))
52291 return SDValue();
52292 NewOpc = X86ISD::ADD;
52293 break;
52294 case ISD::SUB:
52295 // If the carry or overflow flag is used, we can't truncate.
52296 if (needCarryOrOverflowFlag(SDValue(N, 0)))
52297 return SDValue();
52298 NewOpc = X86ISD::SUB;
52299 break;
52300 }
52301
52302 // We found an op we can narrow. Truncate its inputs.
52303 SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0));
52304 SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1));
52305
52306 // Use a X86 specific opcode to avoid DAG combine messing with it.
52307 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
52308 Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1);
52309
52310 // For AND, keep a CMP so that we can match the test pattern.
52311 if (NewOpc == X86ISD::AND)
52312 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
52313 DAG.getConstant(0, dl, VT));
52314
52315 // Return the flags.
52316 return Op.getValue(1);
52317}
52318
52319static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG,
52320 TargetLowering::DAGCombinerInfo &DCI) {
52321 assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) &&(static_cast <bool> ((X86ISD::ADD == N->getOpcode() ||
X86ISD::SUB == N->getOpcode()) && "Expected X86ISD::ADD or X86ISD::SUB"
) ? void (0) : __assert_fail ("(X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) && \"Expected X86ISD::ADD or X86ISD::SUB\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52322, __extension__
__PRETTY_FUNCTION__))
52322 "Expected X86ISD::ADD or X86ISD::SUB")(static_cast <bool> ((X86ISD::ADD == N->getOpcode() ||
X86ISD::SUB == N->getOpcode()) && "Expected X86ISD::ADD or X86ISD::SUB"
) ? void (0) : __assert_fail ("(X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) && \"Expected X86ISD::ADD or X86ISD::SUB\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52322, __extension__
__PRETTY_FUNCTION__))
;
52323
52324 SDLoc DL(N);
52325 SDValue LHS = N->getOperand(0);
52326 SDValue RHS = N->getOperand(1);
52327 MVT VT = LHS.getSimpleValueType();
52328 unsigned GenericOpc = X86ISD::ADD == N->getOpcode() ? ISD::ADD : ISD::SUB;
52329
52330 // If we don't use the flag result, simplify back to a generic ADD/SUB.
52331 if (!N->hasAnyUseOfValue(1)) {
52332 SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS);
52333 return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL);
52334 }
52335
52336 // Fold any similar generic ADD/SUB opcodes to reuse this node.
52337 auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {
52338 SDValue Ops[] = {N0, N1};
52339 SDVTList VTs = DAG.getVTList(N->getValueType(0));
52340 if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {
52341 SDValue Op(N, 0);
52342 if (Negate)
52343 Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op);
52344 DCI.CombineTo(GenericAddSub, Op);
52345 }
52346 };
52347 MatchGeneric(LHS, RHS, false);
52348 MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());
52349
52350 return SDValue();
52351}
52352
52353static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
52354 SDValue LHS = N->getOperand(0);
52355 SDValue RHS = N->getOperand(1);
52356 SDValue BorrowIn = N->getOperand(2);
52357
52358 if (SDValue Flags = combineCarryThroughADD(BorrowIn, DAG)) {
52359 MVT VT = N->getSimpleValueType(0);
52360 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
52361 return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs, LHS, RHS, Flags);
52362 }
52363
52364 // Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)
52365 // iff the flag result is dead.
52366 if (LHS.getOpcode() == ISD::SUB && isNullConstant(RHS) &&
52367 !N->hasAnyUseOfValue(1))
52368 return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), LHS.getOperand(0),
52369 LHS.getOperand(1), BorrowIn);
52370
52371 return SDValue();
52372}
52373
52374// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
52375static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
52376 TargetLowering::DAGCombinerInfo &DCI) {
52377 SDValue LHS = N->getOperand(0);
52378 SDValue RHS = N->getOperand(1);
52379 SDValue CarryIn = N->getOperand(2);
52380 auto *LHSC = dyn_cast<ConstantSDNode>(LHS);
52381 auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
52382
52383 // Canonicalize constant to RHS.
52384 if (LHSC && !RHSC)
52385 return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), RHS, LHS,
52386 CarryIn);
52387
52388 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
52389 // the result is either zero or one (depending on the input carry bit).
52390 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
52391 if (LHSC && RHSC && LHSC->isZero() && RHSC->isZero() &&
52392 // We don't have a good way to replace an EFLAGS use, so only do this when
52393 // dead right now.
52394 SDValue(N, 1).use_empty()) {
52395 SDLoc DL(N);
52396 EVT VT = N->getValueType(0);
52397 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
52398 SDValue Res1 = DAG.getNode(
52399 ISD::AND, DL, VT,
52400 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52401 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), CarryIn),
52402 DAG.getConstant(1, DL, VT));
52403 return DCI.CombineTo(N, Res1, CarryOut);
52404 }
52405
52406 // Fold ADC(C1,C2,Carry) -> ADC(0,C1+C2,Carry)
52407 // iff the flag result is dead.
52408 // TODO: Allow flag result if C1+C2 doesn't signed/unsigned overflow.
52409 if (LHSC && RHSC && !LHSC->isZero() && !N->hasAnyUseOfValue(1)) {
52410 SDLoc DL(N);
52411 APInt Sum = LHSC->getAPIntValue() + RHSC->getAPIntValue();
52412 return DAG.getNode(X86ISD::ADC, DL, N->getVTList(),
52413 DAG.getConstant(0, DL, LHS.getValueType()),
52414 DAG.getConstant(Sum, DL, LHS.getValueType()), CarryIn);
52415 }
52416
52417 if (SDValue Flags = combineCarryThroughADD(CarryIn, DAG)) {
52418 MVT VT = N->getSimpleValueType(0);
52419 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
52420 return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs, LHS, RHS, Flags);
52421 }
52422
52423 // Fold ADC(ADD(X,Y),0,Carry) -> ADC(X,Y,Carry)
52424 // iff the flag result is dead.
52425 if (LHS.getOpcode() == ISD::ADD && RHSC && RHSC->isZero() &&
52426 !N->hasAnyUseOfValue(1))
52427 return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), LHS.getOperand(0),
52428 LHS.getOperand(1), CarryIn);
52429
52430 return SDValue();
52431}
52432
52433/// If this is an add or subtract where one operand is produced by a cmp+setcc,
52434/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
52435/// with CMP+{ADC, SBB}.
52436/// Also try (ADD/SUB)+(AND(SRL,1)) bit extraction pattern with BT+{ADC, SBB}.
52437static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT,
52438 SDValue X, SDValue Y,
52439 SelectionDAG &DAG) {
52440 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
52441 return SDValue();
52442
52443 // Look through a one-use zext.
52444 if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse())
52445 Y = Y.getOperand(0);
52446
52447 X86::CondCode CC;
52448 SDValue EFLAGS;
52449 if (Y.getOpcode() == X86ISD::SETCC && Y.hasOneUse()) {
52450 CC = (X86::CondCode)Y.getConstantOperandVal(0);
52451 EFLAGS = Y.getOperand(1);
52452 } else if (Y.getOpcode() == ISD::AND && isOneConstant(Y.getOperand(1)) &&
52453 Y.hasOneUse()) {
52454 EFLAGS = LowerAndToBT(Y, ISD::SETNE, DL, DAG, CC);
52455 }
52456
52457 if (!EFLAGS)
52458 return SDValue();
52459
52460 // If X is -1 or 0, then we have an opportunity to avoid constants required in
52461 // the general case below.
52462 auto *ConstantX = dyn_cast<ConstantSDNode>(X);
52463 if (ConstantX) {
52464 if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnes()) ||
52465 (IsSub && CC == X86::COND_B && ConstantX->isZero())) {
52466 // This is a complicated way to get -1 or 0 from the carry flag:
52467 // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
52468 // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
52469 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52470 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
52471 EFLAGS);
52472 }
52473
52474 if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnes()) ||
52475 (IsSub && CC == X86::COND_A && ConstantX->isZero())) {
52476 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
52477 EFLAGS.getValueType().isInteger() &&
52478 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
52479 // Swap the operands of a SUB, and we have the same pattern as above.
52480 // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
52481 // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
52482 SDValue NewSub = DAG.getNode(
52483 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
52484 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
52485 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
52486 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52487 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
52488 NewEFLAGS);
52489 }
52490 }
52491 }
52492
52493 if (CC == X86::COND_B) {
52494 // X + SETB Z --> adc X, 0
52495 // X - SETB Z --> sbb X, 0
52496 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
52497 DAG.getVTList(VT, MVT::i32), X,
52498 DAG.getConstant(0, DL, VT), EFLAGS);
52499 }
52500
52501 if (CC == X86::COND_A) {
52502 // Try to convert COND_A into COND_B in an attempt to facilitate
52503 // materializing "setb reg".
52504 //
52505 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
52506 // cannot take an immediate as its first operand.
52507 //
52508 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
52509 EFLAGS.getValueType().isInteger() &&
52510 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
52511 SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
52512 EFLAGS.getNode()->getVTList(),
52513 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
52514 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
52515 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
52516 DAG.getVTList(VT, MVT::i32), X,
52517 DAG.getConstant(0, DL, VT), NewEFLAGS);
52518 }
52519 }
52520
52521 if (CC == X86::COND_AE) {
52522 // X + SETAE --> sbb X, -1
52523 // X - SETAE --> adc X, -1
52524 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
52525 DAG.getVTList(VT, MVT::i32), X,
52526 DAG.getConstant(-1, DL, VT), EFLAGS);
52527 }
52528
52529 if (CC == X86::COND_BE) {
52530 // X + SETBE --> sbb X, -1
52531 // X - SETBE --> adc X, -1
52532 // Try to convert COND_BE into COND_AE in an attempt to facilitate
52533 // materializing "setae reg".
52534 //
52535 // Do not flip "e <= c", where "c" is a constant, because Cmp instruction
52536 // cannot take an immediate as its first operand.
52537 //
52538 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
52539 EFLAGS.getValueType().isInteger() &&
52540 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
52541 SDValue NewSub = DAG.getNode(
52542 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
52543 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
52544 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
52545 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
52546 DAG.getVTList(VT, MVT::i32), X,
52547 DAG.getConstant(-1, DL, VT), NewEFLAGS);
52548 }
52549 }
52550
52551 if (CC != X86::COND_E && CC != X86::COND_NE)
52552 return SDValue();
52553
52554 if (EFLAGS.getOpcode() != X86ISD::CMP || !EFLAGS.hasOneUse() ||
52555 !X86::isZeroNode(EFLAGS.getOperand(1)) ||
52556 !EFLAGS.getOperand(0).getValueType().isInteger())
52557 return SDValue();
52558
52559 SDValue Z = EFLAGS.getOperand(0);
52560 EVT ZVT = Z.getValueType();
52561
52562 // If X is -1 or 0, then we have an opportunity to avoid constants required in
52563 // the general case below.
52564 if (ConstantX) {
52565 // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
52566 // fake operands:
52567 // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
52568 // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
52569 if ((IsSub && CC == X86::COND_NE && ConstantX->isZero()) ||
52570 (!IsSub && CC == X86::COND_E && ConstantX->isAllOnes())) {
52571 SDValue Zero = DAG.getConstant(0, DL, ZVT);
52572 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
52573 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
52574 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52575 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
52576 SDValue(Neg.getNode(), 1));
52577 }
52578
52579 // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
52580 // with fake operands:
52581 // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
52582 // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
52583 if ((IsSub && CC == X86::COND_E && ConstantX->isZero()) ||
52584 (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnes())) {
52585 SDValue One = DAG.getConstant(1, DL, ZVT);
52586 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
52587 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
52588 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52589 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
52590 Cmp1.getValue(1));
52591 }
52592 }
52593
52594 // (cmp Z, 1) sets the carry flag if Z is 0.
52595 SDValue One = DAG.getConstant(1, DL, ZVT);
52596 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
52597 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
52598
52599 // Add the flags type for ADC/SBB nodes.
52600 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
52601
52602 // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
52603 // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
52604 if (CC == X86::COND_NE)
52605 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
52606 DAG.getConstant(-1ULL, DL, VT), Cmp1.getValue(1));
52607
52608 // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
52609 // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
52610 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
52611 DAG.getConstant(0, DL, VT), Cmp1.getValue(1));
52612}
52613
52614/// If this is an add or subtract where one operand is produced by a cmp+setcc,
52615/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
52616/// with CMP+{ADC, SBB}.
52617static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
52618 bool IsSub = N->getOpcode() == ISD::SUB;
52619 SDValue X = N->getOperand(0);
52620 SDValue Y = N->getOperand(1);
52621 EVT VT = N->getValueType(0);
52622 SDLoc DL(N);
52623
52624 if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, X, Y, DAG))
52625 return ADCOrSBB;
52626
52627 // Commute and try again (negate the result for subtracts).
52628 if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, Y, X, DAG)) {
52629 if (IsSub)
52630 ADCOrSBB =
52631 DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), ADCOrSBB);
52632 return ADCOrSBB;
52633 }
52634
52635 return SDValue();
52636}
52637
52638static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
52639 const SDLoc &DL, EVT VT,
52640 const X86Subtarget &Subtarget) {
52641 // Example of pattern we try to detect:
52642 // t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
52643 //(add (build_vector (extract_elt t, 0),
52644 // (extract_elt t, 2),
52645 // (extract_elt t, 4),
52646 // (extract_elt t, 6)),
52647 // (build_vector (extract_elt t, 1),
52648 // (extract_elt t, 3),
52649 // (extract_elt t, 5),
52650 // (extract_elt t, 7)))
52651
52652 if (!Subtarget.hasSSE2())
52653 return SDValue();
52654
52655 if (Op0.getOpcode() != ISD::BUILD_VECTOR ||
52656 Op1.getOpcode() != ISD::BUILD_VECTOR)
52657 return SDValue();
52658
52659 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
52660 VT.getVectorNumElements() < 4 ||
52661 !isPowerOf2_32(VT.getVectorNumElements()))
52662 return SDValue();
52663
52664 // Check if one of Op0,Op1 is of the form:
52665 // (build_vector (extract_elt Mul, 0),
52666 // (extract_elt Mul, 2),
52667 // (extract_elt Mul, 4),
52668 // ...
52669 // the other is of the form:
52670 // (build_vector (extract_elt Mul, 1),
52671 // (extract_elt Mul, 3),
52672 // (extract_elt Mul, 5),
52673 // ...
52674 // and identify Mul.
52675 SDValue Mul;
52676 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
52677 SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
52678 Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
52679 // TODO: Be more tolerant to undefs.
52680 if (Op0L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
52681 Op1L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
52682 Op0H.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
52683 Op1H.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
52684 return SDValue();
52685 auto *Const0L = dyn_cast<ConstantSDNode>(Op0L->getOperand(1));
52686 auto *Const1L = dyn_cast<ConstantSDNode>(Op1L->getOperand(1));
52687 auto *Const0H = dyn_cast<ConstantSDNode>(Op0H->getOperand(1));
52688 auto *Const1H = dyn_cast<ConstantSDNode>(Op1H->getOperand(1));
52689 if (!Const0L || !Const1L || !Const0H || !Const1H)
52690 return SDValue();
52691 unsigned Idx0L = Const0L->getZExtValue(), Idx1L = Const1L->getZExtValue(),
52692 Idx0H = Const0H->getZExtValue(), Idx1H = Const1H->getZExtValue();
52693 // Commutativity of mul allows factors of a product to reorder.
52694 if (Idx0L > Idx1L)
52695 std::swap(Idx0L, Idx1L);
52696 if (Idx0H > Idx1H)
52697 std::swap(Idx0H, Idx1H);
52698 // Commutativity of add allows pairs of factors to reorder.
52699 if (Idx0L > Idx0H) {
52700 std::swap(Idx0L, Idx0H);
52701 std::swap(Idx1L, Idx1H);
52702 }
52703 if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||
52704 Idx1H != 2 * i + 3)
52705 return SDValue();
52706 if (!Mul) {
52707 // First time an extract_elt's source vector is visited. Must be a MUL
52708 // with 2X number of vector elements than the BUILD_VECTOR.
52709 // Both extracts must be from same MUL.
52710 Mul = Op0L->getOperand(0);
52711 if (Mul->getOpcode() != ISD::MUL ||
52712 Mul.getValueType().getVectorNumElements() != 2 * e)
52713 return SDValue();
52714 }
52715 // Check that the extract is from the same MUL previously seen.
52716 if (Mul != Op0L->getOperand(0) || Mul != Op1L->getOperand(0) ||
52717 Mul != Op0H->getOperand(0) || Mul != Op1H->getOperand(0))
52718 return SDValue();
52719 }
52720
52721 // Check if the Mul source can be safely shrunk.
52722 ShrinkMode Mode;
52723 if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) ||
52724 Mode == ShrinkMode::MULU16)
52725 return SDValue();
52726
52727 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
52728 VT.getVectorNumElements() * 2);
52729 SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0));
52730 SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1));
52731
52732 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
52733 ArrayRef<SDValue> Ops) {
52734 EVT InVT = Ops[0].getValueType();
52735 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch")(static_cast <bool> (InVT == Ops[1].getValueType() &&
"Operands' types mismatch") ? void (0) : __assert_fail ("InVT == Ops[1].getValueType() && \"Operands' types mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52735, __extension__
__PRETTY_FUNCTION__))
;
52736 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
52737 InVT.getVectorNumElements() / 2);
52738 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
52739 };
52740 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { N0, N1 }, PMADDBuilder);
52741}
52742
52743// Attempt to turn this pattern into PMADDWD.
52744// (add (mul (sext (build_vector)), (sext (build_vector))),
52745// (mul (sext (build_vector)), (sext (build_vector)))
52746static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
52747 const SDLoc &DL, EVT VT,
52748 const X86Subtarget &Subtarget) {
52749 if (!Subtarget.hasSSE2())
52750 return SDValue();
52751
52752 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
52753 return SDValue();
52754
52755 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
52756 VT.getVectorNumElements() < 4 ||
52757 !isPowerOf2_32(VT.getVectorNumElements()))
52758 return SDValue();
52759
52760 SDValue N00 = N0.getOperand(0);
52761 SDValue N01 = N0.getOperand(1);
52762 SDValue N10 = N1.getOperand(0);
52763 SDValue N11 = N1.getOperand(1);
52764
52765 // All inputs need to be sign extends.
52766 // TODO: Support ZERO_EXTEND from known positive?
52767 if (N00.getOpcode() != ISD::SIGN_EXTEND ||
52768 N01.getOpcode() != ISD::SIGN_EXTEND ||
52769 N10.getOpcode() != ISD::SIGN_EXTEND ||
52770 N11.getOpcode() != ISD::SIGN_EXTEND)
52771 return SDValue();
52772
52773 // Peek through the extends.
52774 N00 = N00.getOperand(0);
52775 N01 = N01.getOperand(0);
52776 N10 = N10.getOperand(0);
52777 N11 = N11.getOperand(0);
52778
52779 // Must be extending from vXi16.
52780 EVT InVT = N00.getValueType();
52781 if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT ||
52782 N10.getValueType() != InVT || N11.getValueType() != InVT)
52783 return SDValue();
52784
52785 // All inputs should be build_vectors.
52786 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
52787 N01.getOpcode() != ISD::BUILD_VECTOR ||
52788 N10.getOpcode() != ISD::BUILD_VECTOR ||
52789 N11.getOpcode() != ISD::BUILD_VECTOR)
52790 return SDValue();
52791
52792 // For each element, we need to ensure we have an odd element from one vector
52793 // multiplied by the odd element of another vector and the even element from
52794 // one of the same vectors being multiplied by the even element from the
52795 // other vector. So we need to make sure for each element i, this operator
52796 // is being performed:
52797 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
52798 SDValue In0, In1;
52799 for (unsigned i = 0; i != N00.getNumOperands(); ++i) {
52800 SDValue N00Elt = N00.getOperand(i);
52801 SDValue N01Elt = N01.getOperand(i);
52802 SDValue N10Elt = N10.getOperand(i);
52803 SDValue N11Elt = N11.getOperand(i);
52804 // TODO: Be more tolerant to undefs.
52805 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
52806 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
52807 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
52808 N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
52809 return SDValue();
52810 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
52811 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
52812 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
52813 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
52814 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
52815 return SDValue();
52816 unsigned IdxN00 = ConstN00Elt->getZExtValue();
52817 unsigned IdxN01 = ConstN01Elt->getZExtValue();
52818 unsigned IdxN10 = ConstN10Elt->getZExtValue();
52819 unsigned IdxN11 = ConstN11Elt->getZExtValue();
52820 // Add is commutative so indices can be reordered.
52821 if (IdxN00 > IdxN10) {
52822 std::swap(IdxN00, IdxN10);
52823 std::swap(IdxN01, IdxN11);
52824 }
52825 // N0 indices be the even element. N1 indices must be the next odd element.
52826 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
52827 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
52828 return SDValue();
52829 SDValue N00In = N00Elt.getOperand(0);
52830 SDValue N01In = N01Elt.getOperand(0);
52831 SDValue N10In = N10Elt.getOperand(0);
52832 SDValue N11In = N11Elt.getOperand(0);
52833
52834 // First time we find an input capture it.
52835 if (!In0) {
52836 In0 = N00In;
52837 In1 = N01In;
52838
52839 // The input vectors must be at least as wide as the output.
52840 // If they are larger than the output, we extract subvector below.
52841 if (In0.getValueSizeInBits() < VT.getSizeInBits() ||
52842 In1.getValueSizeInBits() < VT.getSizeInBits())
52843 return SDValue();
52844 }
52845 // Mul is commutative so the input vectors can be in any order.
52846 // Canonicalize to make the compares easier.
52847 if (In0 != N00In)
52848 std::swap(N00In, N01In);
52849 if (In0 != N10In)
52850 std::swap(N10In, N11In);
52851 if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In)
52852 return SDValue();
52853 }
52854
52855 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
52856 ArrayRef<SDValue> Ops) {
52857 EVT OpVT = Ops[0].getValueType();
52858 assert(OpVT.getScalarType() == MVT::i16 &&(static_cast <bool> (OpVT.getScalarType() == MVT::i16 &&
"Unexpected scalar element type") ? void (0) : __assert_fail
("OpVT.getScalarType() == MVT::i16 && \"Unexpected scalar element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52859, __extension__
__PRETTY_FUNCTION__))
52859 "Unexpected scalar element type")(static_cast <bool> (OpVT.getScalarType() == MVT::i16 &&
"Unexpected scalar element type") ? void (0) : __assert_fail
("OpVT.getScalarType() == MVT::i16 && \"Unexpected scalar element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52859, __extension__
__PRETTY_FUNCTION__))
;
52860 assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch")(static_cast <bool> (OpVT == Ops[1].getValueType() &&
"Operands' types mismatch") ? void (0) : __assert_fail ("OpVT == Ops[1].getValueType() && \"Operands' types mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52860, __extension__
__PRETTY_FUNCTION__))
;
52861 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
52862 OpVT.getVectorNumElements() / 2);
52863 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
52864 };
52865
52866 // If the output is narrower than an input, extract the low part of the input
52867 // vector.
52868 EVT OutVT16 = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
52869 VT.getVectorNumElements() * 2);
52870 if (OutVT16.bitsLT(In0.getValueType())) {
52871 In0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In0,
52872 DAG.getIntPtrConstant(0, DL));
52873 }
52874 if (OutVT16.bitsLT(In1.getValueType())) {
52875 In1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In1,
52876 DAG.getIntPtrConstant(0, DL));
52877 }
52878 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
52879 PMADDBuilder);
52880}
52881
52882// ADD(VPMADDWD(X,Y),VPMADDWD(Z,W)) -> VPMADDWD(SHUFFLE(X,Z), SHUFFLE(Y,W))
52883// If upper element in each pair of both VPMADDWD are zero then we can merge
52884// the operand elements and use the implicit add of VPMADDWD.
52885// TODO: Add support for VPMADDUBSW (which isn't commutable).
52886static SDValue combineAddOfPMADDWD(SelectionDAG &DAG, SDValue N0, SDValue N1,
52887 const SDLoc &DL, EVT VT) {
52888 if (N0.getOpcode() != N1.getOpcode() || N0.getOpcode() != X86ISD::VPMADDWD)
52889 return SDValue();
52890
52891 // TODO: Add 256/512-bit support once VPMADDWD combines with shuffles.
52892 if (VT.getSizeInBits() > 128)
52893 return SDValue();
52894
52895 unsigned NumElts = VT.getVectorNumElements();
52896 MVT OpVT = N0.getOperand(0).getSimpleValueType();
52897 APInt DemandedBits = APInt::getAllOnes(OpVT.getScalarSizeInBits());
52898 APInt DemandedHiElts = APInt::getSplat(2 * NumElts, APInt(2, 2));
52899
52900 bool Op0HiZero =
52901 DAG.MaskedValueIsZero(N0.getOperand(0), DemandedBits, DemandedHiElts) ||
52902 DAG.MaskedValueIsZero(N0.getOperand(1), DemandedBits, DemandedHiElts);
52903 bool Op1HiZero =
52904 DAG.MaskedValueIsZero(N1.getOperand(0), DemandedBits, DemandedHiElts) ||
52905 DAG.MaskedValueIsZero(N1.getOperand(1), DemandedBits, DemandedHiElts);
52906
52907 // TODO: Check for zero lower elements once we have actual codegen that
52908 // creates them.
52909 if (!Op0HiZero || !Op1HiZero)
52910 return SDValue();
52911
52912 // Create a shuffle mask packing the lower elements from each VPMADDWD.
52913 SmallVector<int> Mask;
52914 for (int i = 0; i != (int)NumElts; ++i) {
52915 Mask.push_back(2 * i);
52916 Mask.push_back(2 * (i + NumElts));
52917 }
52918
52919 SDValue LHS =
52920 DAG.getVectorShuffle(OpVT, DL, N0.getOperand(0), N1.getOperand(0), Mask);
52921 SDValue RHS =
52922 DAG.getVectorShuffle(OpVT, DL, N0.getOperand(1), N1.getOperand(1), Mask);
52923 return DAG.getNode(X86ISD::VPMADDWD, DL, VT, LHS, RHS);
52924}
52925
52926/// CMOV of constants requires materializing constant operands in registers.
52927/// Try to fold those constants into an 'add' instruction to reduce instruction
52928/// count. We do this with CMOV rather the generic 'select' because there are
52929/// earlier folds that may be used to turn select-of-constants into logic hacks.
52930static SDValue pushAddIntoCmovOfConsts(SDNode *N, SelectionDAG &DAG,
52931 const X86Subtarget &Subtarget) {
52932 // If an operand is zero, add-of-0 gets simplified away, so that's clearly
52933 // better because we eliminate 1-2 instructions. This transform is still
52934 // an improvement without zero operands because we trade 2 move constants and
52935 // 1 add for 2 adds (LEA) as long as the constants can be represented as
52936 // immediate asm operands (fit in 32-bits).
52937 auto isSuitableCmov = [](SDValue V) {
52938 if (V.getOpcode() != X86ISD::CMOV || !V.hasOneUse())
52939 return false;
52940 if (!isa<ConstantSDNode>(V.getOperand(0)) ||
52941 !isa<ConstantSDNode>(V.getOperand(1)))
52942 return false;
52943 return isNullConstant(V.getOperand(0)) || isNullConstant(V.getOperand(1)) ||
52944 (V.getConstantOperandAPInt(0).isSignedIntN(32) &&
52945 V.getConstantOperandAPInt(1).isSignedIntN(32));
52946 };
52947
52948 // Match an appropriate CMOV as the first operand of the add.
52949 SDValue Cmov = N->getOperand(0);
52950 SDValue OtherOp = N->getOperand(1);
52951 if (!isSuitableCmov(Cmov))
52952 std::swap(Cmov, OtherOp);
52953 if (!isSuitableCmov(Cmov))
52954 return SDValue();
52955
52956 // Don't remove a load folding opportunity for the add. That would neutralize
52957 // any improvements from removing constant materializations.
52958 if (X86::mayFoldLoad(OtherOp, Subtarget))
52959 return SDValue();
52960
52961 EVT VT = N->getValueType(0);
52962 SDLoc DL(N);
52963 SDValue FalseOp = Cmov.getOperand(0);
52964 SDValue TrueOp = Cmov.getOperand(1);
52965
52966 // We will push the add through the select, but we can potentially do better
52967 // if we know there is another add in the sequence and this is pointer math.
52968 // In that case, we can absorb an add into the trailing memory op and avoid
52969 // a 3-operand LEA which is likely slower than a 2-operand LEA.
52970 // TODO: If target has "slow3OpsLEA", do this even without the trailing memop?
52971 if (OtherOp.getOpcode() == ISD::ADD && OtherOp.hasOneUse() &&
52972 !isa<ConstantSDNode>(OtherOp.getOperand(0)) &&
52973 all_of(N->uses(), [&](SDNode *Use) {
52974 auto *MemNode = dyn_cast<MemSDNode>(Use);
52975 return MemNode && MemNode->getBasePtr().getNode() == N;
52976 })) {
52977 // add (cmov C1, C2), add (X, Y) --> add (cmov (add X, C1), (add X, C2)), Y
52978 // TODO: We are arbitrarily choosing op0 as the 1st piece of the sum, but
52979 // it is possible that choosing op1 might be better.
52980 SDValue X = OtherOp.getOperand(0), Y = OtherOp.getOperand(1);
52981 FalseOp = DAG.getNode(ISD::ADD, DL, VT, X, FalseOp);
52982 TrueOp = DAG.getNode(ISD::ADD, DL, VT, X, TrueOp);
52983 Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp,
52984 Cmov.getOperand(2), Cmov.getOperand(3));
52985 return DAG.getNode(ISD::ADD, DL, VT, Cmov, Y);
52986 }
52987
52988 // add (cmov C1, C2), OtherOp --> cmov (add OtherOp, C1), (add OtherOp, C2)
52989 FalseOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, FalseOp);
52990 TrueOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, TrueOp);
52991 return DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp, Cmov.getOperand(2),
52992 Cmov.getOperand(3));
52993}
52994
52995static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
52996 TargetLowering::DAGCombinerInfo &DCI,
52997 const X86Subtarget &Subtarget) {
52998 EVT VT = N->getValueType(0);
52999 SDValue Op0 = N->getOperand(0);
53000 SDValue Op1 = N->getOperand(1);
53001 SDLoc DL(N);
53002
53003 if (SDValue Select = pushAddIntoCmovOfConsts(N, DAG, Subtarget))
53004 return Select;
53005
53006 if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, DL, VT, Subtarget))
53007 return MAdd;
53008 if (SDValue MAdd = matchPMADDWD_2(DAG, Op0, Op1, DL, VT, Subtarget))
53009 return MAdd;
53010 if (SDValue MAdd = combineAddOfPMADDWD(DAG, Op0, Op1, DL, VT))
53011 return MAdd;
53012
53013 // Try to synthesize horizontal adds from adds of shuffles.
53014 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
53015 return V;
53016
53017 // If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into
53018 // (sub Y, (sext (vXi1 X))).
53019 // FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y) in
53020 // generic DAG combine without a legal type check, but adding this there
53021 // caused regressions.
53022 if (VT.isVector()) {
53023 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53024 if (Op0.getOpcode() == ISD::ZERO_EXTEND &&
53025 Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
53026 TLI.isTypeLegal(Op0.getOperand(0).getValueType())) {
53027 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op0.getOperand(0));
53028 return DAG.getNode(ISD::SUB, DL, VT, Op1, SExt);
53029 }
53030
53031 if (Op1.getOpcode() == ISD::ZERO_EXTEND &&
53032 Op1.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
53033 TLI.isTypeLegal(Op1.getOperand(0).getValueType())) {
53034 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op1.getOperand(0));
53035 return DAG.getNode(ISD::SUB, DL, VT, Op0, SExt);
53036 }
53037 }
53038
53039 // Fold ADD(ADC(Y,0,W),X) -> ADC(X,Y,W)
53040 if (Op0.getOpcode() == X86ISD::ADC && Op0->hasOneUse() &&
53041 X86::isZeroNode(Op0.getOperand(1))) {
53042 assert(!Op0->hasAnyUseOfValue(1) && "Overflow bit in use")(static_cast <bool> (!Op0->hasAnyUseOfValue(1) &&
"Overflow bit in use") ? void (0) : __assert_fail ("!Op0->hasAnyUseOfValue(1) && \"Overflow bit in use\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53042, __extension__
__PRETTY_FUNCTION__))
;
53043 return DAG.getNode(X86ISD::ADC, SDLoc(Op0), Op0->getVTList(), Op1,
53044 Op0.getOperand(0), Op0.getOperand(2));
53045 }
53046
53047 return combineAddOrSubToADCOrSBB(N, DAG);
53048}
53049
53050// Try to fold (sub Y, cmovns X, -X) -> (add Y, cmovns -X, X) if the cmov
53051// condition comes from the subtract node that produced -X. This matches the
53052// cmov expansion for absolute value. By swapping the operands we convert abs
53053// to nabs.
53054static SDValue combineSubABS(SDNode *N, SelectionDAG &DAG) {
53055 SDValue N0 = N->getOperand(0);
53056 SDValue N1 = N->getOperand(1);
53057
53058 if (N1.getOpcode() != X86ISD::CMOV || !N1.hasOneUse())
53059 return SDValue();
53060
53061 X86::CondCode CC = (X86::CondCode)N1.getConstantOperandVal(2);
53062 if (CC != X86::COND_S && CC != X86::COND_NS)
53063 return SDValue();
53064
53065 // Condition should come from a negate operation.
53066 SDValue Cond = N1.getOperand(3);
53067 if (Cond.getOpcode() != X86ISD::SUB || !isNullConstant(Cond.getOperand(0)))
53068 return SDValue();
53069 assert(Cond.getResNo() == 1 && "Unexpected result number")(static_cast <bool> (Cond.getResNo() == 1 && "Unexpected result number"
) ? void (0) : __assert_fail ("Cond.getResNo() == 1 && \"Unexpected result number\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53069, __extension__
__PRETTY_FUNCTION__))
;
53070
53071 // Get the X and -X from the negate.
53072 SDValue NegX = Cond.getValue(0);
53073 SDValue X = Cond.getOperand(1);
53074
53075 SDValue FalseOp = N1.getOperand(0);
53076 SDValue TrueOp = N1.getOperand(1);
53077
53078 // Cmov operands should be X and NegX. Order doesn't matter.
53079 if (!(TrueOp == X && FalseOp == NegX) && !(TrueOp == NegX && FalseOp == X))
53080 return SDValue();
53081
53082 // Build a new CMOV with the operands swapped.
53083 SDLoc DL(N);
53084 MVT VT = N->getSimpleValueType(0);
53085 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp,
53086 N1.getOperand(2), Cond);
53087 // Convert sub to add.
53088 return DAG.getNode(ISD::ADD, DL, VT, N0, Cmov);
53089}
53090
53091static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
53092 TargetLowering::DAGCombinerInfo &DCI,
53093 const X86Subtarget &Subtarget) {
53094 SDValue Op0 = N->getOperand(0);
53095 SDValue Op1 = N->getOperand(1);
53096
53097 // TODO: Add NoOpaque handling to isConstantIntBuildVectorOrConstantInt.
53098 auto IsNonOpaqueConstant = [&](SDValue Op) {
53099 if (SDNode *C = DAG.isConstantIntBuildVectorOrConstantInt(Op)) {
53100 if (auto *Cst = dyn_cast<ConstantSDNode>(C))
53101 return !Cst->isOpaque();
53102 return true;
53103 }
53104 return false;
53105 };
53106
53107 // X86 can't encode an immediate LHS of a sub. See if we can push the
53108 // negation into a preceding instruction. If the RHS of the sub is a XOR with
53109 // one use and a constant, invert the immediate, saving one register.
53110 // sub(C1, xor(X, C2)) -> add(xor(X, ~C2), C1+1)
53111 if (Op1.getOpcode() == ISD::XOR && IsNonOpaqueConstant(Op0) &&
53112 IsNonOpaqueConstant(Op1.getOperand(1)) && Op1->hasOneUse()) {
53113 SDLoc DL(N);
53114 EVT VT = Op0.getValueType();
53115 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, Op1.getOperand(0),
53116 DAG.getNOT(SDLoc(Op1), Op1.getOperand(1), VT));
53117 SDValue NewAdd =
53118 DAG.getNode(ISD::ADD, DL, VT, Op0, DAG.getConstant(1, DL, VT));
53119 return DAG.getNode(ISD::ADD, DL, VT, NewXor, NewAdd);
53120 }
53121
53122 if (SDValue V = combineSubABS(N, DAG))
53123 return V;
53124
53125 // Try to synthesize horizontal subs from subs of shuffles.
53126 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
53127 return V;
53128
53129 // Fold SUB(X,ADC(Y,0,W)) -> SBB(X,Y,W)
53130 if (Op1.getOpcode() == X86ISD::ADC && Op1->hasOneUse() &&
53131 X86::isZeroNode(Op1.getOperand(1))) {
53132 assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use")(static_cast <bool> (!Op1->hasAnyUseOfValue(1) &&
"Overflow bit in use") ? void (0) : __assert_fail ("!Op1->hasAnyUseOfValue(1) && \"Overflow bit in use\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53132, __extension__
__PRETTY_FUNCTION__))
;
53133 return DAG.getNode(X86ISD::SBB, SDLoc(Op1), Op1->getVTList(), Op0,
53134 Op1.getOperand(0), Op1.getOperand(2));
53135 }
53136
53137 // Fold SUB(X,SBB(Y,Z,W)) -> SUB(ADC(X,Z,W),Y)
53138 // Don't fold to ADC(0,0,W)/SETCC_CARRY pattern which will prevent more folds.
53139 if (Op1.getOpcode() == X86ISD::SBB && Op1->hasOneUse() &&
53140 !(X86::isZeroNode(Op0) && X86::isZeroNode(Op1.getOperand(1)))) {
53141 assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use")(static_cast <bool> (!Op1->hasAnyUseOfValue(1) &&
"Overflow bit in use") ? void (0) : __assert_fail ("!Op1->hasAnyUseOfValue(1) && \"Overflow bit in use\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53141, __extension__
__PRETTY_FUNCTION__))
;
53142 SDValue ADC = DAG.getNode(X86ISD::ADC, SDLoc(Op1), Op1->getVTList(), Op0,
53143 Op1.getOperand(1), Op1.getOperand(2));
53144 return DAG.getNode(ISD::SUB, SDLoc(N), Op0.getValueType(), ADC.getValue(0),
53145 Op1.getOperand(0));
53146 }
53147
53148 return combineAddOrSubToADCOrSBB(N, DAG);
53149}
53150
53151static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
53152 const X86Subtarget &Subtarget) {
53153 MVT VT = N->getSimpleValueType(0);
53154 SDLoc DL(N);
53155
53156 if (N->getOperand(0) == N->getOperand(1)) {
53157 if (N->getOpcode() == X86ISD::PCMPEQ)
53158 return DAG.getConstant(-1, DL, VT);
53159 if (N->getOpcode() == X86ISD::PCMPGT)
53160 return DAG.getConstant(0, DL, VT);
53161 }
53162
53163 return SDValue();
53164}
53165
53166/// Helper that combines an array of subvector ops as if they were the operands
53167/// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g.
53168/// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type.
53169static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
53170 ArrayRef<SDValue> Ops, SelectionDAG &DAG,
53171 TargetLowering::DAGCombinerInfo &DCI,
53172 const X86Subtarget &Subtarget) {
53173 assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors")(static_cast <bool> (Subtarget.hasAVX() && "AVX assumed for concat_vectors"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"AVX assumed for concat_vectors\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53173, __extension__
__PRETTY_FUNCTION__))
;
53174 unsigned EltSizeInBits = VT.getScalarSizeInBits();
53175
53176 if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
53177 return DAG.getUNDEF(VT);
53178
53179 if (llvm::all_of(Ops, [](SDValue Op) {
53180 return ISD::isBuildVectorAllZeros(Op.getNode());
53181 }))
53182 return getZeroVector(VT, Subtarget, DAG, DL);
53183
53184 SDValue Op0 = Ops[0];
53185 bool IsSplat = llvm::all_of(Ops, [&Op0](SDValue Op) { return Op == Op0; });
53186
53187 // Repeated subvectors.
53188 if (IsSplat &&
53189 (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
53190 // If this broadcast is inserted into both halves, use a larger broadcast.
53191 if (Op0.getOpcode() == X86ISD::VBROADCAST)
53192 return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));
53193
53194 // If this simple subvector or scalar/subvector broadcast_load is inserted
53195 // into both halves, use a larger broadcast_load. Update other uses to use
53196 // an extracted subvector.
53197 if (ISD::isNormalLoad(Op0.getNode()) ||
53198 Op0.getOpcode() == X86ISD::VBROADCAST_LOAD ||
53199 Op0.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
53200 auto *Mem = cast<MemSDNode>(Op0);
53201 unsigned Opc = Op0.getOpcode() == X86ISD::VBROADCAST_LOAD
53202 ? X86ISD::VBROADCAST_LOAD
53203 : X86ISD::SUBV_BROADCAST_LOAD;
53204 if (SDValue BcastLd =
53205 getBROADCAST_LOAD(Opc, DL, VT, Mem->getMemoryVT(), Mem, 0, DAG)) {
53206 SDValue BcastSrc =
53207 extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits());
53208 DAG.ReplaceAllUsesOfValueWith(Op0, BcastSrc);
53209 return BcastLd;
53210 }
53211 }
53212
53213 // concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
53214 if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&
53215 (Subtarget.hasAVX2() ||
53216 X86::mayFoldLoadIntoBroadcastFromMem(Op0.getOperand(0),
53217 VT.getScalarType(), Subtarget)))
53218 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
53219 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,
53220 Op0.getOperand(0),
53221 DAG.getIntPtrConstant(0, DL)));
53222
53223 // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
53224 if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
53225 (Subtarget.hasAVX2() ||
53226 (EltSizeInBits >= 32 &&
53227 X86::mayFoldLoad(Op0.getOperand(0), Subtarget))) &&
53228 Op0.getOperand(0).getValueType() == VT.getScalarType())
53229 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));
53230
53231 // concat_vectors(extract_subvector(broadcast(x)),
53232 // extract_subvector(broadcast(x))) -> broadcast(x)
53233 if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
53234 Op0.getOperand(0).getValueType() == VT) {
53235 if (Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST ||
53236 Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST_LOAD)
53237 return Op0.getOperand(0);
53238 }
53239 }
53240
53241 // concat(extract_subvector(v0,c0), extract_subvector(v1,c1)) -> vperm2x128.
53242 // Only concat of subvector high halves which vperm2x128 is best at.
53243 // TODO: This should go in combineX86ShufflesRecursively eventually.
53244 if (VT.is256BitVector() && Ops.size() == 2) {
53245 SDValue Src0 = peekThroughBitcasts(Ops[0]);
53246 SDValue Src1 = peekThroughBitcasts(Ops[1]);
53247 if (Src0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
53248 Src1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
53249 EVT SrcVT0 = Src0.getOperand(0).getValueType();
53250 EVT SrcVT1 = Src1.getOperand(0).getValueType();
53251 unsigned NumSrcElts0 = SrcVT0.getVectorNumElements();
53252 unsigned NumSrcElts1 = SrcVT1.getVectorNumElements();
53253 if (SrcVT0.is256BitVector() && SrcVT1.is256BitVector() &&
53254 Src0.getConstantOperandAPInt(1) == (NumSrcElts0 / 2) &&
53255 Src1.getConstantOperandAPInt(1) == (NumSrcElts1 / 2)) {
53256 return DAG.getNode(X86ISD::VPERM2X128, DL, VT,
53257 DAG.getBitcast(VT, Src0.getOperand(0)),
53258 DAG.getBitcast(VT, Src1.getOperand(0)),
53259 DAG.getTargetConstant(0x31, DL, MVT::i8));
53260 }
53261 }
53262 }
53263
53264 // Repeated opcode.
53265 // TODO - combineX86ShufflesRecursively should handle shuffle concatenation
53266 // but it currently struggles with different vector widths.
53267 if (llvm::all_of(Ops, [Op0](SDValue Op) {
53268 return Op.getOpcode() == Op0.getOpcode();
53269 })) {
53270 auto ConcatSubOperand = [&](MVT VT, ArrayRef<SDValue> SubOps, unsigned I) {
53271 SmallVector<SDValue> Subs;
53272 for (SDValue SubOp : SubOps)
53273 Subs.push_back(SubOp.getOperand(I));
53274 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
53275 };
53276
53277 unsigned NumOps = Ops.size();
53278 switch (Op0.getOpcode()) {
53279 case X86ISD::VBROADCAST: {
53280 if (!IsSplat && VT == MVT::v4f64 && llvm::all_of(Ops, [](SDValue Op) {
53281 return Op.getOperand(0).getValueType().is128BitVector();
53282 }))
53283 return DAG.getNode(X86ISD::MOVDDUP, DL, VT,
53284 ConcatSubOperand(VT, Ops, 0));
53285 break;
53286 }
53287 case X86ISD::MOVDDUP:
53288 case X86ISD::MOVSHDUP:
53289 case X86ISD::MOVSLDUP: {
53290 if (!IsSplat)
53291 return DAG.getNode(Op0.getOpcode(), DL, VT,
53292 ConcatSubOperand(VT, Ops, 0));
53293 break;
53294 }
53295 case X86ISD::SHUFP: {
53296 // Add SHUFPD support if/when necessary.
53297 if (!IsSplat && VT.getScalarType() == MVT::f32 &&
53298 llvm::all_of(Ops, [Op0](SDValue Op) {
53299 return Op.getOperand(2) == Op0.getOperand(2);
53300 })) {
53301 return DAG.getNode(Op0.getOpcode(), DL, VT,
53302 ConcatSubOperand(VT, Ops, 0),
53303 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
53304 }
53305 break;
53306 }
53307 case X86ISD::PSHUFHW:
53308 case X86ISD::PSHUFLW:
53309 case X86ISD::PSHUFD:
53310 if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&
53311 Subtarget.hasInt256() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
53312 return DAG.getNode(Op0.getOpcode(), DL, VT,
53313 ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
53314 }
53315 LLVM_FALLTHROUGH[[gnu::fallthrough]];
53316 case X86ISD::VPERMILPI:
53317 if (!IsSplat && NumOps == 2 && (VT == MVT::v8f32 || VT == MVT::v8i32) &&
53318 Op0.getOperand(1) == Ops[1].getOperand(1)) {
53319 SDValue Res = DAG.getBitcast(MVT::v8f32, ConcatSubOperand(VT, Ops, 0));
53320 Res = DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, Res,
53321 Op0.getOperand(1));
53322 return DAG.getBitcast(VT, Res);
53323 }
53324 if (!IsSplat && NumOps == 2 && VT == MVT::v4f64) {
53325 uint64_t Idx0 = Ops[0].getConstantOperandVal(1);
53326 uint64_t Idx1 = Ops[1].getConstantOperandVal(1);
53327 uint64_t Idx = ((Idx1 & 3) << 2) | (Idx0 & 3);
53328 return DAG.getNode(Op0.getOpcode(), DL, VT,
53329 ConcatSubOperand(VT, Ops, 0),
53330 DAG.getTargetConstant(Idx, DL, MVT::i8));
53331 }
53332 break;
53333 case X86ISD::PSHUFB:
53334 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
53335 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
53336 return DAG.getNode(Op0.getOpcode(), DL, VT,
53337 ConcatSubOperand(VT, Ops, 0),
53338 ConcatSubOperand(VT, Ops, 1));
53339 }
53340 break;
53341 case X86ISD::VPERMV3:
53342 if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
53343 MVT OpVT = Op0.getSimpleValueType();
53344 int NumSrcElts = OpVT.getVectorNumElements();
53345 SmallVector<int, 64> ConcatMask;
53346 for (unsigned i = 0; i != NumOps; ++i) {
53347 SmallVector<int, 64> SubMask;
53348 SmallVector<SDValue, 2> SubOps;
53349 if (!getTargetShuffleMask(Ops[i].getNode(), OpVT, false, SubOps,
53350 SubMask))
53351 break;
53352 for (int M : SubMask) {
53353 if (0 <= M) {
53354 M += M < NumSrcElts ? 0 : NumSrcElts;
53355 M += i * NumSrcElts;
53356 }
53357 ConcatMask.push_back(M);
53358 }
53359 }
53360 if (ConcatMask.size() == (NumOps * NumSrcElts)) {
53361 SDValue Src0 = concatSubVectors(Ops[0].getOperand(0),
53362 Ops[1].getOperand(0), DAG, DL);
53363 SDValue Src1 = concatSubVectors(Ops[0].getOperand(2),
53364 Ops[1].getOperand(2), DAG, DL);
53365 MVT IntMaskSVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
53366 MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts);
53367 SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true);
53368 return DAG.getNode(X86ISD::VPERMV3, DL, VT, Src0, Mask, Src1);
53369 }
53370 }
53371 break;
53372 case X86ISD::VSHLI:
53373 case X86ISD::VSRLI:
53374 // Special case: SHL/SRL AVX1 V4i64 by 32-bits can lower as a shuffle.
53375 // TODO: Move this to LowerShiftByScalarImmediate?
53376 if (VT == MVT::v4i64 && !Subtarget.hasInt256() &&
53377 llvm::all_of(Ops, [](SDValue Op) {
53378 return Op.getConstantOperandAPInt(1) == 32;
53379 })) {
53380 SDValue Res = DAG.getBitcast(MVT::v8i32, ConcatSubOperand(VT, Ops, 0));
53381 SDValue Zero = getZeroVector(MVT::v8i32, Subtarget, DAG, DL);
53382 if (Op0.getOpcode() == X86ISD::VSHLI) {
53383 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
53384 {8, 0, 8, 2, 8, 4, 8, 6});
53385 } else {
53386 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
53387 {1, 8, 3, 8, 5, 8, 7, 8});
53388 }
53389 return DAG.getBitcast(VT, Res);
53390 }
53391 LLVM_FALLTHROUGH[[gnu::fallthrough]];
53392 case X86ISD::VSRAI:
53393 case X86ISD::VSHL:
53394 case X86ISD::VSRL:
53395 case X86ISD::VSRA:
53396 if (((VT.is256BitVector() && Subtarget.hasInt256()) ||
53397 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
53398 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
53399 llvm::all_of(Ops, [Op0](SDValue Op) {
53400 return Op0.getOperand(1) == Op.getOperand(1);
53401 })) {
53402 return DAG.getNode(Op0.getOpcode(), DL, VT,
53403 ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
53404 }
53405 break;
53406 case X86ISD::VPERMI:
53407 case X86ISD::VROTLI:
53408 case X86ISD::VROTRI:
53409 if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
53410 llvm::all_of(Ops, [Op0](SDValue Op) {
53411 return Op0.getOperand(1) == Op.getOperand(1);
53412 })) {
53413 return DAG.getNode(Op0.getOpcode(), DL, VT,
53414 ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
53415 }
53416 break;
53417 case ISD::AND:
53418 case ISD::OR:
53419 case ISD::XOR:
53420 case X86ISD::ANDNP:
53421 // TODO: Add 256-bit support.
53422 if (!IsSplat && VT.is512BitVector()) {
53423 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
53424 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
53425 NumOps * SrcVT.getVectorNumElements());
53426 return DAG.getNode(Op0.getOpcode(), DL, VT,
53427 ConcatSubOperand(SrcVT, Ops, 0),
53428 ConcatSubOperand(SrcVT, Ops, 1));
53429 }
53430 break;
53431 case X86ISD::HADD:
53432 case X86ISD::HSUB:
53433 case X86ISD::FHADD:
53434 case X86ISD::FHSUB:
53435 case X86ISD::PACKSS:
53436 case X86ISD::PACKUS:
53437 if (!IsSplat && VT.is256BitVector() &&
53438 (VT.isFloatingPoint() || Subtarget.hasInt256())) {
53439 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
53440 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
53441 NumOps * SrcVT.getVectorNumElements());
53442 return DAG.getNode(Op0.getOpcode(), DL, VT,
53443 ConcatSubOperand(SrcVT, Ops, 0),
53444 ConcatSubOperand(SrcVT, Ops, 1));
53445 }
53446 break;
53447 case X86ISD::PALIGNR:
53448 if (!IsSplat &&
53449 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
53450 (VT.is512BitVector() && Subtarget.useBWIRegs())) &&
53451 llvm::all_of(Ops, [Op0](SDValue Op) {
53452 return Op0.getOperand(2) == Op.getOperand(2);
53453 })) {
53454 return DAG.getNode(Op0.getOpcode(), DL, VT,
53455 ConcatSubOperand(VT, Ops, 0),
53456 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
53457 }
53458 break;
53459 }
53460 }
53461
53462 // Fold subvector loads into one.
53463 // If needed, look through bitcasts to get to the load.
53464 if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {
53465 bool Fast;
53466 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
53467 if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
53468 *FirstLd->getMemOperand(), &Fast) &&
53469 Fast) {
53470 if (SDValue Ld =
53471 EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))
53472 return Ld;
53473 }
53474 }
53475
53476 // Attempt to fold target constant loads.
53477 if (all_of(Ops, [](SDValue Op) { return getTargetConstantFromNode(Op); })) {
53478 SmallVector<APInt> EltBits;
53479 APInt UndefElts = APInt::getNullValue(VT.getVectorNumElements());
53480 for (unsigned I = 0, E = Ops.size(); I != E; ++I) {
53481 APInt OpUndefElts;
53482 SmallVector<APInt> OpEltBits;
53483 if (!getTargetConstantBitsFromNode(Ops[I], EltSizeInBits, OpUndefElts,
53484 OpEltBits, true, false))
53485 break;
53486 EltBits.append(OpEltBits);
53487 UndefElts.insertBits(OpUndefElts, I * OpUndefElts.getBitWidth());
53488 }
53489 if (EltBits.size() == VT.getVectorNumElements())
53490 return getConstVector(EltBits, UndefElts, VT, DAG, DL);
53491 }
53492
53493 return SDValue();
53494}
53495
53496static SDValue combineConcatVectors(SDNode *N, SelectionDAG &DAG,
53497 TargetLowering::DAGCombinerInfo &DCI,
53498 const X86Subtarget &Subtarget) {
53499 EVT VT = N->getValueType(0);
53500 EVT SrcVT = N->getOperand(0).getValueType();
53501 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53502
53503 // Don't do anything for i1 vectors.
53504 if (VT.getVectorElementType() == MVT::i1)
53505 return SDValue();
53506
53507 if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) {
53508 SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());
53509 if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG,
53510 DCI, Subtarget))
53511 return R;
53512 }
53513
53514 return SDValue();
53515}
53516
53517static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
53518 TargetLowering::DAGCombinerInfo &DCI,
53519 const X86Subtarget &Subtarget) {
53520 if (DCI.isBeforeLegalizeOps())
53521 return SDValue();
53522
53523 MVT OpVT = N->getSimpleValueType(0);
53524
53525 bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;
53526
53527 SDLoc dl(N);
53528 SDValue Vec = N->getOperand(0);
53529 SDValue SubVec = N->getOperand(1);
53530
53531 uint64_t IdxVal = N->getConstantOperandVal(2);
53532 MVT SubVecVT = SubVec.getSimpleValueType();
53533
53534 if (Vec.isUndef() && SubVec.isUndef())
53535 return DAG.getUNDEF(OpVT);
53536
53537 // Inserting undefs/zeros into zeros/undefs is a zero vector.
53538 if ((Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())) &&
53539 (SubVec.isUndef() || ISD::isBuildVectorAllZeros(SubVec.getNode())))
53540 return getZeroVector(OpVT, Subtarget, DAG, dl);
53541
53542 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
53543 // If we're inserting into a zero vector and then into a larger zero vector,
53544 // just insert into the larger zero vector directly.
53545 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
53546 ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) {
53547 uint64_t Idx2Val = SubVec.getConstantOperandVal(2);
53548 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
53549 getZeroVector(OpVT, Subtarget, DAG, dl),
53550 SubVec.getOperand(1),
53551 DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));
53552 }
53553
53554 // If we're inserting into a zero vector and our input was extracted from an
53555 // insert into a zero vector of the same type and the extraction was at
53556 // least as large as the original insertion. Just insert the original
53557 // subvector into a zero vector.
53558 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
53559 isNullConstant(SubVec.getOperand(1)) &&
53560 SubVec.getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR) {
53561 SDValue Ins = SubVec.getOperand(0);
53562 if (isNullConstant(Ins.getOperand(2)) &&
53563 ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
53564 Ins.getOperand(1).getValueSizeInBits().getFixedSize() <=
53565 SubVecVT.getFixedSizeInBits())
53566 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
53567 getZeroVector(OpVT, Subtarget, DAG, dl),
53568 Ins.getOperand(1), N->getOperand(2));
53569 }
53570 }
53571
53572 // Stop here if this is an i1 vector.
53573 if (IsI1Vector)
53574 return SDValue();
53575
53576 // If this is an insert of an extract, combine to a shuffle. Don't do this
53577 // if the insert or extract can be represented with a subregister operation.
53578 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
53579 SubVec.getOperand(0).getSimpleValueType() == OpVT &&
53580 (IdxVal != 0 ||
53581 !(Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())))) {
53582 int ExtIdxVal = SubVec.getConstantOperandVal(1);
53583 if (ExtIdxVal != 0) {
53584 int VecNumElts = OpVT.getVectorNumElements();
53585 int SubVecNumElts = SubVecVT.getVectorNumElements();
53586 SmallVector<int, 64> Mask(VecNumElts);
53587 // First create an identity shuffle mask.
53588 for (int i = 0; i != VecNumElts; ++i)
53589 Mask[i] = i;
53590 // Now insert the extracted portion.
53591 for (int i = 0; i != SubVecNumElts; ++i)
53592 Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
53593
53594 return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
53595 }
53596 }
53597
53598 // Match concat_vector style patterns.
53599 SmallVector<SDValue, 2> SubVectorOps;
53600 if (collectConcatOps(N, SubVectorOps)) {
53601 if (SDValue Fold =
53602 combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, DCI, Subtarget))
53603 return Fold;
53604
53605 // If we're inserting all zeros into the upper half, change this to
53606 // a concat with zero. We will match this to a move
53607 // with implicit upper bit zeroing during isel.
53608 // We do this here because we don't want combineConcatVectorOps to
53609 // create INSERT_SUBVECTOR from CONCAT_VECTORS.
53610 if (SubVectorOps.size() == 2 &&
53611 ISD::isBuildVectorAllZeros(SubVectorOps[1].getNode()))
53612 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
53613 getZeroVector(OpVT, Subtarget, DAG, dl),
53614 SubVectorOps[0], DAG.getIntPtrConstant(0, dl));
53615 }
53616
53617 // If this is a broadcast insert into an upper undef, use a larger broadcast.
53618 if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)
53619 return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));
53620
53621 // If this is a broadcast load inserted into an upper undef, use a larger
53622 // broadcast load.
53623 if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() &&
53624 SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) {
53625 auto *MemIntr = cast<MemIntrinsicSDNode>(SubVec);
53626 SDVTList Tys = DAG.getVTList(OpVT, MVT::Other);
53627 SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() };
53628 SDValue BcastLd =
53629 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
53630 MemIntr->getMemoryVT(),
53631 MemIntr->getMemOperand());
53632 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
53633 return BcastLd;
53634 }
53635
53636 // If we're splatting the lower half subvector of a full vector load into the
53637 // upper half, attempt to create a subvector broadcast.
53638 if (IdxVal == (OpVT.getVectorNumElements() / 2) && SubVec.hasOneUse() &&
53639 Vec.getValueSizeInBits() == (2 * SubVec.getValueSizeInBits())) {
53640 auto *VecLd = dyn_cast<LoadSDNode>(Vec);
53641 auto *SubLd = dyn_cast<LoadSDNode>(SubVec);
53642 if (VecLd && SubLd &&
53643 DAG.areNonVolatileConsecutiveLoads(SubLd, VecLd,
53644 SubVec.getValueSizeInBits() / 8, 0))
53645 return getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, dl, OpVT, SubVecVT,
53646 SubLd, 0, DAG);
53647 }
53648
53649 return SDValue();
53650}
53651
53652/// If we are extracting a subvector of a vector select and the select condition
53653/// is composed of concatenated vectors, try to narrow the select width. This
53654/// is a common pattern for AVX1 integer code because 256-bit selects may be
53655/// legal, but there is almost no integer math/logic available for 256-bit.
53656/// This function should only be called with legal types (otherwise, the calls
53657/// to get simple value types will assert).
53658static SDValue narrowExtractedVectorSelect(SDNode *Ext, SelectionDAG &DAG) {
53659 SDValue Sel = peekThroughBitcasts(Ext->getOperand(0));
53660 SmallVector<SDValue, 4> CatOps;
53661 if (Sel.getOpcode() != ISD::VSELECT ||
53662 !collectConcatOps(Sel.getOperand(0).getNode(), CatOps))
53663 return SDValue();
53664
53665 // Note: We assume simple value types because this should only be called with
53666 // legal operations/types.
53667 // TODO: This can be extended to handle extraction to 256-bits.
53668 MVT VT = Ext->getSimpleValueType(0);
53669 if (!VT.is128BitVector())
53670 return SDValue();
53671
53672 MVT SelCondVT = Sel.getOperand(0).getSimpleValueType();
53673 if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector())
53674 return SDValue();
53675
53676 MVT WideVT = Ext->getOperand(0).getSimpleValueType();
53677 MVT SelVT = Sel.getSimpleValueType();
53678 assert((SelVT.is256BitVector() || SelVT.is512BitVector()) &&(static_cast <bool> ((SelVT.is256BitVector() || SelVT.is512BitVector
()) && "Unexpected vector type with legal operations"
) ? void (0) : __assert_fail ("(SelVT.is256BitVector() || SelVT.is512BitVector()) && \"Unexpected vector type with legal operations\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53679, __extension__
__PRETTY_FUNCTION__))
53679 "Unexpected vector type with legal operations")(static_cast <bool> ((SelVT.is256BitVector() || SelVT.is512BitVector
()) && "Unexpected vector type with legal operations"
) ? void (0) : __assert_fail ("(SelVT.is256BitVector() || SelVT.is512BitVector()) && \"Unexpected vector type with legal operations\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53679, __extension__
__PRETTY_FUNCTION__))
;
53680
53681 unsigned SelElts = SelVT.getVectorNumElements();
53682 unsigned CastedElts = WideVT.getVectorNumElements();
53683 unsigned ExtIdx = Ext->getConstantOperandVal(1);
53684 if (SelElts % CastedElts == 0) {
53685 // The select has the same or more (narrower) elements than the extract
53686 // operand. The extraction index gets scaled by that factor.
53687 ExtIdx *= (SelElts / CastedElts);
53688 } else if (CastedElts % SelElts == 0) {
53689 // The select has less (wider) elements than the extract operand. Make sure
53690 // that the extraction index can be divided evenly.
53691 unsigned IndexDivisor = CastedElts / SelElts;
53692 if (ExtIdx % IndexDivisor != 0)
53693 return SDValue();
53694 ExtIdx /= IndexDivisor;
53695 } else {
53696 llvm_unreachable("Element count of simple vector types are not divisible?")::llvm::llvm_unreachable_internal("Element count of simple vector types are not divisible?"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53696)
;
53697 }
53698
53699 unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits();
53700 unsigned NarrowElts = SelElts / NarrowingFactor;
53701 MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts);
53702 SDLoc DL(Ext);
53703 SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL);
53704 SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL);
53705 SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL);
53706 SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF);
53707 return DAG.getBitcast(VT, NarrowSel);
53708}
53709
53710static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
53711 TargetLowering::DAGCombinerInfo &DCI,
53712 const X86Subtarget &Subtarget) {
53713 // For AVX1 only, if we are extracting from a 256-bit and+not (which will
53714 // eventually get combined/lowered into ANDNP) with a concatenated operand,
53715 // split the 'and' into 128-bit ops to avoid the concatenate and extract.
53716 // We let generic combining take over from there to simplify the
53717 // insert/extract and 'not'.
53718 // This pattern emerges during AVX1 legalization. We handle it before lowering
53719 // to avoid complications like splitting constant vector loads.
53720
53721 // Capture the original wide type in the likely case that we need to bitcast
53722 // back to this type.
53723 if (!N->getValueType(0).isSimple())
53724 return SDValue();
53725
53726 MVT VT = N->getSimpleValueType(0);
53727 SDValue InVec = N->getOperand(0);
53728 unsigned IdxVal = N->getConstantOperandVal(1);
53729 SDValue InVecBC = peekThroughBitcasts(InVec);
53730 EVT InVecVT = InVec.getValueType();
53731 unsigned SizeInBits = VT.getSizeInBits();
53732 unsigned InSizeInBits = InVecVT.getSizeInBits();
53733 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53734
53735 if (Subtarget.hasAVX() && !Subtarget.hasAVX2() &&
53736 TLI.isTypeLegal(InVecVT) &&
53737 InSizeInBits == 256 && InVecBC.getOpcode() == ISD::AND) {
53738 auto isConcatenatedNot = [](SDValue V) {
53739 V = peekThroughBitcasts(V);
53740 if (!isBitwiseNot(V))
53741 return false;
53742 SDValue NotOp = V->getOperand(0);
53743 return peekThroughBitcasts(NotOp).getOpcode() == ISD::CONCAT_VECTORS;
53744 };
53745 if (isConcatenatedNot(InVecBC.getOperand(0)) ||
53746 isConcatenatedNot(InVecBC.getOperand(1))) {
53747 // extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1
53748 SDValue Concat = splitVectorIntBinary(InVecBC, DAG);
53749 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT,
53750 DAG.getBitcast(InVecVT, Concat), N->getOperand(1));
53751 }
53752 }
53753
53754 if (DCI.isBeforeLegalizeOps())
53755 return SDValue();
53756
53757 if (SDValue V = narrowExtractedVectorSelect(N, DAG))
53758 return V;
53759
53760 if (ISD::isBuildVectorAllZeros(InVec.getNode()))
53761 return getZeroVector(VT, Subtarget, DAG, SDLoc(N));
53762
53763 if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
53764 if (VT.getScalarType() == MVT::i1)
53765 return DAG.getConstant(1, SDLoc(N), VT);
53766 return getOnesVector(VT, DAG, SDLoc(N));
53767 }
53768
53769 if (InVec.getOpcode() == ISD::BUILD_VECTOR)
53770 return DAG.getBuildVector(
53771 VT, SDLoc(N),
53772 InVec.getNode()->ops().slice(IdxVal, VT.getVectorNumElements()));
53773
53774 // If we are extracting from an insert into a larger vector, replace with a
53775 // smaller insert if we don't access less than the original subvector. Don't
53776 // do this for i1 vectors.
53777 // TODO: Relax the matching indices requirement?
53778 if (VT.getVectorElementType() != MVT::i1 &&
53779 InVec.getOpcode() == ISD::INSERT_SUBVECTOR && InVec.hasOneUse() &&
53780 IdxVal == InVec.getConstantOperandVal(2) &&
53781 InVec.getOperand(1).getValueSizeInBits() <= SizeInBits) {
53782 SDLoc DL(N);
53783 SDValue NewExt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT,
53784 InVec.getOperand(0), N->getOperand(1));
53785 unsigned NewIdxVal = InVec.getConstantOperandVal(2) - IdxVal;
53786 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, NewExt,
53787 InVec.getOperand(1),
53788 DAG.getVectorIdxConstant(NewIdxVal, DL));
53789 }
53790
53791 // If we're extracting an upper subvector from a broadcast we should just
53792 // extract the lowest subvector instead which should allow
53793 // SimplifyDemandedVectorElts do more simplifications.
53794 if (IdxVal != 0 && (InVec.getOpcode() == X86ISD::VBROADCAST ||
53795 InVec.getOpcode() == X86ISD::VBROADCAST_LOAD ||
53796 DAG.isSplatValue(InVec, /*AllowUndefs*/ false)))
53797 return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits);
53798
53799 // If we're extracting a broadcasted subvector, just use the lowest subvector.
53800 if (IdxVal != 0 && InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
53801 cast<MemIntrinsicSDNode>(InVec)->getMemoryVT() == VT)
53802 return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits);
53803
53804 // Attempt to extract from the source of a shuffle vector.
53805 if ((InSizeInBits % SizeInBits) == 0 &&
53806 (IdxVal % VT.getVectorNumElements()) == 0) {
53807 SmallVector<int, 32> ShuffleMask;
53808 SmallVector<int, 32> ScaledMask;
53809 SmallVector<SDValue, 2> ShuffleInputs;
53810 unsigned NumSubVecs = InSizeInBits / SizeInBits;
53811 // Decode the shuffle mask and scale it so its shuffling subvectors.
53812 if (getTargetShuffleInputs(InVecBC, ShuffleInputs, ShuffleMask, DAG) &&
53813 scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) {
53814 unsigned SubVecIdx = IdxVal / VT.getVectorNumElements();
53815 if (ScaledMask[SubVecIdx] == SM_SentinelUndef)
53816 return DAG.getUNDEF(VT);
53817 if (ScaledMask[SubVecIdx] == SM_SentinelZero)
53818 return getZeroVector(VT, Subtarget, DAG, SDLoc(N));
53819 SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs];
53820 if (Src.getValueSizeInBits() == InSizeInBits) {
53821 unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs;
53822 unsigned SrcEltIdx = SrcSubVecIdx * VT.getVectorNumElements();
53823 return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG,
53824 SDLoc(N), SizeInBits);
53825 }
53826 }
53827 }
53828
53829 // If we're extracting the lowest subvector and we're the only user,
53830 // we may be able to perform this with a smaller vector width.
53831 unsigned InOpcode = InVec.getOpcode();
53832 if (InVec.hasOneUse()) {
53833 if (IdxVal == 0 && VT == MVT::v2f64 && InVecVT == MVT::v4f64) {
53834 // v2f64 CVTDQ2PD(v4i32).
53835 if (InOpcode == ISD::SINT_TO_FP &&
53836 InVec.getOperand(0).getValueType() == MVT::v4i32) {
53837 return DAG.getNode(X86ISD::CVTSI2P, SDLoc(N), VT, InVec.getOperand(0));
53838 }
53839 // v2f64 CVTUDQ2PD(v4i32).
53840 if (InOpcode == ISD::UINT_TO_FP && Subtarget.hasVLX() &&
53841 InVec.getOperand(0).getValueType() == MVT::v4i32) {
53842 return DAG.getNode(X86ISD::CVTUI2P, SDLoc(N), VT, InVec.getOperand(0));
53843 }
53844 // v2f64 CVTPS2PD(v4f32).
53845 if (InOpcode == ISD::FP_EXTEND &&
53846 InVec.getOperand(0).getValueType() == MVT::v4f32) {
53847 return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), VT, InVec.getOperand(0));
53848 }
53849 }
53850 if (IdxVal == 0 &&
53851 (InOpcode == ISD::ANY_EXTEND ||
53852 InOpcode == ISD::ANY_EXTEND_VECTOR_INREG ||
53853 InOpcode == ISD::ZERO_EXTEND ||
53854 InOpcode == ISD::ZERO_EXTEND_VECTOR_INREG ||
53855 InOpcode == ISD::SIGN_EXTEND ||
53856 InOpcode == ISD::SIGN_EXTEND_VECTOR_INREG) &&
53857 (SizeInBits == 128 || SizeInBits == 256) &&
53858 InVec.getOperand(0).getValueSizeInBits() >= SizeInBits) {
53859 SDLoc DL(N);
53860 SDValue Ext = InVec.getOperand(0);
53861 if (Ext.getValueSizeInBits() > SizeInBits)
53862 Ext = extractSubVector(Ext, 0, DAG, DL, SizeInBits);
53863 unsigned ExtOp = getOpcode_EXTEND_VECTOR_INREG(InOpcode);
53864 return DAG.getNode(ExtOp, DL, VT, Ext);
53865 }
53866 if (IdxVal == 0 && InOpcode == ISD::VSELECT &&
53867 InVec.getOperand(0).getValueType().is256BitVector() &&
53868 InVec.getOperand(1).getValueType().is256BitVector() &&
53869 InVec.getOperand(2).getValueType().is256BitVector()) {
53870 SDLoc DL(N);
53871 SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128);
53872 SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128);
53873 SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);
53874 return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);
53875 }
53876 if (IdxVal == 0 && InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() &&
53877 (VT.is128BitVector() || VT.is256BitVector())) {
53878 SDLoc DL(N);
53879 SDValue InVecSrc = InVec.getOperand(0);
53880 unsigned Scale = InVecSrc.getValueSizeInBits() / InSizeInBits;
53881 SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits);
53882 return DAG.getNode(InOpcode, DL, VT, Ext);
53883 }
53884 if (InOpcode == X86ISD::MOVDDUP &&
53885 (VT.is128BitVector() || VT.is256BitVector())) {
53886 SDLoc DL(N);
53887 SDValue Ext0 =
53888 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
53889 return DAG.getNode(InOpcode, DL, VT, Ext0);
53890 }
53891 }
53892
53893 // Always split vXi64 logical shifts where we're extracting the upper 32-bits
53894 // as this is very likely to fold into a shuffle/truncation.
53895 if ((InOpcode == X86ISD::VSHLI || InOpcode == X86ISD::VSRLI) &&
53896 InVecVT.getScalarSizeInBits() == 64 &&
53897 InVec.getConstantOperandAPInt(1) == 32) {
53898 SDLoc DL(N);
53899 SDValue Ext =
53900 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
53901 return DAG.getNode(InOpcode, DL, VT, Ext, InVec.getOperand(1));
53902 }
53903
53904 return SDValue();
53905}
53906
53907static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {
53908 EVT VT = N->getValueType(0);
53909 SDValue Src = N->getOperand(0);
53910 SDLoc DL(N);
53911
53912 // If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
53913 // This occurs frequently in our masked scalar intrinsic code and our
53914 // floating point select lowering with AVX512.
53915 // TODO: SimplifyDemandedBits instead?
53916 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse())
53917 if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
53918 if (C->getAPIntValue().isOne())
53919 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1,
53920 Src.getOperand(0));
53921
53922 // Combine scalar_to_vector of an extract_vector_elt into an extract_subvec.
53923 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
53924 Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() &&
53925 Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1)
53926 if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
53927 if (C->isZero())
53928 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0),
53929 Src.getOperand(1));
53930
53931 // Reduce v2i64 to v4i32 if we don't need the upper bits.
53932 // TODO: Move to DAGCombine/SimplifyDemandedBits?
53933 if (VT == MVT::v2i64 || VT == MVT::v2f64) {
53934 auto IsAnyExt64 = [](SDValue Op) {
53935 if (Op.getValueType() != MVT::i64 || !Op.hasOneUse())
53936 return SDValue();
53937 if (Op.getOpcode() == ISD::ANY_EXTEND &&
53938 Op.getOperand(0).getScalarValueSizeInBits() <= 32)
53939 return Op.getOperand(0);
53940 if (auto *Ld = dyn_cast<LoadSDNode>(Op))
53941 if (Ld->getExtensionType() == ISD::EXTLOAD &&
53942 Ld->getMemoryVT().getScalarSizeInBits() <= 32)
53943 return Op;
53944 return SDValue();
53945 };
53946 if (SDValue ExtSrc = IsAnyExt64(peekThroughOneUseBitcasts(Src)))
53947 return DAG.getBitcast(
53948 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
53949 DAG.getAnyExtOrTrunc(ExtSrc, DL, MVT::i32)));
53950 }
53951
53952 // Combine (v2i64 (scalar_to_vector (i64 (bitconvert (mmx))))) to MOVQ2DQ.
53953 if (VT == MVT::v2i64 && Src.getOpcode() == ISD::BITCAST &&
53954 Src.getOperand(0).getValueType() == MVT::x86mmx)
53955 return DAG.getNode(X86ISD::MOVQ2DQ, DL, VT, Src.getOperand(0));
53956
53957 // See if we're broadcasting the scalar value, in which case just reuse that.
53958 // Ensure the same SDValue from the SDNode use is being used.
53959 if (VT.getScalarType() == Src.getValueType())
53960 for (SDNode *User : Src->uses())
53961 if (User->getOpcode() == X86ISD::VBROADCAST &&
53962 Src == User->getOperand(0)) {
53963 unsigned SizeInBits = VT.getFixedSizeInBits();
53964 unsigned BroadcastSizeInBits =
53965 User->getValueSizeInBits(0).getFixedSize();
53966 if (BroadcastSizeInBits == SizeInBits)
53967 return SDValue(User, 0);
53968 if (BroadcastSizeInBits > SizeInBits)
53969 return extractSubVector(SDValue(User, 0), 0, DAG, DL, SizeInBits);
53970 // TODO: Handle BroadcastSizeInBits < SizeInBits when we have test
53971 // coverage.
53972 }
53973
53974 return SDValue();
53975}
53976
53977// Simplify PMULDQ and PMULUDQ operations.
53978static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
53979 TargetLowering::DAGCombinerInfo &DCI,
53980 const X86Subtarget &Subtarget) {
53981 SDValue LHS = N->getOperand(0);
53982 SDValue RHS = N->getOperand(1);
53983
53984 // Canonicalize constant to RHS.
53985 if (DAG.isConstantIntBuildVectorOrConstantInt(LHS) &&
53986 !DAG.isConstantIntBuildVectorOrConstantInt(RHS))
53987 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);
53988
53989 // Multiply by zero.
53990 // Don't return RHS as it may contain UNDEFs.
53991 if (ISD::isBuildVectorAllZeros(RHS.getNode()))
53992 return DAG.getConstant(0, SDLoc(N), N->getValueType(0));
53993
53994 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
53995 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53996 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(64), DCI))
53997 return SDValue(N, 0);
53998
53999 // If the input is an extend_invec and the SimplifyDemandedBits call didn't
54000 // convert it to any_extend_invec, due to the LegalOperations check, do the
54001 // conversion directly to a vector shuffle manually. This exposes combine
54002 // opportunities missed by combineEXTEND_VECTOR_INREG not calling
54003 // combineX86ShufflesRecursively on SSE4.1 targets.
54004 // FIXME: This is basically a hack around several other issues related to
54005 // ANY_EXTEND_VECTOR_INREG.
54006 if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() &&
54007 (LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
54008 LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
54009 LHS.getOperand(0).getValueType() == MVT::v4i32) {
54010 SDLoc dl(N);
54011 LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0),
54012 LHS.getOperand(0), { 0, -1, 1, -1 });
54013 LHS = DAG.getBitcast(MVT::v2i64, LHS);
54014 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
54015 }
54016 if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() &&
54017 (RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
54018 RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
54019 RHS.getOperand(0).getValueType() == MVT::v4i32) {
54020 SDLoc dl(N);
54021 RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0),
54022 RHS.getOperand(0), { 0, -1, 1, -1 });
54023 RHS = DAG.getBitcast(MVT::v2i64, RHS);
54024 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
54025 }
54026
54027 return SDValue();
54028}
54029
54030// Simplify VPMADDUBSW/VPMADDWD operations.
54031static SDValue combineVPMADD(SDNode *N, SelectionDAG &DAG,
54032 TargetLowering::DAGCombinerInfo &DCI) {
54033 EVT VT = N->getValueType(0);
54034 SDValue LHS = N->getOperand(0);
54035 SDValue RHS = N->getOperand(1);
54036
54037 // Multiply by zero.
54038 // Don't return LHS/RHS as it may contain UNDEFs.
54039 if (ISD::isBuildVectorAllZeros(LHS.getNode()) ||
54040 ISD::isBuildVectorAllZeros(RHS.getNode()))
54041 return DAG.getConstant(0, SDLoc(N), VT);
54042
54043 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54044 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
54045 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
54046 return SDValue(N, 0);
54047
54048 return SDValue();
54049}
54050
54051static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG,
54052 TargetLowering::DAGCombinerInfo &DCI,
54053 const X86Subtarget &Subtarget) {
54054 EVT VT = N->getValueType(0);
54055 SDValue In = N->getOperand(0);
54056 unsigned Opcode = N->getOpcode();
54057 unsigned InOpcode = In.getOpcode();
54058 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54059 SDLoc DL(N);
54060
54061 // Try to merge vector loads and extend_inreg to an extload.
54062 if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&
54063 In.hasOneUse()) {
54064 auto *Ld = cast<LoadSDNode>(In);
54065 if (Ld->isSimple()) {
54066 MVT SVT = In.getSimpleValueType().getVectorElementType();
54067 ISD::LoadExtType Ext = Opcode == ISD::SIGN_EXTEND_VECTOR_INREG
54068 ? ISD::SEXTLOAD
54069 : ISD::ZEXTLOAD;
54070 EVT MemVT = VT.changeVectorElementType(SVT);
54071 if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {
54072 SDValue Load = DAG.getExtLoad(
54073 Ext, DL, VT, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(),
54074 MemVT, Ld->getOriginalAlign(), Ld->getMemOperand()->getFlags());
54075 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
54076 return Load;
54077 }
54078 }
54079 }
54080
54081 // Fold EXTEND_VECTOR_INREG(EXTEND_VECTOR_INREG(X)) -> EXTEND_VECTOR_INREG(X).
54082 if (Opcode == InOpcode)
54083 return DAG.getNode(Opcode, DL, VT, In.getOperand(0));
54084
54085 // Fold EXTEND_VECTOR_INREG(EXTRACT_SUBVECTOR(EXTEND(X),0))
54086 // -> EXTEND_VECTOR_INREG(X).
54087 // TODO: Handle non-zero subvector indices.
54088 if (InOpcode == ISD::EXTRACT_SUBVECTOR && In.getConstantOperandVal(1) == 0 &&
54089 In.getOperand(0).getOpcode() == getOpcode_EXTEND(Opcode) &&
54090 In.getOperand(0).getOperand(0).getValueSizeInBits() ==
54091 In.getValueSizeInBits())
54092 return DAG.getNode(Opcode, DL, VT, In.getOperand(0).getOperand(0));
54093
54094 // Fold EXTEND_VECTOR_INREG(BUILD_VECTOR(X,Y,?,?)) -> BUILD_VECTOR(X,0,Y,0).
54095 // TODO: Move to DAGCombine?
54096 if (!DCI.isBeforeLegalizeOps() && Opcode == ISD::ZERO_EXTEND_VECTOR_INREG &&
54097 In.getOpcode() == ISD::BUILD_VECTOR && In.hasOneUse() &&
54098 In.getValueSizeInBits() == VT.getSizeInBits()) {
54099 unsigned NumElts = VT.getVectorNumElements();
54100 unsigned Scale = VT.getScalarSizeInBits() / In.getScalarValueSizeInBits();
54101 EVT EltVT = In.getOperand(0).getValueType();
54102 SmallVector<SDValue> Elts(Scale * NumElts, DAG.getConstant(0, DL, EltVT));
54103 for (unsigned I = 0; I != NumElts; ++I)
54104 Elts[I * Scale] = In.getOperand(I);
54105 return DAG.getBitcast(VT, DAG.getBuildVector(In.getValueType(), DL, Elts));
54106 }
54107
54108 // Attempt to combine as a shuffle.
54109 // TODO: General ZERO_EXTEND_VECTOR_INREG support.
54110 if (Opcode == ISD::ANY_EXTEND_VECTOR_INREG ||
54111 (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG && Subtarget.hasSSE41())) {
54112 SDValue Op(N, 0);
54113 if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))
54114 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
54115 return Res;
54116 }
54117
54118 return SDValue();
54119}
54120
54121static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG,
54122 TargetLowering::DAGCombinerInfo &DCI) {
54123 EVT VT = N->getValueType(0);
54124
54125 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
54126 return DAG.getConstant(0, SDLoc(N), VT);
54127
54128 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54129 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
54130 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
54131 return SDValue(N, 0);
54132
54133 return SDValue();
54134}
54135
54136// Optimize (fp16_to_fp (fp_to_fp16 X)) to VCVTPS2PH followed by VCVTPH2PS.
54137// Done as a combine because the lowering for fp16_to_fp and fp_to_fp16 produce
54138// extra instructions between the conversion due to going to scalar and back.
54139static SDValue combineFP16_TO_FP(SDNode *N, SelectionDAG &DAG,
54140 const X86Subtarget &Subtarget) {
54141 if (Subtarget.useSoftFloat() || !Subtarget.hasF16C())
54142 return SDValue();
54143
54144 if (N->getOperand(0).getOpcode() != ISD::FP_TO_FP16)
54145 return SDValue();
54146
54147 if (N->getValueType(0) != MVT::f32 ||
54148 N->getOperand(0).getOperand(0).getValueType() != MVT::f32)
54149 return SDValue();
54150
54151 SDLoc dl(N);
54152 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32,
54153 N->getOperand(0).getOperand(0));
54154 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
54155 DAG.getTargetConstant(4, dl, MVT::i32));
54156 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
54157 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
54158 DAG.getIntPtrConstant(0, dl));
54159}
54160
54161static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG,
54162 const X86Subtarget &Subtarget) {
54163 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
54164 return SDValue();
54165
54166 if (Subtarget.hasFP16())
54167 return SDValue();
54168
54169 bool IsStrict = N->isStrictFPOpcode();
54170 EVT VT = N->getValueType(0);
54171 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
54172 EVT SrcVT = Src.getValueType();
54173
54174 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::f16)
54175 return SDValue();
54176
54177 if (VT.getVectorElementType() != MVT::f32 &&
54178 VT.getVectorElementType() != MVT::f64)
54179 return SDValue();
54180
54181 unsigned NumElts = VT.getVectorNumElements();
54182 if (NumElts == 1 || !isPowerOf2_32(NumElts))
54183 return SDValue();
54184
54185 SDLoc dl(N);
54186
54187 // Convert the input to vXi16.
54188 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
54189 Src = DAG.getBitcast(IntVT, Src);
54190
54191 // Widen to at least 8 input elements.
54192 if (NumElts < 8) {
54193 unsigned NumConcats = 8 / NumElts;
54194 SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT)
54195 : DAG.getConstant(0, dl, IntVT);
54196 SmallVector<SDValue, 4> Ops(NumConcats, Fill);
54197 Ops[0] = Src;
54198 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops);
54199 }
54200
54201 // Destination is vXf32 with at least 4 elements.
54202 EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32,
54203 std::max(4U, NumElts));
54204 SDValue Cvt, Chain;
54205 if (IsStrict) {
54206 Cvt = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {CvtVT, MVT::Other},
54207 {N->getOperand(0), Src});
54208 Chain = Cvt.getValue(1);
54209 } else {
54210 Cvt = DAG.getNode(X86ISD::CVTPH2PS, dl, CvtVT, Src);
54211 }
54212
54213 if (NumElts < 4) {
54214 assert(NumElts == 2 && "Unexpected size")(static_cast <bool> (NumElts == 2 && "Unexpected size"
) ? void (0) : __assert_fail ("NumElts == 2 && \"Unexpected size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54214, __extension__
__PRETTY_FUNCTION__))
;
54215 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Cvt,
54216 DAG.getIntPtrConstant(0, dl));
54217 }
54218
54219 if (IsStrict) {
54220 // Extend to the original VT if necessary.
54221 if (Cvt.getValueType() != VT) {
54222 Cvt = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {VT, MVT::Other},
54223 {Chain, Cvt});
54224 Chain = Cvt.getValue(1);
54225 }
54226 return DAG.getMergeValues({Cvt, Chain}, dl);
54227 }
54228
54229 // Extend to the original VT if necessary.
54230 return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt);
54231}
54232
54233// Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract
54234// from. Limit this to cases where the loads have the same input chain and the
54235// output chains are unused. This avoids any memory ordering issues.
54236static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG,
54237 TargetLowering::DAGCombinerInfo &DCI) {
54238 assert((N->getOpcode() == X86ISD::VBROADCAST_LOAD ||(static_cast <bool> ((N->getOpcode() == X86ISD::VBROADCAST_LOAD
|| N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
"Unknown broadcast load type") ? void (0) : __assert_fail ("(N->getOpcode() == X86ISD::VBROADCAST_LOAD || N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54240, __extension__
__PRETTY_FUNCTION__))
54239 N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&(static_cast <bool> ((N->getOpcode() == X86ISD::VBROADCAST_LOAD
|| N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
"Unknown broadcast load type") ? void (0) : __assert_fail ("(N->getOpcode() == X86ISD::VBROADCAST_LOAD || N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54240, __extension__
__PRETTY_FUNCTION__))
54240 "Unknown broadcast load type")(static_cast <bool> ((N->getOpcode() == X86ISD::VBROADCAST_LOAD
|| N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
"Unknown broadcast load type") ? void (0) : __assert_fail ("(N->getOpcode() == X86ISD::VBROADCAST_LOAD || N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54240, __extension__
__PRETTY_FUNCTION__))
;
54241
54242 // Only do this if the chain result is unused.
54243 if (N->hasAnyUseOfValue(1))
54244 return SDValue();
54245
54246 auto *MemIntrin = cast<MemIntrinsicSDNode>(N);
54247
54248 SDValue Ptr = MemIntrin->getBasePtr();
54249 SDValue Chain = MemIntrin->getChain();
54250 EVT VT = N->getSimpleValueType(0);
54251 EVT MemVT = MemIntrin->getMemoryVT();
54252
54253 // Look at other users of our base pointer and try to find a wider broadcast.
54254 // The input chain and the size of the memory VT must match.
54255 for (SDNode *User : Ptr->uses())
54256 if (User != N && User->getOpcode() == N->getOpcode() &&
54257 cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
54258 cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
54259 cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
54260 MemVT.getSizeInBits() &&
54261 !User->hasAnyUseOfValue(1) &&
54262 User->getValueSizeInBits(0).getFixedSize() > VT.getFixedSizeInBits()) {
54263 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
54264 VT.getSizeInBits());
54265 Extract = DAG.getBitcast(VT, Extract);
54266 return DCI.CombineTo(N, Extract, SDValue(User, 1));
54267 }
54268
54269 return SDValue();
54270}
54271
54272static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG,
54273 const X86Subtarget &Subtarget) {
54274 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
54275 return SDValue();
54276
54277 if (Subtarget.hasFP16())
54278 return SDValue();
54279
54280 EVT VT = N->getValueType(0);
54281 SDValue Src = N->getOperand(0);
54282 EVT SrcVT = Src.getValueType();
54283
54284 if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 ||
54285 SrcVT.getVectorElementType() != MVT::f32)
54286 return SDValue();
54287
54288 unsigned NumElts = VT.getVectorNumElements();
54289 if (NumElts == 1 || !isPowerOf2_32(NumElts))
54290 return SDValue();
54291
54292 SDLoc dl(N);
54293
54294 // Widen to at least 4 input elements.
54295 if (NumElts < 4)
54296 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
54297 DAG.getConstantFP(0.0, dl, SrcVT));
54298
54299 // Destination is v8i16 with at least 8 elements.
54300 EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
54301 std::max(8U, NumElts));
54302 SDValue Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src,
54303 DAG.getTargetConstant(4, dl, MVT::i32));
54304
54305 // Extract down to real number of elements.
54306 if (NumElts < 8) {
54307 EVT IntVT = VT.changeVectorElementTypeToInteger();
54308 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, IntVT, Cvt,
54309 DAG.getIntPtrConstant(0, dl));
54310 }
54311
54312 return DAG.getBitcast(VT, Cvt);
54313}
54314
54315static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG) {
54316 SDValue Src = N->getOperand(0);
54317
54318 // Turn MOVDQ2Q+simple_load into an mmx load.
54319 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
54320 LoadSDNode *LN = cast<LoadSDNode>(Src.getNode());
54321
54322 if (LN->isSimple()) {
54323 SDValue NewLd = DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(),
54324 LN->getBasePtr(),
54325 LN->getPointerInfo(),
54326 LN->getOriginalAlign(),
54327 LN->getMemOperand()->getFlags());
54328 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), NewLd.getValue(1));
54329 return NewLd;
54330 }
54331 }
54332
54333 return SDValue();
54334}
54335
54336static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG,
54337 TargetLowering::DAGCombinerInfo &DCI) {
54338 unsigned NumBits = N->getSimpleValueType(0).getSizeInBits();
54339 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54340 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBits), DCI))
54341 return SDValue(N, 0);
54342
54343 return SDValue();
54344}
54345
54346SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
54347 DAGCombinerInfo &DCI) const {
54348 SelectionDAG &DAG = DCI.DAG;
54349 switch (N->getOpcode()) {
54350 default: break;
54351 case ISD::SCALAR_TO_VECTOR:
54352 return combineScalarToVector(N, DAG);
54353 case ISD::EXTRACT_VECTOR_ELT:
54354 case X86ISD::PEXTRW:
54355 case X86ISD::PEXTRB:
54356 return combineExtractVectorElt(N, DAG, DCI, Subtarget);
54357 case ISD::CONCAT_VECTORS:
54358 return combineConcatVectors(N, DAG, DCI, Subtarget);
54359 case ISD::INSERT_SUBVECTOR:
54360 return combineInsertSubvector(N, DAG, DCI, Subtarget);
54361 case ISD::EXTRACT_SUBVECTOR:
54362 return combineExtractSubvector(N, DAG, DCI, Subtarget);
54363 case ISD::VSELECT:
54364 case ISD::SELECT:
54365 case X86ISD::BLENDV: return combineSelect(N, DAG, DCI, Subtarget);
54366 case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
54367 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
54368 case X86ISD::CMP: return combineCMP(N, DAG);
54369 case ISD::ADD: return combineAdd(N, DAG, DCI, Subtarget);
54370 case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget);
54371 case X86ISD::ADD:
54372 case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI);
54373 case X86ISD::SBB: return combineSBB(N, DAG);
54374 case X86ISD::ADC: return combineADC(N, DAG, DCI);
54375 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
54376 case ISD::SHL: return combineShiftLeft(N, DAG);
54377 case ISD::SRA: return combineShiftRightArithmetic(N, DAG, Subtarget);
54378 case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI, Subtarget);
54379 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
54380 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
54381 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
54382 case X86ISD::BEXTR:
54383 case X86ISD::BEXTRI: return combineBEXTR(N, DAG, DCI, Subtarget);
54384 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
54385 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
54386 case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);
54387 case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget);
54388 case X86ISD::VEXTRACT_STORE:
54389 return combineVEXTRACT_STORE(N, DAG, DCI, Subtarget);
54390 case ISD::SINT_TO_FP:
54391 case ISD::STRICT_SINT_TO_FP:
54392 return combineSIntToFP(N, DAG, DCI, Subtarget);
54393 case ISD::UINT_TO_FP:
54394 case ISD::STRICT_UINT_TO_FP:
54395 return combineUIntToFP(N, DAG, Subtarget);
54396 case ISD::FADD:
54397 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
54398 case X86ISD::VFCMULC:
54399 case X86ISD::VFMULC: return combineFMulcFCMulc(N, DAG, Subtarget);
54400 case ISD::FNEG: return combineFneg(N, DAG, DCI, Subtarget);
54401 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
54402 case X86ISD::VTRUNC: return combineVTRUNC(N, DAG, DCI);
54403 case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
54404 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
54405 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
54406 case X86ISD::FXOR:
54407 case X86ISD::FOR: return combineFOr(N, DAG, DCI, Subtarget);
54408 case X86ISD::FMIN:
54409 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
54410 case ISD::FMINNUM:
54411 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
54412 case X86ISD::CVTSI2P:
54413 case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI);
54414 case X86ISD::CVTP2SI:
54415 case X86ISD::CVTP2UI:
54416 case X86ISD::STRICT_CVTTP2SI:
54417 case X86ISD::CVTTP2SI:
54418 case X86ISD::STRICT_CVTTP2UI:
54419 case X86ISD::CVTTP2UI:
54420 return combineCVTP2I_CVTTP2I(N, DAG, DCI);
54421 case X86ISD::STRICT_CVTPH2PS:
54422 case X86ISD::CVTPH2PS: return combineCVTPH2PS(N, DAG, DCI);
54423 case X86ISD::BT: return combineBT(N, DAG, DCI);
54424 case ISD::ANY_EXTEND:
54425 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
54426 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
54427 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
54428 case ISD::ANY_EXTEND_VECTOR_INREG:
54429 case ISD::SIGN_EXTEND_VECTOR_INREG:
54430 case ISD::ZERO_EXTEND_VECTOR_INREG:
54431 return combineEXTEND_VECTOR_INREG(N, DAG, DCI, Subtarget);
54432 case ISD::SETCC: return combineSetCC(N, DAG, DCI, Subtarget);
54433 case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
54434 case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
54435 case X86ISD::PACKSS:
54436 case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);
54437 case X86ISD::HADD:
54438 case X86ISD::HSUB:
54439 case X86ISD::FHADD:
54440 case X86ISD::FHSUB: return combineVectorHADDSUB(N, DAG, DCI, Subtarget);
54441 case X86ISD::VSHL:
54442 case X86ISD::VSRA:
54443 case X86ISD::VSRL:
54444 return combineVectorShiftVar(N, DAG, DCI, Subtarget);
54445 case X86ISD::VSHLI:
54446 case X86ISD::VSRAI:
54447 case X86ISD::VSRLI:
54448 return combineVectorShiftImm(N, DAG, DCI, Subtarget);
54449 case ISD::INSERT_VECTOR_ELT:
54450 case X86ISD::PINSRB:
54451 case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
54452 case X86ISD::SHUFP: // Handle all target specific shuffles
54453 case X86ISD::INSERTPS:
54454 case X86ISD::EXTRQI:
54455 case X86ISD::INSERTQI:
54456 case X86ISD::VALIGN:
54457 case X86ISD::PALIGNR:
54458 case X86ISD::VSHLDQ:
54459 case X86ISD::VSRLDQ:
54460 case X86ISD::BLENDI:
54461 case X86ISD::UNPCKH:
54462 case X86ISD::UNPCKL:
54463 case X86ISD::MOVHLPS:
54464 case X86ISD::MOVLHPS:
54465 case X86ISD::PSHUFB:
54466 case X86ISD::PSHUFD:
54467 case X86ISD::PSHUFHW:
54468 case X86ISD::PSHUFLW:
54469 case X86ISD::MOVSHDUP:
54470 case X86ISD::MOVSLDUP:
54471 case X86ISD::MOVDDUP:
54472 case X86ISD::MOVSS:
54473 case X86ISD::MOVSD:
54474 case X86ISD::MOVSH:
54475 case X86ISD::VBROADCAST:
54476 case X86ISD::VPPERM:
54477 case X86ISD::VPERMI:
54478 case X86ISD::VPERMV:
54479 case X86ISD::VPERMV3:
54480 case X86ISD::VPERMIL2:
54481 case X86ISD::VPERMILPI:
54482 case X86ISD::VPERMILPV:
54483 case X86ISD::VPERM2X128:
54484 case X86ISD::SHUF128:
54485 case X86ISD::VZEXT_MOVL:
54486 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
54487 case X86ISD::FMADD_RND:
54488 case X86ISD::FMSUB:
54489 case X86ISD::STRICT_FMSUB:
54490 case X86ISD::FMSUB_RND:
54491 case X86ISD::FNMADD:
54492 case X86ISD::STRICT_FNMADD:
54493 case X86ISD::FNMADD_RND:
54494 case X86ISD::FNMSUB:
54495 case X86ISD::STRICT_FNMSUB:
54496 case X86ISD::FNMSUB_RND:
54497 case ISD::FMA:
54498 case ISD::STRICT_FMA: return combineFMA(N, DAG, DCI, Subtarget);
54499 case X86ISD::FMADDSUB_RND:
54500 case X86ISD::FMSUBADD_RND:
54501 case X86ISD::FMADDSUB:
54502 case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, DCI);
54503 case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI, Subtarget);
54504 case X86ISD::MGATHER:
54505 case X86ISD::MSCATTER:
54506 return combineX86GatherScatter(N, DAG, DCI, Subtarget);
54507 case ISD::MGATHER:
54508 case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI);
54509 case X86ISD::PCMPEQ:
54510 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
54511 case X86ISD::PMULDQ:
54512 case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget);
54513 case X86ISD::VPMADDUBSW:
54514 case X86ISD::VPMADDWD: return combineVPMADD(N, DAG, DCI);
54515 case X86ISD::KSHIFTL:
54516 case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI);
54517 case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget);
54518 case ISD::STRICT_FP_EXTEND:
54519 case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, Subtarget);
54520 case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget);
54521 case X86ISD::VBROADCAST_LOAD:
54522 case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI);
54523 case X86ISD::MOVDQ2Q: return combineMOVDQ2Q(N, DAG);
54524 case X86ISD::PDEP: return combinePDEP(N, DAG, DCI);
54525 }
54526
54527 return SDValue();
54528}
54529
54530bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
54531 if (!isTypeLegal(VT))
54532 return false;
54533
54534 // There are no vXi8 shifts.
54535 if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
54536 return false;
54537
54538 // TODO: Almost no 8-bit ops are desirable because they have no actual
54539 // size/speed advantages vs. 32-bit ops, but they do have a major
54540 // potential disadvantage by causing partial register stalls.
54541 //
54542 // 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and
54543 // we have specializations to turn 32-bit multiply/shl into LEA or other ops.
54544 // Also, see the comment in "IsDesirableToPromoteOp" - where we additionally
54545 // check for a constant operand to the multiply.
54546 if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8)
54547 return false;
54548
54549 // i16 instruction encodings are longer and some i16 instructions are slow,
54550 // so those are not desirable.
54551 if (VT == MVT::i16) {
54552 switch (Opc) {
54553 default:
54554 break;
54555 case ISD::LOAD:
54556 case ISD::SIGN_EXTEND:
54557 case ISD::ZERO_EXTEND:
54558 case ISD::ANY_EXTEND:
54559 case ISD::SHL:
54560 case ISD::SRA:
54561 case ISD::SRL:
54562 case ISD::SUB:
54563 case ISD::ADD:
54564 case ISD::MUL:
54565 case ISD::AND:
54566 case ISD::OR:
54567 case ISD::XOR:
54568 return false;
54569 }
54570 }
54571
54572 // Any legal type not explicitly accounted for above here is desirable.
54573 return true;
54574}
54575
54576SDValue X86TargetLowering::expandIndirectJTBranch(const SDLoc& dl,
54577 SDValue Value, SDValue Addr,
54578 SelectionDAG &DAG) const {
54579 const Module *M = DAG.getMachineFunction().getMMI().getModule();
54580 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
54581 if (IsCFProtectionSupported) {
54582 // In case control-flow branch protection is enabled, we need to add
54583 // notrack prefix to the indirect branch.
54584 // In order to do that we create NT_BRIND SDNode.
54585 // Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.
54586 return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, Value, Addr);
54587 }
54588
54589 return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, DAG);
54590}
54591
54592bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
54593 EVT VT = Op.getValueType();
54594 bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL &&
54595 isa<ConstantSDNode>(Op.getOperand(1));
54596
54597 // i16 is legal, but undesirable since i16 instruction encodings are longer
54598 // and some i16 instructions are slow.
54599 // 8-bit multiply-by-constant can usually be expanded to something cheaper
54600 // using LEA and/or other ALU ops.
54601 if (VT != MVT::i16 && !Is8BitMulByConstant)
54602 return false;
54603
54604 auto IsFoldableRMW = [](SDValue Load, SDValue Op) {
54605 if (!Op.hasOneUse())
54606 return false;
54607 SDNode *User = *Op->use_begin();
54608 if (!ISD::isNormalStore(User))
54609 return false;
54610 auto *Ld = cast<LoadSDNode>(Load);
54611 auto *St = cast<StoreSDNode>(User);
54612 return Ld->getBasePtr() == St->getBasePtr();
54613 };
54614
54615 auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) {
54616 if (!Load.hasOneUse() || Load.getOpcode() != ISD::ATOMIC_LOAD)
54617 return false;
54618 if (!Op.hasOneUse())
54619 return false;
54620 SDNode *User = *Op->use_begin();
54621 if (User->getOpcode() != ISD::ATOMIC_STORE)
54622 return false;
54623 auto *Ld = cast<AtomicSDNode>(Load);
54624 auto *St = cast<AtomicSDNode>(User);
54625 return Ld->getBasePtr() == St->getBasePtr();
54626 };
54627
54628 bool Commute = false;
54629 switch (Op.getOpcode()) {
54630 default: return false;
54631 case ISD::SIGN_EXTEND:
54632 case ISD::ZERO_EXTEND:
54633 case ISD::ANY_EXTEND:
54634 break;
54635 case ISD::SHL:
54636 case ISD::SRA:
54637 case ISD::SRL: {
54638 SDValue N0 = Op.getOperand(0);
54639 // Look out for (store (shl (load), x)).
54640 if (X86::mayFoldLoad(N0, Subtarget) && IsFoldableRMW(N0, Op))
54641 return false;
54642 break;
54643 }
54644 case ISD::ADD:
54645 case ISD::MUL:
54646 case ISD::AND:
54647 case ISD::OR:
54648 case ISD::XOR:
54649 Commute = true;
54650 LLVM_FALLTHROUGH[[gnu::fallthrough]];
54651 case ISD::SUB: {
54652 SDValue N0 = Op.getOperand(0);
54653 SDValue N1 = Op.getOperand(1);
54654 // Avoid disabling potential load folding opportunities.
54655 if (X86::mayFoldLoad(N1, Subtarget) &&
54656 (!Commute || !isa<ConstantSDNode>(N0) ||
54657 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
54658 return false;
54659 if (X86::mayFoldLoad(N0, Subtarget) &&
54660 ((Commute && !isa<ConstantSDNode>(N1)) ||
54661 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
54662 return false;
54663 if (IsFoldableAtomicRMW(N0, Op) ||
54664 (Commute && IsFoldableAtomicRMW(N1, Op)))
54665 return false;
54666 }
54667 }
54668
54669 PVT = MVT::i32;
54670 return true;
54671}
54672
54673//===----------------------------------------------------------------------===//
54674// X86 Inline Assembly Support
54675//===----------------------------------------------------------------------===//
54676
54677// Helper to match a string separated by whitespace.
54678static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
54679 S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
54680
54681 for (StringRef Piece : Pieces) {
54682 if (!S.startswith(Piece)) // Check if the piece matches.
54683 return false;
54684
54685 S = S.substr(Piece.size());
54686 StringRef::size_type Pos = S.find_first_not_of(" \t");
54687 if (Pos == 0) // We matched a prefix.
54688 return false;
54689
54690 S = S.substr(Pos);
54691 }
54692
54693 return S.empty();
54694}
54695
54696static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
54697
54698 if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
54699 if (llvm::is_contained(AsmPieces, "~{cc}") &&
54700 llvm::is_contained(AsmPieces, "~{flags}") &&
54701 llvm::is_contained(AsmPieces, "~{fpsr}")) {
54702
54703 if (AsmPieces.size() == 3)
54704 return true;
54705 else if (llvm::is_contained(AsmPieces, "~{dirflag}"))
54706 return true;
54707 }
54708 }
54709 return false;
54710}
54711
54712bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
54713 InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand());
54714
54715 const std::string &AsmStr = IA->getAsmString();
54716
54717 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
54718 if (!Ty || Ty->getBitWidth() % 16 != 0)
54719 return false;
54720
54721 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
54722 SmallVector<StringRef, 4> AsmPieces;
54723 SplitString(AsmStr, AsmPieces, ";\n");
54724
54725 switch (AsmPieces.size()) {
54726 default: return false;
54727 case 1:
54728 // FIXME: this should verify that we are targeting a 486 or better. If not,
54729 // we will turn this bswap into something that will be lowered to logical
54730 // ops instead of emitting the bswap asm. For now, we don't support 486 or
54731 // lower so don't worry about this.
54732 // bswap $0
54733 if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
54734 matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
54735 matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
54736 matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
54737 matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
54738 matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
54739 // No need to check constraints, nothing other than the equivalent of
54740 // "=r,0" would be valid here.
54741 return IntrinsicLowering::LowerToByteSwap(CI);
54742 }
54743
54744 // rorw $$8, ${0:w} --> llvm.bswap.i16
54745 if (CI->getType()->isIntegerTy(16) &&
54746 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
54747 (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
54748 matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
54749 AsmPieces.clear();
54750 StringRef ConstraintsStr = IA->getConstraintString();
54751 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
54752 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
54753 if (clobbersFlagRegisters(AsmPieces))
54754 return IntrinsicLowering::LowerToByteSwap(CI);
54755 }
54756 break;
54757 case 3:
54758 if (CI->getType()->isIntegerTy(32) &&
54759 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
54760 matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
54761 matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
54762 matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
54763 AsmPieces.clear();
54764 StringRef ConstraintsStr = IA->getConstraintString();
54765 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
54766 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
54767 if (clobbersFlagRegisters(AsmPieces))
54768 return IntrinsicLowering::LowerToByteSwap(CI);
54769 }
54770
54771 if (CI->getType()->isIntegerTy(64)) {
54772 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
54773 if (Constraints.size() >= 2 &&
54774 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
54775 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
54776 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
54777 if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
54778 matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
54779 matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
54780 return IntrinsicLowering::LowerToByteSwap(CI);
54781 }
54782 }
54783 break;
54784 }
54785 return false;
54786}
54787
54788static X86::CondCode parseConstraintCode(llvm::StringRef Constraint) {
54789 X86::CondCode Cond = StringSwitch<X86::CondCode>(Constraint)
54790 .Case("{@cca}", X86::COND_A)
54791 .Case("{@ccae}", X86::COND_AE)
54792 .Case("{@ccb}", X86::COND_B)
54793 .Case("{@ccbe}", X86::COND_BE)
54794 .Case("{@ccc}", X86::COND_B)
54795 .Case("{@cce}", X86::COND_E)
54796 .Case("{@ccz}", X86::COND_E)
54797 .Case("{@ccg}", X86::COND_G)
54798 .Case("{@ccge}", X86::COND_GE)
54799 .Case("{@ccl}", X86::COND_L)
54800 .Case("{@ccle}", X86::COND_LE)
54801 .Case("{@ccna}", X86::COND_BE)
54802 .Case("{@ccnae}", X86::COND_B)
54803 .Case("{@ccnb}", X86::COND_AE)
54804 .Case("{@ccnbe}", X86::COND_A)
54805 .Case("{@ccnc}", X86::COND_AE)
54806 .Case("{@ccne}", X86::COND_NE)
54807 .Case("{@ccnz}", X86::COND_NE)
54808 .Case("{@ccng}", X86::COND_LE)
54809 .Case("{@ccnge}", X86::COND_L)
54810 .Case("{@ccnl}", X86::COND_GE)
54811 .Case("{@ccnle}", X86::COND_G)
54812 .Case("{@ccno}", X86::COND_NO)
54813 .Case("{@ccnp}", X86::COND_NP)
54814 .Case("{@ccns}", X86::COND_NS)
54815 .Case("{@cco}", X86::COND_O)
54816 .Case("{@ccp}", X86::COND_P)
54817 .Case("{@ccs}", X86::COND_S)
54818 .Default(X86::COND_INVALID);
54819 return Cond;
54820}
54821
54822/// Given a constraint letter, return the type of constraint for this target.
54823X86TargetLowering::ConstraintType
54824X86TargetLowering::getConstraintType(StringRef Constraint) const {
54825 if (Constraint.size() == 1) {
54826 switch (Constraint[0]) {
54827 case 'R':
54828 case 'q':
54829 case 'Q':
54830 case 'f':
54831 case 't':
54832 case 'u':
54833 case 'y':
54834 case 'x':
54835 case 'v':
54836 case 'l':
54837 case 'k': // AVX512 masking registers.
54838 return C_RegisterClass;
54839 case 'a':
54840 case 'b':
54841 case 'c':
54842 case 'd':
54843 case 'S':
54844 case 'D':
54845 case 'A':
54846 return C_Register;
54847 case 'I':
54848 case 'J':
54849 case 'K':
54850 case 'N':
54851 case 'G':
54852 case 'L':
54853 case 'M':
54854 return C_Immediate;
54855 case 'C':
54856 case 'e':
54857 case 'Z':
54858 return C_Other;
54859 default:
54860 break;
54861 }
54862 }
54863 else if (Constraint.size() == 2) {
54864 switch (Constraint[0]) {
54865 default:
54866 break;
54867 case 'Y':
54868 switch (Constraint[1]) {
54869 default:
54870 break;
54871 case 'z':
54872 return C_Register;
54873 case 'i':
54874 case 'm':
54875 case 'k':
54876 case 't':
54877 case '2':
54878 return C_RegisterClass;
54879 }
54880 }
54881 } else if (parseConstraintCode(Constraint) != X86::COND_INVALID)
54882 return C_Other;
54883 return TargetLowering::getConstraintType(Constraint);
54884}
54885
54886/// Examine constraint type and operand type and determine a weight value.
54887/// This object must already have been set up with the operand type
54888/// and the current alternative constraint selected.
54889TargetLowering::ConstraintWeight
54890 X86TargetLowering::getSingleConstraintMatchWeight(
54891 AsmOperandInfo &info, const char *constraint) const {
54892 ConstraintWeight weight = CW_Invalid;
54893 Value *CallOperandVal = info.CallOperandVal;
54894 // If we don't have a value, we can't do a match,
54895 // but allow it at the lowest weight.
54896 if (!CallOperandVal)
54897 return CW_Default;
54898 Type *type = CallOperandVal->getType();
54899 // Look at the constraint type.
54900 switch (*constraint) {
54901 default:
54902 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
54903 LLVM_FALLTHROUGH[[gnu::fallthrough]];
54904 case 'R':
54905 case 'q':
54906 case 'Q':
54907 case 'a':
54908 case 'b':
54909 case 'c':
54910 case 'd':
54911 case 'S':
54912 case 'D':
54913 case 'A':
54914 if (CallOperandVal->getType()->isIntegerTy())
54915 weight = CW_SpecificReg;
54916 break;
54917 case 'f':
54918 case 't':
54919 case 'u':
54920 if (type->isFloatingPointTy())
54921 weight = CW_SpecificReg;
54922 break;
54923 case 'y':
54924 if (type->isX86_MMXTy() && Subtarget.hasMMX())
54925 weight = CW_SpecificReg;
54926 break;
54927 case 'Y':
54928 if (StringRef(constraint).size() != 2)
54929 break;
54930 switch (constraint[1]) {
54931 default:
54932 return CW_Invalid;
54933 // XMM0
54934 case 'z':
54935 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
54936 ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) ||
54937 ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()))
54938 return CW_SpecificReg;
54939 return CW_Invalid;
54940 // Conditional OpMask regs (AVX512)
54941 case 'k':
54942 if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
54943 return CW_Register;
54944 return CW_Invalid;
54945 // Any MMX reg
54946 case 'm':
54947 if (type->isX86_MMXTy() && Subtarget.hasMMX())
54948 return weight;
54949 return CW_Invalid;
54950 // Any SSE reg when ISA >= SSE2, same as 'x'
54951 case 'i':
54952 case 't':
54953 case '2':
54954 if (!Subtarget.hasSSE2())
54955 return CW_Invalid;
54956 break;
54957 }
54958 break;
54959 case 'v':
54960 if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
54961 weight = CW_Register;
54962 LLVM_FALLTHROUGH[[gnu::fallthrough]];
54963 case 'x':
54964 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
54965 ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
54966 weight = CW_Register;
54967 break;
54968 case 'k':
54969 // Enable conditional vector operations using %k<#> registers.
54970 if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
54971 weight = CW_Register;
54972 break;
54973 case 'I':
54974 if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
54975 if (C->getZExtValue() <= 31)
54976 weight = CW_Constant;
54977 }
54978 break;
54979 case 'J':
54980 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
54981 if (C->getZExtValue() <= 63)
54982 weight = CW_Constant;
54983 }
54984 break;
54985 case 'K':
54986 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
54987 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
54988 weight = CW_Constant;
54989 }
54990 break;
54991 case 'L':
54992 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
54993 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
54994 weight = CW_Constant;
54995 }
54996 break;
54997 case 'M':
54998 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
54999 if (C->getZExtValue() <= 3)
55000 weight = CW_Constant;
55001 }
55002 break;
55003 case 'N':
55004 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
55005 if (C->getZExtValue() <= 0xff)
55006 weight = CW_Constant;
55007 }
55008 break;
55009 case 'G':
55010 case 'C':
55011 if (isa<ConstantFP>(CallOperandVal)) {
55012 weight = CW_Constant;
55013 }
55014 break;
55015 case 'e':
55016 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
55017 if ((C->getSExtValue() >= -0x80000000LL) &&
55018 (C->getSExtValue() <= 0x7fffffffLL))
55019 weight = CW_Constant;
55020 }
55021 break;
55022 case 'Z':
55023 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
55024 if (C->getZExtValue() <= 0xffffffff)
55025 weight = CW_Constant;
55026 }
55027 break;
55028 }
55029 return weight;
55030}
55031
55032/// Try to replace an X constraint, which matches anything, with another that
55033/// has more specific requirements based on the type of the corresponding
55034/// operand.
55035const char *X86TargetLowering::
55036LowerXConstraint(EVT ConstraintVT) const {
55037 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
55038 // 'f' like normal targets.
55039 if (ConstraintVT.isFloatingPoint()) {
55040 if (Subtarget.hasSSE1())
55041 return "x";
55042 }
55043
55044 return TargetLowering::LowerXConstraint(ConstraintVT);
55045}
55046
55047// Lower @cc targets via setcc.
55048SDValue X86TargetLowering::LowerAsmOutputForConstraint(
55049 SDValue &Chain, SDValue &Flag, const SDLoc &DL,
55050 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
55051 X86::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
55052 if (Cond == X86::COND_INVALID)
55053 return SDValue();
55054 // Check that return type is valid.
55055 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
55056 OpInfo.ConstraintVT.getSizeInBits() < 8)
55057 report_fatal_error("Flag output operand is of invalid type");
55058
55059 // Get EFLAGS register. Only update chain when copyfrom is glued.
55060 if (Flag.getNode()) {
55061 Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Flag);
55062 Chain = Flag.getValue(1);
55063 } else
55064 Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32);
55065 // Extract CC code.
55066 SDValue CC = getSETCC(Cond, Flag, DL, DAG);
55067 // Extend to 32-bits
55068 SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
55069
55070 return Result;
55071}
55072
55073/// Lower the specified operand into the Ops vector.
55074/// If it is invalid, don't add anything to Ops.
55075void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
55076 std::string &Constraint,
55077 std::vector<SDValue>&Ops,
55078 SelectionDAG &DAG) const {
55079 SDValue Result;
55080
55081 // Only support length 1 constraints for now.
55082 if (Constraint.length() > 1) return;
55083
55084 char ConstraintLetter = Constraint[0];
55085 switch (ConstraintLetter) {
55086 default: break;
55087 case 'I':
55088 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
55089 if (C->getZExtValue() <= 31) {
55090 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
55091 Op.getValueType());
55092 break;
55093 }
55094 }
55095 return;
55096 case 'J':
55097 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
55098 if (C->getZExtValue() <= 63) {
55099 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
55100 Op.getValueType());
55101 break;
55102 }
55103 }
55104 return;
55105 case 'K':
55106 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
55107 if (isInt<8>(C->getSExtValue())) {
55108 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
55109 Op.getValueType());
55110 break;
55111 }
55112 }
55113 return;
55114 case 'L':
55115 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
55116 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
55117 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
55118 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
55119 Op.getValueType());
55120 break;
55121 }
55122 }
55123 return;
55124 case 'M':
55125 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
55126 if (C->getZExtValue() <= 3) {
55127 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
55128 Op.getValueType());
55129 break;
55130 }
55131 }
55132 return;
55133 case 'N':
55134 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
55135 if (C->getZExtValue() <= 255) {
55136 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
55137 Op.getValueType());
55138 break;
55139 }
55140 }
55141 return;
55142 case 'O':
55143 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
55144 if (C->getZExtValue() <= 127) {
55145 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
55146 Op.getValueType());
55147 break;
55148 }
55149 }
55150 return;
55151 case 'e': {
55152 // 32-bit signed value
55153 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
55154 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
55155 C->getSExtValue())) {
55156 // Widen to 64 bits here to get it sign extended.
55157 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
55158 break;
55159 }
55160 // FIXME gcc accepts some relocatable values here too, but only in certain
55161 // memory models; it's complicated.
55162 }
55163 return;
55164 }
55165 case 'Z': {
55166 // 32-bit unsigned value
55167 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
55168 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
55169 C->getZExtValue())) {
55170 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
55171 Op.getValueType());
55172 break;
55173 }
55174 }
55175 // FIXME gcc accepts some relocatable values here too, but only in certain
55176 // memory models; it's complicated.
55177 return;
55178 }
55179 case 'i': {
55180 // Literal immediates are always ok.
55181 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
55182 bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;
55183 BooleanContent BCont = getBooleanContents(MVT::i64);
55184 ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)
55185 : ISD::SIGN_EXTEND;
55186 int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()
55187 : CST->getSExtValue();
55188 Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);
55189 break;
55190 }
55191
55192 // In any sort of PIC mode addresses need to be computed at runtime by
55193 // adding in a register or some sort of table lookup. These can't
55194 // be used as immediates. BlockAddresses are fine though.
55195 if ((Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC()) &&
55196 !isa<BlockAddressSDNode>(Op))
55197 return;
55198
55199 // If we are in non-pic codegen mode, we allow the address of a global (with
55200 // an optional displacement) to be used with 'i'.
55201 if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
55202 // If we require an extra load to get this address, as in PIC mode, we
55203 // can't accept it.
55204 if (isGlobalStubReference(
55205 Subtarget.classifyGlobalReference(GA->getGlobal())))
55206 return;
55207 break;
55208 }
55209 }
55210
55211 if (Result.getNode()) {
55212 Ops.push_back(Result);
55213 return;
55214 }
55215 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
55216}
55217
55218/// Check if \p RC is a general purpose register class.
55219/// I.e., GR* or one of their variant.
55220static bool isGRClass(const TargetRegisterClass &RC) {
55221 return RC.hasSuperClassEq(&X86::GR8RegClass) ||
55222 RC.hasSuperClassEq(&X86::GR16RegClass) ||
55223 RC.hasSuperClassEq(&X86::GR32RegClass) ||
55224 RC.hasSuperClassEq(&X86::GR64RegClass) ||
55225 RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
55226}
55227
55228/// Check if \p RC is a vector register class.
55229/// I.e., FR* / VR* or one of their variant.
55230static bool isFRClass(const TargetRegisterClass &RC) {
55231 return RC.hasSuperClassEq(&X86::FR16XRegClass) ||
55232 RC.hasSuperClassEq(&X86::FR32XRegClass) ||
55233 RC.hasSuperClassEq(&X86::FR64XRegClass) ||
55234 RC.hasSuperClassEq(&X86::VR128XRegClass) ||
55235 RC.hasSuperClassEq(&X86::VR256XRegClass) ||
55236 RC.hasSuperClassEq(&X86::VR512RegClass);
55237}
55238
55239/// Check if \p RC is a mask register class.
55240/// I.e., VK* or one of their variant.
55241static bool isVKClass(const TargetRegisterClass &RC) {
55242 return RC.hasSuperClassEq(&X86::VK1RegClass) ||
55243 RC.hasSuperClassEq(&X86::VK2RegClass) ||
55244 RC.hasSuperClassEq(&X86::VK4RegClass) ||
55245 RC.hasSuperClassEq(&X86::VK8RegClass) ||
55246 RC.hasSuperClassEq(&X86::VK16RegClass) ||
55247 RC.hasSuperClassEq(&X86::VK32RegClass) ||
55248 RC.hasSuperClassEq(&X86::VK64RegClass);
55249}
55250
55251std::pair<unsigned, const TargetRegisterClass *>
55252X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
55253 StringRef Constraint,
55254 MVT VT) const {
55255 // First, see if this is a constraint that directly corresponds to an LLVM
55256 // register class.
55257 if (Constraint.size() == 1) {
55258 // GCC Constraint Letters
55259 switch (Constraint[0]) {
55260 default: break;
55261 // 'A' means [ER]AX + [ER]DX.
55262 case 'A':
55263 if (Subtarget.is64Bit())
55264 return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);
55265 assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&(static_cast <bool> ((Subtarget.is32Bit() || Subtarget.
is16Bit()) && "Expecting 64, 32 or 16 bit subtarget")
? void (0) : __assert_fail ("(Subtarget.is32Bit() || Subtarget.is16Bit()) && \"Expecting 64, 32 or 16 bit subtarget\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55266, __extension__
__PRETTY_FUNCTION__))
55266 "Expecting 64, 32 or 16 bit subtarget")(static_cast <bool> ((Subtarget.is32Bit() || Subtarget.
is16Bit()) && "Expecting 64, 32 or 16 bit subtarget")
? void (0) : __assert_fail ("(Subtarget.is32Bit() || Subtarget.is16Bit()) && \"Expecting 64, 32 or 16 bit subtarget\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55266, __extension__
__PRETTY_FUNCTION__))
;
55267 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
55268
55269 // TODO: Slight differences here in allocation order and leaving
55270 // RIP in the class. Do they matter any more here than they do
55271 // in the normal allocation?
55272 case 'k':
55273 if (Subtarget.hasAVX512()) {
55274 if (VT == MVT::i1)
55275 return std::make_pair(0U, &X86::VK1RegClass);
55276 if (VT == MVT::i8)
55277 return std::make_pair(0U, &X86::VK8RegClass);
55278 if (VT == MVT::i16)
55279 return std::make_pair(0U, &X86::VK16RegClass);
55280 }
55281 if (Subtarget.hasBWI()) {
55282 if (VT == MVT::i32)
55283 return std::make_pair(0U, &X86::VK32RegClass);
55284 if (VT == MVT::i64)
55285 return std::make_pair(0U, &X86::VK64RegClass);
55286 }
55287 break;
55288 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
55289 if (Subtarget.is64Bit()) {
55290 if (VT == MVT::i8 || VT == MVT::i1)
55291 return std::make_pair(0U, &X86::GR8RegClass);
55292 if (VT == MVT::i16)
55293 return std::make_pair(0U, &X86::GR16RegClass);
55294 if (VT == MVT::i32 || VT == MVT::f32)
55295 return std::make_pair(0U, &X86::GR32RegClass);
55296 if (VT != MVT::f80 && !VT.isVector())
55297 return std::make_pair(0U, &X86::GR64RegClass);
55298 break;
55299 }
55300 LLVM_FALLTHROUGH[[gnu::fallthrough]];
55301 // 32-bit fallthrough
55302 case 'Q': // Q_REGS
55303 if (VT == MVT::i8 || VT == MVT::i1)
55304 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
55305 if (VT == MVT::i16)
55306 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
55307 if (VT == MVT::i32 || VT == MVT::f32 ||
55308 (!VT.isVector() && !Subtarget.is64Bit()))
55309 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
55310 if (VT != MVT::f80 && !VT.isVector())
55311 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
55312 break;
55313 case 'r': // GENERAL_REGS
55314 case 'l': // INDEX_REGS
55315 if (VT == MVT::i8 || VT == MVT::i1)
55316 return std::make_pair(0U, &X86::GR8RegClass);
55317 if (VT == MVT::i16)
55318 return std::make_pair(0U, &X86::GR16RegClass);
55319 if (VT == MVT::i32 || VT == MVT::f32 ||
55320 (!VT.isVector() && !Subtarget.is64Bit()))
55321 return std::make_pair(0U, &X86::GR32RegClass);
55322 if (VT != MVT::f80 && !VT.isVector())
55323 return std::make_pair(0U, &X86::GR64RegClass);
55324 break;
55325 case 'R': // LEGACY_REGS
55326 if (VT == MVT::i8 || VT == MVT::i1)
55327 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
55328 if (VT == MVT::i16)
55329 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
55330 if (VT == MVT::i32 || VT == MVT::f32 ||
55331 (!VT.isVector() && !Subtarget.is64Bit()))
55332 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
55333 if (VT != MVT::f80 && !VT.isVector())
55334 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
55335 break;
55336 case 'f': // FP Stack registers.
55337 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
55338 // value to the correct fpstack register class.
55339 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
55340 return std::make_pair(0U, &X86::RFP32RegClass);
55341 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
55342 return std::make_pair(0U, &X86::RFP64RegClass);
55343 if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80)
55344 return std::make_pair(0U, &X86::RFP80RegClass);
55345 break;
55346 case 'y': // MMX_REGS if MMX allowed.
55347 if (!Subtarget.hasMMX()) break;
55348 return std::make_pair(0U, &X86::VR64RegClass);
55349 case 'v':
55350 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
55351 if (!Subtarget.hasSSE1()) break;
55352 bool VConstraint = (Constraint[0] == 'v');
55353
55354 switch (VT.SimpleTy) {
55355 default: break;
55356 // Scalar SSE types.
55357 case MVT::f16:
55358 if (VConstraint && Subtarget.hasFP16())
55359 return std::make_pair(0U, &X86::FR16XRegClass);
55360 break;
55361 case MVT::f32:
55362 case MVT::i32:
55363 if (VConstraint && Subtarget.hasVLX())
55364 return std::make_pair(0U, &X86::FR32XRegClass);
55365 return std::make_pair(0U, &X86::FR32RegClass);
55366 case MVT::f64:
55367 case MVT::i64:
55368 if (VConstraint && Subtarget.hasVLX())
55369 return std::make_pair(0U, &X86::FR64XRegClass);
55370 return std::make_pair(0U, &X86::FR64RegClass);
55371 case MVT::i128:
55372 if (Subtarget.is64Bit()) {
55373 if (VConstraint && Subtarget.hasVLX())
55374 return std::make_pair(0U, &X86::VR128XRegClass);
55375 return std::make_pair(0U, &X86::VR128RegClass);
55376 }
55377 break;
55378 // Vector types and fp128.
55379 case MVT::v8f16:
55380 if (!Subtarget.hasFP16())
55381 break;
55382 LLVM_FALLTHROUGH[[gnu::fallthrough]];
55383 case MVT::f128:
55384 case MVT::v16i8:
55385 case MVT::v8i16:
55386 case MVT::v4i32:
55387 case MVT::v2i64:
55388 case MVT::v4f32:
55389 case MVT::v2f64:
55390 if (VConstraint && Subtarget.hasVLX())
55391 return std::make_pair(0U, &X86::VR128XRegClass);
55392 return std::make_pair(0U, &X86::VR128RegClass);
55393 // AVX types.
55394 case MVT::v16f16:
55395 if (!Subtarget.hasFP16())
55396 break;
55397 LLVM_FALLTHROUGH[[gnu::fallthrough]];
55398 case MVT::v32i8:
55399 case MVT::v16i16:
55400 case MVT::v8i32:
55401 case MVT::v4i64:
55402 case MVT::v8f32:
55403 case MVT::v4f64:
55404 if (VConstraint && Subtarget.hasVLX())
55405 return std::make_pair(0U, &X86::VR256XRegClass);
55406 if (Subtarget.hasAVX())
55407 return std::make_pair(0U, &X86::VR256RegClass);
55408 break;
55409 case MVT::v32f16:
55410 if (!Subtarget.hasFP16())
55411 break;
55412 LLVM_FALLTHROUGH[[gnu::fallthrough]];
55413 case MVT::v64i8:
55414 case MVT::v32i16:
55415 case MVT::v8f64:
55416 case MVT::v16f32:
55417 case MVT::v16i32:
55418 case MVT::v8i64:
55419 if (!Subtarget.hasAVX512()) break;
55420 if (VConstraint)
55421 return std::make_pair(0U, &X86::VR512RegClass);
55422 return std::make_pair(0U, &X86::VR512_0_15RegClass);
55423 }
55424 break;
55425 }
55426 } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
55427 switch (Constraint[1]) {
55428 default:
55429 break;
55430 case 'i':
55431 case 't':
55432 case '2':
55433 return getRegForInlineAsmConstraint(TRI, "x", VT);
55434 case 'm':
55435 if (!Subtarget.hasMMX()) break;
55436 return std::make_pair(0U, &X86::VR64RegClass);
55437 case 'z':
55438 if (!Subtarget.hasSSE1()) break;
55439 switch (VT.SimpleTy) {
55440 default: break;
55441 // Scalar SSE types.
55442 case MVT::f16:
55443 if (!Subtarget.hasFP16())
55444 break;
55445 return std::make_pair(X86::XMM0, &X86::FR16XRegClass);
55446 case MVT::f32:
55447 case MVT::i32:
55448 return std::make_pair(X86::XMM0, &X86::FR32RegClass);
55449 case MVT::f64:
55450 case MVT::i64:
55451 return std::make_pair(X86::XMM0, &X86::FR64RegClass);
55452 case MVT::v8f16:
55453 if (!Subtarget.hasFP16())
55454 break;
55455 LLVM_FALLTHROUGH[[gnu::fallthrough]];
55456 case MVT::f128:
55457 case MVT::v16i8:
55458 case MVT::v8i16:
55459 case MVT::v4i32:
55460 case MVT::v2i64:
55461 case MVT::v4f32:
55462 case MVT::v2f64:
55463 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
55464 // AVX types.
55465 case MVT::v16f16:
55466 if (!Subtarget.hasFP16())
55467 break;
55468 LLVM_FALLTHROUGH[[gnu::fallthrough]];
55469 case MVT::v32i8:
55470 case MVT::v16i16:
55471 case MVT::v8i32:
55472 case MVT::v4i64:
55473 case MVT::v8f32:
55474 case MVT::v4f64:
55475 if (Subtarget.hasAVX())
55476 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
55477 break;
55478 case MVT::v32f16:
55479 if (!Subtarget.hasFP16())
55480 break;
55481 LLVM_FALLTHROUGH[[gnu::fallthrough]];
55482 case MVT::v64i8:
55483 case MVT::v32i16:
55484 case MVT::v8f64:
55485 case MVT::v16f32:
55486 case MVT::v16i32:
55487 case MVT::v8i64:
55488 if (Subtarget.hasAVX512())
55489 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
55490 break;
55491 }
55492 break;
55493 case 'k':
55494 // This register class doesn't allocate k0 for masked vector operation.
55495 if (Subtarget.hasAVX512()) {
55496 if (VT == MVT::i1)
55497 return std::make_pair(0U, &X86::VK1WMRegClass);
55498 if (VT == MVT::i8)
55499 return std::make_pair(0U, &X86::VK8WMRegClass);
55500 if (VT == MVT::i16)
55501 return std::make_pair(0U, &X86::VK16WMRegClass);
55502 }
55503 if (Subtarget.hasBWI()) {
55504 if (VT == MVT::i32)
55505 return std::make_pair(0U, &X86::VK32WMRegClass);
55506 if (VT == MVT::i64)
55507 return std::make_pair(0U, &X86::VK64WMRegClass);
55508 }
55509 break;
55510 }
55511 }
55512
55513 if (parseConstraintCode(Constraint) != X86::COND_INVALID)
55514 return std::make_pair(0U, &X86::GR32RegClass);
55515
55516 // Use the default implementation in TargetLowering to convert the register
55517 // constraint into a member of a register class.
55518 std::pair<Register, const TargetRegisterClass*> Res;
55519 Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
55520
55521 // Not found as a standard register?
55522 if (!Res.second) {
55523 // Only match x87 registers if the VT is one SelectionDAGBuilder can convert
55524 // to/from f80.
55525 if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) {
55526 // Map st(0) -> st(7) -> ST0
55527 if (Constraint.size() == 7 && Constraint[0] == '{' &&
55528 tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&
55529 Constraint[3] == '(' &&
55530 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
55531 Constraint[5] == ')' && Constraint[6] == '}') {
55532 // st(7) is not allocatable and thus not a member of RFP80. Return
55533 // singleton class in cases where we have a reference to it.
55534 if (Constraint[4] == '7')
55535 return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);
55536 return std::make_pair(X86::FP0 + Constraint[4] - '0',
55537 &X86::RFP80RegClass);
55538 }
55539
55540 // GCC allows "st(0)" to be called just plain "st".
55541 if (StringRef("{st}").equals_insensitive(Constraint))
55542 return std::make_pair(X86::FP0, &X86::RFP80RegClass);
55543 }
55544
55545 // flags -> EFLAGS
55546 if (StringRef("{flags}").equals_insensitive(Constraint))
55547 return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);
55548
55549 // dirflag -> DF
55550 // Only allow for clobber.
55551 if (StringRef("{dirflag}").equals_insensitive(Constraint) &&
55552 VT == MVT::Other)
55553 return std::make_pair(X86::DF, &X86::DFCCRRegClass);
55554
55555 // fpsr -> FPSW
55556 if (StringRef("{fpsr}").equals_insensitive(Constraint))
55557 return std::make_pair(X86::FPSW, &X86::FPCCRRegClass);
55558
55559 return Res;
55560 }
55561
55562 // Make sure it isn't a register that requires 64-bit mode.
55563 if (!Subtarget.is64Bit() &&
55564 (isFRClass(*Res.second) || isGRClass(*Res.second)) &&
55565 TRI->getEncodingValue(Res.first) >= 8) {
55566 // Register requires REX prefix, but we're in 32-bit mode.
55567 return std::make_pair(0, nullptr);
55568 }
55569
55570 // Make sure it isn't a register that requires AVX512.
55571 if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&
55572 TRI->getEncodingValue(Res.first) & 0x10) {
55573 // Register requires EVEX prefix.
55574 return std::make_pair(0, nullptr);
55575 }
55576
55577 // Otherwise, check to see if this is a register class of the wrong value
55578 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
55579 // turn into {ax},{dx}.
55580 // MVT::Other is used to specify clobber names.
55581 if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
55582 return Res; // Correct type already, nothing to do.
55583
55584 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
55585 // return "eax". This should even work for things like getting 64bit integer
55586 // registers when given an f64 type.
55587 const TargetRegisterClass *Class = Res.second;
55588 // The generic code will match the first register class that contains the
55589 // given register. Thus, based on the ordering of the tablegened file,
55590 // the "plain" GR classes might not come first.
55591 // Therefore, use a helper method.
55592 if (isGRClass(*Class)) {
55593 unsigned Size = VT.getSizeInBits();
55594 if (Size == 1) Size = 8;
55595 Register DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
55596 if (DestReg > 0) {
55597 bool is64Bit = Subtarget.is64Bit();
55598 const TargetRegisterClass *RC =
55599 Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
55600 : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
55601 : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
55602 : Size == 64 ? (is64Bit ? &X86::GR64RegClass : nullptr)
55603 : nullptr;
55604 if (Size == 64 && !is64Bit) {
55605 // Model GCC's behavior here and select a fixed pair of 32-bit
55606 // registers.
55607 switch (DestReg) {
55608 case X86::RAX:
55609 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
55610 case X86::RDX:
55611 return std::make_pair(X86::EDX, &X86::GR32_DCRegClass);
55612 case X86::RCX:
55613 return std::make_pair(X86::ECX, &X86::GR32_CBRegClass);
55614 case X86::RBX:
55615 return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass);
55616 case X86::RSI:
55617 return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass);
55618 case X86::RDI:
55619 return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass);
55620 case X86::RBP:
55621 return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass);
55622 default:
55623 return std::make_pair(0, nullptr);
55624 }
55625 }
55626 if (RC && RC->contains(DestReg))
55627 return std::make_pair(DestReg, RC);
55628 return Res;
55629 }
55630 // No register found/type mismatch.
55631 return std::make_pair(0, nullptr);
55632 } else if (isFRClass(*Class)) {
55633 // Handle references to XMM physical registers that got mapped into the
55634 // wrong class. This can happen with constraints like {xmm0} where the
55635 // target independent register mapper will just pick the first match it can
55636 // find, ignoring the required type.
55637
55638 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
55639 if (VT == MVT::f16)
55640 Res.second = &X86::FR16XRegClass;
55641 else if (VT == MVT::f32 || VT == MVT::i32)
55642 Res.second = &X86::FR32XRegClass;
55643 else if (VT == MVT::f64 || VT == MVT::i64)
55644 Res.second = &X86::FR64XRegClass;
55645 else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))
55646 Res.second = &X86::VR128XRegClass;
55647 else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))
55648 Res.second = &X86::VR256XRegClass;
55649 else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
55650 Res.second = &X86::VR512RegClass;
55651 else {
55652 // Type mismatch and not a clobber: Return an error;
55653 Res.first = 0;
55654 Res.second = nullptr;
55655 }
55656 } else if (isVKClass(*Class)) {
55657 if (VT == MVT::i1)
55658 Res.second = &X86::VK1RegClass;
55659 else if (VT == MVT::i8)
55660 Res.second = &X86::VK8RegClass;
55661 else if (VT == MVT::i16)
55662 Res.second = &X86::VK16RegClass;
55663 else if (VT == MVT::i32)
55664 Res.second = &X86::VK32RegClass;
55665 else if (VT == MVT::i64)
55666 Res.second = &X86::VK64RegClass;
55667 else {
55668 // Type mismatch and not a clobber: Return an error;
55669 Res.first = 0;
55670 Res.second = nullptr;
55671 }
55672 }
55673
55674 return Res;
55675}
55676
55677InstructionCost X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
55678 const AddrMode &AM,
55679 Type *Ty,
55680 unsigned AS) const {
55681 // Scaling factors are not free at all.
55682 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
55683 // will take 2 allocations in the out of order engine instead of 1
55684 // for plain addressing mode, i.e. inst (reg1).
55685 // E.g.,
55686 // vaddps (%rsi,%rdx), %ymm0, %ymm1
55687 // Requires two allocations (one for the load, one for the computation)
55688 // whereas:
55689 // vaddps (%rsi), %ymm0, %ymm1
55690 // Requires just 1 allocation, i.e., freeing allocations for other operations
55691 // and having less micro operations to execute.
55692 //
55693 // For some X86 architectures, this is even worse because for instance for
55694 // stores, the complex addressing mode forces the instruction to use the
55695 // "load" ports instead of the dedicated "store" port.
55696 // E.g., on Haswell:
55697 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
55698 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
55699 if (isLegalAddressingMode(DL, AM, Ty, AS))
55700 // Scale represents reg2 * scale, thus account for 1
55701 // as soon as we use a second register.
55702 return AM.Scale != 0;
55703 return -1;
55704}
55705
55706bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
55707 // Integer division on x86 is expensive. However, when aggressively optimizing
55708 // for code size, we prefer to use a div instruction, as it is usually smaller
55709 // than the alternative sequence.
55710 // The exception to this is vector division. Since x86 doesn't have vector
55711 // integer division, leaving the division as-is is a loss even in terms of
55712 // size, because it will have to be scalarized, while the alternative code
55713 // sequence can be performed in vector form.
55714 bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
55715 return OptSize && !VT.isVector();
55716}
55717
55718void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
55719 if (!Subtarget.is64Bit())
55720 return;
55721
55722 // Update IsSplitCSR in X86MachineFunctionInfo.
55723 X86MachineFunctionInfo *AFI =
55724 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
55725 AFI->setIsSplitCSR(true);
55726}
55727
55728void X86TargetLowering::insertCopiesSplitCSR(
55729 MachineBasicBlock *Entry,
55730 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
55731 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
55732 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
55733 if (!IStart)
55734 return;
55735
55736 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
55737 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
55738 MachineBasicBlock::iterator MBBI = Entry->begin();
55739 for (const MCPhysReg *I = IStart; *I; ++I) {
55740 const TargetRegisterClass *RC = nullptr;
55741 if (X86::GR64RegClass.contains(*I))
55742 RC = &X86::GR64RegClass;
55743 else
55744 llvm_unreachable("Unexpected register class in CSRsViaCopy!")::llvm::llvm_unreachable_internal("Unexpected register class in CSRsViaCopy!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55744)
;
55745
55746 Register NewVR = MRI->createVirtualRegister(RC);
55747 // Create copy from CSR to a virtual register.
55748 // FIXME: this currently does not emit CFI pseudo-instructions, it works
55749 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
55750 // nounwind. If we want to generalize this later, we may need to emit
55751 // CFI pseudo-instructions.
55752 assert((static_cast <bool> (Entry->getParent()->getFunction
().hasFnAttribute(Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"
) ? void (0) : __assert_fail ("Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55754, __extension__
__PRETTY_FUNCTION__))
55753 Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&(static_cast <bool> (Entry->getParent()->getFunction
().hasFnAttribute(Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"
) ? void (0) : __assert_fail ("Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55754, __extension__
__PRETTY_FUNCTION__))
55754 "Function should be nounwind in insertCopiesSplitCSR!")(static_cast <bool> (Entry->getParent()->getFunction
().hasFnAttribute(Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"
) ? void (0) : __assert_fail ("Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55754, __extension__
__PRETTY_FUNCTION__))
;
55755 Entry->addLiveIn(*I);
55756 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
55757 .addReg(*I);
55758
55759 // Insert the copy-back instructions right before the terminator.
55760 for (auto *Exit : Exits)
55761 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
55762 TII->get(TargetOpcode::COPY), *I)
55763 .addReg(NewVR);
55764 }
55765}
55766
55767bool X86TargetLowering::supportSwiftError() const {
55768 return Subtarget.is64Bit();
55769}
55770
55771/// Returns true if stack probing through a function call is requested.
55772bool X86TargetLowering::hasStackProbeSymbol(MachineFunction &MF) const {
55773 return !getStackProbeSymbolName(MF).empty();
55774}
55775
55776/// Returns true if stack probing through inline assembly is requested.
55777bool X86TargetLowering::hasInlineStackProbe(MachineFunction &MF) const {
55778
55779 // No inline stack probe for Windows, they have their own mechanism.
55780 if (Subtarget.isOSWindows() ||
55781 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
55782 return false;
55783
55784 // If the function specifically requests inline stack probes, emit them.
55785 if (MF.getFunction().hasFnAttribute("probe-stack"))
55786 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
55787 "inline-asm";
55788
55789 return false;
55790}
55791
55792/// Returns the name of the symbol used to emit stack probes or the empty
55793/// string if not applicable.
55794StringRef
55795X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
55796 // Inline Stack probes disable stack probe call
55797 if (hasInlineStackProbe(MF))
55798 return "";
55799
55800 // If the function specifically requests stack probes, emit them.
55801 if (MF.getFunction().hasFnAttribute("probe-stack"))
55802 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
55803
55804 // Generally, if we aren't on Windows, the platform ABI does not include
55805 // support for stack probes, so don't emit them.
55806 if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO() ||
55807 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
55808 return "";
55809
55810 // We need a stack probe to conform to the Windows ABI. Choose the right
55811 // symbol.
55812 if (Subtarget.is64Bit())
55813 return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
55814 return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
55815}
55816
55817unsigned
55818X86TargetLowering::getStackProbeSize(MachineFunction &MF) const {
55819 // The default stack probe size is 4096 if the function has no stackprobesize
55820 // attribute.
55821 unsigned StackProbeSize = 4096;
55822 const Function &Fn = MF.getFunction();
55823 if (Fn.hasFnAttribute("stack-probe-size"))
55824 Fn.getFnAttribute("stack-probe-size")
55825 .getValueAsString()
55826 .getAsInteger(0, StackProbeSize);
55827 return StackProbeSize;
55828}
55829
55830Align X86TargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
55831 if (ML->isInnermost() &&
55832 ExperimentalPrefInnermostLoopAlignment.getNumOccurrences())
55833 return Align(1ULL << ExperimentalPrefInnermostLoopAlignment);
55834 return TargetLowering::getPrefLoopAlignment();
55835}