Bug Summary

File:build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/llvm/lib/Target/X86/X86ISelLowering.cpp
Warning:line 10227, column 35
Division by zero

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name X86ISelLowering.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/build-llvm -resource-dir /usr/lib/llvm-15/lib/clang/15.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I lib/Target/X86 -I /build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/llvm/lib/Target/X86 -I include -I /build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/llvm/include -D _FORTIFY_SOURCE=2 -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-15/lib/clang/15.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -fmacro-prefix-map=/build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/build-llvm=build-llvm -fmacro-prefix-map=/build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/= -fcoverage-prefix-map=/build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/build-llvm=build-llvm -fcoverage-prefix-map=/build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/= -O3 -Wno-unused-command-line-argument -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir=/build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/build-llvm -fdebug-prefix-map=/build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/build-llvm=build-llvm -fdebug-prefix-map=/build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/= -ferror-limit 19 -fvisibility hidden -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2022-04-20-140412-16051-1 -x c++ /build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/llvm/lib/Target/X86/X86ISelLowering.cpp
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that X86 uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "X86ISelLowering.h"
15#include "MCTargetDesc/X86ShuffleDecode.h"
16#include "X86.h"
17#include "X86CallingConv.h"
18#include "X86FrameLowering.h"
19#include "X86InstrBuilder.h"
20#include "X86IntrinsicsInfo.h"
21#include "X86MachineFunctionInfo.h"
22#include "X86TargetMachine.h"
23#include "X86TargetObjectFile.h"
24#include "llvm/ADT/SmallBitVector.h"
25#include "llvm/ADT/SmallSet.h"
26#include "llvm/ADT/Statistic.h"
27#include "llvm/ADT/StringExtras.h"
28#include "llvm/ADT/StringSwitch.h"
29#include "llvm/Analysis/BlockFrequencyInfo.h"
30#include "llvm/Analysis/EHPersonalities.h"
31#include "llvm/Analysis/ObjCARCUtil.h"
32#include "llvm/Analysis/ProfileSummaryInfo.h"
33#include "llvm/Analysis/VectorUtils.h"
34#include "llvm/CodeGen/IntrinsicLowering.h"
35#include "llvm/CodeGen/MachineFrameInfo.h"
36#include "llvm/CodeGen/MachineFunction.h"
37#include "llvm/CodeGen/MachineInstrBuilder.h"
38#include "llvm/CodeGen/MachineJumpTableInfo.h"
39#include "llvm/CodeGen/MachineLoopInfo.h"
40#include "llvm/CodeGen/MachineModuleInfo.h"
41#include "llvm/CodeGen/MachineRegisterInfo.h"
42#include "llvm/CodeGen/TargetLowering.h"
43#include "llvm/CodeGen/WinEHFuncInfo.h"
44#include "llvm/IR/CallingConv.h"
45#include "llvm/IR/Constants.h"
46#include "llvm/IR/DerivedTypes.h"
47#include "llvm/IR/DiagnosticInfo.h"
48#include "llvm/IR/Function.h"
49#include "llvm/IR/GlobalAlias.h"
50#include "llvm/IR/GlobalVariable.h"
51#include "llvm/IR/IRBuilder.h"
52#include "llvm/IR/Instructions.h"
53#include "llvm/IR/Intrinsics.h"
54#include "llvm/IR/PatternMatch.h"
55#include "llvm/MC/MCAsmInfo.h"
56#include "llvm/MC/MCContext.h"
57#include "llvm/MC/MCExpr.h"
58#include "llvm/MC/MCSymbol.h"
59#include "llvm/Support/CommandLine.h"
60#include "llvm/Support/Debug.h"
61#include "llvm/Support/ErrorHandling.h"
62#include "llvm/Support/KnownBits.h"
63#include "llvm/Support/MathExtras.h"
64#include "llvm/Target/TargetOptions.h"
65#include <algorithm>
66#include <bitset>
67#include <cctype>
68#include <numeric>
69using namespace llvm;
70
71#define DEBUG_TYPE"x86-isel" "x86-isel"
72
73STATISTIC(NumTailCalls, "Number of tail calls")static llvm::Statistic NumTailCalls = {"x86-isel", "NumTailCalls"
, "Number of tail calls"}
;
74
75static cl::opt<int> ExperimentalPrefInnermostLoopAlignment(
76 "x86-experimental-pref-innermost-loop-alignment", cl::init(4),
77 cl::desc(
78 "Sets the preferable loop alignment for experiments (as log2 bytes) "
79 "for innermost loops only. If specified, this option overrides "
80 "alignment set by x86-experimental-pref-loop-alignment."),
81 cl::Hidden);
82
83static cl::opt<bool> MulConstantOptimization(
84 "mul-constant-optimization", cl::init(true),
85 cl::desc("Replace 'mul x, Const' with more effective instructions like "
86 "SHIFT, LEA, etc."),
87 cl::Hidden);
88
89static cl::opt<bool> ExperimentalUnorderedISEL(
90 "x86-experimental-unordered-atomic-isel", cl::init(false),
91 cl::desc("Use LoadSDNode and StoreSDNode instead of "
92 "AtomicSDNode for unordered atomic loads and "
93 "stores respectively."),
94 cl::Hidden);
95
96/// Call this when the user attempts to do something unsupported, like
97/// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
98/// report_fatal_error, so calling code should attempt to recover without
99/// crashing.
100static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
101 const char *Msg) {
102 MachineFunction &MF = DAG.getMachineFunction();
103 DAG.getContext()->diagnose(
104 DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc()));
105}
106
107X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
108 const X86Subtarget &STI)
109 : TargetLowering(TM), Subtarget(STI) {
110 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
111 MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
112
113 // Set up the TargetLowering object.
114
115 // X86 is weird. It always uses i8 for shift amounts and setcc results.
116 setBooleanContents(ZeroOrOneBooleanContent);
117 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
118 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
119
120 // For 64-bit, since we have so many registers, use the ILP scheduler.
121 // For 32-bit, use the register pressure specific scheduling.
122 // For Atom, always use ILP scheduling.
123 if (Subtarget.isAtom())
124 setSchedulingPreference(Sched::ILP);
125 else if (Subtarget.is64Bit())
126 setSchedulingPreference(Sched::ILP);
127 else
128 setSchedulingPreference(Sched::RegPressure);
129 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
130 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
131
132 // Bypass expensive divides and use cheaper ones.
133 if (TM.getOptLevel() >= CodeGenOpt::Default) {
134 if (Subtarget.hasSlowDivide32())
135 addBypassSlowDiv(32, 8);
136 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
137 addBypassSlowDiv(64, 32);
138 }
139
140 // Setup Windows compiler runtime calls.
141 if (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()) {
142 static const struct {
143 const RTLIB::Libcall Op;
144 const char * const Name;
145 const CallingConv::ID CC;
146 } LibraryCalls[] = {
147 { RTLIB::SDIV_I64, "_alldiv", CallingConv::X86_StdCall },
148 { RTLIB::UDIV_I64, "_aulldiv", CallingConv::X86_StdCall },
149 { RTLIB::SREM_I64, "_allrem", CallingConv::X86_StdCall },
150 { RTLIB::UREM_I64, "_aullrem", CallingConv::X86_StdCall },
151 { RTLIB::MUL_I64, "_allmul", CallingConv::X86_StdCall },
152 };
153
154 for (const auto &LC : LibraryCalls) {
155 setLibcallName(LC.Op, LC.Name);
156 setLibcallCallingConv(LC.Op, LC.CC);
157 }
158 }
159
160 if (Subtarget.getTargetTriple().isOSMSVCRT()) {
161 // MSVCRT doesn't have powi; fall back to pow
162 setLibcallName(RTLIB::POWI_F32, nullptr);
163 setLibcallName(RTLIB::POWI_F64, nullptr);
164 }
165
166 // If we don't have cmpxchg8b(meaing this is a 386/486), limit atomic size to
167 // 32 bits so the AtomicExpandPass will expand it so we don't need cmpxchg8b.
168 // FIXME: Should we be limiting the atomic size on other configs? Default is
169 // 1024.
170 if (!Subtarget.canUseCMPXCHG8B())
171 setMaxAtomicSizeInBitsSupported(32);
172
173 // Set up the register classes.
174 addRegisterClass(MVT::i8, &X86::GR8RegClass);
175 addRegisterClass(MVT::i16, &X86::GR16RegClass);
176 addRegisterClass(MVT::i32, &X86::GR32RegClass);
177 if (Subtarget.is64Bit())
178 addRegisterClass(MVT::i64, &X86::GR64RegClass);
179
180 for (MVT VT : MVT::integer_valuetypes())
181 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
182
183 // We don't accept any truncstore of integer registers.
184 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
185 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
186 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
187 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
188 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
189 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
190
191 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
192
193 // SETOEQ and SETUNE require checking two conditions.
194 for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
195 setCondCodeAction(ISD::SETOEQ, VT, Expand);
196 setCondCodeAction(ISD::SETUNE, VT, Expand);
197 }
198
199 // Integer absolute.
200 if (Subtarget.canUseCMOV()) {
201 setOperationAction(ISD::ABS , MVT::i16 , Custom);
202 setOperationAction(ISD::ABS , MVT::i32 , Custom);
203 if (Subtarget.is64Bit())
204 setOperationAction(ISD::ABS , MVT::i64 , Custom);
205 }
206
207 // Signed saturation subtraction.
208 setOperationAction(ISD::SSUBSAT , MVT::i8 , Custom);
209 setOperationAction(ISD::SSUBSAT , MVT::i16 , Custom);
210 setOperationAction(ISD::SSUBSAT , MVT::i32 , Custom);
211 if (Subtarget.is64Bit())
212 setOperationAction(ISD::SSUBSAT , MVT::i64 , Custom);
213
214 // Funnel shifts.
215 for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
216 // For slow shld targets we only lower for code size.
217 LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;
218
219 setOperationAction(ShiftOp , MVT::i8 , Custom);
220 setOperationAction(ShiftOp , MVT::i16 , Custom);
221 setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction);
222 if (Subtarget.is64Bit())
223 setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction);
224 }
225
226 if (!Subtarget.useSoftFloat()) {
227 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
228 // operation.
229 setOperationAction(ISD::UINT_TO_FP, MVT::i8, Promote);
230 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i8, Promote);
231 setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
232 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i16, Promote);
233 // We have an algorithm for SSE2, and we turn this into a 64-bit
234 // FILD or VCVTUSI2SS/SD for other targets.
235 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
236 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);
237 // We have an algorithm for SSE2->double, and we turn this into a
238 // 64-bit FILD followed by conditional FADD for other targets.
239 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
240 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);
241
242 // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
243 // this operation.
244 setOperationAction(ISD::SINT_TO_FP, MVT::i8, Promote);
245 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i8, Promote);
246 // SSE has no i16 to fp conversion, only i32. We promote in the handler
247 // to allow f80 to use i16 and f64 to use i16 with sse1 only
248 setOperationAction(ISD::SINT_TO_FP, MVT::i16, Custom);
249 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i16, Custom);
250 // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
251 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
252 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);
253 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
254 // are Legal, f80 is custom lowered.
255 setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
256 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);
257
258 // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
259 // this operation.
260 setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote);
261 // FIXME: This doesn't generate invalid exception when it should. PR44019.
262 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i8, Promote);
263 setOperationAction(ISD::FP_TO_SINT, MVT::i16, Custom);
264 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i16, Custom);
265 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
266 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
267 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
268 // are Legal, f80 is custom lowered.
269 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
270 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);
271
272 // Handle FP_TO_UINT by promoting the destination to a larger signed
273 // conversion.
274 setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote);
275 // FIXME: This doesn't generate invalid exception when it should. PR44019.
276 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i8, Promote);
277 setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
278 // FIXME: This doesn't generate invalid exception when it should. PR44019.
279 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i16, Promote);
280 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
281 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
282 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
283 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);
284
285 setOperationAction(ISD::LRINT, MVT::f32, Custom);
286 setOperationAction(ISD::LRINT, MVT::f64, Custom);
287 setOperationAction(ISD::LLRINT, MVT::f32, Custom);
288 setOperationAction(ISD::LLRINT, MVT::f64, Custom);
289
290 if (!Subtarget.is64Bit()) {
291 setOperationAction(ISD::LRINT, MVT::i64, Custom);
292 setOperationAction(ISD::LLRINT, MVT::i64, Custom);
293 }
294 }
295
296 if (Subtarget.hasSSE2()) {
297 // Custom lowering for saturating float to int conversions.
298 // We handle promotion to larger result types manually.
299 for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
300 setOperationAction(ISD::FP_TO_UINT_SAT, VT, Custom);
301 setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom);
302 }
303 if (Subtarget.is64Bit()) {
304 setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom);
305 setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom);
306 }
307 }
308
309 // Handle address space casts between mixed sized pointers.
310 setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
311 setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
312
313 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
314 if (!Subtarget.hasSSE2()) {
315 setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
316 setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
317 if (Subtarget.is64Bit()) {
318 setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
319 // Without SSE, i64->f64 goes through memory.
320 setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
321 }
322 } else if (!Subtarget.is64Bit())
323 setOperationAction(ISD::BITCAST , MVT::i64 , Custom);
324
325 // Scalar integer divide and remainder are lowered to use operations that
326 // produce two results, to match the available instructions. This exposes
327 // the two-result form to trivial CSE, which is able to combine x/y and x%y
328 // into a single instruction.
329 //
330 // Scalar integer multiply-high is also lowered to use two-result
331 // operations, to match the available instructions. However, plain multiply
332 // (low) operations are left as Legal, as there are single-result
333 // instructions for this in x86. Using the two-result multiply instructions
334 // when both high and low results are needed must be arranged by dagcombine.
335 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
336 setOperationAction(ISD::MULHS, VT, Expand);
337 setOperationAction(ISD::MULHU, VT, Expand);
338 setOperationAction(ISD::SDIV, VT, Expand);
339 setOperationAction(ISD::UDIV, VT, Expand);
340 setOperationAction(ISD::SREM, VT, Expand);
341 setOperationAction(ISD::UREM, VT, Expand);
342 }
343
344 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
345 setOperationAction(ISD::BRCOND , MVT::Other, Custom);
346 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
347 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
348 setOperationAction(ISD::BR_CC, VT, Expand);
349 setOperationAction(ISD::SELECT_CC, VT, Expand);
350 }
351 if (Subtarget.is64Bit())
352 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
353 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
354 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
355 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
356
357 setOperationAction(ISD::FREM , MVT::f32 , Expand);
358 setOperationAction(ISD::FREM , MVT::f64 , Expand);
359 setOperationAction(ISD::FREM , MVT::f80 , Expand);
360 setOperationAction(ISD::FREM , MVT::f128 , Expand);
361
362 if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {
363 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);
364 setOperationAction(ISD::SET_ROUNDING , MVT::Other, Custom);
365 }
366
367 // Promote the i8 variants and force them on up to i32 which has a shorter
368 // encoding.
369 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
370 setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
371
372 if (Subtarget.hasBMI()) {
373 // Promote the i16 zero undef variant and force it on up to i32 when tzcnt
374 // is enabled.
375 setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i16, MVT::i32);
376 } else {
377 setOperationAction(ISD::CTTZ, MVT::i16, Custom);
378 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
379 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Legal);
380 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
381 if (Subtarget.is64Bit()) {
382 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
383 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
384 }
385 }
386
387 if (Subtarget.hasLZCNT()) {
388 // When promoting the i8 variants, force them to i32 for a shorter
389 // encoding.
390 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
391 setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
392 } else {
393 for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
394 if (VT == MVT::i64 && !Subtarget.is64Bit())
395 continue;
396 setOperationAction(ISD::CTLZ , VT, Custom);
397 setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Custom);
398 }
399 }
400
401 for (auto Op : {ISD::FP16_TO_FP, ISD::STRICT_FP16_TO_FP, ISD::FP_TO_FP16,
402 ISD::STRICT_FP_TO_FP16}) {
403 // Special handling for half-precision floating point conversions.
404 // If we don't have F16C support, then lower half float conversions
405 // into library calls.
406 setOperationAction(
407 Op, MVT::f32,
408 (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);
409 // There's never any support for operations beyond MVT::f32.
410 setOperationAction(Op, MVT::f64, Expand);
411 setOperationAction(Op, MVT::f80, Expand);
412 setOperationAction(Op, MVT::f128, Expand);
413 }
414
415 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
416 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
417 setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
418 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f16, Expand);
419 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
420 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
421 setTruncStoreAction(MVT::f80, MVT::f16, Expand);
422 setTruncStoreAction(MVT::f128, MVT::f16, Expand);
423
424 setOperationAction(ISD::PARITY, MVT::i8, Custom);
425 setOperationAction(ISD::PARITY, MVT::i16, Custom);
426 setOperationAction(ISD::PARITY, MVT::i32, Custom);
427 if (Subtarget.is64Bit())
428 setOperationAction(ISD::PARITY, MVT::i64, Custom);
429 if (Subtarget.hasPOPCNT()) {
430 setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
431 // popcntw is longer to encode than popcntl and also has a false dependency
432 // on the dest that popcntl hasn't had since Cannon Lake.
433 setOperationPromotedToType(ISD::CTPOP, MVT::i16, MVT::i32);
434 } else {
435 setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
436 setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
437 setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
438 if (Subtarget.is64Bit())
439 setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
440 else
441 setOperationAction(ISD::CTPOP , MVT::i64 , Custom);
442 }
443
444 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
445
446 if (!Subtarget.hasMOVBE())
447 setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
448
449 // X86 wants to expand cmov itself.
450 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
451 setOperationAction(ISD::SELECT, VT, Custom);
452 setOperationAction(ISD::SETCC, VT, Custom);
453 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
454 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
455 }
456 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
457 if (VT == MVT::i64 && !Subtarget.is64Bit())
458 continue;
459 setOperationAction(ISD::SELECT, VT, Custom);
460 setOperationAction(ISD::SETCC, VT, Custom);
461 }
462
463 // Custom action for SELECT MMX and expand action for SELECT_CC MMX
464 setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
465 setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);
466
467 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
468 // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
469 // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
470 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
471 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
472 setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
473 if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
474 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
475
476 // Darwin ABI issue.
477 for (auto VT : { MVT::i32, MVT::i64 }) {
478 if (VT == MVT::i64 && !Subtarget.is64Bit())
479 continue;
480 setOperationAction(ISD::ConstantPool , VT, Custom);
481 setOperationAction(ISD::JumpTable , VT, Custom);
482 setOperationAction(ISD::GlobalAddress , VT, Custom);
483 setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
484 setOperationAction(ISD::ExternalSymbol , VT, Custom);
485 setOperationAction(ISD::BlockAddress , VT, Custom);
486 }
487
488 // 64-bit shl, sra, srl (iff 32-bit x86)
489 for (auto VT : { MVT::i32, MVT::i64 }) {
490 if (VT == MVT::i64 && !Subtarget.is64Bit())
491 continue;
492 setOperationAction(ISD::SHL_PARTS, VT, Custom);
493 setOperationAction(ISD::SRA_PARTS, VT, Custom);
494 setOperationAction(ISD::SRL_PARTS, VT, Custom);
495 }
496
497 if (Subtarget.hasSSEPrefetch() || Subtarget.hasThreeDNow())
498 setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
499
500 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
501
502 // Expand certain atomics
503 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
504 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
505 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
506 setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
507 setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
508 setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
509 setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
510 setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
511 }
512
513 if (!Subtarget.is64Bit())
514 setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
515
516 if (Subtarget.canUseCMPXCHG16B())
517 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
518
519 // FIXME - use subtarget debug flags
520 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
521 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
522 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
523 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
524 }
525
526 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
527 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
528
529 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
530 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
531
532 setOperationAction(ISD::TRAP, MVT::Other, Legal);
533 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
534 if (Subtarget.getTargetTriple().isPS4())
535 setOperationAction(ISD::UBSANTRAP, MVT::Other, Expand);
536 else
537 setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);
538
539 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
540 setOperationAction(ISD::VASTART , MVT::Other, Custom);
541 setOperationAction(ISD::VAEND , MVT::Other, Expand);
542 bool Is64Bit = Subtarget.is64Bit();
543 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
544 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
545
546 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
547 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
548
549 setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
550
551 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
552 setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
553 setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
554
555 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
556 // f32 and f64 use SSE.
557 // Set up the FP register classes.
558 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
559 : &X86::FR32RegClass);
560 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
561 : &X86::FR64RegClass);
562
563 // Disable f32->f64 extload as we can only generate this in one instruction
564 // under optsize. So its easier to pattern match (fpext (load)) for that
565 // case instead of needing to emit 2 instructions for extload in the
566 // non-optsize case.
567 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
568
569 for (auto VT : { MVT::f32, MVT::f64 }) {
570 // Use ANDPD to simulate FABS.
571 setOperationAction(ISD::FABS, VT, Custom);
572
573 // Use XORP to simulate FNEG.
574 setOperationAction(ISD::FNEG, VT, Custom);
575
576 // Use ANDPD and ORPD to simulate FCOPYSIGN.
577 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
578
579 // These might be better off as horizontal vector ops.
580 setOperationAction(ISD::FADD, VT, Custom);
581 setOperationAction(ISD::FSUB, VT, Custom);
582
583 // We don't support sin/cos/fmod
584 setOperationAction(ISD::FSIN , VT, Expand);
585 setOperationAction(ISD::FCOS , VT, Expand);
586 setOperationAction(ISD::FSINCOS, VT, Expand);
587 }
588
589 // Lower this to MOVMSK plus an AND.
590 setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
591 setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
592
593 } else if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1() &&
594 (UseX87 || Is64Bit)) {
595 // Use SSE for f32, x87 for f64.
596 // Set up the FP register classes.
597 addRegisterClass(MVT::f32, &X86::FR32RegClass);
598 if (UseX87)
599 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
600
601 // Use ANDPS to simulate FABS.
602 setOperationAction(ISD::FABS , MVT::f32, Custom);
603
604 // Use XORP to simulate FNEG.
605 setOperationAction(ISD::FNEG , MVT::f32, Custom);
606
607 if (UseX87)
608 setOperationAction(ISD::UNDEF, MVT::f64, Expand);
609
610 // Use ANDPS and ORPS to simulate FCOPYSIGN.
611 if (UseX87)
612 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
613 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
614
615 // We don't support sin/cos/fmod
616 setOperationAction(ISD::FSIN , MVT::f32, Expand);
617 setOperationAction(ISD::FCOS , MVT::f32, Expand);
618 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
619
620 if (UseX87) {
621 // Always expand sin/cos functions even though x87 has an instruction.
622 setOperationAction(ISD::FSIN, MVT::f64, Expand);
623 setOperationAction(ISD::FCOS, MVT::f64, Expand);
624 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
625 }
626 } else if (UseX87) {
627 // f32 and f64 in x87.
628 // Set up the FP register classes.
629 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
630 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
631
632 for (auto VT : { MVT::f32, MVT::f64 }) {
633 setOperationAction(ISD::UNDEF, VT, Expand);
634 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
635
636 // Always expand sin/cos functions even though x87 has an instruction.
637 setOperationAction(ISD::FSIN , VT, Expand);
638 setOperationAction(ISD::FCOS , VT, Expand);
639 setOperationAction(ISD::FSINCOS, VT, Expand);
640 }
641 }
642
643 // Expand FP32 immediates into loads from the stack, save special cases.
644 if (isTypeLegal(MVT::f32)) {
645 if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
646 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
647 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
648 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
649 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
650 } else // SSE immediates.
651 addLegalFPImmediate(APFloat(+0.0f)); // xorps
652 }
653 // Expand FP64 immediates into loads from the stack, save special cases.
654 if (isTypeLegal(MVT::f64)) {
655 if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
656 addLegalFPImmediate(APFloat(+0.0)); // FLD0
657 addLegalFPImmediate(APFloat(+1.0)); // FLD1
658 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
659 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
660 } else // SSE immediates.
661 addLegalFPImmediate(APFloat(+0.0)); // xorpd
662 }
663 // Handle constrained floating-point operations of scalar.
664 setOperationAction(ISD::STRICT_FADD, MVT::f32, Legal);
665 setOperationAction(ISD::STRICT_FADD, MVT::f64, Legal);
666 setOperationAction(ISD::STRICT_FSUB, MVT::f32, Legal);
667 setOperationAction(ISD::STRICT_FSUB, MVT::f64, Legal);
668 setOperationAction(ISD::STRICT_FMUL, MVT::f32, Legal);
669 setOperationAction(ISD::STRICT_FMUL, MVT::f64, Legal);
670 setOperationAction(ISD::STRICT_FDIV, MVT::f32, Legal);
671 setOperationAction(ISD::STRICT_FDIV, MVT::f64, Legal);
672 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);
673 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal);
674 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Legal);
675 setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Legal);
676 setOperationAction(ISD::STRICT_FSQRT, MVT::f64, Legal);
677
678 // We don't support FMA.
679 setOperationAction(ISD::FMA, MVT::f64, Expand);
680 setOperationAction(ISD::FMA, MVT::f32, Expand);
681
682 // f80 always uses X87.
683 if (UseX87) {
684 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
685 setOperationAction(ISD::UNDEF, MVT::f80, Expand);
686 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
687 {
688 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
689 addLegalFPImmediate(TmpFlt); // FLD0
690 TmpFlt.changeSign();
691 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
692
693 bool ignored;
694 APFloat TmpFlt2(+1.0);
695 TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
696 &ignored);
697 addLegalFPImmediate(TmpFlt2); // FLD1
698 TmpFlt2.changeSign();
699 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
700 }
701
702 // Always expand sin/cos functions even though x87 has an instruction.
703 setOperationAction(ISD::FSIN , MVT::f80, Expand);
704 setOperationAction(ISD::FCOS , MVT::f80, Expand);
705 setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
706
707 setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
708 setOperationAction(ISD::FCEIL, MVT::f80, Expand);
709 setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
710 setOperationAction(ISD::FRINT, MVT::f80, Expand);
711 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
712 setOperationAction(ISD::FMA, MVT::f80, Expand);
713 setOperationAction(ISD::LROUND, MVT::f80, Expand);
714 setOperationAction(ISD::LLROUND, MVT::f80, Expand);
715 setOperationAction(ISD::LRINT, MVT::f80, Custom);
716 setOperationAction(ISD::LLRINT, MVT::f80, Custom);
717
718 // Handle constrained floating-point operations of scalar.
719 setOperationAction(ISD::STRICT_FADD , MVT::f80, Legal);
720 setOperationAction(ISD::STRICT_FSUB , MVT::f80, Legal);
721 setOperationAction(ISD::STRICT_FMUL , MVT::f80, Legal);
722 setOperationAction(ISD::STRICT_FDIV , MVT::f80, Legal);
723 setOperationAction(ISD::STRICT_FSQRT , MVT::f80, Legal);
724 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Legal);
725 // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
726 // as Custom.
727 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Legal);
728 }
729
730 // f128 uses xmm registers, but most operations require libcalls.
731 if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
732 addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
733 : &X86::VR128RegClass);
734
735 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
736
737 setOperationAction(ISD::FADD, MVT::f128, LibCall);
738 setOperationAction(ISD::STRICT_FADD, MVT::f128, LibCall);
739 setOperationAction(ISD::FSUB, MVT::f128, LibCall);
740 setOperationAction(ISD::STRICT_FSUB, MVT::f128, LibCall);
741 setOperationAction(ISD::FDIV, MVT::f128, LibCall);
742 setOperationAction(ISD::STRICT_FDIV, MVT::f128, LibCall);
743 setOperationAction(ISD::FMUL, MVT::f128, LibCall);
744 setOperationAction(ISD::STRICT_FMUL, MVT::f128, LibCall);
745 setOperationAction(ISD::FMA, MVT::f128, LibCall);
746 setOperationAction(ISD::STRICT_FMA, MVT::f128, LibCall);
747
748 setOperationAction(ISD::FABS, MVT::f128, Custom);
749 setOperationAction(ISD::FNEG, MVT::f128, Custom);
750 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
751
752 setOperationAction(ISD::FSIN, MVT::f128, LibCall);
753 setOperationAction(ISD::STRICT_FSIN, MVT::f128, LibCall);
754 setOperationAction(ISD::FCOS, MVT::f128, LibCall);
755 setOperationAction(ISD::STRICT_FCOS, MVT::f128, LibCall);
756 setOperationAction(ISD::FSINCOS, MVT::f128, LibCall);
757 // No STRICT_FSINCOS
758 setOperationAction(ISD::FSQRT, MVT::f128, LibCall);
759 setOperationAction(ISD::STRICT_FSQRT, MVT::f128, LibCall);
760
761 setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
762 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Custom);
763 // We need to custom handle any FP_ROUND with an f128 input, but
764 // LegalizeDAG uses the result type to know when to run a custom handler.
765 // So we have to list all legal floating point result types here.
766 if (isTypeLegal(MVT::f32)) {
767 setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
768 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);
769 }
770 if (isTypeLegal(MVT::f64)) {
771 setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
772 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);
773 }
774 if (isTypeLegal(MVT::f80)) {
775 setOperationAction(ISD::FP_ROUND, MVT::f80, Custom);
776 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Custom);
777 }
778
779 setOperationAction(ISD::SETCC, MVT::f128, Custom);
780
781 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
782 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
783 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);
784 setTruncStoreAction(MVT::f128, MVT::f32, Expand);
785 setTruncStoreAction(MVT::f128, MVT::f64, Expand);
786 setTruncStoreAction(MVT::f128, MVT::f80, Expand);
787 }
788
789 // Always use a library call for pow.
790 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
791 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
792 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
793 setOperationAction(ISD::FPOW , MVT::f128 , Expand);
794
795 setOperationAction(ISD::FLOG, MVT::f80, Expand);
796 setOperationAction(ISD::FLOG2, MVT::f80, Expand);
797 setOperationAction(ISD::FLOG10, MVT::f80, Expand);
798 setOperationAction(ISD::FEXP, MVT::f80, Expand);
799 setOperationAction(ISD::FEXP2, MVT::f80, Expand);
800 setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
801 setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
802
803 // Some FP actions are always expanded for vector types.
804 for (auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16,
805 MVT::v4f32, MVT::v8f32, MVT::v16f32,
806 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
807 setOperationAction(ISD::FSIN, VT, Expand);
808 setOperationAction(ISD::FSINCOS, VT, Expand);
809 setOperationAction(ISD::FCOS, VT, Expand);
810 setOperationAction(ISD::FREM, VT, Expand);
811 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
812 setOperationAction(ISD::FPOW, VT, Expand);
813 setOperationAction(ISD::FLOG, VT, Expand);
814 setOperationAction(ISD::FLOG2, VT, Expand);
815 setOperationAction(ISD::FLOG10, VT, Expand);
816 setOperationAction(ISD::FEXP, VT, Expand);
817 setOperationAction(ISD::FEXP2, VT, Expand);
818 }
819
820 // First set operation action for all vector types to either promote
821 // (for widening) or expand (for scalarization). Then we will selectively
822 // turn on ones that can be effectively codegen'd.
823 for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
824 setOperationAction(ISD::SDIV, VT, Expand);
825 setOperationAction(ISD::UDIV, VT, Expand);
826 setOperationAction(ISD::SREM, VT, Expand);
827 setOperationAction(ISD::UREM, VT, Expand);
828 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
829 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
830 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
831 setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
832 setOperationAction(ISD::FMA, VT, Expand);
833 setOperationAction(ISD::FFLOOR, VT, Expand);
834 setOperationAction(ISD::FCEIL, VT, Expand);
835 setOperationAction(ISD::FTRUNC, VT, Expand);
836 setOperationAction(ISD::FRINT, VT, Expand);
837 setOperationAction(ISD::FNEARBYINT, VT, Expand);
838 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
839 setOperationAction(ISD::MULHS, VT, Expand);
840 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
841 setOperationAction(ISD::MULHU, VT, Expand);
842 setOperationAction(ISD::SDIVREM, VT, Expand);
843 setOperationAction(ISD::UDIVREM, VT, Expand);
844 setOperationAction(ISD::CTPOP, VT, Expand);
845 setOperationAction(ISD::CTTZ, VT, Expand);
846 setOperationAction(ISD::CTLZ, VT, Expand);
847 setOperationAction(ISD::ROTL, VT, Expand);
848 setOperationAction(ISD::ROTR, VT, Expand);
849 setOperationAction(ISD::BSWAP, VT, Expand);
850 setOperationAction(ISD::SETCC, VT, Expand);
851 setOperationAction(ISD::FP_TO_UINT, VT, Expand);
852 setOperationAction(ISD::FP_TO_SINT, VT, Expand);
853 setOperationAction(ISD::UINT_TO_FP, VT, Expand);
854 setOperationAction(ISD::SINT_TO_FP, VT, Expand);
855 setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
856 setOperationAction(ISD::TRUNCATE, VT, Expand);
857 setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
858 setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
859 setOperationAction(ISD::ANY_EXTEND, VT, Expand);
860 setOperationAction(ISD::SELECT_CC, VT, Expand);
861 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
862 setTruncStoreAction(InnerVT, VT, Expand);
863
864 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
865 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
866
867 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
868 // types, we have to deal with them whether we ask for Expansion or not.
869 // Setting Expand causes its own optimisation problems though, so leave
870 // them legal.
871 if (VT.getVectorElementType() == MVT::i1)
872 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
873
874 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
875 // split/scalarized right now.
876 if (VT.getVectorElementType() == MVT::f16)
877 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
878 }
879 }
880
881 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
882 // with -msoft-float, disable use of MMX as well.
883 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
884 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
885 // No operations on x86mmx supported, everything uses intrinsics.
886 }
887
888 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
889 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
890 : &X86::VR128RegClass);
891
892 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
893 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
894 setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);
895 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
896 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
897 setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
898 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
899 setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
900
901 setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
902 setOperationAction(ISD::STORE, MVT::v2f32, Custom);
903
904 setOperationAction(ISD::STRICT_FADD, MVT::v4f32, Legal);
905 setOperationAction(ISD::STRICT_FSUB, MVT::v4f32, Legal);
906 setOperationAction(ISD::STRICT_FMUL, MVT::v4f32, Legal);
907 setOperationAction(ISD::STRICT_FDIV, MVT::v4f32, Legal);
908 setOperationAction(ISD::STRICT_FSQRT, MVT::v4f32, Legal);
909 }
910
911 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
912 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
913 : &X86::VR128RegClass);
914
915 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
916 // registers cannot be used even for integer operations.
917 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
918 : &X86::VR128RegClass);
919 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
920 : &X86::VR128RegClass);
921 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
922 : &X86::VR128RegClass);
923 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
924 : &X86::VR128RegClass);
925
926 for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
927 MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
928 setOperationAction(ISD::SDIV, VT, Custom);
929 setOperationAction(ISD::SREM, VT, Custom);
930 setOperationAction(ISD::UDIV, VT, Custom);
931 setOperationAction(ISD::UREM, VT, Custom);
932 }
933
934 setOperationAction(ISD::MUL, MVT::v2i8, Custom);
935 setOperationAction(ISD::MUL, MVT::v4i8, Custom);
936 setOperationAction(ISD::MUL, MVT::v8i8, Custom);
937
938 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
939 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
940 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
941 setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
942 setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
943 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
944 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
945 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
946 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
947 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
948 setOperationAction(ISD::AVGCEILU, MVT::v16i8, Legal);
949 setOperationAction(ISD::AVGCEILU, MVT::v8i16, Legal);
950
951 setOperationAction(ISD::SMULO, MVT::v16i8, Custom);
952 setOperationAction(ISD::UMULO, MVT::v16i8, Custom);
953
954 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
955 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
956 setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);
957
958 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
959 setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
960 setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
961 setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
962 setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
963 }
964
965 setOperationAction(ISD::UADDSAT, MVT::v16i8, Legal);
966 setOperationAction(ISD::SADDSAT, MVT::v16i8, Legal);
967 setOperationAction(ISD::USUBSAT, MVT::v16i8, Legal);
968 setOperationAction(ISD::SSUBSAT, MVT::v16i8, Legal);
969 setOperationAction(ISD::UADDSAT, MVT::v8i16, Legal);
970 setOperationAction(ISD::SADDSAT, MVT::v8i16, Legal);
971 setOperationAction(ISD::USUBSAT, MVT::v8i16, Legal);
972 setOperationAction(ISD::SSUBSAT, MVT::v8i16, Legal);
973 setOperationAction(ISD::USUBSAT, MVT::v4i32, Custom);
974 setOperationAction(ISD::USUBSAT, MVT::v2i64, Custom);
975
976 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
977 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
978 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
979 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
980
981 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
982 setOperationAction(ISD::SETCC, VT, Custom);
983 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
984 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
985 setOperationAction(ISD::CTPOP, VT, Custom);
986 setOperationAction(ISD::ABS, VT, Custom);
987
988 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
989 // setcc all the way to isel and prefer SETGT in some isel patterns.
990 setCondCodeAction(ISD::SETLT, VT, Custom);
991 setCondCodeAction(ISD::SETLE, VT, Custom);
992 }
993
994 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
995 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
996 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
997 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
998 setOperationAction(ISD::VSELECT, VT, Custom);
999 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1000 }
1001
1002 for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
1003 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1004 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1005 setOperationAction(ISD::VSELECT, VT, Custom);
1006
1007 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
1008 continue;
1009
1010 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1011 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1012 }
1013
1014 // Custom lower v2i64 and v2f64 selects.
1015 setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
1016 setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
1017 setOperationAction(ISD::SELECT, MVT::v4i32, Custom);
1018 setOperationAction(ISD::SELECT, MVT::v8i16, Custom);
1019 setOperationAction(ISD::SELECT, MVT::v16i8, Custom);
1020
1021 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
1022 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Custom);
1023 setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
1024 setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
1025 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Legal);
1026 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i32, Custom);
1027
1028 // Custom legalize these to avoid over promotion or custom promotion.
1029 for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
1030 setOperationAction(ISD::FP_TO_SINT, VT, Custom);
1031 setOperationAction(ISD::FP_TO_UINT, VT, Custom);
1032 setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom);
1033 setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom);
1034 }
1035
1036 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
1037 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i32, Legal);
1038 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
1039 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i32, Custom);
1040
1041 setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
1042 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i32, Custom);
1043
1044 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
1045 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32, Custom);
1046
1047 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
1048 setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
1049 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f32, Custom);
1050 setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
1051 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f32, Custom);
1052
1053 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
1054 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f32, Custom);
1055 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
1056 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f32, Custom);
1057
1058 // We want to legalize this to an f64 load rather than an i64 load on
1059 // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
1060 // store.
1061 setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
1062 setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
1063 setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
1064 setOperationAction(ISD::STORE, MVT::v2i32, Custom);
1065 setOperationAction(ISD::STORE, MVT::v4i16, Custom);
1066 setOperationAction(ISD::STORE, MVT::v8i8, Custom);
1067
1068 setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
1069 setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
1070 setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
1071 if (!Subtarget.hasAVX512())
1072 setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);
1073
1074 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
1075 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
1076 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
1077
1078 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
1079
1080 setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom);
1081 setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom);
1082 setOperationAction(ISD::TRUNCATE, MVT::v2i32, Custom);
1083 setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom);
1084 setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom);
1085 setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom);
1086
1087 // In the customized shift lowering, the legal v4i32/v2i64 cases
1088 // in AVX2 will be recognized.
1089 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1090 setOperationAction(ISD::SRL, VT, Custom);
1091 setOperationAction(ISD::SHL, VT, Custom);
1092 setOperationAction(ISD::SRA, VT, Custom);
1093 if (VT == MVT::v2i64) continue;
1094 setOperationAction(ISD::ROTL, VT, Custom);
1095 setOperationAction(ISD::ROTR, VT, Custom);
1096 setOperationAction(ISD::FSHL, VT, Custom);
1097 setOperationAction(ISD::FSHR, VT, Custom);
1098 }
1099
1100 setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal);
1101 setOperationAction(ISD::STRICT_FADD, MVT::v2f64, Legal);
1102 setOperationAction(ISD::STRICT_FSUB, MVT::v2f64, Legal);
1103 setOperationAction(ISD::STRICT_FMUL, MVT::v2f64, Legal);
1104 setOperationAction(ISD::STRICT_FDIV, MVT::v2f64, Legal);
1105 }
1106
1107 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
1108 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
1109 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
1110 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
1111 setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);
1112 setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
1113 setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
1114 setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
1115 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1116
1117 // These might be better off as horizontal vector ops.
1118 setOperationAction(ISD::ADD, MVT::i16, Custom);
1119 setOperationAction(ISD::ADD, MVT::i32, Custom);
1120 setOperationAction(ISD::SUB, MVT::i16, Custom);
1121 setOperationAction(ISD::SUB, MVT::i32, Custom);
1122 }
1123
1124 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
1125 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
1126 setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
1127 setOperationAction(ISD::STRICT_FFLOOR, RoundedTy, Legal);
1128 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
1129 setOperationAction(ISD::STRICT_FCEIL, RoundedTy, Legal);
1130 setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
1131 setOperationAction(ISD::STRICT_FTRUNC, RoundedTy, Legal);
1132 setOperationAction(ISD::FRINT, RoundedTy, Legal);
1133 setOperationAction(ISD::STRICT_FRINT, RoundedTy, Legal);
1134 setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
1135 setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy, Legal);
1136 setOperationAction(ISD::FROUNDEVEN, RoundedTy, Legal);
1137 setOperationAction(ISD::STRICT_FROUNDEVEN, RoundedTy, Legal);
1138
1139 setOperationAction(ISD::FROUND, RoundedTy, Custom);
1140 }
1141
1142 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
1143 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
1144 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
1145 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
1146 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
1147 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
1148 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
1149 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
1150
1151 setOperationAction(ISD::UADDSAT, MVT::v4i32, Custom);
1152 setOperationAction(ISD::SADDSAT, MVT::v2i64, Custom);
1153 setOperationAction(ISD::SSUBSAT, MVT::v2i64, Custom);
1154
1155 // FIXME: Do we need to handle scalar-to-vector here?
1156 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
1157
1158 // We directly match byte blends in the backend as they match the VSELECT
1159 // condition form.
1160 setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
1161
1162 // SSE41 brings specific instructions for doing vector sign extend even in
1163 // cases where we don't have SRA.
1164 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1165 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
1166 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
1167 }
1168
1169 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1170 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1171 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
1172 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
1173 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
1174 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
1175 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
1176 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
1177 }
1178
1179 if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
1180 // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
1181 // do the pre and post work in the vector domain.
1182 setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Custom);
1183 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i64, Custom);
1184 // We need to mark SINT_TO_FP as Custom even though we want to expand it
1185 // so that DAG combine doesn't try to turn it into uint_to_fp.
1186 setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Custom);
1187 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i64, Custom);
1188 }
1189 }
1190
1191 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
1192 setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom);
1193 }
1194
1195 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
1196 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1197 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1198 setOperationAction(ISD::ROTL, VT, Custom);
1199 setOperationAction(ISD::ROTR, VT, Custom);
1200 }
1201
1202 // XOP can efficiently perform BITREVERSE with VPPERM.
1203 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
1204 setOperationAction(ISD::BITREVERSE, VT, Custom);
1205
1206 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1207 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1208 setOperationAction(ISD::BITREVERSE, VT, Custom);
1209 }
1210
1211 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
1212 bool HasInt256 = Subtarget.hasInt256();
1213
1214 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
1215 : &X86::VR256RegClass);
1216 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1217 : &X86::VR256RegClass);
1218 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1219 : &X86::VR256RegClass);
1220 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1221 : &X86::VR256RegClass);
1222 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1223 : &X86::VR256RegClass);
1224 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1225 : &X86::VR256RegClass);
1226
1227 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
1228 setOperationAction(ISD::FFLOOR, VT, Legal);
1229 setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
1230 setOperationAction(ISD::FCEIL, VT, Legal);
1231 setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
1232 setOperationAction(ISD::FTRUNC, VT, Legal);
1233 setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
1234 setOperationAction(ISD::FRINT, VT, Legal);
1235 setOperationAction(ISD::STRICT_FRINT, VT, Legal);
1236 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1237 setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
1238 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
1239 setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);
1240
1241 setOperationAction(ISD::FROUND, VT, Custom);
1242
1243 setOperationAction(ISD::FNEG, VT, Custom);
1244 setOperationAction(ISD::FABS, VT, Custom);
1245 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1246 }
1247
1248 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1249 // even though v8i16 is a legal type.
1250 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1251 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1252 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1253 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1254 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
1255 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Custom);
1256 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i32, Legal);
1257
1258 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
1259 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i32, Legal);
1260
1261 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f32, Legal);
1262 setOperationAction(ISD::STRICT_FADD, MVT::v8f32, Legal);
1263 setOperationAction(ISD::STRICT_FADD, MVT::v4f64, Legal);
1264 setOperationAction(ISD::STRICT_FSUB, MVT::v8f32, Legal);
1265 setOperationAction(ISD::STRICT_FSUB, MVT::v4f64, Legal);
1266 setOperationAction(ISD::STRICT_FMUL, MVT::v8f32, Legal);
1267 setOperationAction(ISD::STRICT_FMUL, MVT::v4f64, Legal);
1268 setOperationAction(ISD::STRICT_FDIV, MVT::v8f32, Legal);
1269 setOperationAction(ISD::STRICT_FDIV, MVT::v4f64, Legal);
1270 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Legal);
1271 setOperationAction(ISD::STRICT_FSQRT, MVT::v8f32, Legal);
1272 setOperationAction(ISD::STRICT_FSQRT, MVT::v4f64, Legal);
1273
1274 if (!Subtarget.hasAVX512())
1275 setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
1276
1277 // In the customized shift lowering, the legal v8i32/v4i64 cases
1278 // in AVX2 will be recognized.
1279 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1280 setOperationAction(ISD::SRL, VT, Custom);
1281 setOperationAction(ISD::SHL, VT, Custom);
1282 setOperationAction(ISD::SRA, VT, Custom);
1283 if (VT == MVT::v4i64) continue;
1284 setOperationAction(ISD::ROTL, VT, Custom);
1285 setOperationAction(ISD::ROTR, VT, Custom);
1286 setOperationAction(ISD::FSHL, VT, Custom);
1287 setOperationAction(ISD::FSHR, VT, Custom);
1288 }
1289
1290 // These types need custom splitting if their input is a 128-bit vector.
1291 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1292 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1293 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1294 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1295
1296 setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
1297 setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
1298 setOperationAction(ISD::SELECT, MVT::v8i32, Custom);
1299 setOperationAction(ISD::SELECT, MVT::v16i16, Custom);
1300 setOperationAction(ISD::SELECT, MVT::v32i8, Custom);
1301 setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
1302
1303 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1304 setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1305 setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1306 setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1307 }
1308
1309 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1310 setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
1311 setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
1312 setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);
1313
1314 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1315 setOperationAction(ISD::SETCC, VT, Custom);
1316 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
1317 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
1318 setOperationAction(ISD::CTPOP, VT, Custom);
1319 setOperationAction(ISD::CTLZ, VT, Custom);
1320
1321 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1322 // setcc all the way to isel and prefer SETGT in some isel patterns.
1323 setCondCodeAction(ISD::SETLT, VT, Custom);
1324 setCondCodeAction(ISD::SETLE, VT, Custom);
1325 }
1326
1327 if (Subtarget.hasAnyFMA()) {
1328 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1329 MVT::v2f64, MVT::v4f64 }) {
1330 setOperationAction(ISD::FMA, VT, Legal);
1331 setOperationAction(ISD::STRICT_FMA, VT, Legal);
1332 }
1333 }
1334
1335 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1336 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1337 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1338 }
1339
1340 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1341 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1342 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1343 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1344
1345 setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
1346 setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
1347 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1348 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1349 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1350 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1351 setOperationAction(ISD::AVGCEILU, MVT::v16i16, HasInt256 ? Legal : Custom);
1352 setOperationAction(ISD::AVGCEILU, MVT::v32i8, HasInt256 ? Legal : Custom);
1353
1354 setOperationAction(ISD::SMULO, MVT::v32i8, Custom);
1355 setOperationAction(ISD::UMULO, MVT::v32i8, Custom);
1356
1357 setOperationAction(ISD::ABS, MVT::v4i64, Custom);
1358 setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
1359 setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
1360 setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
1361 setOperationAction(ISD::UMIN, MVT::v4i64, Custom);
1362
1363 setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1364 setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1365 setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1366 setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1367 setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1368 setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1369 setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1370 setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1371 setOperationAction(ISD::UADDSAT, MVT::v8i32, Custom);
1372 setOperationAction(ISD::USUBSAT, MVT::v8i32, Custom);
1373 setOperationAction(ISD::UADDSAT, MVT::v4i64, Custom);
1374 setOperationAction(ISD::USUBSAT, MVT::v4i64, Custom);
1375
1376 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1377 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1378 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1379 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1380 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1381 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1382 }
1383
1384 for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
1385 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
1386 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
1387 }
1388
1389 if (HasInt256) {
1390 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1391 // when we have a 256bit-wide blend with immediate.
1392 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1393 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32, Custom);
1394
1395 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1396 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1397 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1398 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1399 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1400 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1401 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1402 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1403 }
1404 }
1405
1406 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1407 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1408 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1409 setOperationAction(ISD::MSTORE, VT, Legal);
1410 }
1411
1412 // Extract subvector is special because the value type
1413 // (result) is 128-bit but the source is 256-bit wide.
1414 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1415 MVT::v4f32, MVT::v2f64 }) {
1416 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1417 }
1418
1419 // Custom lower several nodes for 256-bit types.
1420 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1421 MVT::v8f32, MVT::v4f64 }) {
1422 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1423 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1424 setOperationAction(ISD::VSELECT, VT, Custom);
1425 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1426 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1427 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1428 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1429 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1430 setOperationAction(ISD::STORE, VT, Custom);
1431 }
1432
1433 if (HasInt256) {
1434 setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
1435
1436 // Custom legalize 2x32 to get a little better code.
1437 setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
1438 setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);
1439
1440 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1441 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1442 setOperationAction(ISD::MGATHER, VT, Custom);
1443 }
1444 }
1445
1446 // This block controls legalization of the mask vector sizes that are
1447 // available with AVX512. 512-bit vectors are in a separate block controlled
1448 // by useAVX512Regs.
1449 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1450 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1451 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1452 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1453 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1454 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1455
1456 setOperationAction(ISD::SELECT, MVT::v1i1, Custom);
1457 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
1458 setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);
1459
1460 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1461 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1462 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1463 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1464 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1465 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1466 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1467 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1468 setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom);
1469 setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom);
1470 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i1, Custom);
1471 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i1, Custom);
1472
1473 // There is no byte sized k-register load or store without AVX512DQ.
1474 if (!Subtarget.hasDQI()) {
1475 setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
1476 setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
1477 setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
1478 setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
1479
1480 setOperationAction(ISD::STORE, MVT::v1i1, Custom);
1481 setOperationAction(ISD::STORE, MVT::v2i1, Custom);
1482 setOperationAction(ISD::STORE, MVT::v4i1, Custom);
1483 setOperationAction(ISD::STORE, MVT::v8i1, Custom);
1484 }
1485
1486 // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1487 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1488 setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1489 setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1490 setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1491 }
1492
1493 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })
1494 setOperationAction(ISD::VSELECT, VT, Expand);
1495
1496 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1497 setOperationAction(ISD::SETCC, VT, Custom);
1498 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
1499 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
1500 setOperationAction(ISD::SELECT, VT, Custom);
1501 setOperationAction(ISD::TRUNCATE, VT, Custom);
1502
1503 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1504 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1505 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1506 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1507 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1508 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1509 }
1510
1511 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1512 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1513 }
1514
1515 // This block controls legalization for 512-bit operations with 32/64 bit
1516 // elements. 512-bits can be disabled based on prefer-vector-width and
1517 // required-vector-width function attributes.
1518 if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1519 bool HasBWI = Subtarget.hasBWI();
1520
1521 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1522 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1523 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1524 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1525 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1526 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1527
1528 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1529 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1530 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1531 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1532 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1533 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1534 if (HasBWI)
1535 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1536 }
1537
1538 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1539 setOperationAction(ISD::FNEG, VT, Custom);
1540 setOperationAction(ISD::FABS, VT, Custom);
1541 setOperationAction(ISD::FMA, VT, Legal);
1542 setOperationAction(ISD::STRICT_FMA, VT, Legal);
1543 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1544 }
1545
1546 for (MVT VT : { MVT::v16i1, MVT::v16i8, MVT::v16i16 }) {
1547 setOperationPromotedToType(ISD::FP_TO_SINT , VT, MVT::v16i32);
1548 setOperationPromotedToType(ISD::FP_TO_UINT , VT, MVT::v16i32);
1549 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, VT, MVT::v16i32);
1550 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, VT, MVT::v16i32);
1551 }
1552 setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
1553 setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
1554 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v16i32, Legal);
1555 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v16i32, Legal);
1556 setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
1557 setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
1558 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i32, Legal);
1559 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i32, Legal);
1560
1561 setOperationAction(ISD::STRICT_FADD, MVT::v16f32, Legal);
1562 setOperationAction(ISD::STRICT_FADD, MVT::v8f64, Legal);
1563 setOperationAction(ISD::STRICT_FSUB, MVT::v16f32, Legal);
1564 setOperationAction(ISD::STRICT_FSUB, MVT::v8f64, Legal);
1565 setOperationAction(ISD::STRICT_FMUL, MVT::v16f32, Legal);
1566 setOperationAction(ISD::STRICT_FMUL, MVT::v8f64, Legal);
1567 setOperationAction(ISD::STRICT_FDIV, MVT::v16f32, Legal);
1568 setOperationAction(ISD::STRICT_FDIV, MVT::v8f64, Legal);
1569 setOperationAction(ISD::STRICT_FSQRT, MVT::v16f32, Legal);
1570 setOperationAction(ISD::STRICT_FSQRT, MVT::v8f64, Legal);
1571 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Legal);
1572 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f32, Legal);
1573
1574 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1575 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1576 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1577 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1578 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1579 if (HasBWI)
1580 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1581
1582 // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1583 // to 512-bit rather than use the AVX2 instructions so that we can use
1584 // k-masks.
1585 if (!Subtarget.hasVLX()) {
1586 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1587 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1588 setOperationAction(ISD::MLOAD, VT, Custom);
1589 setOperationAction(ISD::MSTORE, VT, Custom);
1590 }
1591 }
1592
1593 setOperationAction(ISD::TRUNCATE, MVT::v8i32, Legal);
1594 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal);
1595 setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom);
1596 setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
1597 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
1598 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1599 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1600 setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
1601 setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
1602 setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
1603 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
1604 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1605 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1606
1607 if (HasBWI) {
1608 // Extends from v64i1 masks to 512-bit vectors.
1609 setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
1610 setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
1611 setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom);
1612 }
1613
1614 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1615 setOperationAction(ISD::FFLOOR, VT, Legal);
1616 setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
1617 setOperationAction(ISD::FCEIL, VT, Legal);
1618 setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
1619 setOperationAction(ISD::FTRUNC, VT, Legal);
1620 setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
1621 setOperationAction(ISD::FRINT, VT, Legal);
1622 setOperationAction(ISD::STRICT_FRINT, VT, Legal);
1623 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1624 setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
1625 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
1626 setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);
1627
1628 setOperationAction(ISD::FROUND, VT, Custom);
1629 }
1630
1631 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
1632 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
1633 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
1634 }
1635
1636 setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);
1637 setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);
1638 setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom);
1639 setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom);
1640
1641 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1642 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1643 setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);
1644 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1645
1646 setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
1647 setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
1648 setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);
1649 setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
1650 setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
1651 setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
1652 setOperationAction(ISD::AVGCEILU, MVT::v32i16, HasBWI ? Legal : Custom);
1653 setOperationAction(ISD::AVGCEILU, MVT::v64i8, HasBWI ? Legal : Custom);
1654
1655 setOperationAction(ISD::SMULO, MVT::v64i8, Custom);
1656 setOperationAction(ISD::UMULO, MVT::v64i8, Custom);
1657
1658 setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);
1659
1660 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1661 setOperationAction(ISD::SRL, VT, Custom);
1662 setOperationAction(ISD::SHL, VT, Custom);
1663 setOperationAction(ISD::SRA, VT, Custom);
1664 setOperationAction(ISD::ROTL, VT, Custom);
1665 setOperationAction(ISD::ROTR, VT, Custom);
1666 setOperationAction(ISD::SETCC, VT, Custom);
1667
1668 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1669 // setcc all the way to isel and prefer SETGT in some isel patterns.
1670 setCondCodeAction(ISD::SETLT, VT, Custom);
1671 setCondCodeAction(ISD::SETLE, VT, Custom);
1672 }
1673 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1674 setOperationAction(ISD::SMAX, VT, Legal);
1675 setOperationAction(ISD::UMAX, VT, Legal);
1676 setOperationAction(ISD::SMIN, VT, Legal);
1677 setOperationAction(ISD::UMIN, VT, Legal);
1678 setOperationAction(ISD::ABS, VT, Legal);
1679 setOperationAction(ISD::CTPOP, VT, Custom);
1680 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
1681 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
1682 }
1683
1684 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1685 setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom);
1686 setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom);
1687 setOperationAction(ISD::CTLZ, VT, Custom);
1688 setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom);
1689 setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom);
1690 setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom);
1691 setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom);
1692 setOperationAction(ISD::UADDSAT, VT, HasBWI ? Legal : Custom);
1693 setOperationAction(ISD::SADDSAT, VT, HasBWI ? Legal : Custom);
1694 setOperationAction(ISD::USUBSAT, VT, HasBWI ? Legal : Custom);
1695 setOperationAction(ISD::SSUBSAT, VT, HasBWI ? Legal : Custom);
1696 }
1697
1698 setOperationAction(ISD::FSHL, MVT::v64i8, Custom);
1699 setOperationAction(ISD::FSHR, MVT::v64i8, Custom);
1700 setOperationAction(ISD::FSHL, MVT::v32i16, Custom);
1701 setOperationAction(ISD::FSHR, MVT::v32i16, Custom);
1702 setOperationAction(ISD::FSHL, MVT::v16i32, Custom);
1703 setOperationAction(ISD::FSHR, MVT::v16i32, Custom);
1704
1705 if (Subtarget.hasDQI()) {
1706 setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);
1707 setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
1708 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i64, Legal);
1709 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i64, Legal);
1710 setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);
1711 setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);
1712 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i64, Legal);
1713 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i64, Legal);
1714
1715 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
1716 }
1717
1718 if (Subtarget.hasCDI()) {
1719 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1720 for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
1721 setOperationAction(ISD::CTLZ, VT, Legal);
1722 }
1723 } // Subtarget.hasCDI()
1724
1725 if (Subtarget.hasVPOPCNTDQ()) {
1726 for (auto VT : { MVT::v16i32, MVT::v8i64 })
1727 setOperationAction(ISD::CTPOP, VT, Legal);
1728 }
1729
1730 // Extract subvector is special because the value type
1731 // (result) is 256-bit but the source is 512-bit wide.
1732 // 128-bit was made Legal under AVX1.
1733 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1734 MVT::v8f32, MVT::v4f64 })
1735 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1736
1737 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
1738 MVT::v16f32, MVT::v8f64 }) {
1739 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1740 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1741 setOperationAction(ISD::SELECT, VT, Custom);
1742 setOperationAction(ISD::VSELECT, VT, Custom);
1743 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1744 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1745 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1746 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1747 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1748 }
1749
1750 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
1751 setOperationAction(ISD::MLOAD, VT, Legal);
1752 setOperationAction(ISD::MSTORE, VT, Legal);
1753 setOperationAction(ISD::MGATHER, VT, Custom);
1754 setOperationAction(ISD::MSCATTER, VT, Custom);
1755 }
1756 if (HasBWI) {
1757 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1758 setOperationAction(ISD::MLOAD, VT, Legal);
1759 setOperationAction(ISD::MSTORE, VT, Legal);
1760 }
1761 } else {
1762 setOperationAction(ISD::STORE, MVT::v32i16, Custom);
1763 setOperationAction(ISD::STORE, MVT::v64i8, Custom);
1764 }
1765
1766 if (Subtarget.hasVBMI2()) {
1767 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64,
1768 MVT::v16i16, MVT::v8i32, MVT::v4i64,
1769 MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1770 setOperationAction(ISD::FSHL, VT, Custom);
1771 setOperationAction(ISD::FSHR, VT, Custom);
1772 }
1773
1774 setOperationAction(ISD::ROTL, MVT::v32i16, Custom);
1775 setOperationAction(ISD::ROTR, MVT::v8i16, Custom);
1776 setOperationAction(ISD::ROTR, MVT::v16i16, Custom);
1777 setOperationAction(ISD::ROTR, MVT::v32i16, Custom);
1778 }
1779 }// useAVX512Regs
1780
1781 // This block controls legalization for operations that don't have
1782 // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
1783 // narrower widths.
1784 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1785 // These operations are handled on non-VLX by artificially widening in
1786 // isel patterns.
1787
1788 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32,
1789 Subtarget.hasVLX() ? Legal : Custom);
1790 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32,
1791 Subtarget.hasVLX() ? Legal : Custom);
1792 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32,
1793 Subtarget.hasVLX() ? Legal : Custom);
1794 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32,
1795 Subtarget.hasVLX() ? Legal : Custom);
1796 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i32, Custom);
1797 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32,
1798 Subtarget.hasVLX() ? Legal : Custom);
1799 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32,
1800 Subtarget.hasVLX() ? Legal : Custom);
1801 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32,
1802 Subtarget.hasVLX() ? Legal : Custom);
1803 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32,
1804 Subtarget.hasVLX() ? Legal : Custom);
1805
1806 if (Subtarget.hasDQI()) {
1807 // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
1808 // v2f32 UINT_TO_FP is already custom under SSE2.
1809 assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&(static_cast <bool> (isOperationCustom(ISD::UINT_TO_FP,
MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP
, MVT::v2f32) && "Unexpected operation action!") ? void
(0) : __assert_fail ("isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && \"Unexpected operation action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 1811, __extension__
__PRETTY_FUNCTION__))
1810 isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) &&(static_cast <bool> (isOperationCustom(ISD::UINT_TO_FP,
MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP
, MVT::v2f32) && "Unexpected operation action!") ? void
(0) : __assert_fail ("isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && \"Unexpected operation action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 1811, __extension__
__PRETTY_FUNCTION__))
1811 "Unexpected operation action!")(static_cast <bool> (isOperationCustom(ISD::UINT_TO_FP,
MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP
, MVT::v2f32) && "Unexpected operation action!") ? void
(0) : __assert_fail ("isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && \"Unexpected operation action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 1811, __extension__
__PRETTY_FUNCTION__))
;
1812 // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
1813 setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
1814 setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
1815 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f32, Custom);
1816 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f32, Custom);
1817 }
1818
1819 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1820 setOperationAction(ISD::SMAX, VT, Legal);
1821 setOperationAction(ISD::UMAX, VT, Legal);
1822 setOperationAction(ISD::SMIN, VT, Legal);
1823 setOperationAction(ISD::UMIN, VT, Legal);
1824 setOperationAction(ISD::ABS, VT, Legal);
1825 }
1826
1827 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
1828 setOperationAction(ISD::ROTL, VT, Custom);
1829 setOperationAction(ISD::ROTR, VT, Custom);
1830 }
1831
1832 // Custom legalize 2x32 to get a little better code.
1833 setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);
1834 setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);
1835
1836 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1837 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1838 setOperationAction(ISD::MSCATTER, VT, Custom);
1839
1840 if (Subtarget.hasDQI()) {
1841 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1842 setOperationAction(ISD::SINT_TO_FP, VT,
1843 Subtarget.hasVLX() ? Legal : Custom);
1844 setOperationAction(ISD::UINT_TO_FP, VT,
1845 Subtarget.hasVLX() ? Legal : Custom);
1846 setOperationAction(ISD::STRICT_SINT_TO_FP, VT,
1847 Subtarget.hasVLX() ? Legal : Custom);
1848 setOperationAction(ISD::STRICT_UINT_TO_FP, VT,
1849 Subtarget.hasVLX() ? Legal : Custom);
1850 setOperationAction(ISD::FP_TO_SINT, VT,
1851 Subtarget.hasVLX() ? Legal : Custom);
1852 setOperationAction(ISD::FP_TO_UINT, VT,
1853 Subtarget.hasVLX() ? Legal : Custom);
1854 setOperationAction(ISD::STRICT_FP_TO_SINT, VT,
1855 Subtarget.hasVLX() ? Legal : Custom);
1856 setOperationAction(ISD::STRICT_FP_TO_UINT, VT,
1857 Subtarget.hasVLX() ? Legal : Custom);
1858 setOperationAction(ISD::MUL, VT, Legal);
1859 }
1860 }
1861
1862 if (Subtarget.hasCDI()) {
1863 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
1864 setOperationAction(ISD::CTLZ, VT, Legal);
1865 }
1866 } // Subtarget.hasCDI()
1867
1868 if (Subtarget.hasVPOPCNTDQ()) {
1869 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
1870 setOperationAction(ISD::CTPOP, VT, Legal);
1871 }
1872 }
1873
1874 // This block control legalization of v32i1/v64i1 which are available with
1875 // AVX512BW. 512-bit v32i16 and v64i8 vector legalization is controlled with
1876 // useBWIRegs.
1877 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1878 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
1879 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
1880
1881 for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
1882 setOperationAction(ISD::VSELECT, VT, Expand);
1883 setOperationAction(ISD::TRUNCATE, VT, Custom);
1884 setOperationAction(ISD::SETCC, VT, Custom);
1885 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1886 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1887 setOperationAction(ISD::SELECT, VT, Custom);
1888 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1889 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1890 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1891 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1892 }
1893
1894 for (auto VT : { MVT::v16i1, MVT::v32i1 })
1895 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1896
1897 // Extends from v32i1 masks to 256-bit vectors.
1898 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
1899 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
1900 setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom);
1901
1902 for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
1903 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1904 setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
1905 }
1906
1907 // These operations are handled on non-VLX by artificially widening in
1908 // isel patterns.
1909 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
1910
1911 if (Subtarget.hasBITALG()) {
1912 for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
1913 setOperationAction(ISD::CTPOP, VT, Legal);
1914 }
1915 }
1916
1917 if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) {
1918 auto setGroup = [&] (MVT VT) {
1919 setOperationAction(ISD::FADD, VT, Legal);
1920 setOperationAction(ISD::STRICT_FADD, VT, Legal);
1921 setOperationAction(ISD::FSUB, VT, Legal);
1922 setOperationAction(ISD::STRICT_FSUB, VT, Legal);
1923 setOperationAction(ISD::FMUL, VT, Legal);
1924 setOperationAction(ISD::STRICT_FMUL, VT, Legal);
1925 setOperationAction(ISD::FDIV, VT, Legal);
1926 setOperationAction(ISD::STRICT_FDIV, VT, Legal);
1927 setOperationAction(ISD::FSQRT, VT, Legal);
1928 setOperationAction(ISD::STRICT_FSQRT, VT, Legal);
1929
1930 setOperationAction(ISD::FFLOOR, VT, Legal);
1931 setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
1932 setOperationAction(ISD::FCEIL, VT, Legal);
1933 setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
1934 setOperationAction(ISD::FTRUNC, VT, Legal);
1935 setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
1936 setOperationAction(ISD::FRINT, VT, Legal);
1937 setOperationAction(ISD::STRICT_FRINT, VT, Legal);
1938 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1939 setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
1940
1941 setOperationAction(ISD::LOAD, VT, Legal);
1942 setOperationAction(ISD::STORE, VT, Legal);
1943
1944 setOperationAction(ISD::FMA, VT, Legal);
1945 setOperationAction(ISD::STRICT_FMA, VT, Legal);
1946 setOperationAction(ISD::VSELECT, VT, Legal);
1947 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1948 setOperationAction(ISD::SELECT, VT, Custom);
1949
1950 setOperationAction(ISD::FNEG, VT, Custom);
1951 setOperationAction(ISD::FABS, VT, Custom);
1952 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1953 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1954 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1955 };
1956
1957 // AVX512_FP16 scalar operations
1958 setGroup(MVT::f16);
1959 addRegisterClass(MVT::f16, &X86::FR16XRegClass);
1960 setOperationAction(ISD::FREM, MVT::f16, Promote);
1961 setOperationAction(ISD::STRICT_FREM, MVT::f16, Promote);
1962 setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
1963 setOperationAction(ISD::BR_CC, MVT::f16, Expand);
1964 setOperationAction(ISD::SETCC, MVT::f16, Custom);
1965 setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Custom);
1966 setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Custom);
1967 setOperationAction(ISD::FROUND, MVT::f16, Custom);
1968 setOperationAction(ISD::STRICT_FROUND, MVT::f16, Promote);
1969 setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal);
1970 setOperationAction(ISD::STRICT_FROUNDEVEN, MVT::f16, Legal);
1971 setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
1972 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);
1973 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Legal);
1974 if (isTypeLegal(MVT::f80)) {
1975 setOperationAction(ISD::FP_EXTEND, MVT::f80, Custom);
1976 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Custom);
1977 }
1978
1979 setCondCodeAction(ISD::SETOEQ, MVT::f16, Expand);
1980 setCondCodeAction(ISD::SETUNE, MVT::f16, Expand);
1981
1982 if (Subtarget.useAVX512Regs()) {
1983 setGroup(MVT::v32f16);
1984 addRegisterClass(MVT::v32f16, &X86::VR512RegClass);
1985 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32f16, Custom);
1986 setOperationAction(ISD::SINT_TO_FP, MVT::v32i16, Legal);
1987 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v32i16, Legal);
1988 setOperationAction(ISD::UINT_TO_FP, MVT::v32i16, Legal);
1989 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v32i16, Legal);
1990 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f16, Legal);
1991 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Legal);
1992 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32f16, Custom);
1993
1994 setOperationAction(ISD::FP_TO_SINT, MVT::v32i16, Custom);
1995 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v32i16, Custom);
1996 setOperationAction(ISD::FP_TO_UINT, MVT::v32i16, Custom);
1997 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v32i16, Custom);
1998 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i8, MVT::v32i16);
1999 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v32i8,
2000 MVT::v32i16);
2001 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i8, MVT::v32i16);
2002 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v32i8,
2003 MVT::v32i16);
2004 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i1, MVT::v32i16);
2005 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v32i1,
2006 MVT::v32i16);
2007 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i1, MVT::v32i16);
2008 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v32i1,
2009 MVT::v32i16);
2010
2011 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f16, Legal);
2012 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32f16, Legal);
2013 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32f16, Custom);
2014
2015 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Legal);
2016 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal);
2017
2018 setOperationAction(ISD::STRICT_FSETCC, MVT::v32i1, Custom);
2019 setOperationAction(ISD::STRICT_FSETCCS, MVT::v32i1, Custom);
2020 }
2021
2022 if (Subtarget.hasVLX()) {
2023 addRegisterClass(MVT::v8f16, &X86::VR128XRegClass);
2024 addRegisterClass(MVT::v16f16, &X86::VR256XRegClass);
2025 setGroup(MVT::v8f16);
2026 setGroup(MVT::v16f16);
2027
2028 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8f16, Legal);
2029 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16f16, Custom);
2030 setOperationAction(ISD::SINT_TO_FP, MVT::v16i16, Legal);
2031 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i16, Legal);
2032 setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Legal);
2033 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i16, Legal);
2034 setOperationAction(ISD::UINT_TO_FP, MVT::v16i16, Legal);
2035 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i16, Legal);
2036 setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Legal);
2037 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i16, Legal);
2038
2039 setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom);
2040 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i16, Custom);
2041 setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Custom);
2042 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i16, Custom);
2043 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f16, Legal);
2044 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f32, Legal);
2045
2046 // INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE
2047 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8f16, Custom);
2048 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16f16, Custom);
2049
2050 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f16, Legal);
2051 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16f16, Legal);
2052 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f16, Custom);
2053
2054 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Legal);
2055 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Legal);
2056 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Legal);
2057 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Legal);
2058
2059 // Need to custom widen these to prevent scalarization.
2060 setOperationAction(ISD::LOAD, MVT::v4f16, Custom);
2061 setOperationAction(ISD::STORE, MVT::v4f16, Custom);
2062 }
2063
2064 // Support fp16 0 immediate
2065 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf()));
2066 }
2067
2068 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
2069 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
2070 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
2071 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
2072 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
2073 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
2074
2075 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
2076 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
2077 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
2078 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
2079 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
2080
2081 if (Subtarget.hasBWI()) {
2082 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
2083 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
2084 }
2085
2086 if (Subtarget.hasFP16()) {
2087 // vcvttph2[u]dq v4f16 -> v4i32/64, v2f16 -> v2i32/64
2088 setOperationAction(ISD::FP_TO_SINT, MVT::v2f16, Custom);
2089 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f16, Custom);
2090 setOperationAction(ISD::FP_TO_UINT, MVT::v2f16, Custom);
2091 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f16, Custom);
2092 setOperationAction(ISD::FP_TO_SINT, MVT::v4f16, Custom);
2093 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4f16, Custom);
2094 setOperationAction(ISD::FP_TO_UINT, MVT::v4f16, Custom);
2095 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4f16, Custom);
2096 // vcvt[u]dq2ph v4i32/64 -> v4f16, v2i32/64 -> v2f16
2097 setOperationAction(ISD::SINT_TO_FP, MVT::v2f16, Custom);
2098 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f16, Custom);
2099 setOperationAction(ISD::UINT_TO_FP, MVT::v2f16, Custom);
2100 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f16, Custom);
2101 setOperationAction(ISD::SINT_TO_FP, MVT::v4f16, Custom);
2102 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4f16, Custom);
2103 setOperationAction(ISD::UINT_TO_FP, MVT::v4f16, Custom);
2104 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4f16, Custom);
2105 // vcvtps2phx v4f32 -> v4f16, v2f32 -> v2f16
2106 setOperationAction(ISD::FP_ROUND, MVT::v2f16, Custom);
2107 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f16, Custom);
2108 setOperationAction(ISD::FP_ROUND, MVT::v4f16, Custom);
2109 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f16, Custom);
2110 // vcvtph2psx v4f16 -> v4f32, v2f16 -> v2f32
2111 setOperationAction(ISD::FP_EXTEND, MVT::v2f16, Custom);
2112 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f16, Custom);
2113 setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Custom);
2114 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f16, Custom);
2115 }
2116
2117 setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom);
2118 setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom);
2119 setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
2120 }
2121
2122 if (Subtarget.hasAMXTILE()) {
2123 addRegisterClass(MVT::x86amx, &X86::TILERegClass);
2124 }
2125
2126 // We want to custom lower some of our intrinsics.
2127 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
2128 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
2129 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
2130 if (!Subtarget.is64Bit()) {
2131 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
2132 }
2133
2134 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
2135 // handle type legalization for these operations here.
2136 //
2137 // FIXME: We really should do custom legalization for addition and
2138 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
2139 // than generic legalization for 64-bit multiplication-with-overflow, though.
2140 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
2141 if (VT == MVT::i64 && !Subtarget.is64Bit())
2142 continue;
2143 // Add/Sub/Mul with overflow operations are custom lowered.
2144 setOperationAction(ISD::SADDO, VT, Custom);
2145 setOperationAction(ISD::UADDO, VT, Custom);
2146 setOperationAction(ISD::SSUBO, VT, Custom);
2147 setOperationAction(ISD::USUBO, VT, Custom);
2148 setOperationAction(ISD::SMULO, VT, Custom);
2149 setOperationAction(ISD::UMULO, VT, Custom);
2150
2151 // Support carry in as value rather than glue.
2152 setOperationAction(ISD::ADDCARRY, VT, Custom);
2153 setOperationAction(ISD::SUBCARRY, VT, Custom);
2154 setOperationAction(ISD::SETCCCARRY, VT, Custom);
2155 setOperationAction(ISD::SADDO_CARRY, VT, Custom);
2156 setOperationAction(ISD::SSUBO_CARRY, VT, Custom);
2157 }
2158
2159 if (!Subtarget.is64Bit()) {
2160 // These libcalls are not available in 32-bit.
2161 setLibcallName(RTLIB::SHL_I128, nullptr);
2162 setLibcallName(RTLIB::SRL_I128, nullptr);
2163 setLibcallName(RTLIB::SRA_I128, nullptr);
2164 setLibcallName(RTLIB::MUL_I128, nullptr);
2165 // The MULO libcall is not part of libgcc, only compiler-rt.
2166 setLibcallName(RTLIB::MULO_I64, nullptr);
2167 }
2168 // The MULO libcall is not part of libgcc, only compiler-rt.
2169 setLibcallName(RTLIB::MULO_I128, nullptr);
2170
2171 // Combine sin / cos into _sincos_stret if it is available.
2172 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
2173 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
2174 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
2175 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
2176 }
2177
2178 if (Subtarget.isTargetWin64()) {
2179 setOperationAction(ISD::SDIV, MVT::i128, Custom);
2180 setOperationAction(ISD::UDIV, MVT::i128, Custom);
2181 setOperationAction(ISD::SREM, MVT::i128, Custom);
2182 setOperationAction(ISD::UREM, MVT::i128, Custom);
2183 setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);
2184 setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);
2185 setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);
2186 setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);
2187 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i128, Custom);
2188 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i128, Custom);
2189 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i128, Custom);
2190 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom);
2191 }
2192
2193 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
2194 // is. We should promote the value to 64-bits to solve this.
2195 // This is what the CRT headers do - `fmodf` is an inline header
2196 // function casting to f64 and calling `fmod`.
2197 if (Subtarget.is32Bit() &&
2198 (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
2199 for (ISD::NodeType Op :
2200 {ISD::FCEIL, ISD::STRICT_FCEIL,
2201 ISD::FCOS, ISD::STRICT_FCOS,
2202 ISD::FEXP, ISD::STRICT_FEXP,
2203 ISD::FFLOOR, ISD::STRICT_FFLOOR,
2204 ISD::FREM, ISD::STRICT_FREM,
2205 ISD::FLOG, ISD::STRICT_FLOG,
2206 ISD::FLOG10, ISD::STRICT_FLOG10,
2207 ISD::FPOW, ISD::STRICT_FPOW,
2208 ISD::FSIN, ISD::STRICT_FSIN})
2209 if (isOperationExpand(Op, MVT::f32))
2210 setOperationAction(Op, MVT::f32, Promote);
2211
2212 // We have target-specific dag combine patterns for the following nodes:
2213 setTargetDAGCombine({ISD::VECTOR_SHUFFLE,
2214 ISD::SCALAR_TO_VECTOR,
2215 ISD::INSERT_VECTOR_ELT,
2216 ISD::EXTRACT_VECTOR_ELT,
2217 ISD::CONCAT_VECTORS,
2218 ISD::INSERT_SUBVECTOR,
2219 ISD::EXTRACT_SUBVECTOR,
2220 ISD::BITCAST,
2221 ISD::VSELECT,
2222 ISD::SELECT,
2223 ISD::SHL,
2224 ISD::SRA,
2225 ISD::SRL,
2226 ISD::OR,
2227 ISD::AND,
2228 ISD::ADD,
2229 ISD::FADD,
2230 ISD::FSUB,
2231 ISD::FNEG,
2232 ISD::FMA,
2233 ISD::STRICT_FMA,
2234 ISD::FMINNUM,
2235 ISD::FMAXNUM,
2236 ISD::SUB,
2237 ISD::LOAD,
2238 ISD::MLOAD,
2239 ISD::STORE,
2240 ISD::MSTORE,
2241 ISD::TRUNCATE,
2242 ISD::ZERO_EXTEND,
2243 ISD::ANY_EXTEND,
2244 ISD::SIGN_EXTEND,
2245 ISD::SIGN_EXTEND_INREG,
2246 ISD::ANY_EXTEND_VECTOR_INREG,
2247 ISD::SIGN_EXTEND_VECTOR_INREG,
2248 ISD::ZERO_EXTEND_VECTOR_INREG,
2249 ISD::SINT_TO_FP,
2250 ISD::UINT_TO_FP,
2251 ISD::STRICT_SINT_TO_FP,
2252 ISD::STRICT_UINT_TO_FP,
2253 ISD::SETCC,
2254 ISD::MUL,
2255 ISD::XOR,
2256 ISD::MSCATTER,
2257 ISD::MGATHER,
2258 ISD::FP16_TO_FP,
2259 ISD::FP_EXTEND,
2260 ISD::STRICT_FP_EXTEND,
2261 ISD::FP_ROUND});
2262
2263 computeRegisterProperties(Subtarget.getRegisterInfo());
2264
2265 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
2266 MaxStoresPerMemsetOptSize = 8;
2267 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
2268 MaxStoresPerMemcpyOptSize = 4;
2269 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
2270 MaxStoresPerMemmoveOptSize = 4;
2271
2272 // TODO: These control memcmp expansion in CGP and could be raised higher, but
2273 // that needs to benchmarked and balanced with the potential use of vector
2274 // load/store types (PR33329, PR33914).
2275 MaxLoadsPerMemcmp = 2;
2276 MaxLoadsPerMemcmpOptSize = 2;
2277
2278 // Default loop alignment, which can be overridden by -align-loops.
2279 setPrefLoopAlignment(Align(16));
2280
2281 // An out-of-order CPU can speculatively execute past a predictable branch,
2282 // but a conditional move could be stalled by an expensive earlier operation.
2283 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
2284 EnableExtLdPromotion = true;
2285 setPrefFunctionAlignment(Align(16));
2286
2287 verifyIntrinsicTables();
2288
2289 // Default to having -disable-strictnode-mutation on
2290 IsStrictFPEnabled = true;
2291}
2292
2293// This has so far only been implemented for 64-bit MachO.
2294bool X86TargetLowering::useLoadStackGuardNode() const {
2295 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
2296}
2297
2298bool X86TargetLowering::useStackGuardXorFP() const {
2299 // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
2300 return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
2301}
2302
2303SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
2304 const SDLoc &DL) const {
2305 EVT PtrTy = getPointerTy(DAG.getDataLayout());
2306 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
2307 MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
2308 return SDValue(Node, 0);
2309}
2310
2311TargetLoweringBase::LegalizeTypeAction
2312X86TargetLowering::getPreferredVectorAction(MVT VT) const {
2313 if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&
2314 !Subtarget.hasBWI())
2315 return TypeSplitVector;
2316
2317 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2318 VT.getVectorElementType() != MVT::i1)
2319 return TypeWidenVector;
2320
2321 return TargetLoweringBase::getPreferredVectorAction(VT);
2322}
2323
2324static std::pair<MVT, unsigned>
2325handleMaskRegisterForCallingConv(unsigned NumElts, CallingConv::ID CC,
2326 const X86Subtarget &Subtarget) {
2327 // v2i1/v4i1/v8i1/v16i1 all pass in xmm registers unless the calling
2328 // convention is one that uses k registers.
2329 if (NumElts == 2)
2330 return {MVT::v2i64, 1};
2331 if (NumElts == 4)
2332 return {MVT::v4i32, 1};
2333 if (NumElts == 8 && CC != CallingConv::X86_RegCall &&
2334 CC != CallingConv::Intel_OCL_BI)
2335 return {MVT::v8i16, 1};
2336 if (NumElts == 16 && CC != CallingConv::X86_RegCall &&
2337 CC != CallingConv::Intel_OCL_BI)
2338 return {MVT::v16i8, 1};
2339 // v32i1 passes in ymm unless we have BWI and the calling convention is
2340 // regcall.
2341 if (NumElts == 32 && (!Subtarget.hasBWI() || CC != CallingConv::X86_RegCall))
2342 return {MVT::v32i8, 1};
2343 // Split v64i1 vectors if we don't have v64i8 available.
2344 if (NumElts == 64 && Subtarget.hasBWI() && CC != CallingConv::X86_RegCall) {
2345 if (Subtarget.useAVX512Regs())
2346 return {MVT::v64i8, 1};
2347 return {MVT::v32i8, 2};
2348 }
2349
2350 // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
2351 if (!isPowerOf2_32(NumElts) || (NumElts == 64 && !Subtarget.hasBWI()) ||
2352 NumElts > 64)
2353 return {MVT::i8, NumElts};
2354
2355 return {MVT::INVALID_SIMPLE_VALUE_TYPE, 0};
2356}
2357
2358MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
2359 CallingConv::ID CC,
2360 EVT VT) const {
2361 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
2362 Subtarget.hasAVX512()) {
2363 unsigned NumElts = VT.getVectorNumElements();
2364
2365 MVT RegisterVT;
2366 unsigned NumRegisters;
2367 std::tie(RegisterVT, NumRegisters) =
2368 handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
2369 if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
2370 return RegisterVT;
2371 }
2372
2373 // v3f16 will be widen to v4f16. But we don't assign register class for v4f16.
2374 // So its default register type is f16. We override the type to v8f16 here.
2375 if (VT == MVT::v3f16 && Subtarget.hasFP16())
2376 return MVT::v8f16;
2377
2378 // We will use more GPRs for f64 and f80 on 32 bits when x87 is disabled.
2379 if ((VT == MVT::f64 || VT == MVT::f80) && !Subtarget.is64Bit() &&
2380 !Subtarget.hasX87())
2381 return MVT::i32;
2382
2383 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
2384}
2385
2386unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
2387 CallingConv::ID CC,
2388 EVT VT) const {
2389 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
2390 Subtarget.hasAVX512()) {
2391 unsigned NumElts = VT.getVectorNumElements();
2392
2393 MVT RegisterVT;
2394 unsigned NumRegisters;
2395 std::tie(RegisterVT, NumRegisters) =
2396 handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
2397 if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
2398 return NumRegisters;
2399 }
2400
2401 // v3f16 will be widen to v4f16. But we don't assign register class for v4f16.
2402 // So its default register number is 3. We override the number to 1 here.
2403 if (VT == MVT::v3f16 && Subtarget.hasFP16())
2404 return 1;
2405
2406 // We have to split f64 to 2 registers and f80 to 3 registers on 32 bits if
2407 // x87 is disabled.
2408 if (!Subtarget.is64Bit() && !Subtarget.hasX87()) {
2409 if (VT == MVT::f64)
2410 return 2;
2411 if (VT == MVT::f80)
2412 return 3;
2413 }
2414
2415 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
2416}
2417
2418unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv(
2419 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
2420 unsigned &NumIntermediates, MVT &RegisterVT) const {
2421 // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
2422 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
2423 Subtarget.hasAVX512() &&
2424 (!isPowerOf2_32(VT.getVectorNumElements()) ||
2425 (VT.getVectorNumElements() == 64 && !Subtarget.hasBWI()) ||
2426 VT.getVectorNumElements() > 64)) {
2427 RegisterVT = MVT::i8;
2428 IntermediateVT = MVT::i1;
2429 NumIntermediates = VT.getVectorNumElements();
2430 return NumIntermediates;
2431 }
2432
2433 // Split v64i1 vectors if we don't have v64i8 available.
2434 if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
2435 CC != CallingConv::X86_RegCall) {
2436 RegisterVT = MVT::v32i8;
2437 IntermediateVT = MVT::v32i1;
2438 NumIntermediates = 2;
2439 return 2;
2440 }
2441
2442 return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT,
2443 NumIntermediates, RegisterVT);
2444}
2445
2446EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
2447 LLVMContext& Context,
2448 EVT VT) const {
2449 if (!VT.isVector())
2450 return MVT::i8;
2451
2452 if (Subtarget.hasAVX512()) {
2453 // Figure out what this type will be legalized to.
2454 EVT LegalVT = VT;
2455 while (getTypeAction(Context, LegalVT) != TypeLegal)
2456 LegalVT = getTypeToTransformTo(Context, LegalVT);
2457
2458 // If we got a 512-bit vector then we'll definitely have a vXi1 compare.
2459 if (LegalVT.getSimpleVT().is512BitVector())
2460 return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount());
2461
2462 if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {
2463 // If we legalized to less than a 512-bit vector, then we will use a vXi1
2464 // compare for vXi32/vXi64 for sure. If we have BWI we will also support
2465 // vXi16/vXi8.
2466 MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();
2467 if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32)
2468 return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount());
2469 }
2470 }
2471
2472 return VT.changeVectorElementTypeToInteger();
2473}
2474
2475/// Helper for getByValTypeAlignment to determine
2476/// the desired ByVal argument alignment.
2477static void getMaxByValAlign(Type *Ty, Align &MaxAlign) {
2478 if (MaxAlign == 16)
2479 return;
2480 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
2481 if (VTy->getPrimitiveSizeInBits().getFixedSize() == 128)
2482 MaxAlign = Align(16);
2483 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
2484 Align EltAlign;
2485 getMaxByValAlign(ATy->getElementType(), EltAlign);
2486 if (EltAlign > MaxAlign)
2487 MaxAlign = EltAlign;
2488 } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
2489 for (auto *EltTy : STy->elements()) {
2490 Align EltAlign;
2491 getMaxByValAlign(EltTy, EltAlign);
2492 if (EltAlign > MaxAlign)
2493 MaxAlign = EltAlign;
2494 if (MaxAlign == 16)
2495 break;
2496 }
2497 }
2498}
2499
2500/// Return the desired alignment for ByVal aggregate
2501/// function arguments in the caller parameter area. For X86, aggregates
2502/// that contain SSE vectors are placed at 16-byte boundaries while the rest
2503/// are at 4-byte boundaries.
2504uint64_t X86TargetLowering::getByValTypeAlignment(Type *Ty,
2505 const DataLayout &DL) const {
2506 if (Subtarget.is64Bit()) {
2507 // Max of 8 and alignment of type.
2508 Align TyAlign = DL.getABITypeAlign(Ty);
2509 if (TyAlign > 8)
2510 return TyAlign.value();
2511 return 8;
2512 }
2513
2514 Align Alignment(4);
2515 if (Subtarget.hasSSE1())
2516 getMaxByValAlign(Ty, Alignment);
2517 return Alignment.value();
2518}
2519
2520/// It returns EVT::Other if the type should be determined using generic
2521/// target-independent logic.
2522/// For vector ops we check that the overall size isn't larger than our
2523/// preferred vector width.
2524EVT X86TargetLowering::getOptimalMemOpType(
2525 const MemOp &Op, const AttributeList &FuncAttributes) const {
2526 if (!FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat)) {
2527 if (Op.size() >= 16 &&
2528 (!Subtarget.isUnalignedMem16Slow() || Op.isAligned(Align(16)))) {
2529 // FIXME: Check if unaligned 64-byte accesses are slow.
2530 if (Op.size() >= 64 && Subtarget.hasAVX512() &&
2531 (Subtarget.getPreferVectorWidth() >= 512)) {
2532 return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32;
2533 }
2534 // FIXME: Check if unaligned 32-byte accesses are slow.
2535 if (Op.size() >= 32 && Subtarget.hasAVX() &&
2536 (Subtarget.getPreferVectorWidth() >= 256)) {
2537 // Although this isn't a well-supported type for AVX1, we'll let
2538 // legalization and shuffle lowering produce the optimal codegen. If we
2539 // choose an optimal type with a vector element larger than a byte,
2540 // getMemsetStores() may create an intermediate splat (using an integer
2541 // multiply) before we splat as a vector.
2542 return MVT::v32i8;
2543 }
2544 if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128))
2545 return MVT::v16i8;
2546 // TODO: Can SSE1 handle a byte vector?
2547 // If we have SSE1 registers we should be able to use them.
2548 if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) &&
2549 (Subtarget.getPreferVectorWidth() >= 128))
2550 return MVT::v4f32;
2551 } else if (((Op.isMemcpy() && !Op.isMemcpyStrSrc()) || Op.isZeroMemset()) &&
2552 Op.size() >= 8 && !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
2553 // Do not use f64 to lower memcpy if source is string constant. It's
2554 // better to use i32 to avoid the loads.
2555 // Also, do not use f64 to lower memset unless this is a memset of zeros.
2556 // The gymnastics of splatting a byte value into an XMM register and then
2557 // only using 8-byte stores (because this is a CPU with slow unaligned
2558 // 16-byte accesses) makes that a loser.
2559 return MVT::f64;
2560 }
2561 }
2562 // This is a compromise. If we reach here, unaligned accesses may be slow on
2563 // this target. However, creating smaller, aligned accesses could be even
2564 // slower and would certainly be a lot more code.
2565 if (Subtarget.is64Bit() && Op.size() >= 8)
2566 return MVT::i64;
2567 return MVT::i32;
2568}
2569
2570bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
2571 if (VT == MVT::f32)
2572 return Subtarget.hasSSE1();
2573 if (VT == MVT::f64)
2574 return Subtarget.hasSSE2();
2575 return true;
2576}
2577
2578bool X86TargetLowering::allowsMisalignedMemoryAccesses(
2579 EVT VT, unsigned, Align Alignment, MachineMemOperand::Flags Flags,
2580 bool *Fast) const {
2581 if (Fast) {
2582 switch (VT.getSizeInBits()) {
2583 default:
2584 // 8-byte and under are always assumed to be fast.
2585 *Fast = true;
2586 break;
2587 case 128:
2588 *Fast = !Subtarget.isUnalignedMem16Slow();
2589 break;
2590 case 256:
2591 *Fast = !Subtarget.isUnalignedMem32Slow();
2592 break;
2593 // TODO: What about AVX-512 (512-bit) accesses?
2594 }
2595 }
2596 // NonTemporal vector memory ops must be aligned.
2597 if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {
2598 // NT loads can only be vector aligned, so if its less aligned than the
2599 // minimum vector size (which we can split the vector down to), we might as
2600 // well use a regular unaligned vector load.
2601 // We don't have any NT loads pre-SSE41.
2602 if (!!(Flags & MachineMemOperand::MOLoad))
2603 return (Alignment < 16 || !Subtarget.hasSSE41());
2604 return false;
2605 }
2606 // Misaligned accesses of any size are always allowed.
2607 return true;
2608}
2609
2610/// Return the entry encoding for a jump table in the
2611/// current function. The returned value is a member of the
2612/// MachineJumpTableInfo::JTEntryKind enum.
2613unsigned X86TargetLowering::getJumpTableEncoding() const {
2614 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
2615 // symbol.
2616 if (isPositionIndependent() && Subtarget.isPICStyleGOT())
2617 return MachineJumpTableInfo::EK_Custom32;
2618
2619 // Otherwise, use the normal jump table encoding heuristics.
2620 return TargetLowering::getJumpTableEncoding();
2621}
2622
2623bool X86TargetLowering::useSoftFloat() const {
2624 return Subtarget.useSoftFloat();
2625}
2626
2627void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
2628 ArgListTy &Args) const {
2629
2630 // Only relabel X86-32 for C / Stdcall CCs.
2631 if (Subtarget.is64Bit())
2632 return;
2633 if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
2634 return;
2635 unsigned ParamRegs = 0;
2636 if (auto *M = MF->getFunction().getParent())
2637 ParamRegs = M->getNumberRegisterParameters();
2638
2639 // Mark the first N int arguments as having reg
2640 for (auto &Arg : Args) {
2641 Type *T = Arg.Ty;
2642 if (T->isIntOrPtrTy())
2643 if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
2644 unsigned numRegs = 1;
2645 if (MF->getDataLayout().getTypeAllocSize(T) > 4)
2646 numRegs = 2;
2647 if (ParamRegs < numRegs)
2648 return;
2649 ParamRegs -= numRegs;
2650 Arg.IsInReg = true;
2651 }
2652 }
2653}
2654
2655const MCExpr *
2656X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
2657 const MachineBasicBlock *MBB,
2658 unsigned uid,MCContext &Ctx) const{
2659 assert(isPositionIndependent() && Subtarget.isPICStyleGOT())(static_cast <bool> (isPositionIndependent() &&
Subtarget.isPICStyleGOT()) ? void (0) : __assert_fail ("isPositionIndependent() && Subtarget.isPICStyleGOT()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 2659, __extension__
__PRETTY_FUNCTION__))
;
2660 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
2661 // entries.
2662 return MCSymbolRefExpr::create(MBB->getSymbol(),
2663 MCSymbolRefExpr::VK_GOTOFF, Ctx);
2664}
2665
2666/// Returns relocation base for the given PIC jumptable.
2667SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
2668 SelectionDAG &DAG) const {
2669 if (!Subtarget.is64Bit())
2670 // This doesn't have SDLoc associated with it, but is not really the
2671 // same as a Register.
2672 return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
2673 getPointerTy(DAG.getDataLayout()));
2674 return Table;
2675}
2676
2677/// This returns the relocation base for the given PIC jumptable,
2678/// the same as getPICJumpTableRelocBase, but as an MCExpr.
2679const MCExpr *X86TargetLowering::
2680getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
2681 MCContext &Ctx) const {
2682 // X86-64 uses RIP relative addressing based on the jump table label.
2683 if (Subtarget.isPICStyleRIPRel())
2684 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
2685
2686 // Otherwise, the reference is relative to the PIC base.
2687 return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
2688}
2689
2690std::pair<const TargetRegisterClass *, uint8_t>
2691X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
2692 MVT VT) const {
2693 const TargetRegisterClass *RRC = nullptr;
2694 uint8_t Cost = 1;
2695 switch (VT.SimpleTy) {
2696 default:
2697 return TargetLowering::findRepresentativeClass(TRI, VT);
2698 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
2699 RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
2700 break;
2701 case MVT::x86mmx:
2702 RRC = &X86::VR64RegClass;
2703 break;
2704 case MVT::f32: case MVT::f64:
2705 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
2706 case MVT::v4f32: case MVT::v2f64:
2707 case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
2708 case MVT::v8f32: case MVT::v4f64:
2709 case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
2710 case MVT::v16f32: case MVT::v8f64:
2711 RRC = &X86::VR128XRegClass;
2712 break;
2713 }
2714 return std::make_pair(RRC, Cost);
2715}
2716
2717unsigned X86TargetLowering::getAddressSpace() const {
2718 if (Subtarget.is64Bit())
2719 return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
2720 return 256;
2721}
2722
2723static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
2724 return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
2725 (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
2726}
2727
2728static Constant* SegmentOffset(IRBuilderBase &IRB,
2729 int Offset, unsigned AddressSpace) {
2730 return ConstantExpr::getIntToPtr(
2731 ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
2732 Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
2733}
2734
2735Value *X86TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const {
2736 // glibc, bionic, and Fuchsia have a special slot for the stack guard in
2737 // tcbhead_t; use it instead of the usual global variable (see
2738 // sysdeps/{i386,x86_64}/nptl/tls.h)
2739 if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
2740 if (Subtarget.isTargetFuchsia()) {
2741 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
2742 return SegmentOffset(IRB, 0x10, getAddressSpace());
2743 } else {
2744 unsigned AddressSpace = getAddressSpace();
2745 Module *M = IRB.GetInsertBlock()->getParent()->getParent();
2746 // Specially, some users may customize the base reg and offset.
2747 int Offset = M->getStackProtectorGuardOffset();
2748 // If we don't set -stack-protector-guard-offset value:
2749 // %fs:0x28, unless we're using a Kernel code model, in which case
2750 // it's %gs:0x28. gs:0x14 on i386.
2751 if (Offset == INT_MAX2147483647)
2752 Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
2753
2754 StringRef GuardReg = M->getStackProtectorGuardReg();
2755 if (GuardReg == "fs")
2756 AddressSpace = X86AS::FS;
2757 else if (GuardReg == "gs")
2758 AddressSpace = X86AS::GS;
2759 return SegmentOffset(IRB, Offset, AddressSpace);
2760 }
2761 }
2762 return TargetLowering::getIRStackGuard(IRB);
2763}
2764
2765void X86TargetLowering::insertSSPDeclarations(Module &M) const {
2766 // MSVC CRT provides functionalities for stack protection.
2767 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2768 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
2769 // MSVC CRT has a global variable holding security cookie.
2770 M.getOrInsertGlobal("__security_cookie",
2771 Type::getInt8PtrTy(M.getContext()));
2772
2773 // MSVC CRT has a function to validate security cookie.
2774 FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
2775 "__security_check_cookie", Type::getVoidTy(M.getContext()),
2776 Type::getInt8PtrTy(M.getContext()));
2777 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
2778 F->setCallingConv(CallingConv::X86_FastCall);
2779 F->addParamAttr(0, Attribute::AttrKind::InReg);
2780 }
2781 return;
2782 }
2783
2784 StringRef GuardMode = M.getStackProtectorGuard();
2785
2786 // glibc, bionic, and Fuchsia have a special slot for the stack guard.
2787 if ((GuardMode == "tls" || GuardMode.empty()) &&
2788 hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
2789 return;
2790 TargetLowering::insertSSPDeclarations(M);
2791}
2792
2793Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
2794 // MSVC CRT has a global variable holding security cookie.
2795 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2796 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
2797 return M.getGlobalVariable("__security_cookie");
2798 }
2799 return TargetLowering::getSDagStackGuard(M);
2800}
2801
2802Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
2803 // MSVC CRT has a function to validate security cookie.
2804 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2805 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
2806 return M.getFunction("__security_check_cookie");
2807 }
2808 return TargetLowering::getSSPStackGuardCheck(M);
2809}
2810
2811Value *
2812X86TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {
2813 if (Subtarget.getTargetTriple().isOSContiki())
2814 return getDefaultSafeStackPointerLocation(IRB, false);
2815
2816 // Android provides a fixed TLS slot for the SafeStack pointer. See the
2817 // definition of TLS_SLOT_SAFESTACK in
2818 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
2819 if (Subtarget.isTargetAndroid()) {
2820 // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
2821 // %gs:0x24 on i386
2822 int Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
2823 return SegmentOffset(IRB, Offset, getAddressSpace());
2824 }
2825
2826 // Fuchsia is similar.
2827 if (Subtarget.isTargetFuchsia()) {
2828 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
2829 return SegmentOffset(IRB, 0x18, getAddressSpace());
2830 }
2831
2832 return TargetLowering::getSafeStackPointerLocation(IRB);
2833}
2834
2835//===----------------------------------------------------------------------===//
2836// Return Value Calling Convention Implementation
2837//===----------------------------------------------------------------------===//
2838
2839bool X86TargetLowering::CanLowerReturn(
2840 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
2841 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
2842 SmallVector<CCValAssign, 16> RVLocs;
2843 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2844 return CCInfo.CheckReturn(Outs, RetCC_X86);
2845}
2846
2847const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2848 static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2849 return ScratchRegs;
2850}
2851
2852/// Lowers masks values (v*i1) to the local register values
2853/// \returns DAG node after lowering to register type
2854static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
2855 const SDLoc &Dl, SelectionDAG &DAG) {
2856 EVT ValVT = ValArg.getValueType();
2857
2858 if (ValVT == MVT::v1i1)
2859 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ValLoc, ValArg,
2860 DAG.getIntPtrConstant(0, Dl));
2861
2862 if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
2863 (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
2864 // Two stage lowering might be required
2865 // bitcast: v8i1 -> i8 / v16i1 -> i16
2866 // anyextend: i8 -> i32 / i16 -> i32
2867 EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
2868 SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
2869 if (ValLoc == MVT::i32)
2870 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
2871 return ValToCopy;
2872 }
2873
2874 if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
2875 (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
2876 // One stage lowering is required
2877 // bitcast: v32i1 -> i32 / v64i1 -> i64
2878 return DAG.getBitcast(ValLoc, ValArg);
2879 }
2880
2881 return DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValArg);
2882}
2883
2884/// Breaks v64i1 value into two registers and adds the new node to the DAG
2885static void Passv64i1ArgInRegs(
2886 const SDLoc &Dl, SelectionDAG &DAG, SDValue &Arg,
2887 SmallVectorImpl<std::pair<Register, SDValue>> &RegsToPass, CCValAssign &VA,
2888 CCValAssign &NextVA, const X86Subtarget &Subtarget) {
2889 assert(Subtarget.hasBWI() && "Expected AVX512BW target!")(static_cast <bool> (Subtarget.hasBWI() && "Expected AVX512BW target!"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected AVX512BW target!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 2889, __extension__
__PRETTY_FUNCTION__))
;
2890 assert(Subtarget.is32Bit() && "Expecting 32 bit target")(static_cast <bool> (Subtarget.is32Bit() && "Expecting 32 bit target"
) ? void (0) : __assert_fail ("Subtarget.is32Bit() && \"Expecting 32 bit target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 2890, __extension__
__PRETTY_FUNCTION__))
;
2891 assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value")(static_cast <bool> (Arg.getValueType() == MVT::i64 &&
"Expecting 64 bit value") ? void (0) : __assert_fail ("Arg.getValueType() == MVT::i64 && \"Expecting 64 bit value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 2891, __extension__
__PRETTY_FUNCTION__))
;
2892 assert(VA.isRegLoc() && NextVA.isRegLoc() &&(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc
() && "The value should reside in two registers") ? void
(0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The value should reside in two registers\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 2893, __extension__
__PRETTY_FUNCTION__))
2893 "The value should reside in two registers")(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc
() && "The value should reside in two registers") ? void
(0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The value should reside in two registers\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 2893, __extension__
__PRETTY_FUNCTION__))
;
2894
2895 // Before splitting the value we cast it to i64
2896 Arg = DAG.getBitcast(MVT::i64, Arg);
2897
2898 // Splitting the value into two i32 types
2899 SDValue Lo, Hi;
2900 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2901 DAG.getConstant(0, Dl, MVT::i32));
2902 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2903 DAG.getConstant(1, Dl, MVT::i32));
2904
2905 // Attach the two i32 types into corresponding registers
2906 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
2907 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
2908}
2909
2910SDValue
2911X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2912 bool isVarArg,
2913 const SmallVectorImpl<ISD::OutputArg> &Outs,
2914 const SmallVectorImpl<SDValue> &OutVals,
2915 const SDLoc &dl, SelectionDAG &DAG) const {
2916 MachineFunction &MF = DAG.getMachineFunction();
2917 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2918
2919 // In some cases we need to disable registers from the default CSR list.
2920 // For example, when they are used for argument passing.
2921 bool ShouldDisableCalleeSavedRegister =
2922 CallConv == CallingConv::X86_RegCall ||
2923 MF.getFunction().hasFnAttribute("no_caller_saved_registers");
2924
2925 if (CallConv == CallingConv::X86_INTR && !Outs.empty())
2926 report_fatal_error("X86 interrupts may not return any value");
2927
2928 SmallVector<CCValAssign, 16> RVLocs;
2929 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2930 CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2931
2932 SmallVector<std::pair<Register, SDValue>, 4> RetVals;
2933 for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
2934 ++I, ++OutsIndex) {
2935 CCValAssign &VA = RVLocs[I];
2936 assert(VA.isRegLoc() && "Can only return in registers!")(static_cast <bool> (VA.isRegLoc() && "Can only return in registers!"
) ? void (0) : __assert_fail ("VA.isRegLoc() && \"Can only return in registers!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 2936, __extension__
__PRETTY_FUNCTION__))
;
2937
2938 // Add the register to the CalleeSaveDisableRegs list.
2939 if (ShouldDisableCalleeSavedRegister)
2940 MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());
2941
2942 SDValue ValToCopy = OutVals[OutsIndex];
2943 EVT ValVT = ValToCopy.getValueType();
2944
2945 // Promote values to the appropriate types.
2946 if (VA.getLocInfo() == CCValAssign::SExt)
2947 ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2948 else if (VA.getLocInfo() == CCValAssign::ZExt)
2949 ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2950 else if (VA.getLocInfo() == CCValAssign::AExt) {
2951 if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
2952 ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
2953 else
2954 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2955 }
2956 else if (VA.getLocInfo() == CCValAssign::BCvt)
2957 ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
2958
2959 assert(VA.getLocInfo() != CCValAssign::FPExt &&(static_cast <bool> (VA.getLocInfo() != CCValAssign::FPExt
&& "Unexpected FP-extend for return value.") ? void (
0) : __assert_fail ("VA.getLocInfo() != CCValAssign::FPExt && \"Unexpected FP-extend for return value.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 2960, __extension__
__PRETTY_FUNCTION__))
2960 "Unexpected FP-extend for return value.")(static_cast <bool> (VA.getLocInfo() != CCValAssign::FPExt
&& "Unexpected FP-extend for return value.") ? void (
0) : __assert_fail ("VA.getLocInfo() != CCValAssign::FPExt && \"Unexpected FP-extend for return value.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 2960, __extension__
__PRETTY_FUNCTION__))
;
2961
2962 // Report an error if we have attempted to return a value via an XMM
2963 // register and SSE was disabled.
2964 if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
2965 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2966 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2967 } else if (!Subtarget.hasSSE2() &&
2968 X86::FR64XRegClass.contains(VA.getLocReg()) &&
2969 ValVT == MVT::f64) {
2970 // When returning a double via an XMM register, report an error if SSE2 is
2971 // not enabled.
2972 errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
2973 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2974 }
2975
2976 // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2977 // the RET instruction and handled by the FP Stackifier.
2978 if (VA.getLocReg() == X86::FP0 ||
2979 VA.getLocReg() == X86::FP1) {
2980 // If this is a copy from an xmm register to ST(0), use an FPExtend to
2981 // change the value to the FP stack register class.
2982 if (isScalarFPTypeInSSEReg(VA.getValVT()))
2983 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2984 RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
2985 // Don't emit a copytoreg.
2986 continue;
2987 }
2988
2989 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2990 // which is returned in RAX / RDX.
2991 if (Subtarget.is64Bit()) {
2992 if (ValVT == MVT::x86mmx) {
2993 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2994 ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
2995 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2996 ValToCopy);
2997 // If we don't have SSE2 available, convert to v4f32 so the generated
2998 // register is legal.
2999 if (!Subtarget.hasSSE2())
3000 ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
3001 }
3002 }
3003 }
3004
3005 if (VA.needsCustom()) {
3006 assert(VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3007, __extension__
__PRETTY_FUNCTION__))
3007 "Currently the only custom case is when we split v64i1 to 2 regs")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3007, __extension__
__PRETTY_FUNCTION__))
;
3008
3009 Passv64i1ArgInRegs(dl, DAG, ValToCopy, RetVals, VA, RVLocs[++I],
3010 Subtarget);
3011
3012 // Add the second register to the CalleeSaveDisableRegs list.
3013 if (ShouldDisableCalleeSavedRegister)
3014 MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
3015 } else {
3016 RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
3017 }
3018 }
3019
3020 SDValue Flag;
3021 SmallVector<SDValue, 6> RetOps;
3022 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3023 // Operand #1 = Bytes To Pop
3024 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
3025 MVT::i32));
3026
3027 // Copy the result values into the output registers.
3028 for (auto &RetVal : RetVals) {
3029 if (RetVal.first == X86::FP0 || RetVal.first == X86::FP1) {
3030 RetOps.push_back(RetVal.second);
3031 continue; // Don't emit a copytoreg.
3032 }
3033
3034 Chain = DAG.getCopyToReg(Chain, dl, RetVal.first, RetVal.second, Flag);
3035 Flag = Chain.getValue(1);
3036 RetOps.push_back(
3037 DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
3038 }
3039
3040 // Swift calling convention does not require we copy the sret argument
3041 // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
3042
3043 // All x86 ABIs require that for returning structs by value we copy
3044 // the sret argument into %rax/%eax (depending on ABI) for the return.
3045 // We saved the argument into a virtual register in the entry block,
3046 // so now we copy the value out and into %rax/%eax.
3047 //
3048 // Checking Function.hasStructRetAttr() here is insufficient because the IR
3049 // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
3050 // false, then an sret argument may be implicitly inserted in the SelDAG. In
3051 // either case FuncInfo->setSRetReturnReg() will have been called.
3052 if (Register SRetReg = FuncInfo->getSRetReturnReg()) {
3053 // When we have both sret and another return value, we should use the
3054 // original Chain stored in RetOps[0], instead of the current Chain updated
3055 // in the above loop. If we only have sret, RetOps[0] equals to Chain.
3056
3057 // For the case of sret and another return value, we have
3058 // Chain_0 at the function entry
3059 // Chain_1 = getCopyToReg(Chain_0) in the above loop
3060 // If we use Chain_1 in getCopyFromReg, we will have
3061 // Val = getCopyFromReg(Chain_1)
3062 // Chain_2 = getCopyToReg(Chain_1, Val) from below
3063
3064 // getCopyToReg(Chain_0) will be glued together with
3065 // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
3066 // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
3067 // Data dependency from Unit B to Unit A due to usage of Val in
3068 // getCopyToReg(Chain_1, Val)
3069 // Chain dependency from Unit A to Unit B
3070
3071 // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
3072 SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
3073 getPointerTy(MF.getDataLayout()));
3074
3075 Register RetValReg
3076 = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
3077 X86::RAX : X86::EAX;
3078 Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
3079 Flag = Chain.getValue(1);
3080
3081 // RAX/EAX now acts like a return value.
3082 RetOps.push_back(
3083 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
3084
3085 // Add the returned register to the CalleeSaveDisableRegs list.
3086 if (ShouldDisableCalleeSavedRegister)
3087 MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
3088 }
3089
3090 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
3091 const MCPhysReg *I =
3092 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3093 if (I) {
3094 for (; *I; ++I) {
3095 if (X86::GR64RegClass.contains(*I))
3096 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3097 else
3098 llvm_unreachable("Unexpected register class in CSRsViaCopy!")::llvm::llvm_unreachable_internal("Unexpected register class in CSRsViaCopy!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3098)
;
3099 }
3100 }
3101
3102 RetOps[0] = Chain; // Update chain.
3103
3104 // Add the flag if we have it.
3105 if (Flag.getNode())
3106 RetOps.push_back(Flag);
3107
3108 X86ISD::NodeType opcode = X86ISD::RET_FLAG;
3109 if (CallConv == CallingConv::X86_INTR)
3110 opcode = X86ISD::IRET;
3111 return DAG.getNode(opcode, dl, MVT::Other, RetOps);
3112}
3113
3114bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
3115 if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
3116 return false;
3117
3118 SDValue TCChain = Chain;
3119 SDNode *Copy = *N->use_begin();
3120 if (Copy->getOpcode() == ISD::CopyToReg) {
3121 // If the copy has a glue operand, we conservatively assume it isn't safe to
3122 // perform a tail call.
3123 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
3124 return false;
3125 TCChain = Copy->getOperand(0);
3126 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
3127 return false;
3128
3129 bool HasRet = false;
3130 for (const SDNode *U : Copy->uses()) {
3131 if (U->getOpcode() != X86ISD::RET_FLAG)
3132 return false;
3133 // If we are returning more than one value, we can definitely
3134 // not make a tail call see PR19530
3135 if (U->getNumOperands() > 4)
3136 return false;
3137 if (U->getNumOperands() == 4 &&
3138 U->getOperand(U->getNumOperands() - 1).getValueType() != MVT::Glue)
3139 return false;
3140 HasRet = true;
3141 }
3142
3143 if (!HasRet)
3144 return false;
3145
3146 Chain = TCChain;
3147 return true;
3148}
3149
3150EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
3151 ISD::NodeType ExtendKind) const {
3152 MVT ReturnMVT = MVT::i32;
3153
3154 bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
3155 if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
3156 // The ABI does not require i1, i8 or i16 to be extended.
3157 //
3158 // On Darwin, there is code in the wild relying on Clang's old behaviour of
3159 // always extending i8/i16 return values, so keep doing that for now.
3160 // (PR26665).
3161 ReturnMVT = MVT::i8;
3162 }
3163
3164 EVT MinVT = getRegisterType(Context, ReturnMVT);
3165 return VT.bitsLT(MinVT) ? MinVT : VT;
3166}
3167
3168/// Reads two 32 bit registers and creates a 64 bit mask value.
3169/// \param VA The current 32 bit value that need to be assigned.
3170/// \param NextVA The next 32 bit value that need to be assigned.
3171/// \param Root The parent DAG node.
3172/// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
3173/// glue purposes. In the case the DAG is already using
3174/// physical register instead of virtual, we should glue
3175/// our new SDValue to InFlag SDvalue.
3176/// \return a new SDvalue of size 64bit.
3177static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
3178 SDValue &Root, SelectionDAG &DAG,
3179 const SDLoc &Dl, const X86Subtarget &Subtarget,
3180 SDValue *InFlag = nullptr) {
3181 assert((Subtarget.hasBWI()) && "Expected AVX512BW target!")(static_cast <bool> ((Subtarget.hasBWI()) && "Expected AVX512BW target!"
) ? void (0) : __assert_fail ("(Subtarget.hasBWI()) && \"Expected AVX512BW target!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3181, __extension__
__PRETTY_FUNCTION__))
;
3182 assert(Subtarget.is32Bit() && "Expecting 32 bit target")(static_cast <bool> (Subtarget.is32Bit() && "Expecting 32 bit target"
) ? void (0) : __assert_fail ("Subtarget.is32Bit() && \"Expecting 32 bit target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3182, __extension__
__PRETTY_FUNCTION__))
;
3183 assert(VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Expecting first location of 64 bit width type") ? void (0) :
__assert_fail ("VA.getValVT() == MVT::v64i1 && \"Expecting first location of 64 bit width type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3184, __extension__
__PRETTY_FUNCTION__))
3184 "Expecting first location of 64 bit width type")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Expecting first location of 64 bit width type") ? void (0) :
__assert_fail ("VA.getValVT() == MVT::v64i1 && \"Expecting first location of 64 bit width type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3184, __extension__
__PRETTY_FUNCTION__))
;
3185 assert(NextVA.getValVT() == VA.getValVT() &&(static_cast <bool> (NextVA.getValVT() == VA.getValVT()
&& "The locations should have the same type") ? void
(0) : __assert_fail ("NextVA.getValVT() == VA.getValVT() && \"The locations should have the same type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3186, __extension__
__PRETTY_FUNCTION__))
3186 "The locations should have the same type")(static_cast <bool> (NextVA.getValVT() == VA.getValVT()
&& "The locations should have the same type") ? void
(0) : __assert_fail ("NextVA.getValVT() == VA.getValVT() && \"The locations should have the same type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3186, __extension__
__PRETTY_FUNCTION__))
;
3187 assert(VA.isRegLoc() && NextVA.isRegLoc() &&(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc
() && "The values should reside in two registers") ? void
(0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The values should reside in two registers\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3188, __extension__
__PRETTY_FUNCTION__))
3188 "The values should reside in two registers")(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc
() && "The values should reside in two registers") ? void
(0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The values should reside in two registers\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3188, __extension__
__PRETTY_FUNCTION__))
;
3189
3190 SDValue Lo, Hi;
3191 SDValue ArgValueLo, ArgValueHi;
3192
3193 MachineFunction &MF = DAG.getMachineFunction();
3194 const TargetRegisterClass *RC = &X86::GR32RegClass;
3195
3196 // Read a 32 bit value from the registers.
3197 if (nullptr == InFlag) {
3198 // When no physical register is present,
3199 // create an intermediate virtual register.
3200 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
3201 ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
3202 Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
3203 ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
3204 } else {
3205 // When a physical register is available read the value from it and glue
3206 // the reads together.
3207 ArgValueLo =
3208 DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
3209 *InFlag = ArgValueLo.getValue(2);
3210 ArgValueHi =
3211 DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
3212 *InFlag = ArgValueHi.getValue(2);
3213 }
3214
3215 // Convert the i32 type into v32i1 type.
3216 Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
3217
3218 // Convert the i32 type into v32i1 type.
3219 Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
3220
3221 // Concatenate the two values together.
3222 return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
3223}
3224
3225/// The function will lower a register of various sizes (8/16/32/64)
3226/// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
3227/// \returns a DAG node contains the operand after lowering to mask type.
3228static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
3229 const EVT &ValLoc, const SDLoc &Dl,
3230 SelectionDAG &DAG) {
3231 SDValue ValReturned = ValArg;
3232
3233 if (ValVT == MVT::v1i1)
3234 return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);
3235
3236 if (ValVT == MVT::v64i1) {
3237 // In 32 bit machine, this case is handled by getv64i1Argument
3238 assert(ValLoc == MVT::i64 && "Expecting only i64 locations")(static_cast <bool> (ValLoc == MVT::i64 && "Expecting only i64 locations"
) ? void (0) : __assert_fail ("ValLoc == MVT::i64 && \"Expecting only i64 locations\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3238, __extension__
__PRETTY_FUNCTION__))
;
3239 // In 64 bit machine, There is no need to truncate the value only bitcast
3240 } else {
3241 MVT maskLen;
3242 switch (ValVT.getSimpleVT().SimpleTy) {
3243 case MVT::v8i1:
3244 maskLen = MVT::i8;
3245 break;
3246 case MVT::v16i1:
3247 maskLen = MVT::i16;
3248 break;
3249 case MVT::v32i1:
3250 maskLen = MVT::i32;
3251 break;
3252 default:
3253 llvm_unreachable("Expecting a vector of i1 types")::llvm::llvm_unreachable_internal("Expecting a vector of i1 types"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3253)
;
3254 }
3255
3256 ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
3257 }
3258 return DAG.getBitcast(ValVT, ValReturned);
3259}
3260
3261/// Lower the result values of a call into the
3262/// appropriate copies out of appropriate physical registers.
3263///
3264SDValue X86TargetLowering::LowerCallResult(
3265 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
3266 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3267 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
3268 uint32_t *RegMask) const {
3269
3270 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
3271 // Assign locations to each value returned by this call.
3272 SmallVector<CCValAssign, 16> RVLocs;
3273 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3274 *DAG.getContext());
3275 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
3276
3277 // Copy all of the result registers out of their specified physreg.
3278 for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
3279 ++I, ++InsIndex) {
3280 CCValAssign &VA = RVLocs[I];
3281 EVT CopyVT = VA.getLocVT();
3282
3283 // In some calling conventions we need to remove the used registers
3284 // from the register mask.
3285 if (RegMask) {
3286 for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);
3287 SubRegs.isValid(); ++SubRegs)
3288 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
3289 }
3290
3291 // Report an error if there was an attempt to return FP values via XMM
3292 // registers.
3293 if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
3294 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
3295 if (VA.getLocReg() == X86::XMM1)
3296 VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
3297 else
3298 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
3299 } else if (!Subtarget.hasSSE2() &&
3300 X86::FR64XRegClass.contains(VA.getLocReg()) &&
3301 CopyVT == MVT::f64) {
3302 errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
3303 if (VA.getLocReg() == X86::XMM1)
3304 VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
3305 else
3306 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
3307 }
3308
3309 // If we prefer to use the value in xmm registers, copy it out as f80 and
3310 // use a truncate to move it from fp stack reg to xmm reg.
3311 bool RoundAfterCopy = false;
3312 if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
3313 isScalarFPTypeInSSEReg(VA.getValVT())) {
3314 if (!Subtarget.hasX87())
3315 report_fatal_error("X87 register return with X87 disabled");
3316 CopyVT = MVT::f80;
3317 RoundAfterCopy = (CopyVT != VA.getLocVT());
3318 }
3319
3320 SDValue Val;
3321 if (VA.needsCustom()) {
3322 assert(VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3323, __extension__
__PRETTY_FUNCTION__))
3323 "Currently the only custom case is when we split v64i1 to 2 regs")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3323, __extension__
__PRETTY_FUNCTION__))
;
3324 Val =
3325 getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
3326 } else {
3327 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
3328 .getValue(1);
3329 Val = Chain.getValue(0);
3330 InFlag = Chain.getValue(2);
3331 }
3332
3333 if (RoundAfterCopy)
3334 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
3335 // This truncation won't change the value.
3336 DAG.getIntPtrConstant(1, dl));
3337
3338 if (VA.isExtInLoc()) {
3339 if (VA.getValVT().isVector() &&
3340 VA.getValVT().getScalarType() == MVT::i1 &&
3341 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
3342 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
3343 // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
3344 Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
3345 } else
3346 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
3347 }
3348
3349 if (VA.getLocInfo() == CCValAssign::BCvt)
3350 Val = DAG.getBitcast(VA.getValVT(), Val);
3351
3352 InVals.push_back(Val);
3353 }
3354
3355 return Chain;
3356}
3357
3358//===----------------------------------------------------------------------===//
3359// C & StdCall & Fast Calling Convention implementation
3360//===----------------------------------------------------------------------===//
3361// StdCall calling convention seems to be standard for many Windows' API
3362// routines and around. It differs from C calling convention just a little:
3363// callee should clean up the stack, not caller. Symbols should be also
3364// decorated in some fancy way :) It doesn't support any vector arguments.
3365// For info on fast calling convention see Fast Calling Convention (tail call)
3366// implementation LowerX86_32FastCCCallTo.
3367
3368/// Determines whether Args, either a set of outgoing arguments to a call, or a
3369/// set of incoming args of a call, contains an sret pointer that the callee
3370/// pops
3371template <typename T>
3372static bool hasCalleePopSRet(const SmallVectorImpl<T> &Args,
3373 const X86Subtarget &Subtarget) {
3374 // Not C++20 (yet), so no concepts available.
3375 static_assert(std::is_same<T, ISD::OutputArg>::value ||
3376 std::is_same<T, ISD::InputArg>::value,
3377 "requires ISD::OutputArg or ISD::InputArg");
3378
3379 // Only 32-bit pops the sret. It's a 64-bit world these days, so early-out
3380 // for most compilations.
3381 if (!Subtarget.is32Bit())
3382 return false;
3383
3384 if (Args.empty())
3385 return false;
3386
3387 // Most calls do not have an sret argument, check the arg next.
3388 const ISD::ArgFlagsTy &Flags = Args[0].Flags;
3389 if (!Flags.isSRet() || Flags.isInReg())
3390 return false;
3391
3392 // The MSVCabi does not pop the sret.
3393 if (Subtarget.getTargetTriple().isOSMSVCRT())
3394 return false;
3395
3396 // MCUs don't pop the sret
3397 if (Subtarget.isTargetMCU())
3398 return false;
3399
3400 // Callee pops argument
3401 return true;
3402}
3403
3404/// Make a copy of an aggregate at address specified by "Src" to address
3405/// "Dst" with size and alignment information specified by the specific
3406/// parameter attribute. The copy will be passed as a byval function parameter.
3407static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
3408 SDValue Chain, ISD::ArgFlagsTy Flags,
3409 SelectionDAG &DAG, const SDLoc &dl) {
3410 SDValue SizeNode = DAG.getIntPtrConstant(Flags.getByValSize(), dl);
3411
3412 return DAG.getMemcpy(
3413 Chain, dl, Dst, Src, SizeNode, Flags.getNonZeroByValAlign(),
3414 /*isVolatile*/ false, /*AlwaysInline=*/true,
3415 /*isTailCall*/ false, MachinePointerInfo(), MachinePointerInfo());
3416}
3417
3418/// Return true if the calling convention is one that we can guarantee TCO for.
3419static bool canGuaranteeTCO(CallingConv::ID CC) {
3420 return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
3421 CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
3422 CC == CallingConv::HHVM || CC == CallingConv::Tail ||
3423 CC == CallingConv::SwiftTail);
3424}
3425
3426/// Return true if we might ever do TCO for calls with this calling convention.
3427static bool mayTailCallThisCC(CallingConv::ID CC) {
3428 switch (CC) {
3429 // C calling conventions:
3430 case CallingConv::C:
3431 case CallingConv::Win64:
3432 case CallingConv::X86_64_SysV:
3433 // Callee pop conventions:
3434 case CallingConv::X86_ThisCall:
3435 case CallingConv::X86_StdCall:
3436 case CallingConv::X86_VectorCall:
3437 case CallingConv::X86_FastCall:
3438 // Swift:
3439 case CallingConv::Swift:
3440 return true;
3441 default:
3442 return canGuaranteeTCO(CC);
3443 }
3444}
3445
3446/// Return true if the function is being made into a tailcall target by
3447/// changing its ABI.
3448static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
3449 return (GuaranteedTailCallOpt && canGuaranteeTCO(CC)) ||
3450 CC == CallingConv::Tail || CC == CallingConv::SwiftTail;
3451}
3452
3453bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
3454 if (!CI->isTailCall())
3455 return false;
3456
3457 CallingConv::ID CalleeCC = CI->getCallingConv();
3458 if (!mayTailCallThisCC(CalleeCC))
3459 return false;
3460
3461 return true;
3462}
3463
3464SDValue
3465X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
3466 const SmallVectorImpl<ISD::InputArg> &Ins,
3467 const SDLoc &dl, SelectionDAG &DAG,
3468 const CCValAssign &VA,
3469 MachineFrameInfo &MFI, unsigned i) const {
3470 // Create the nodes corresponding to a load from this parameter slot.
3471 ISD::ArgFlagsTy Flags = Ins[i].Flags;
3472 bool AlwaysUseMutable = shouldGuaranteeTCO(
3473 CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
3474 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
3475 EVT ValVT;
3476 MVT PtrVT = getPointerTy(DAG.getDataLayout());
3477
3478 // If value is passed by pointer we have address passed instead of the value
3479 // itself. No need to extend if the mask value and location share the same
3480 // absolute size.
3481 bool ExtendedInMem =
3482 VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
3483 VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
3484
3485 if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
3486 ValVT = VA.getLocVT();
3487 else
3488 ValVT = VA.getValVT();
3489
3490 // FIXME: For now, all byval parameter objects are marked mutable. This can be
3491 // changed with more analysis.
3492 // In case of tail call optimization mark all arguments mutable. Since they
3493 // could be overwritten by lowering of arguments in case of a tail call.
3494 if (Flags.isByVal()) {
3495 unsigned Bytes = Flags.getByValSize();
3496 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
3497
3498 // FIXME: For now, all byval parameter objects are marked as aliasing. This
3499 // can be improved with deeper analysis.
3500 int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable,
3501 /*isAliased=*/true);
3502 return DAG.getFrameIndex(FI, PtrVT);
3503 }
3504
3505 EVT ArgVT = Ins[i].ArgVT;
3506
3507 // If this is a vector that has been split into multiple parts, and the
3508 // scalar size of the parts don't match the vector element size, then we can't
3509 // elide the copy. The parts will have padding between them instead of being
3510 // packed like a vector.
3511 bool ScalarizedAndExtendedVector =
3512 ArgVT.isVector() && !VA.getLocVT().isVector() &&
3513 VA.getLocVT().getSizeInBits() != ArgVT.getScalarSizeInBits();
3514
3515 // This is an argument in memory. We might be able to perform copy elision.
3516 // If the argument is passed directly in memory without any extension, then we
3517 // can perform copy elision. Large vector types, for example, may be passed
3518 // indirectly by pointer.
3519 if (Flags.isCopyElisionCandidate() &&
3520 VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem &&
3521 !ScalarizedAndExtendedVector) {
3522 SDValue PartAddr;
3523 if (Ins[i].PartOffset == 0) {
3524 // If this is a one-part value or the first part of a multi-part value,
3525 // create a stack object for the entire argument value type and return a
3526 // load from our portion of it. This assumes that if the first part of an
3527 // argument is in memory, the rest will also be in memory.
3528 int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
3529 /*IsImmutable=*/false);
3530 PartAddr = DAG.getFrameIndex(FI, PtrVT);
3531 return DAG.getLoad(
3532 ValVT, dl, Chain, PartAddr,
3533 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3534 } else {
3535 // This is not the first piece of an argument in memory. See if there is
3536 // already a fixed stack object including this offset. If so, assume it
3537 // was created by the PartOffset == 0 branch above and create a load from
3538 // the appropriate offset into it.
3539 int64_t PartBegin = VA.getLocMemOffset();
3540 int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
3541 int FI = MFI.getObjectIndexBegin();
3542 for (; MFI.isFixedObjectIndex(FI); ++FI) {
3543 int64_t ObjBegin = MFI.getObjectOffset(FI);
3544 int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
3545 if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
3546 break;
3547 }
3548 if (MFI.isFixedObjectIndex(FI)) {
3549 SDValue Addr =
3550 DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
3551 DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
3552 return DAG.getLoad(
3553 ValVT, dl, Chain, Addr,
3554 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
3555 Ins[i].PartOffset));
3556 }
3557 }
3558 }
3559
3560 int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
3561 VA.getLocMemOffset(), isImmutable);
3562
3563 // Set SExt or ZExt flag.
3564 if (VA.getLocInfo() == CCValAssign::ZExt) {
3565 MFI.setObjectZExt(FI, true);
3566 } else if (VA.getLocInfo() == CCValAssign::SExt) {
3567 MFI.setObjectSExt(FI, true);
3568 }
3569
3570 MaybeAlign Alignment;
3571 if (Subtarget.isTargetWindowsMSVC() && !Subtarget.is64Bit() &&
3572 ValVT != MVT::f80)
3573 Alignment = MaybeAlign(4);
3574 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
3575 SDValue Val = DAG.getLoad(
3576 ValVT, dl, Chain, FIN,
3577 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
3578 Alignment);
3579 return ExtendedInMem
3580 ? (VA.getValVT().isVector()
3581 ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
3582 : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
3583 : Val;
3584}
3585
3586// FIXME: Get this from tablegen.
3587static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
3588 const X86Subtarget &Subtarget) {
3589 assert(Subtarget.is64Bit())(static_cast <bool> (Subtarget.is64Bit()) ? void (0) : __assert_fail
("Subtarget.is64Bit()", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3589, __extension__ __PRETTY_FUNCTION__))
;
3590
3591 if (Subtarget.isCallingConvWin64(CallConv)) {
3592 static const MCPhysReg GPR64ArgRegsWin64[] = {
3593 X86::RCX, X86::RDX, X86::R8, X86::R9
3594 };
3595 return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
3596 }
3597
3598 static const MCPhysReg GPR64ArgRegs64Bit[] = {
3599 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
3600 };
3601 return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
3602}
3603
3604// FIXME: Get this from tablegen.
3605static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
3606 CallingConv::ID CallConv,
3607 const X86Subtarget &Subtarget) {
3608 assert(Subtarget.is64Bit())(static_cast <bool> (Subtarget.is64Bit()) ? void (0) : __assert_fail
("Subtarget.is64Bit()", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3608, __extension__ __PRETTY_FUNCTION__))
;
3609 if (Subtarget.isCallingConvWin64(CallConv)) {
3610 // The XMM registers which might contain var arg parameters are shadowed
3611 // in their paired GPR. So we only need to save the GPR to their home
3612 // slots.
3613 // TODO: __vectorcall will change this.
3614 return None;
3615 }
3616
3617 bool isSoftFloat = Subtarget.useSoftFloat();
3618 if (isSoftFloat || !Subtarget.hasSSE1())
3619 // Kernel mode asks for SSE to be disabled, so there are no XMM argument
3620 // registers.
3621 return None;
3622
3623 static const MCPhysReg XMMArgRegs64Bit[] = {
3624 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3625 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3626 };
3627 return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
3628}
3629
3630#ifndef NDEBUG
3631static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) {
3632 return llvm::is_sorted(
3633 ArgLocs, [](const CCValAssign &A, const CCValAssign &B) -> bool {
3634 return A.getValNo() < B.getValNo();
3635 });
3636}
3637#endif
3638
3639namespace {
3640/// This is a helper class for lowering variable arguments parameters.
3641class VarArgsLoweringHelper {
3642public:
3643 VarArgsLoweringHelper(X86MachineFunctionInfo *FuncInfo, const SDLoc &Loc,
3644 SelectionDAG &DAG, const X86Subtarget &Subtarget,
3645 CallingConv::ID CallConv, CCState &CCInfo)
3646 : FuncInfo(FuncInfo), DL(Loc), DAG(DAG), Subtarget(Subtarget),
3647 TheMachineFunction(DAG.getMachineFunction()),
3648 TheFunction(TheMachineFunction.getFunction()),
3649 FrameInfo(TheMachineFunction.getFrameInfo()),
3650 FrameLowering(*Subtarget.getFrameLowering()),
3651 TargLowering(DAG.getTargetLoweringInfo()), CallConv(CallConv),
3652 CCInfo(CCInfo) {}
3653
3654 // Lower variable arguments parameters.
3655 void lowerVarArgsParameters(SDValue &Chain, unsigned StackSize);
3656
3657private:
3658 void createVarArgAreaAndStoreRegisters(SDValue &Chain, unsigned StackSize);
3659
3660 void forwardMustTailParameters(SDValue &Chain);
3661
3662 bool is64Bit() const { return Subtarget.is64Bit(); }
3663 bool isWin64() const { return Subtarget.isCallingConvWin64(CallConv); }
3664
3665 X86MachineFunctionInfo *FuncInfo;
3666 const SDLoc &DL;
3667 SelectionDAG &DAG;
3668 const X86Subtarget &Subtarget;
3669 MachineFunction &TheMachineFunction;
3670 const Function &TheFunction;
3671 MachineFrameInfo &FrameInfo;
3672 const TargetFrameLowering &FrameLowering;
3673 const TargetLowering &TargLowering;
3674 CallingConv::ID CallConv;
3675 CCState &CCInfo;
3676};
3677} // namespace
3678
3679void VarArgsLoweringHelper::createVarArgAreaAndStoreRegisters(
3680 SDValue &Chain, unsigned StackSize) {
3681 // If the function takes variable number of arguments, make a frame index for
3682 // the start of the first vararg value... for expansion of llvm.va_start. We
3683 // can skip this if there are no va_start calls.
3684 if (is64Bit() || (CallConv != CallingConv::X86_FastCall &&
3685 CallConv != CallingConv::X86_ThisCall)) {
3686 FuncInfo->setVarArgsFrameIndex(
3687 FrameInfo.CreateFixedObject(1, StackSize, true));
3688 }
3689
3690 // 64-bit calling conventions support varargs and register parameters, so we
3691 // have to do extra work to spill them in the prologue.
3692 if (is64Bit()) {
3693 // Find the first unallocated argument registers.
3694 ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
3695 ArrayRef<MCPhysReg> ArgXMMs =
3696 get64BitArgumentXMMs(TheMachineFunction, CallConv, Subtarget);
3697 unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
3698 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
3699
3700 assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&(static_cast <bool> (!(NumXMMRegs && !Subtarget
.hasSSE1()) && "SSE register cannot be used when SSE is disabled!"
) ? void (0) : __assert_fail ("!(NumXMMRegs && !Subtarget.hasSSE1()) && \"SSE register cannot be used when SSE is disabled!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3701, __extension__
__PRETTY_FUNCTION__))
3701 "SSE register cannot be used when SSE is disabled!")(static_cast <bool> (!(NumXMMRegs && !Subtarget
.hasSSE1()) && "SSE register cannot be used when SSE is disabled!"
) ? void (0) : __assert_fail ("!(NumXMMRegs && !Subtarget.hasSSE1()) && \"SSE register cannot be used when SSE is disabled!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3701, __extension__
__PRETTY_FUNCTION__))
;
3702
3703 if (isWin64()) {
3704 // Get to the caller-allocated home save location. Add 8 to account
3705 // for the return address.
3706 int HomeOffset = FrameLowering.getOffsetOfLocalArea() + 8;
3707 FuncInfo->setRegSaveFrameIndex(
3708 FrameInfo.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
3709 // Fixup to set vararg frame on shadow area (4 x i64).
3710 if (NumIntRegs < 4)
3711 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
3712 } else {
3713 // For X86-64, if there are vararg parameters that are passed via
3714 // registers, then we must store them to their spots on the stack so
3715 // they may be loaded by dereferencing the result of va_next.
3716 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
3717 FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
3718 FuncInfo->setRegSaveFrameIndex(FrameInfo.CreateStackObject(
3719 ArgGPRs.size() * 8 + ArgXMMs.size() * 16, Align(16), false));
3720 }
3721
3722 SmallVector<SDValue, 6>
3723 LiveGPRs; // list of SDValue for GPR registers keeping live input value
3724 SmallVector<SDValue, 8> LiveXMMRegs; // list of SDValue for XMM registers
3725 // keeping live input value
3726 SDValue ALVal; // if applicable keeps SDValue for %al register
3727
3728 // Gather all the live in physical registers.
3729 for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
3730 Register GPR = TheMachineFunction.addLiveIn(Reg, &X86::GR64RegClass);
3731 LiveGPRs.push_back(DAG.getCopyFromReg(Chain, DL, GPR, MVT::i64));
3732 }
3733 const auto &AvailableXmms = ArgXMMs.slice(NumXMMRegs);
3734 if (!AvailableXmms.empty()) {
3735 Register AL = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);
3736 ALVal = DAG.getCopyFromReg(Chain, DL, AL, MVT::i8);
3737 for (MCPhysReg Reg : AvailableXmms) {
3738 // FastRegisterAllocator spills virtual registers at basic
3739 // block boundary. That leads to usages of xmm registers
3740 // outside of check for %al. Pass physical registers to
3741 // VASTART_SAVE_XMM_REGS to avoid unneccessary spilling.
3742 TheMachineFunction.getRegInfo().addLiveIn(Reg);
3743 LiveXMMRegs.push_back(DAG.getRegister(Reg, MVT::v4f32));
3744 }
3745 }
3746
3747 // Store the integer parameter registers.
3748 SmallVector<SDValue, 8> MemOps;
3749 SDValue RSFIN =
3750 DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
3751 TargLowering.getPointerTy(DAG.getDataLayout()));
3752 unsigned Offset = FuncInfo->getVarArgsGPOffset();
3753 for (SDValue Val : LiveGPRs) {
3754 SDValue FIN = DAG.getNode(ISD::ADD, DL,
3755 TargLowering.getPointerTy(DAG.getDataLayout()),
3756 RSFIN, DAG.getIntPtrConstant(Offset, DL));
3757 SDValue Store =
3758 DAG.getStore(Val.getValue(1), DL, Val, FIN,
3759 MachinePointerInfo::getFixedStack(
3760 DAG.getMachineFunction(),
3761 FuncInfo->getRegSaveFrameIndex(), Offset));
3762 MemOps.push_back(Store);
3763 Offset += 8;
3764 }
3765
3766 // Now store the XMM (fp + vector) parameter registers.
3767 if (!LiveXMMRegs.empty()) {
3768 SmallVector<SDValue, 12> SaveXMMOps;
3769 SaveXMMOps.push_back(Chain);
3770 SaveXMMOps.push_back(ALVal);
3771 SaveXMMOps.push_back(RSFIN);
3772 SaveXMMOps.push_back(
3773 DAG.getTargetConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32));
3774 llvm::append_range(SaveXMMOps, LiveXMMRegs);
3775 MachineMemOperand *StoreMMO =
3776 DAG.getMachineFunction().getMachineMemOperand(
3777 MachinePointerInfo::getFixedStack(
3778 DAG.getMachineFunction(), FuncInfo->getRegSaveFrameIndex(),
3779 Offset),
3780 MachineMemOperand::MOStore, 128, Align(16));
3781 MemOps.push_back(DAG.getMemIntrinsicNode(X86ISD::VASTART_SAVE_XMM_REGS,
3782 DL, DAG.getVTList(MVT::Other),
3783 SaveXMMOps, MVT::i8, StoreMMO));
3784 }
3785
3786 if (!MemOps.empty())
3787 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
3788 }
3789}
3790
3791void VarArgsLoweringHelper::forwardMustTailParameters(SDValue &Chain) {
3792 // Find the largest legal vector type.
3793 MVT VecVT = MVT::Other;
3794 // FIXME: Only some x86_32 calling conventions support AVX512.
3795 if (Subtarget.useAVX512Regs() &&
3796 (is64Bit() || (CallConv == CallingConv::X86_VectorCall ||
3797 CallConv == CallingConv::Intel_OCL_BI)))
3798 VecVT = MVT::v16f32;
3799 else if (Subtarget.hasAVX())
3800 VecVT = MVT::v8f32;
3801 else if (Subtarget.hasSSE2())
3802 VecVT = MVT::v4f32;
3803
3804 // We forward some GPRs and some vector types.
3805 SmallVector<MVT, 2> RegParmTypes;
3806 MVT IntVT = is64Bit() ? MVT::i64 : MVT::i32;
3807 RegParmTypes.push_back(IntVT);
3808 if (VecVT != MVT::Other)
3809 RegParmTypes.push_back(VecVT);
3810
3811 // Compute the set of forwarded registers. The rest are scratch.
3812 SmallVectorImpl<ForwardedRegister> &Forwards =
3813 FuncInfo->getForwardedMustTailRegParms();
3814 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
3815
3816 // Forward AL for SysV x86_64 targets, since it is used for varargs.
3817 if (is64Bit() && !isWin64() && !CCInfo.isAllocated(X86::AL)) {
3818 Register ALVReg = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);
3819 Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
3820 }
3821
3822 // Copy all forwards from physical to virtual registers.
3823 for (ForwardedRegister &FR : Forwards) {
3824 // FIXME: Can we use a less constrained schedule?
3825 SDValue RegVal = DAG.getCopyFromReg(Chain, DL, FR.VReg, FR.VT);
3826 FR.VReg = TheMachineFunction.getRegInfo().createVirtualRegister(
3827 TargLowering.getRegClassFor(FR.VT));
3828 Chain = DAG.getCopyToReg(Chain, DL, FR.VReg, RegVal);
3829 }
3830}
3831
3832void VarArgsLoweringHelper::lowerVarArgsParameters(SDValue &Chain,
3833 unsigned StackSize) {
3834 // Set FrameIndex to the 0xAAAAAAA value to mark unset state.
3835 // If necessary, it would be set into the correct value later.
3836 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
3837 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
3838
3839 if (FrameInfo.hasVAStart())
3840 createVarArgAreaAndStoreRegisters(Chain, StackSize);
3841
3842 if (FrameInfo.hasMustTailInVarArgFunc())
3843 forwardMustTailParameters(Chain);
3844}
3845
3846SDValue X86TargetLowering::LowerFormalArguments(
3847 SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
3848 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3849 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3850 MachineFunction &MF = DAG.getMachineFunction();
3851 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
3852
3853 const Function &F = MF.getFunction();
3854 if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&
3855 F.getName() == "main")
3856 FuncInfo->setForceFramePointer(true);
3857
3858 MachineFrameInfo &MFI = MF.getFrameInfo();
3859 bool Is64Bit = Subtarget.is64Bit();
3860 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
3861
3862 assert((static_cast <bool> (!(IsVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling conv' regcall, fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(IsVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3864, __extension__
__PRETTY_FUNCTION__))
3863 !(IsVarArg && canGuaranteeTCO(CallConv)) &&(static_cast <bool> (!(IsVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling conv' regcall, fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(IsVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3864, __extension__
__PRETTY_FUNCTION__))
3864 "Var args not supported with calling conv' regcall, fastcc, ghc or hipe")(static_cast <bool> (!(IsVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling conv' regcall, fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(IsVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3864, __extension__
__PRETTY_FUNCTION__))
;
3865
3866 // Assign locations to all of the incoming arguments.
3867 SmallVector<CCValAssign, 16> ArgLocs;
3868 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
3869
3870 // Allocate shadow area for Win64.
3871 if (IsWin64)
3872 CCInfo.AllocateStack(32, Align(8));
3873
3874 CCInfo.AnalyzeArguments(Ins, CC_X86);
3875
3876 // In vectorcall calling convention a second pass is required for the HVA
3877 // types.
3878 if (CallingConv::X86_VectorCall == CallConv) {
3879 CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
3880 }
3881
3882 // The next loop assumes that the locations are in the same order of the
3883 // input arguments.
3884 assert(isSortedByValueNo(ArgLocs) &&(static_cast <bool> (isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering") ? void
(0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3885, __extension__
__PRETTY_FUNCTION__))
3885 "Argument Location list must be sorted before lowering")(static_cast <bool> (isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering") ? void
(0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3885, __extension__
__PRETTY_FUNCTION__))
;
3886
3887 SDValue ArgValue;
3888 for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
3889 ++I, ++InsIndex) {
3890 assert(InsIndex < Ins.size() && "Invalid Ins index")(static_cast <bool> (InsIndex < Ins.size() &&
"Invalid Ins index") ? void (0) : __assert_fail ("InsIndex < Ins.size() && \"Invalid Ins index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3890, __extension__
__PRETTY_FUNCTION__))
;
3891 CCValAssign &VA = ArgLocs[I];
3892
3893 if (VA.isRegLoc()) {
3894 EVT RegVT = VA.getLocVT();
3895 if (VA.needsCustom()) {
3896 assert((static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3898, __extension__
__PRETTY_FUNCTION__))
3897 VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3898, __extension__
__PRETTY_FUNCTION__))
3898 "Currently the only custom case is when we split v64i1 to 2 regs")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3898, __extension__
__PRETTY_FUNCTION__))
;
3899
3900 // v64i1 values, in regcall calling convention, that are
3901 // compiled to 32 bit arch, are split up into two registers.
3902 ArgValue =
3903 getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
3904 } else {
3905 const TargetRegisterClass *RC;
3906 if (RegVT == MVT::i8)
3907 RC = &X86::GR8RegClass;
3908 else if (RegVT == MVT::i16)
3909 RC = &X86::GR16RegClass;
3910 else if (RegVT == MVT::i32)
3911 RC = &X86::GR32RegClass;
3912 else if (Is64Bit && RegVT == MVT::i64)
3913 RC = &X86::GR64RegClass;
3914 else if (RegVT == MVT::f16)
3915 RC = &X86::FR16XRegClass;
3916 else if (RegVT == MVT::f32)
3917 RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
3918 else if (RegVT == MVT::f64)
3919 RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
3920 else if (RegVT == MVT::f80)
3921 RC = &X86::RFP80RegClass;
3922 else if (RegVT == MVT::f128)
3923 RC = &X86::VR128RegClass;
3924 else if (RegVT.is512BitVector())
3925 RC = &X86::VR512RegClass;
3926 else if (RegVT.is256BitVector())
3927 RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
3928 else if (RegVT.is128BitVector())
3929 RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
3930 else if (RegVT == MVT::x86mmx)
3931 RC = &X86::VR64RegClass;
3932 else if (RegVT == MVT::v1i1)
3933 RC = &X86::VK1RegClass;
3934 else if (RegVT == MVT::v8i1)
3935 RC = &X86::VK8RegClass;
3936 else if (RegVT == MVT::v16i1)
3937 RC = &X86::VK16RegClass;
3938 else if (RegVT == MVT::v32i1)
3939 RC = &X86::VK32RegClass;
3940 else if (RegVT == MVT::v64i1)
3941 RC = &X86::VK64RegClass;
3942 else
3943 llvm_unreachable("Unknown argument type!")::llvm::llvm_unreachable_internal("Unknown argument type!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3943)
;
3944
3945 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
3946 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
3947 }
3948
3949 // If this is an 8 or 16-bit value, it is really passed promoted to 32
3950 // bits. Insert an assert[sz]ext to capture this, then truncate to the
3951 // right size.
3952 if (VA.getLocInfo() == CCValAssign::SExt)
3953 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
3954 DAG.getValueType(VA.getValVT()));
3955 else if (VA.getLocInfo() == CCValAssign::ZExt)
3956 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
3957 DAG.getValueType(VA.getValVT()));
3958 else if (VA.getLocInfo() == CCValAssign::BCvt)
3959 ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
3960
3961 if (VA.isExtInLoc()) {
3962 // Handle MMX values passed in XMM regs.
3963 if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
3964 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
3965 else if (VA.getValVT().isVector() &&
3966 VA.getValVT().getScalarType() == MVT::i1 &&
3967 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
3968 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
3969 // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
3970 ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
3971 } else
3972 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
3973 }
3974 } else {
3975 assert(VA.isMemLoc())(static_cast <bool> (VA.isMemLoc()) ? void (0) : __assert_fail
("VA.isMemLoc()", "llvm/lib/Target/X86/X86ISelLowering.cpp",
3975, __extension__ __PRETTY_FUNCTION__))
;
3976 ArgValue =
3977 LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
3978 }
3979
3980 // If value is passed via pointer - do a load.
3981 if (VA.getLocInfo() == CCValAssign::Indirect && !Ins[I].Flags.isByVal())
3982 ArgValue =
3983 DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
3984
3985 InVals.push_back(ArgValue);
3986 }
3987
3988 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
3989 if (Ins[I].Flags.isSwiftAsync()) {
3990 auto X86FI = MF.getInfo<X86MachineFunctionInfo>();
3991 if (Subtarget.is64Bit())
3992 X86FI->setHasSwiftAsyncContext(true);
3993 else {
3994 int FI = MF.getFrameInfo().CreateStackObject(4, Align(4), false);
3995 X86FI->setSwiftAsyncContextFrameIdx(FI);
3996 SDValue St = DAG.getStore(DAG.getEntryNode(), dl, InVals[I],
3997 DAG.getFrameIndex(FI, MVT::i32),
3998 MachinePointerInfo::getFixedStack(MF, FI));
3999 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, St, Chain);
4000 }
4001 }
4002
4003 // Swift calling convention does not require we copy the sret argument
4004 // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
4005 if (CallConv == CallingConv::Swift || CallConv == CallingConv::SwiftTail)
4006 continue;
4007
4008 // All x86 ABIs require that for returning structs by value we copy the
4009 // sret argument into %rax/%eax (depending on ABI) for the return. Save
4010 // the argument into a virtual register so that we can access it from the
4011 // return points.
4012 if (Ins[I].Flags.isSRet()) {
4013 assert(!FuncInfo->getSRetReturnReg() &&(static_cast <bool> (!FuncInfo->getSRetReturnReg() &&
"SRet return has already been set") ? void (0) : __assert_fail
("!FuncInfo->getSRetReturnReg() && \"SRet return has already been set\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4014, __extension__
__PRETTY_FUNCTION__))
4014 "SRet return has already been set")(static_cast <bool> (!FuncInfo->getSRetReturnReg() &&
"SRet return has already been set") ? void (0) : __assert_fail
("!FuncInfo->getSRetReturnReg() && \"SRet return has already been set\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4014, __extension__
__PRETTY_FUNCTION__))
;
4015 MVT PtrTy = getPointerTy(DAG.getDataLayout());
4016 Register Reg =
4017 MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
4018 FuncInfo->setSRetReturnReg(Reg);
4019 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
4020 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
4021 break;
4022 }
4023 }
4024
4025 unsigned StackSize = CCInfo.getNextStackOffset();
4026 // Align stack specially for tail calls.
4027 if (shouldGuaranteeTCO(CallConv,
4028 MF.getTarget().Options.GuaranteedTailCallOpt))
4029 StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
4030
4031 if (IsVarArg)
4032 VarArgsLoweringHelper(FuncInfo, dl, DAG, Subtarget, CallConv, CCInfo)
4033 .lowerVarArgsParameters(Chain, StackSize);
4034
4035 // Some CCs need callee pop.
4036 if (X86::isCalleePop(CallConv, Is64Bit, IsVarArg,
4037 MF.getTarget().Options.GuaranteedTailCallOpt)) {
4038 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
4039 } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
4040 // X86 interrupts must pop the error code (and the alignment padding) if
4041 // present.
4042 FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
4043 } else {
4044 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
4045 // If this is an sret function, the return should pop the hidden pointer.
4046 if (!canGuaranteeTCO(CallConv) && hasCalleePopSRet(Ins, Subtarget))
4047 FuncInfo->setBytesToPopOnReturn(4);
4048 }
4049
4050 if (!Is64Bit) {
4051 // RegSaveFrameIndex is X86-64 only.
4052 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
4053 }
4054
4055 FuncInfo->setArgumentStackSize(StackSize);
4056
4057 if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
4058 EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());
4059 if (Personality == EHPersonality::CoreCLR) {
4060 assert(Is64Bit)(static_cast <bool> (Is64Bit) ? void (0) : __assert_fail
("Is64Bit", "llvm/lib/Target/X86/X86ISelLowering.cpp", 4060,
__extension__ __PRETTY_FUNCTION__))
;
4061 // TODO: Add a mechanism to frame lowering that will allow us to indicate
4062 // that we'd prefer this slot be allocated towards the bottom of the frame
4063 // (i.e. near the stack pointer after allocating the frame). Every
4064 // funclet needs a copy of this slot in its (mostly empty) frame, and the
4065 // offset from the bottom of this and each funclet's frame must be the
4066 // same, so the size of funclets' (mostly empty) frames is dictated by
4067 // how far this slot is from the bottom (since they allocate just enough
4068 // space to accommodate holding this slot at the correct offset).
4069 int PSPSymFI = MFI.CreateStackObject(8, Align(8), /*isSpillSlot=*/false);
4070 EHInfo->PSPSymFrameIdx = PSPSymFI;
4071 }
4072 }
4073
4074 if (CallConv == CallingConv::X86_RegCall ||
4075 F.hasFnAttribute("no_caller_saved_registers")) {
4076 MachineRegisterInfo &MRI = MF.getRegInfo();
4077 for (std::pair<Register, Register> Pair : MRI.liveins())
4078 MRI.disableCalleeSavedRegister(Pair.first);
4079 }
4080
4081 return Chain;
4082}
4083
4084SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
4085 SDValue Arg, const SDLoc &dl,
4086 SelectionDAG &DAG,
4087 const CCValAssign &VA,
4088 ISD::ArgFlagsTy Flags,
4089 bool isByVal) const {
4090 unsigned LocMemOffset = VA.getLocMemOffset();
4091 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
4092 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
4093 StackPtr, PtrOff);
4094 if (isByVal)
4095 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
4096
4097 MaybeAlign Alignment;
4098 if (Subtarget.isTargetWindowsMSVC() && !Subtarget.is64Bit() &&
4099 Arg.getSimpleValueType() != MVT::f80)
4100 Alignment = MaybeAlign(4);
4101 return DAG.getStore(
4102 Chain, dl, Arg, PtrOff,
4103 MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset),
4104 Alignment);
4105}
4106
4107/// Emit a load of return address if tail call
4108/// optimization is performed and it is required.
4109SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
4110 SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
4111 bool Is64Bit, int FPDiff, const SDLoc &dl) const {
4112 // Adjust the Return address stack slot.
4113 EVT VT = getPointerTy(DAG.getDataLayout());
4114 OutRetAddr = getReturnAddressFrameIndex(DAG);
4115
4116 // Load the "old" Return address.
4117 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
4118 return SDValue(OutRetAddr.getNode(), 1);
4119}
4120
4121/// Emit a store of the return address if tail call
4122/// optimization is performed and it is required (FPDiff!=0).
4123static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
4124 SDValue Chain, SDValue RetAddrFrIdx,
4125 EVT PtrVT, unsigned SlotSize,
4126 int FPDiff, const SDLoc &dl) {
4127 // Store the return address to the appropriate stack slot.
4128 if (!FPDiff) return Chain;
4129 // Calculate the new stack slot for the return address.
4130 int NewReturnAddrFI =
4131 MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
4132 false);
4133 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
4134 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
4135 MachinePointerInfo::getFixedStack(
4136 DAG.getMachineFunction(), NewReturnAddrFI));
4137 return Chain;
4138}
4139
4140/// Returns a vector_shuffle mask for an movs{s|d}, movd
4141/// operation of specified width.
4142static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
4143 SDValue V2) {
4144 unsigned NumElems = VT.getVectorNumElements();
4145 SmallVector<int, 8> Mask;
4146 Mask.push_back(NumElems);
4147 for (unsigned i = 1; i != NumElems; ++i)
4148 Mask.push_back(i);
4149 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
4150}
4151
4152SDValue
4153X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
4154 SmallVectorImpl<SDValue> &InVals) const {
4155 SelectionDAG &DAG = CLI.DAG;
4156 SDLoc &dl = CLI.DL;
4157 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
4158 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
4159 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
4160 SDValue Chain = CLI.Chain;
4161 SDValue Callee = CLI.Callee;
4162 CallingConv::ID CallConv = CLI.CallConv;
4163 bool &isTailCall = CLI.IsTailCall;
4164 bool isVarArg = CLI.IsVarArg;
4165 const auto *CB = CLI.CB;
4166
4167 MachineFunction &MF = DAG.getMachineFunction();
4168 bool Is64Bit = Subtarget.is64Bit();
4169 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
4170 bool IsSibcall = false;
4171 bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt ||
4172 CallConv == CallingConv::Tail || CallConv == CallingConv::SwiftTail;
4173 bool IsCalleePopSRet = !IsGuaranteeTCO && hasCalleePopSRet(Outs, Subtarget);
4174 X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
4175 bool HasNCSR = (CB && isa<CallInst>(CB) &&
4176 CB->hasFnAttr("no_caller_saved_registers"));
4177 bool HasNoCfCheck = (CB && CB->doesNoCfCheck());
4178 bool IsIndirectCall = (CB && isa<CallInst>(CB) && CB->isIndirectCall());
4179 const Module *M = MF.getMMI().getModule();
4180 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
4181
4182 MachineFunction::CallSiteInfo CSInfo;
4183 if (CallConv == CallingConv::X86_INTR)
4184 report_fatal_error("X86 interrupts may not be called directly");
4185
4186 bool IsMustTail = CLI.CB && CLI.CB->isMustTailCall();
4187 if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO && !IsMustTail) {
4188 // If we are using a GOT, disable tail calls to external symbols with
4189 // default visibility. Tail calling such a symbol requires using a GOT
4190 // relocation, which forces early binding of the symbol. This breaks code
4191 // that require lazy function symbol resolution. Using musttail or
4192 // GuaranteedTailCallOpt will override this.
4193 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
4194 if (!G || (!G->getGlobal()->hasLocalLinkage() &&
4195 G->getGlobal()->hasDefaultVisibility()))
4196 isTailCall = false;
4197 }
4198
4199 if (isTailCall && !IsMustTail) {
4200 // Check if it's really possible to do a tail call.
4201 isTailCall = IsEligibleForTailCallOptimization(
4202 Callee, CallConv, IsCalleePopSRet, isVarArg, CLI.RetTy, Outs, OutVals,
4203 Ins, DAG);
4204
4205 // Sibcalls are automatically detected tailcalls which do not require
4206 // ABI changes.
4207 if (!IsGuaranteeTCO && isTailCall)
4208 IsSibcall = true;
4209
4210 if (isTailCall)
4211 ++NumTailCalls;
4212 }
4213
4214 if (IsMustTail && !isTailCall)
4215 report_fatal_error("failed to perform tail call elimination on a call "
4216 "site marked musttail");
4217
4218 assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&(static_cast <bool> (!(isVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling convention fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(isVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling convention fastcc, ghc or hipe\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4219, __extension__
__PRETTY_FUNCTION__))
4219 "Var args not supported with calling convention fastcc, ghc or hipe")(static_cast <bool> (!(isVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling convention fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(isVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling convention fastcc, ghc or hipe\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4219, __extension__
__PRETTY_FUNCTION__))
;
4220
4221 // Analyze operands of the call, assigning locations to each operand.
4222 SmallVector<CCValAssign, 16> ArgLocs;
4223 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
4224
4225 // Allocate shadow area for Win64.
4226 if (IsWin64)
4227 CCInfo.AllocateStack(32, Align(8));
4228
4229 CCInfo.AnalyzeArguments(Outs, CC_X86);
4230
4231 // In vectorcall calling convention a second pass is required for the HVA
4232 // types.
4233 if (CallingConv::X86_VectorCall == CallConv) {
4234 CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
4235 }
4236
4237 // Get a count of how many bytes are to be pushed on the stack.
4238 unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
4239 if (IsSibcall)
4240 // This is a sibcall. The memory operands are available in caller's
4241 // own caller's stack.
4242 NumBytes = 0;
4243 else if (IsGuaranteeTCO && canGuaranteeTCO(CallConv))
4244 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
4245
4246 int FPDiff = 0;
4247 if (isTailCall &&
4248 shouldGuaranteeTCO(CallConv,
4249 MF.getTarget().Options.GuaranteedTailCallOpt)) {
4250 // Lower arguments at fp - stackoffset + fpdiff.
4251 unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
4252
4253 FPDiff = NumBytesCallerPushed - NumBytes;
4254
4255 // Set the delta of movement of the returnaddr stackslot.
4256 // But only set if delta is greater than previous delta.
4257 if (FPDiff < X86Info->getTCReturnAddrDelta())
4258 X86Info->setTCReturnAddrDelta(FPDiff);
4259 }
4260
4261 unsigned NumBytesToPush = NumBytes;
4262 unsigned NumBytesToPop = NumBytes;
4263
4264 // If we have an inalloca argument, all stack space has already been allocated
4265 // for us and be right at the top of the stack. We don't support multiple
4266 // arguments passed in memory when using inalloca.
4267 if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
4268 NumBytesToPush = 0;
4269 if (!ArgLocs.back().isMemLoc())
4270 report_fatal_error("cannot use inalloca attribute on a register "
4271 "parameter");
4272 if (ArgLocs.back().getLocMemOffset() != 0)
4273 report_fatal_error("any parameter with the inalloca attribute must be "
4274 "the only memory argument");
4275 } else if (CLI.IsPreallocated) {
4276 assert(ArgLocs.back().isMemLoc() &&(static_cast <bool> (ArgLocs.back().isMemLoc() &&
"cannot use preallocated attribute on a register " "parameter"
) ? void (0) : __assert_fail ("ArgLocs.back().isMemLoc() && \"cannot use preallocated attribute on a register \" \"parameter\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4278, __extension__
__PRETTY_FUNCTION__))
4277 "cannot use preallocated attribute on a register "(static_cast <bool> (ArgLocs.back().isMemLoc() &&
"cannot use preallocated attribute on a register " "parameter"
) ? void (0) : __assert_fail ("ArgLocs.back().isMemLoc() && \"cannot use preallocated attribute on a register \" \"parameter\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4278, __extension__
__PRETTY_FUNCTION__))
4278 "parameter")(static_cast <bool> (ArgLocs.back().isMemLoc() &&
"cannot use preallocated attribute on a register " "parameter"
) ? void (0) : __assert_fail ("ArgLocs.back().isMemLoc() && \"cannot use preallocated attribute on a register \" \"parameter\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4278, __extension__
__PRETTY_FUNCTION__))
;
4279 SmallVector<size_t, 4> PreallocatedOffsets;
4280 for (size_t i = 0; i < CLI.OutVals.size(); ++i) {
4281 if (CLI.CB->paramHasAttr(i, Attribute::Preallocated)) {
4282 PreallocatedOffsets.push_back(ArgLocs[i].getLocMemOffset());
4283 }
4284 }
4285 auto *MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();
4286 size_t PreallocatedId = MFI->getPreallocatedIdForCallSite(CLI.CB);
4287 MFI->setPreallocatedStackSize(PreallocatedId, NumBytes);
4288 MFI->setPreallocatedArgOffsets(PreallocatedId, PreallocatedOffsets);
4289 NumBytesToPush = 0;
4290 }
4291
4292 if (!IsSibcall && !IsMustTail)
4293 Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
4294 NumBytes - NumBytesToPush, dl);
4295
4296 SDValue RetAddrFrIdx;
4297 // Load return address for tail calls.
4298 if (isTailCall && FPDiff)
4299 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
4300 Is64Bit, FPDiff, dl);
4301
4302 SmallVector<std::pair<Register, SDValue>, 8> RegsToPass;
4303 SmallVector<SDValue, 8> MemOpChains;
4304 SDValue StackPtr;
4305
4306 // The next loop assumes that the locations are in the same order of the
4307 // input arguments.
4308 assert(isSortedByValueNo(ArgLocs) &&(static_cast <bool> (isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering") ? void
(0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4309, __extension__
__PRETTY_FUNCTION__))
4309 "Argument Location list must be sorted before lowering")(static_cast <bool> (isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering") ? void
(0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4309, __extension__
__PRETTY_FUNCTION__))
;
4310
4311 // Walk the register/memloc assignments, inserting copies/loads. In the case
4312 // of tail call optimization arguments are handle later.
4313 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4314 for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
4315 ++I, ++OutIndex) {
4316 assert(OutIndex < Outs.size() && "Invalid Out index")(static_cast <bool> (OutIndex < Outs.size() &&
"Invalid Out index") ? void (0) : __assert_fail ("OutIndex < Outs.size() && \"Invalid Out index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4316, __extension__
__PRETTY_FUNCTION__))
;
4317 // Skip inalloca/preallocated arguments, they have already been written.
4318 ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
4319 if (Flags.isInAlloca() || Flags.isPreallocated())
4320 continue;
4321
4322 CCValAssign &VA = ArgLocs[I];
4323 EVT RegVT = VA.getLocVT();
4324 SDValue Arg = OutVals[OutIndex];
4325 bool isByVal = Flags.isByVal();
4326
4327 // Promote the value if needed.
4328 switch (VA.getLocInfo()) {
4329 default: llvm_unreachable("Unknown loc info!")::llvm::llvm_unreachable_internal("Unknown loc info!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4329)
;
4330 case CCValAssign::Full: break;
4331 case CCValAssign::SExt:
4332 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
4333 break;
4334 case CCValAssign::ZExt:
4335 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
4336 break;
4337 case CCValAssign::AExt:
4338 if (Arg.getValueType().isVector() &&
4339 Arg.getValueType().getVectorElementType() == MVT::i1)
4340 Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
4341 else if (RegVT.is128BitVector()) {
4342 // Special case: passing MMX values in XMM registers.
4343 Arg = DAG.getBitcast(MVT::i64, Arg);
4344 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
4345 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
4346 } else
4347 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
4348 break;
4349 case CCValAssign::BCvt:
4350 Arg = DAG.getBitcast(RegVT, Arg);
4351 break;
4352 case CCValAssign::Indirect: {
4353 if (isByVal) {
4354 // Memcpy the argument to a temporary stack slot to prevent
4355 // the caller from seeing any modifications the callee may make
4356 // as guaranteed by the `byval` attribute.
4357 int FrameIdx = MF.getFrameInfo().CreateStackObject(
4358 Flags.getByValSize(),
4359 std::max(Align(16), Flags.getNonZeroByValAlign()), false);
4360 SDValue StackSlot =
4361 DAG.getFrameIndex(FrameIdx, getPointerTy(DAG.getDataLayout()));
4362 Chain =
4363 CreateCopyOfByValArgument(Arg, StackSlot, Chain, Flags, DAG, dl);
4364 // From now on treat this as a regular pointer
4365 Arg = StackSlot;
4366 isByVal = false;
4367 } else {
4368 // Store the argument.
4369 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
4370 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
4371 Chain = DAG.getStore(
4372 Chain, dl, Arg, SpillSlot,
4373 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
4374 Arg = SpillSlot;
4375 }
4376 break;
4377 }
4378 }
4379
4380 if (VA.needsCustom()) {
4381 assert(VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4382, __extension__
__PRETTY_FUNCTION__))
4382 "Currently the only custom case is when we split v64i1 to 2 regs")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4382, __extension__
__PRETTY_FUNCTION__))
;
4383 // Split v64i1 value into two registers
4384 Passv64i1ArgInRegs(dl, DAG, Arg, RegsToPass, VA, ArgLocs[++I], Subtarget);
4385 } else if (VA.isRegLoc()) {
4386 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
4387 const TargetOptions &Options = DAG.getTarget().Options;
4388 if (Options.EmitCallSiteInfo)
4389 CSInfo.emplace_back(VA.getLocReg(), I);
4390 if (isVarArg && IsWin64) {
4391 // Win64 ABI requires argument XMM reg to be copied to the corresponding
4392 // shadow reg if callee is a varargs function.
4393 Register ShadowReg;
4394 switch (VA.getLocReg()) {
4395 case X86::XMM0: ShadowReg = X86::RCX; break;
4396 case X86::XMM1: ShadowReg = X86::RDX; break;
4397 case X86::XMM2: ShadowReg = X86::R8; break;
4398 case X86::XMM3: ShadowReg = X86::R9; break;
4399 }
4400 if (ShadowReg)
4401 RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
4402 }
4403 } else if (!IsSibcall && (!isTailCall || isByVal)) {
4404 assert(VA.isMemLoc())(static_cast <bool> (VA.isMemLoc()) ? void (0) : __assert_fail
("VA.isMemLoc()", "llvm/lib/Target/X86/X86ISelLowering.cpp",
4404, __extension__ __PRETTY_FUNCTION__))
;
4405 if (!StackPtr.getNode())
4406 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
4407 getPointerTy(DAG.getDataLayout()));
4408 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
4409 dl, DAG, VA, Flags, isByVal));
4410 }
4411 }
4412
4413 if (!MemOpChains.empty())
4414 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
4415
4416 if (Subtarget.isPICStyleGOT()) {
4417 // ELF / PIC requires GOT in the EBX register before function calls via PLT
4418 // GOT pointer (except regcall).
4419 if (!isTailCall) {
4420 // Indirect call with RegCall calling convertion may use up all the
4421 // general registers, so it is not suitable to bind EBX reister for
4422 // GOT address, just let register allocator handle it.
4423 if (CallConv != CallingConv::X86_RegCall)
4424 RegsToPass.push_back(std::make_pair(
4425 Register(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
4426 getPointerTy(DAG.getDataLayout()))));
4427 } else {
4428 // If we are tail calling and generating PIC/GOT style code load the
4429 // address of the callee into ECX. The value in ecx is used as target of
4430 // the tail jump. This is done to circumvent the ebx/callee-saved problem
4431 // for tail calls on PIC/GOT architectures. Normally we would just put the
4432 // address of GOT into ebx and then call target@PLT. But for tail calls
4433 // ebx would be restored (since ebx is callee saved) before jumping to the
4434 // target@PLT.
4435
4436 // Note: The actual moving to ECX is done further down.
4437 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
4438 if (G && !G->getGlobal()->hasLocalLinkage() &&
4439 G->getGlobal()->hasDefaultVisibility())
4440 Callee = LowerGlobalAddress(Callee, DAG);
4441 else if (isa<ExternalSymbolSDNode>(Callee))
4442 Callee = LowerExternalSymbol(Callee, DAG);
4443 }
4444 }
4445
4446 if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail &&
4447 (Subtarget.hasSSE1() || !M->getModuleFlag("SkipRaxSetup"))) {
4448 // From AMD64 ABI document:
4449 // For calls that may call functions that use varargs or stdargs
4450 // (prototype-less calls or calls to functions containing ellipsis (...) in
4451 // the declaration) %al is used as hidden argument to specify the number
4452 // of SSE registers used. The contents of %al do not need to match exactly
4453 // the number of registers, but must be an ubound on the number of SSE
4454 // registers used and is in the range 0 - 8 inclusive.
4455
4456 // Count the number of XMM registers allocated.
4457 static const MCPhysReg XMMArgRegs[] = {
4458 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
4459 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
4460 };
4461 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
4462 assert((Subtarget.hasSSE1() || !NumXMMRegs)(static_cast <bool> ((Subtarget.hasSSE1() || !NumXMMRegs
) && "SSE registers cannot be used when SSE is disabled"
) ? void (0) : __assert_fail ("(Subtarget.hasSSE1() || !NumXMMRegs) && \"SSE registers cannot be used when SSE is disabled\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4463, __extension__
__PRETTY_FUNCTION__))
4463 && "SSE registers cannot be used when SSE is disabled")(static_cast <bool> ((Subtarget.hasSSE1() || !NumXMMRegs
) && "SSE registers cannot be used when SSE is disabled"
) ? void (0) : __assert_fail ("(Subtarget.hasSSE1() || !NumXMMRegs) && \"SSE registers cannot be used when SSE is disabled\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4463, __extension__
__PRETTY_FUNCTION__))
;
4464 RegsToPass.push_back(std::make_pair(Register(X86::AL),
4465 DAG.getConstant(NumXMMRegs, dl,
4466 MVT::i8)));
4467 }
4468
4469 if (isVarArg && IsMustTail) {
4470 const auto &Forwards = X86Info->getForwardedMustTailRegParms();
4471 for (const auto &F : Forwards) {
4472 SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
4473 RegsToPass.push_back(std::make_pair(F.PReg, Val));
4474 }
4475 }
4476
4477 // For tail calls lower the arguments to the 'real' stack slots. Sibcalls
4478 // don't need this because the eligibility check rejects calls that require
4479 // shuffling arguments passed in memory.
4480 if (!IsSibcall && isTailCall) {
4481 // Force all the incoming stack arguments to be loaded from the stack
4482 // before any new outgoing arguments are stored to the stack, because the
4483 // outgoing stack slots may alias the incoming argument stack slots, and
4484 // the alias isn't otherwise explicit. This is slightly more conservative
4485 // than necessary, because it means that each store effectively depends
4486 // on every argument instead of just those arguments it would clobber.
4487 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
4488
4489 SmallVector<SDValue, 8> MemOpChains2;
4490 SDValue FIN;
4491 int FI = 0;
4492 for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
4493 ++I, ++OutsIndex) {
4494 CCValAssign &VA = ArgLocs[I];
4495
4496 if (VA.isRegLoc()) {
4497 if (VA.needsCustom()) {
4498 assert((CallConv == CallingConv::X86_RegCall) &&(static_cast <bool> ((CallConv == CallingConv::X86_RegCall
) && "Expecting custom case only in regcall calling convention"
) ? void (0) : __assert_fail ("(CallConv == CallingConv::X86_RegCall) && \"Expecting custom case only in regcall calling convention\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4499, __extension__
__PRETTY_FUNCTION__))
4499 "Expecting custom case only in regcall calling convention")(static_cast <bool> ((CallConv == CallingConv::X86_RegCall
) && "Expecting custom case only in regcall calling convention"
) ? void (0) : __assert_fail ("(CallConv == CallingConv::X86_RegCall) && \"Expecting custom case only in regcall calling convention\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4499, __extension__
__PRETTY_FUNCTION__))
;
4500 // This means that we are in special case where one argument was
4501 // passed through two register locations - Skip the next location
4502 ++I;
4503 }
4504
4505 continue;
4506 }
4507
4508 assert(VA.isMemLoc())(static_cast <bool> (VA.isMemLoc()) ? void (0) : __assert_fail
("VA.isMemLoc()", "llvm/lib/Target/X86/X86ISelLowering.cpp",
4508, __extension__ __PRETTY_FUNCTION__))
;
4509 SDValue Arg = OutVals[OutsIndex];
4510 ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
4511 // Skip inalloca/preallocated arguments. They don't require any work.
4512 if (Flags.isInAlloca() || Flags.isPreallocated())
4513 continue;
4514 // Create frame index.
4515 int32_t Offset = VA.getLocMemOffset()+FPDiff;
4516 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
4517 FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
4518 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
4519
4520 if (Flags.isByVal()) {
4521 // Copy relative to framepointer.
4522 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
4523 if (!StackPtr.getNode())
4524 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
4525 getPointerTy(DAG.getDataLayout()));
4526 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
4527 StackPtr, Source);
4528
4529 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
4530 ArgChain,
4531 Flags, DAG, dl));
4532 } else {
4533 // Store relative to framepointer.
4534 MemOpChains2.push_back(DAG.getStore(
4535 ArgChain, dl, Arg, FIN,
4536 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
4537 }
4538 }
4539
4540 if (!MemOpChains2.empty())
4541 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
4542
4543 // Store the return address to the appropriate stack slot.
4544 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
4545 getPointerTy(DAG.getDataLayout()),
4546 RegInfo->getSlotSize(), FPDiff, dl);
4547 }
4548
4549 // Build a sequence of copy-to-reg nodes chained together with token chain
4550 // and flag operands which copy the outgoing args into registers.
4551 SDValue InFlag;
4552 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
4553 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
4554 RegsToPass[i].second, InFlag);
4555 InFlag = Chain.getValue(1);
4556 }
4557
4558 if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
4559 assert(Is64Bit && "Large code model is only legal in 64-bit mode.")(static_cast <bool> (Is64Bit && "Large code model is only legal in 64-bit mode."
) ? void (0) : __assert_fail ("Is64Bit && \"Large code model is only legal in 64-bit mode.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4559, __extension__
__PRETTY_FUNCTION__))
;
4560 // In the 64-bit large code model, we have to make all calls
4561 // through a register, since the call instruction's 32-bit
4562 // pc-relative offset may not be large enough to hold the whole
4563 // address.
4564 } else if (Callee->getOpcode() == ISD::GlobalAddress ||
4565 Callee->getOpcode() == ISD::ExternalSymbol) {
4566 // Lower direct calls to global addresses and external symbols. Setting
4567 // ForCall to true here has the effect of removing WrapperRIP when possible
4568 // to allow direct calls to be selected without first materializing the
4569 // address into a register.
4570 Callee = LowerGlobalOrExternal(Callee, DAG, /*ForCall=*/true);
4571 } else if (Subtarget.isTarget64BitILP32() &&
4572 Callee.getValueType() == MVT::i32) {
4573 // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
4574 Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
4575 }
4576
4577 // Returns a chain & a flag for retval copy to use.
4578 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
4579 SmallVector<SDValue, 8> Ops;
4580
4581 if (!IsSibcall && isTailCall && !IsMustTail) {
4582 Chain = DAG.getCALLSEQ_END(Chain,
4583 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
4584 DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
4585 InFlag = Chain.getValue(1);
4586 }
4587
4588 Ops.push_back(Chain);
4589 Ops.push_back(Callee);
4590
4591 if (isTailCall)
4592 Ops.push_back(DAG.getTargetConstant(FPDiff, dl, MVT::i32));
4593
4594 // Add argument registers to the end of the list so that they are known live
4595 // into the call.
4596 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
4597 Ops.push_back(DAG.getRegister(RegsToPass[i].first,
4598 RegsToPass[i].second.getValueType()));
4599
4600 // Add a register mask operand representing the call-preserved registers.
4601 const uint32_t *Mask = [&]() {
4602 auto AdaptedCC = CallConv;
4603 // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists),
4604 // use X86_INTR calling convention because it has the same CSR mask
4605 // (same preserved registers).
4606 if (HasNCSR)
4607 AdaptedCC = (CallingConv::ID)CallingConv::X86_INTR;
4608 // If NoCalleeSavedRegisters is requested, than use GHC since it happens
4609 // to use the CSR_NoRegs_RegMask.
4610 if (CB && CB->hasFnAttr("no_callee_saved_registers"))
4611 AdaptedCC = (CallingConv::ID)CallingConv::GHC;
4612 return RegInfo->getCallPreservedMask(MF, AdaptedCC);
4613 }();
4614 assert(Mask && "Missing call preserved mask for calling convention")(static_cast <bool> (Mask && "Missing call preserved mask for calling convention"
) ? void (0) : __assert_fail ("Mask && \"Missing call preserved mask for calling convention\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4614, __extension__
__PRETTY_FUNCTION__))
;
4615
4616 // If this is an invoke in a 32-bit function using a funclet-based
4617 // personality, assume the function clobbers all registers. If an exception
4618 // is thrown, the runtime will not restore CSRs.
4619 // FIXME: Model this more precisely so that we can register allocate across
4620 // the normal edge and spill and fill across the exceptional edge.
4621 if (!Is64Bit && CLI.CB && isa<InvokeInst>(CLI.CB)) {
4622 const Function &CallerFn = MF.getFunction();
4623 EHPersonality Pers =
4624 CallerFn.hasPersonalityFn()
4625 ? classifyEHPersonality(CallerFn.getPersonalityFn())
4626 : EHPersonality::Unknown;
4627 if (isFuncletEHPersonality(Pers))
4628 Mask = RegInfo->getNoPreservedMask();
4629 }
4630
4631 // Define a new register mask from the existing mask.
4632 uint32_t *RegMask = nullptr;
4633
4634 // In some calling conventions we need to remove the used physical registers
4635 // from the reg mask.
4636 if (CallConv == CallingConv::X86_RegCall || HasNCSR) {
4637 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
4638
4639 // Allocate a new Reg Mask and copy Mask.
4640 RegMask = MF.allocateRegMask();
4641 unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs());
4642 memcpy(RegMask, Mask, sizeof(RegMask[0]) * RegMaskSize);
4643
4644 // Make sure all sub registers of the argument registers are reset
4645 // in the RegMask.
4646 for (auto const &RegPair : RegsToPass)
4647 for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true);
4648 SubRegs.isValid(); ++SubRegs)
4649 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
4650
4651 // Create the RegMask Operand according to our updated mask.
4652 Ops.push_back(DAG.getRegisterMask(RegMask));
4653 } else {
4654 // Create the RegMask Operand according to the static mask.
4655 Ops.push_back(DAG.getRegisterMask(Mask));
4656 }
4657
4658 if (InFlag.getNode())
4659 Ops.push_back(InFlag);
4660
4661 if (isTailCall) {
4662 // We used to do:
4663 //// If this is the first return lowered for this function, add the regs
4664 //// to the liveout set for the function.
4665 // This isn't right, although it's probably harmless on x86; liveouts
4666 // should be computed from returns not tail calls. Consider a void
4667 // function making a tail call to a function returning int.
4668 MF.getFrameInfo().setHasTailCall();
4669 SDValue Ret = DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
4670 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
4671 return Ret;
4672 }
4673
4674 if (HasNoCfCheck && IsCFProtectionSupported && IsIndirectCall) {
4675 Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops);
4676 } else if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
4677 // Calls with a "clang.arc.attachedcall" bundle are special. They should be
4678 // expanded to the call, directly followed by a special marker sequence and
4679 // a call to a ObjC library function. Use the CALL_RVMARKER to do that.
4680 assert(!isTailCall &&(static_cast <bool> (!isTailCall && "tail calls cannot be marked with clang.arc.attachedcall"
) ? void (0) : __assert_fail ("!isTailCall && \"tail calls cannot be marked with clang.arc.attachedcall\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4681, __extension__
__PRETTY_FUNCTION__))
4681 "tail calls cannot be marked with clang.arc.attachedcall")(static_cast <bool> (!isTailCall && "tail calls cannot be marked with clang.arc.attachedcall"
) ? void (0) : __assert_fail ("!isTailCall && \"tail calls cannot be marked with clang.arc.attachedcall\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4681, __extension__
__PRETTY_FUNCTION__))
;
4682 assert(Is64Bit && "clang.arc.attachedcall is only supported in 64bit mode")(static_cast <bool> (Is64Bit && "clang.arc.attachedcall is only supported in 64bit mode"
) ? void (0) : __assert_fail ("Is64Bit && \"clang.arc.attachedcall is only supported in 64bit mode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4682, __extension__
__PRETTY_FUNCTION__))
;
4683
4684 // Add a target global address for the retainRV/claimRV runtime function
4685 // just before the call target.
4686 Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB);
4687 auto PtrVT = getPointerTy(DAG.getDataLayout());
4688 auto GA = DAG.getTargetGlobalAddress(ARCFn, dl, PtrVT);
4689 Ops.insert(Ops.begin() + 1, GA);
4690 Chain = DAG.getNode(X86ISD::CALL_RVMARKER, dl, NodeTys, Ops);
4691 } else {
4692 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
4693 }
4694
4695 InFlag = Chain.getValue(1);
4696 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
4697 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
4698
4699 // Save heapallocsite metadata.
4700 if (CLI.CB)
4701 if (MDNode *HeapAlloc = CLI.CB->getMetadata("heapallocsite"))
4702 DAG.addHeapAllocSite(Chain.getNode(), HeapAlloc);
4703
4704 // Create the CALLSEQ_END node.
4705 unsigned NumBytesForCalleeToPop = 0; // Callee pops nothing.
4706 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
4707 DAG.getTarget().Options.GuaranteedTailCallOpt))
4708 NumBytesForCalleeToPop = NumBytes; // Callee pops everything
4709 else if (!canGuaranteeTCO(CallConv) && IsCalleePopSRet)
4710 // If this call passes a struct-return pointer, the callee
4711 // pops that struct pointer.
4712 NumBytesForCalleeToPop = 4;
4713
4714 // Returns a flag for retval copy to use.
4715 if (!IsSibcall) {
4716 Chain = DAG.getCALLSEQ_END(Chain,
4717 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
4718 DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
4719 true),
4720 InFlag, dl);
4721 InFlag = Chain.getValue(1);
4722 }
4723
4724 // Handle result values, copying them out of physregs into vregs that we
4725 // return.
4726 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
4727 InVals, RegMask);
4728}
4729
4730//===----------------------------------------------------------------------===//
4731// Fast Calling Convention (tail call) implementation
4732//===----------------------------------------------------------------------===//
4733
4734// Like std call, callee cleans arguments, convention except that ECX is
4735// reserved for storing the tail called function address. Only 2 registers are
4736// free for argument passing (inreg). Tail call optimization is performed
4737// provided:
4738// * tailcallopt is enabled
4739// * caller/callee are fastcc
4740// On X86_64 architecture with GOT-style position independent code only local
4741// (within module) calls are supported at the moment.
4742// To keep the stack aligned according to platform abi the function
4743// GetAlignedArgumentStackSize ensures that argument delta is always multiples
4744// of stack alignment. (Dynamic linkers need this - Darwin's dyld for example)
4745// If a tail called function callee has more arguments than the caller the
4746// caller needs to make sure that there is room to move the RETADDR to. This is
4747// achieved by reserving an area the size of the argument delta right after the
4748// original RETADDR, but before the saved framepointer or the spilled registers
4749// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
4750// stack layout:
4751// arg1
4752// arg2
4753// RETADDR
4754// [ new RETADDR
4755// move area ]
4756// (possible EBP)
4757// ESI
4758// EDI
4759// local1 ..
4760
4761/// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
4762/// requirement.
4763unsigned
4764X86TargetLowering::GetAlignedArgumentStackSize(const unsigned StackSize,
4765 SelectionDAG &DAG) const {
4766 const Align StackAlignment = Subtarget.getFrameLowering()->getStackAlign();
4767 const uint64_t SlotSize = Subtarget.getRegisterInfo()->getSlotSize();
4768 assert(StackSize % SlotSize == 0 &&(static_cast <bool> (StackSize % SlotSize == 0 &&
"StackSize must be a multiple of SlotSize") ? void (0) : __assert_fail
("StackSize % SlotSize == 0 && \"StackSize must be a multiple of SlotSize\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4769, __extension__
__PRETTY_FUNCTION__))
4769 "StackSize must be a multiple of SlotSize")(static_cast <bool> (StackSize % SlotSize == 0 &&
"StackSize must be a multiple of SlotSize") ? void (0) : __assert_fail
("StackSize % SlotSize == 0 && \"StackSize must be a multiple of SlotSize\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4769, __extension__
__PRETTY_FUNCTION__))
;
4770 return alignTo(StackSize + SlotSize, StackAlignment) - SlotSize;
4771}
4772
4773/// Return true if the given stack call argument is already available in the
4774/// same position (relatively) of the caller's incoming argument stack.
4775static
4776bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
4777 MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
4778 const X86InstrInfo *TII, const CCValAssign &VA) {
4779 unsigned Bytes = Arg.getValueSizeInBits() / 8;
4780
4781 for (;;) {
4782 // Look through nodes that don't alter the bits of the incoming value.
4783 unsigned Op = Arg.getOpcode();
4784 if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
4785 Arg = Arg.getOperand(0);
4786 continue;
4787 }
4788 if (Op == ISD::TRUNCATE) {
4789 const SDValue &TruncInput = Arg.getOperand(0);
4790 if (TruncInput.getOpcode() == ISD::AssertZext &&
4791 cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
4792 Arg.getValueType()) {
4793 Arg = TruncInput.getOperand(0);
4794 continue;
4795 }
4796 }
4797 break;
4798 }
4799
4800 int FI = INT_MAX2147483647;
4801 if (Arg.getOpcode() == ISD::CopyFromReg) {
4802 Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
4803 if (!VR.isVirtual())
4804 return false;
4805 MachineInstr *Def = MRI->getVRegDef(VR);
4806 if (!Def)
4807 return false;
4808 if (!Flags.isByVal()) {
4809 if (!TII->isLoadFromStackSlot(*Def, FI))
4810 return false;
4811 } else {
4812 unsigned Opcode = Def->getOpcode();
4813 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
4814 Opcode == X86::LEA64_32r) &&
4815 Def->getOperand(1).isFI()) {
4816 FI = Def->getOperand(1).getIndex();
4817 Bytes = Flags.getByValSize();
4818 } else
4819 return false;
4820 }
4821 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
4822 if (Flags.isByVal())
4823 // ByVal argument is passed in as a pointer but it's now being
4824 // dereferenced. e.g.
4825 // define @foo(%struct.X* %A) {
4826 // tail call @bar(%struct.X* byval %A)
4827 // }
4828 return false;
4829 SDValue Ptr = Ld->getBasePtr();
4830 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
4831 if (!FINode)
4832 return false;
4833 FI = FINode->getIndex();
4834 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
4835 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
4836 FI = FINode->getIndex();
4837 Bytes = Flags.getByValSize();
4838 } else
4839 return false;
4840
4841 assert(FI != INT_MAX)(static_cast <bool> (FI != 2147483647) ? void (0) : __assert_fail
("FI != INT_MAX", "llvm/lib/Target/X86/X86ISelLowering.cpp",
4841, __extension__ __PRETTY_FUNCTION__))
;
4842 if (!MFI.isFixedObjectIndex(FI))
4843 return false;
4844
4845 if (Offset != MFI.getObjectOffset(FI))
4846 return false;
4847
4848 // If this is not byval, check that the argument stack object is immutable.
4849 // inalloca and argument copy elision can create mutable argument stack
4850 // objects. Byval objects can be mutated, but a byval call intends to pass the
4851 // mutated memory.
4852 if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
4853 return false;
4854
4855 if (VA.getLocVT().getFixedSizeInBits() >
4856 Arg.getValueSizeInBits().getFixedSize()) {
4857 // If the argument location is wider than the argument type, check that any
4858 // extension flags match.
4859 if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
4860 Flags.isSExt() != MFI.isObjectSExt(FI)) {
4861 return false;
4862 }
4863 }
4864
4865 return Bytes == MFI.getObjectSize(FI);
4866}
4867
4868/// Check whether the call is eligible for tail call optimization. Targets
4869/// that want to do tail call optimization should implement this function.
4870bool X86TargetLowering::IsEligibleForTailCallOptimization(
4871 SDValue Callee, CallingConv::ID CalleeCC, bool IsCalleePopSRet,
4872 bool isVarArg, Type *RetTy, const SmallVectorImpl<ISD::OutputArg> &Outs,
4873 const SmallVectorImpl<SDValue> &OutVals,
4874 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
4875 if (!mayTailCallThisCC(CalleeCC))
4876 return false;
4877
4878 // If -tailcallopt is specified, make fastcc functions tail-callable.
4879 MachineFunction &MF = DAG.getMachineFunction();
4880 const Function &CallerF = MF.getFunction();
4881
4882 // If the function return type is x86_fp80 and the callee return type is not,
4883 // then the FP_EXTEND of the call result is not a nop. It's not safe to
4884 // perform a tailcall optimization here.
4885 if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
4886 return false;
4887
4888 CallingConv::ID CallerCC = CallerF.getCallingConv();
4889 bool CCMatch = CallerCC == CalleeCC;
4890 bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
4891 bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
4892 bool IsGuaranteeTCO = DAG.getTarget().Options.GuaranteedTailCallOpt ||
4893 CalleeCC == CallingConv::Tail || CalleeCC == CallingConv::SwiftTail;
4894
4895 // Win64 functions have extra shadow space for argument homing. Don't do the
4896 // sibcall if the caller and callee have mismatched expectations for this
4897 // space.
4898 if (IsCalleeWin64 != IsCallerWin64)
4899 return false;
4900
4901 if (IsGuaranteeTCO) {
4902 if (canGuaranteeTCO(CalleeCC) && CCMatch)
4903 return true;
4904 return false;
4905 }
4906
4907 // Look for obvious safe cases to perform tail call optimization that do not
4908 // require ABI changes. This is what gcc calls sibcall.
4909
4910 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
4911 // emit a special epilogue.
4912 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4913 if (RegInfo->hasStackRealignment(MF))
4914 return false;
4915
4916 // Also avoid sibcall optimization if we're an sret return fn and the callee
4917 // is incompatible. See comment in LowerReturn about why hasStructRetAttr is
4918 // insufficient.
4919 if (MF.getInfo<X86MachineFunctionInfo>()->getSRetReturnReg()) {
4920 // For a compatible tail call the callee must return our sret pointer. So it
4921 // needs to be (a) an sret function itself and (b) we pass our sret as its
4922 // sret. Condition #b is harder to determine.
4923 return false;
4924 } else if (IsCalleePopSRet)
4925 // The callee pops an sret, so we cannot tail-call, as our caller doesn't
4926 // expect that.
4927 return false;
4928
4929 // Do not sibcall optimize vararg calls unless all arguments are passed via
4930 // registers.
4931 LLVMContext &C = *DAG.getContext();
4932 if (isVarArg && !Outs.empty()) {
4933 // Optimizing for varargs on Win64 is unlikely to be safe without
4934 // additional testing.
4935 if (IsCalleeWin64 || IsCallerWin64)
4936 return false;
4937
4938 SmallVector<CCValAssign, 16> ArgLocs;
4939 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4940
4941 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4942 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
4943 if (!ArgLocs[i].isRegLoc())
4944 return false;
4945 }
4946
4947 // If the call result is in ST0 / ST1, it needs to be popped off the x87
4948 // stack. Therefore, if it's not used by the call it is not safe to optimize
4949 // this into a sibcall.
4950 bool Unused = false;
4951 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
4952 if (!Ins[i].Used) {
4953 Unused = true;
4954 break;
4955 }
4956 }
4957 if (Unused) {
4958 SmallVector<CCValAssign, 16> RVLocs;
4959 CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
4960 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
4961 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
4962 CCValAssign &VA = RVLocs[i];
4963 if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
4964 return false;
4965 }
4966 }
4967
4968 // Check that the call results are passed in the same way.
4969 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
4970 RetCC_X86, RetCC_X86))
4971 return false;
4972 // The callee has to preserve all registers the caller needs to preserve.
4973 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
4974 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
4975 if (!CCMatch) {
4976 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4977 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4978 return false;
4979 }
4980
4981 unsigned StackArgsSize = 0;
4982
4983 // If the callee takes no arguments then go on to check the results of the
4984 // call.
4985 if (!Outs.empty()) {
4986 // Check if stack adjustment is needed. For now, do not do this if any
4987 // argument is passed on the stack.
4988 SmallVector<CCValAssign, 16> ArgLocs;
4989 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4990
4991 // Allocate shadow area for Win64
4992 if (IsCalleeWin64)
4993 CCInfo.AllocateStack(32, Align(8));
4994
4995 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4996 StackArgsSize = CCInfo.getNextStackOffset();
4997
4998 if (CCInfo.getNextStackOffset()) {
4999 // Check if the arguments are already laid out in the right way as
5000 // the caller's fixed stack objects.
5001 MachineFrameInfo &MFI = MF.getFrameInfo();
5002 const MachineRegisterInfo *MRI = &MF.getRegInfo();
5003 const X86InstrInfo *TII = Subtarget.getInstrInfo();
5004 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
5005 CCValAssign &VA = ArgLocs[i];
5006 SDValue Arg = OutVals[i];
5007 ISD::ArgFlagsTy Flags = Outs[i].Flags;
5008 if (VA.getLocInfo() == CCValAssign::Indirect)
5009 return false;
5010 if (!VA.isRegLoc()) {
5011 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
5012 MFI, MRI, TII, VA))
5013 return false;
5014 }
5015 }
5016 }
5017
5018 bool PositionIndependent = isPositionIndependent();
5019 // If the tailcall address may be in a register, then make sure it's
5020 // possible to register allocate for it. In 32-bit, the call address can
5021 // only target EAX, EDX, or ECX since the tail call must be scheduled after
5022 // callee-saved registers are restored. These happen to be the same
5023 // registers used to pass 'inreg' arguments so watch out for those.
5024 if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
5025 !isa<ExternalSymbolSDNode>(Callee)) ||
5026 PositionIndependent)) {
5027 unsigned NumInRegs = 0;
5028 // In PIC we need an extra register to formulate the address computation
5029 // for the callee.
5030 unsigned MaxInRegs = PositionIndependent ? 2 : 3;
5031
5032 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
5033 CCValAssign &VA = ArgLocs[i];
5034 if (!VA.isRegLoc())
5035 continue;
5036 Register Reg = VA.getLocReg();
5037 switch (Reg) {
5038 default: break;
5039 case X86::EAX: case X86::EDX: case X86::ECX:
5040 if (++NumInRegs == MaxInRegs)
5041 return false;
5042 break;
5043 }
5044 }
5045 }
5046
5047 const MachineRegisterInfo &MRI = MF.getRegInfo();
5048 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
5049 return false;
5050 }
5051
5052 bool CalleeWillPop =
5053 X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
5054 MF.getTarget().Options.GuaranteedTailCallOpt);
5055
5056 if (unsigned BytesToPop =
5057 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
5058 // If we have bytes to pop, the callee must pop them.
5059 bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
5060 if (!CalleePopMatches)
5061 return false;
5062 } else if (CalleeWillPop && StackArgsSize > 0) {
5063 // If we don't have bytes to pop, make sure the callee doesn't pop any.
5064 return false;
5065 }
5066
5067 return true;
5068}
5069
5070FastISel *
5071X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
5072 const TargetLibraryInfo *libInfo) const {
5073 return X86::createFastISel(funcInfo, libInfo);
5074}
5075
5076//===----------------------------------------------------------------------===//
5077// Other Lowering Hooks
5078//===----------------------------------------------------------------------===//
5079
5080bool X86::mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget,
5081 bool AssumeSingleUse) {
5082 if (!AssumeSingleUse && !Op.hasOneUse())
5083 return false;
5084 if (!ISD::isNormalLoad(Op.getNode()))
5085 return false;
5086
5087 // If this is an unaligned vector, make sure the target supports folding it.
5088 auto *Ld = cast<LoadSDNode>(Op.getNode());
5089 if (!Subtarget.hasAVX() && !Subtarget.hasSSEUnalignedMem() &&
5090 Ld->getValueSizeInBits(0) == 128 && Ld->getAlignment() < 16)
5091 return false;
5092
5093 // TODO: If this is a non-temporal load and the target has an instruction
5094 // for it, it should not be folded. See "useNonTemporalLoad()".
5095
5096 return true;
5097}
5098
5099bool X86::mayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT,
5100 const X86Subtarget &Subtarget,
5101 bool AssumeSingleUse) {
5102 assert(Subtarget.hasAVX() && "Expected AVX for broadcast from memory")(static_cast <bool> (Subtarget.hasAVX() && "Expected AVX for broadcast from memory"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"Expected AVX for broadcast from memory\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5102, __extension__
__PRETTY_FUNCTION__))
;
5103 if (!X86::mayFoldLoad(Op, Subtarget, AssumeSingleUse))
5104 return false;
5105
5106 // We can not replace a wide volatile load with a broadcast-from-memory,
5107 // because that would narrow the load, which isn't legal for volatiles.
5108 auto *Ld = cast<LoadSDNode>(Op.getNode());
5109 return !Ld->isVolatile() ||
5110 Ld->getValueSizeInBits(0) == EltVT.getScalarSizeInBits();
5111}
5112
5113bool X86::mayFoldIntoStore(SDValue Op) {
5114 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
5115}
5116
5117bool X86::mayFoldIntoZeroExtend(SDValue Op) {
5118 if (Op.hasOneUse()) {
5119 unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
5120 return (ISD::ZERO_EXTEND == Opcode);
5121 }
5122 return false;
5123}
5124
5125static bool isTargetShuffle(unsigned Opcode) {
5126 switch(Opcode) {
5127 default: return false;
5128 case X86ISD::BLENDI:
5129 case X86ISD::PSHUFB:
5130 case X86ISD::PSHUFD:
5131 case X86ISD::PSHUFHW:
5132 case X86ISD::PSHUFLW:
5133 case X86ISD::SHUFP:
5134 case X86ISD::INSERTPS:
5135 case X86ISD::EXTRQI:
5136 case X86ISD::INSERTQI:
5137 case X86ISD::VALIGN:
5138 case X86ISD::PALIGNR:
5139 case X86ISD::VSHLDQ:
5140 case X86ISD::VSRLDQ:
5141 case X86ISD::MOVLHPS:
5142 case X86ISD::MOVHLPS:
5143 case X86ISD::MOVSHDUP:
5144 case X86ISD::MOVSLDUP:
5145 case X86ISD::MOVDDUP:
5146 case X86ISD::MOVSS:
5147 case X86ISD::MOVSD:
5148 case X86ISD::MOVSH:
5149 case X86ISD::UNPCKL:
5150 case X86ISD::UNPCKH:
5151 case X86ISD::VBROADCAST:
5152 case X86ISD::VPERMILPI:
5153 case X86ISD::VPERMILPV:
5154 case X86ISD::VPERM2X128:
5155 case X86ISD::SHUF128:
5156 case X86ISD::VPERMIL2:
5157 case X86ISD::VPERMI:
5158 case X86ISD::VPPERM:
5159 case X86ISD::VPERMV:
5160 case X86ISD::VPERMV3:
5161 case X86ISD::VZEXT_MOVL:
5162 return true;
5163 }
5164}
5165
5166static bool isTargetShuffleVariableMask(unsigned Opcode) {
5167 switch (Opcode) {
5168 default: return false;
5169 // Target Shuffles.
5170 case X86ISD::PSHUFB:
5171 case X86ISD::VPERMILPV:
5172 case X86ISD::VPERMIL2:
5173 case X86ISD::VPPERM:
5174 case X86ISD::VPERMV:
5175 case X86ISD::VPERMV3:
5176 return true;
5177 // 'Faux' Target Shuffles.
5178 case ISD::OR:
5179 case ISD::AND:
5180 case X86ISD::ANDNP:
5181 return true;
5182 }
5183}
5184
5185static bool isTargetShuffleSplat(SDValue Op) {
5186 unsigned Opcode = Op.getOpcode();
5187 if (Opcode == ISD::EXTRACT_SUBVECTOR)
5188 return isTargetShuffleSplat(Op.getOperand(0));
5189 return Opcode == X86ISD::VBROADCAST || Opcode == X86ISD::VBROADCAST_LOAD;
5190}
5191
5192SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
5193 MachineFunction &MF = DAG.getMachineFunction();
5194 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
5195 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
5196 int ReturnAddrIndex = FuncInfo->getRAIndex();
5197
5198 if (ReturnAddrIndex == 0) {
5199 // Set up a frame object for the return address.
5200 unsigned SlotSize = RegInfo->getSlotSize();
5201 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
5202 -(int64_t)SlotSize,
5203 false);
5204 FuncInfo->setRAIndex(ReturnAddrIndex);
5205 }
5206
5207 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
5208}
5209
5210bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
5211 bool hasSymbolicDisplacement) {
5212 // Offset should fit into 32 bit immediate field.
5213 if (!isInt<32>(Offset))
5214 return false;
5215
5216 // If we don't have a symbolic displacement - we don't have any extra
5217 // restrictions.
5218 if (!hasSymbolicDisplacement)
5219 return true;
5220
5221 // FIXME: Some tweaks might be needed for medium code model.
5222 if (M != CodeModel::Small && M != CodeModel::Kernel)
5223 return false;
5224
5225 // For small code model we assume that latest object is 16MB before end of 31
5226 // bits boundary. We may also accept pretty large negative constants knowing
5227 // that all objects are in the positive half of address space.
5228 if (M == CodeModel::Small && Offset < 16*1024*1024)
5229 return true;
5230
5231 // For kernel code model we know that all object resist in the negative half
5232 // of 32bits address space. We may not accept negative offsets, since they may
5233 // be just off and we may accept pretty large positive ones.
5234 if (M == CodeModel::Kernel && Offset >= 0)
5235 return true;
5236
5237 return false;
5238}
5239
5240/// Determines whether the callee is required to pop its own arguments.
5241/// Callee pop is necessary to support tail calls.
5242bool X86::isCalleePop(CallingConv::ID CallingConv,
5243 bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
5244 // If GuaranteeTCO is true, we force some calls to be callee pop so that we
5245 // can guarantee TCO.
5246 if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
5247 return true;
5248
5249 switch (CallingConv) {
5250 default:
5251 return false;
5252 case CallingConv::X86_StdCall:
5253 case CallingConv::X86_FastCall:
5254 case CallingConv::X86_ThisCall:
5255 case CallingConv::X86_VectorCall:
5256 return !is64Bit;
5257 }
5258}
5259
5260/// Return true if the condition is an signed comparison operation.
5261static bool isX86CCSigned(unsigned X86CC) {
5262 switch (X86CC) {
5263 default:
5264 llvm_unreachable("Invalid integer condition!")::llvm::llvm_unreachable_internal("Invalid integer condition!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5264)
;
5265 case X86::COND_E:
5266 case X86::COND_NE:
5267 case X86::COND_B:
5268 case X86::COND_A:
5269 case X86::COND_BE:
5270 case X86::COND_AE:
5271 return false;
5272 case X86::COND_G:
5273 case X86::COND_GE:
5274 case X86::COND_L:
5275 case X86::COND_LE:
5276 return true;
5277 }
5278}
5279
5280static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
5281 switch (SetCCOpcode) {
5282 default: llvm_unreachable("Invalid integer condition!")::llvm::llvm_unreachable_internal("Invalid integer condition!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5282)
;
5283 case ISD::SETEQ: return X86::COND_E;
5284 case ISD::SETGT: return X86::COND_G;
5285 case ISD::SETGE: return X86::COND_GE;
5286 case ISD::SETLT: return X86::COND_L;
5287 case ISD::SETLE: return X86::COND_LE;
5288 case ISD::SETNE: return X86::COND_NE;
5289 case ISD::SETULT: return X86::COND_B;
5290 case ISD::SETUGT: return X86::COND_A;
5291 case ISD::SETULE: return X86::COND_BE;
5292 case ISD::SETUGE: return X86::COND_AE;
5293 }
5294}
5295
5296/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
5297/// condition code, returning the condition code and the LHS/RHS of the
5298/// comparison to make.
5299static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
5300 bool isFP, SDValue &LHS, SDValue &RHS,
5301 SelectionDAG &DAG) {
5302 if (!isFP) {
5303 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
5304 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnes()) {
5305 // X > -1 -> X == 0, jump !sign.
5306 RHS = DAG.getConstant(0, DL, RHS.getValueType());
5307 return X86::COND_NS;
5308 }
5309 if (SetCCOpcode == ISD::SETLT && RHSC->isZero()) {
5310 // X < 0 -> X == 0, jump on sign.
5311 return X86::COND_S;
5312 }
5313 if (SetCCOpcode == ISD::SETGE && RHSC->isZero()) {
5314 // X >= 0 -> X == 0, jump on !sign.
5315 return X86::COND_NS;
5316 }
5317 if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {
5318 // X < 1 -> X <= 0
5319 RHS = DAG.getConstant(0, DL, RHS.getValueType());
5320 return X86::COND_LE;
5321 }
5322 }
5323
5324 return TranslateIntegerX86CC(SetCCOpcode);
5325 }
5326
5327 // First determine if it is required or is profitable to flip the operands.
5328
5329 // If LHS is a foldable load, but RHS is not, flip the condition.
5330 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
5331 !ISD::isNON_EXTLoad(RHS.getNode())) {
5332 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
5333 std::swap(LHS, RHS);
5334 }
5335
5336 switch (SetCCOpcode) {
5337 default: break;
5338 case ISD::SETOLT:
5339 case ISD::SETOLE:
5340 case ISD::SETUGT:
5341 case ISD::SETUGE:
5342 std::swap(LHS, RHS);
5343 break;
5344 }
5345
5346 // On a floating point condition, the flags are set as follows:
5347 // ZF PF CF op
5348 // 0 | 0 | 0 | X > Y
5349 // 0 | 0 | 1 | X < Y
5350 // 1 | 0 | 0 | X == Y
5351 // 1 | 1 | 1 | unordered
5352 switch (SetCCOpcode) {
5353 default: llvm_unreachable("Condcode should be pre-legalized away")::llvm::llvm_unreachable_internal("Condcode should be pre-legalized away"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5353)
;
5354 case ISD::SETUEQ:
5355 case ISD::SETEQ: return X86::COND_E;
5356 case ISD::SETOLT: // flipped
5357 case ISD::SETOGT:
5358 case ISD::SETGT: return X86::COND_A;
5359 case ISD::SETOLE: // flipped
5360 case ISD::SETOGE:
5361 case ISD::SETGE: return X86::COND_AE;
5362 case ISD::SETUGT: // flipped
5363 case ISD::SETULT:
5364 case ISD::SETLT: return X86::COND_B;
5365 case ISD::SETUGE: // flipped
5366 case ISD::SETULE:
5367 case ISD::SETLE: return X86::COND_BE;
5368 case ISD::SETONE:
5369 case ISD::SETNE: return X86::COND_NE;
5370 case ISD::SETUO: return X86::COND_P;
5371 case ISD::SETO: return X86::COND_NP;
5372 case ISD::SETOEQ:
5373 case ISD::SETUNE: return X86::COND_INVALID;
5374 }
5375}
5376
5377/// Is there a floating point cmov for the specific X86 condition code?
5378/// Current x86 isa includes the following FP cmov instructions:
5379/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
5380static bool hasFPCMov(unsigned X86CC) {
5381 switch (X86CC) {
5382 default:
5383 return false;
5384 case X86::COND_B:
5385 case X86::COND_BE:
5386 case X86::COND_E:
5387 case X86::COND_P:
5388 case X86::COND_A:
5389 case X86::COND_AE:
5390 case X86::COND_NE:
5391 case X86::COND_NP:
5392 return true;
5393 }
5394}
5395
5396static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT) {
5397 return Subtarget.hasVLX() || Subtarget.canExtendTo512DQ() ||
5398 VT.is512BitVector();
5399}
5400
5401bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
5402 const CallInst &I,
5403 MachineFunction &MF,
5404 unsigned Intrinsic) const {
5405 Info.flags = MachineMemOperand::MONone;
5406 Info.offset = 0;
5407
5408 const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
5409 if (!IntrData) {
5410 switch (Intrinsic) {
5411 case Intrinsic::x86_aesenc128kl:
5412 case Intrinsic::x86_aesdec128kl:
5413 Info.opc = ISD::INTRINSIC_W_CHAIN;
5414 Info.ptrVal = I.getArgOperand(1);
5415 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
5416 Info.align = Align(1);
5417 Info.flags |= MachineMemOperand::MOLoad;
5418 return true;
5419 case Intrinsic::x86_aesenc256kl:
5420 case Intrinsic::x86_aesdec256kl:
5421 Info.opc = ISD::INTRINSIC_W_CHAIN;
5422 Info.ptrVal = I.getArgOperand(1);
5423 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
5424 Info.align = Align(1);
5425 Info.flags |= MachineMemOperand::MOLoad;
5426 return true;
5427 case Intrinsic::x86_aesencwide128kl:
5428 case Intrinsic::x86_aesdecwide128kl:
5429 Info.opc = ISD::INTRINSIC_W_CHAIN;
5430 Info.ptrVal = I.getArgOperand(0);
5431 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
5432 Info.align = Align(1);
5433 Info.flags |= MachineMemOperand::MOLoad;
5434 return true;
5435 case Intrinsic::x86_aesencwide256kl:
5436 case Intrinsic::x86_aesdecwide256kl:
5437 Info.opc = ISD::INTRINSIC_W_CHAIN;
5438 Info.ptrVal = I.getArgOperand(0);
5439 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
5440 Info.align = Align(1);
5441 Info.flags |= MachineMemOperand::MOLoad;
5442 return true;
5443 case Intrinsic::x86_atomic_bts:
5444 case Intrinsic::x86_atomic_btc:
5445 case Intrinsic::x86_atomic_btr: {
5446 Info.opc = ISD::INTRINSIC_W_CHAIN;
5447 Info.ptrVal = I.getArgOperand(0);
5448 unsigned Size = I.getType()->getScalarSizeInBits();
5449 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
5450 Info.align = Align(Size);
5451 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
5452 MachineMemOperand::MOVolatile;
5453 return true;
5454 }
5455 }
5456 return false;
5457 }
5458
5459 switch (IntrData->Type) {
5460 case TRUNCATE_TO_MEM_VI8:
5461 case TRUNCATE_TO_MEM_VI16:
5462 case TRUNCATE_TO_MEM_VI32: {
5463 Info.opc = ISD::INTRINSIC_VOID;
5464 Info.ptrVal = I.getArgOperand(0);
5465 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
5466 MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
5467 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
5468 ScalarVT = MVT::i8;
5469 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
5470 ScalarVT = MVT::i16;
5471 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
5472 ScalarVT = MVT::i32;
5473
5474 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
5475 Info.align = Align(1);
5476 Info.flags |= MachineMemOperand::MOStore;
5477 break;
5478 }
5479 case GATHER:
5480 case GATHER_AVX2: {
5481 Info.opc = ISD::INTRINSIC_W_CHAIN;
5482 Info.ptrVal = nullptr;
5483 MVT DataVT = MVT::getVT(I.getType());
5484 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
5485 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
5486 IndexVT.getVectorNumElements());
5487 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
5488 Info.align = Align(1);
5489 Info.flags |= MachineMemOperand::MOLoad;
5490 break;
5491 }
5492 case SCATTER: {
5493 Info.opc = ISD::INTRINSIC_VOID;
5494 Info.ptrVal = nullptr;
5495 MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
5496 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
5497 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
5498 IndexVT.getVectorNumElements());
5499 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
5500 Info.align = Align(1);
5501 Info.flags |= MachineMemOperand::MOStore;
5502 break;
5503 }
5504 default:
5505 return false;
5506 }
5507
5508 return true;
5509}
5510
5511/// Returns true if the target can instruction select the
5512/// specified FP immediate natively. If false, the legalizer will
5513/// materialize the FP immediate as a load from a constant pool.
5514bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
5515 bool ForCodeSize) const {
5516 for (const APFloat &FPImm : LegalFPImmediates)
5517 if (Imm.bitwiseIsEqual(FPImm))
5518 return true;
5519 return false;
5520}
5521
5522bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
5523 ISD::LoadExtType ExtTy,
5524 EVT NewVT) const {
5525 assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow")(static_cast <bool> (cast<LoadSDNode>(Load)->isSimple
() && "illegal to narrow") ? void (0) : __assert_fail
("cast<LoadSDNode>(Load)->isSimple() && \"illegal to narrow\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5525, __extension__
__PRETTY_FUNCTION__))
;
5526
5527 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
5528 // relocation target a movq or addq instruction: don't let the load shrink.
5529 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
5530 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
5531 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
5532 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
5533
5534 // If this is an (1) AVX vector load with (2) multiple uses and (3) all of
5535 // those uses are extracted directly into a store, then the extract + store
5536 // can be store-folded. Therefore, it's probably not worth splitting the load.
5537 EVT VT = Load->getValueType(0);
5538 if ((VT.is256BitVector() || VT.is512BitVector()) && !Load->hasOneUse()) {
5539 for (auto UI = Load->use_begin(), UE = Load->use_end(); UI != UE; ++UI) {
5540 // Skip uses of the chain value. Result 0 of the node is the load value.
5541 if (UI.getUse().getResNo() != 0)
5542 continue;
5543
5544 // If this use is not an extract + store, it's probably worth splitting.
5545 if (UI->getOpcode() != ISD::EXTRACT_SUBVECTOR || !UI->hasOneUse() ||
5546 UI->use_begin()->getOpcode() != ISD::STORE)
5547 return true;
5548 }
5549 // All non-chain uses are extract + store.
5550 return false;
5551 }
5552
5553 return true;
5554}
5555
5556/// Returns true if it is beneficial to convert a load of a constant
5557/// to just the constant itself.
5558bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
5559 Type *Ty) const {
5560 assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) :
__assert_fail ("Ty->isIntegerTy()", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5560, __extension__ __PRETTY_FUNCTION__))
;
5561
5562 unsigned BitSize = Ty->getPrimitiveSizeInBits();
5563 if (BitSize == 0 || BitSize > 64)
5564 return false;
5565 return true;
5566}
5567
5568bool X86TargetLowering::reduceSelectOfFPConstantLoads(EVT CmpOpVT) const {
5569 // If we are using XMM registers in the ABI and the condition of the select is
5570 // a floating-point compare and we have blendv or conditional move, then it is
5571 // cheaper to select instead of doing a cross-register move and creating a
5572 // load that depends on the compare result.
5573 bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;
5574 return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();
5575}
5576
5577bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {
5578 // TODO: It might be a win to ease or lift this restriction, but the generic
5579 // folds in DAGCombiner conflict with vector folds for an AVX512 target.
5580 if (VT.isVector() && Subtarget.hasAVX512())
5581 return false;
5582
5583 return true;
5584}
5585
5586bool X86TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
5587 SDValue C) const {
5588 // TODO: We handle scalars using custom code, but generic combining could make
5589 // that unnecessary.
5590 APInt MulC;
5591 if (!ISD::isConstantSplatVector(C.getNode(), MulC))
5592 return false;
5593
5594 // Find the type this will be legalized too. Otherwise we might prematurely
5595 // convert this to shl+add/sub and then still have to type legalize those ops.
5596 // Another choice would be to defer the decision for illegal types until
5597 // after type legalization. But constant splat vectors of i64 can't make it
5598 // through type legalization on 32-bit targets so we would need to special
5599 // case vXi64.
5600 while (getTypeAction(Context, VT) != TypeLegal)
5601 VT = getTypeToTransformTo(Context, VT);
5602
5603 // If vector multiply is legal, assume that's faster than shl + add/sub.
5604 // Multiply is a complex op with higher latency and lower throughput in
5605 // most implementations, sub-vXi32 vector multiplies are always fast,
5606 // vXi32 mustn't have a SlowMULLD implementation, and anything larger (vXi64)
5607 // is always going to be slow.
5608 unsigned EltSizeInBits = VT.getScalarSizeInBits();
5609 if (isOperationLegal(ISD::MUL, VT) && EltSizeInBits <= 32 &&
5610 (EltSizeInBits != 32 || !Subtarget.isPMULLDSlow()))
5611 return false;
5612
5613 // shl+add, shl+sub, shl+add+neg
5614 return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
5615 (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
5616}
5617
5618bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
5619 unsigned Index) const {
5620 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
5621 return false;
5622
5623 // Mask vectors support all subregister combinations and operations that
5624 // extract half of vector.
5625 if (ResVT.getVectorElementType() == MVT::i1)
5626 return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
5627 (Index == ResVT.getVectorNumElements()));
5628
5629 return (Index % ResVT.getVectorNumElements()) == 0;
5630}
5631
5632bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const {
5633 unsigned Opc = VecOp.getOpcode();
5634
5635 // Assume target opcodes can't be scalarized.
5636 // TODO - do we have any exceptions?
5637 if (Opc >= ISD::BUILTIN_OP_END)
5638 return false;
5639
5640 // If the vector op is not supported, try to convert to scalar.
5641 EVT VecVT = VecOp.getValueType();
5642 if (!isOperationLegalOrCustomOrPromote(Opc, VecVT))
5643 return true;
5644
5645 // If the vector op is supported, but the scalar op is not, the transform may
5646 // not be worthwhile.
5647 EVT ScalarVT = VecVT.getScalarType();
5648 return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
5649}
5650
5651bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT,
5652 bool) const {
5653 // TODO: Allow vectors?
5654 if (VT.isVector())
5655 return false;
5656 return VT.isSimple() || !isOperationExpand(Opcode, VT);
5657}
5658
5659bool X86TargetLowering::isCheapToSpeculateCttz() const {
5660 // Speculate cttz only if we can directly use TZCNT.
5661 return Subtarget.hasBMI();
5662}
5663
5664bool X86TargetLowering::isCheapToSpeculateCtlz() const {
5665 // Speculate ctlz only if we can directly use LZCNT.
5666 return Subtarget.hasLZCNT();
5667}
5668
5669bool X86TargetLowering::hasBitPreservingFPLogic(EVT VT) const {
5670 return VT == MVT::f32 || VT == MVT::f64 || VT.isVector() ||
5671 (VT == MVT::f16 && Subtarget.hasFP16());
5672}
5673
5674bool X86TargetLowering::ShouldShrinkFPConstant(EVT VT) const {
5675 // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
5676 // expensive than a straight movsd. On the other hand, it's important to
5677 // shrink long double fp constant since fldt is very slow.
5678 return !Subtarget.hasSSE2() || VT == MVT::f80;
5679}
5680
5681bool X86TargetLowering::isScalarFPTypeInSSEReg(EVT VT) const {
5682 return (VT == MVT::f64 && Subtarget.hasSSE2()) ||
5683 (VT == MVT::f32 && Subtarget.hasSSE1()) ||
5684 (VT == MVT::f16 && Subtarget.hasFP16());
5685}
5686
5687bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
5688 const SelectionDAG &DAG,
5689 const MachineMemOperand &MMO) const {
5690 if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
5691 BitcastVT.getVectorElementType() == MVT::i1)
5692 return false;
5693
5694 if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
5695 return false;
5696
5697 // If both types are legal vectors, it's always ok to convert them.
5698 if (LoadVT.isVector() && BitcastVT.isVector() &&
5699 isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
5700 return true;
5701
5702 return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
5703}
5704
5705bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
5706 const MachineFunction &MF) const {
5707 // Do not merge to float value size (128 bytes) if no implicit
5708 // float attribute is set.
5709 bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);
5710
5711 if (NoFloat) {
5712 unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
5713 return (MemVT.getSizeInBits() <= MaxIntSize);
5714 }
5715 // Make sure we don't merge greater than our preferred vector
5716 // width.
5717 if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
5718 return false;
5719
5720 return true;
5721}
5722
5723bool X86TargetLowering::isCtlzFast() const {
5724 return Subtarget.hasFastLZCNT();
5725}
5726
5727bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
5728 const Instruction &AndI) const {
5729 return true;
5730}
5731
5732bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
5733 EVT VT = Y.getValueType();
5734
5735 if (VT.isVector())
5736 return false;
5737
5738 if (!Subtarget.hasBMI())
5739 return false;
5740
5741 // There are only 32-bit and 64-bit forms for 'andn'.
5742 if (VT != MVT::i32 && VT != MVT::i64)
5743 return false;
5744
5745 return !isa<ConstantSDNode>(Y);
5746}
5747
5748bool X86TargetLowering::hasAndNot(SDValue Y) const {
5749 EVT VT = Y.getValueType();
5750
5751 if (!VT.isVector())
5752 return hasAndNotCompare(Y);
5753
5754 // Vector.
5755
5756 if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
5757 return false;
5758
5759 if (VT == MVT::v4i32)
5760 return true;
5761
5762 return Subtarget.hasSSE2();
5763}
5764
5765bool X86TargetLowering::hasBitTest(SDValue X, SDValue Y) const {
5766 return X.getValueType().isScalarInteger(); // 'bt'
5767}
5768
5769bool X86TargetLowering::
5770 shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
5771 SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
5772 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
5773 SelectionDAG &DAG) const {
5774 // Does baseline recommend not to perform the fold by default?
5775 if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
5776 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
5777 return false;
5778 // For scalars this transform is always beneficial.
5779 if (X.getValueType().isScalarInteger())
5780 return true;
5781 // If all the shift amounts are identical, then transform is beneficial even
5782 // with rudimentary SSE2 shifts.
5783 if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))
5784 return true;
5785 // If we have AVX2 with it's powerful shift operations, then it's also good.
5786 if (Subtarget.hasAVX2())
5787 return true;
5788 // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.
5789 return NewShiftOpcode == ISD::SHL;
5790}
5791
5792bool X86TargetLowering::shouldFoldConstantShiftPairToMask(
5793 const SDNode *N, CombineLevel Level) const {
5794 assert(((N->getOpcode() == ISD::SHL &&(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5798, __extension__
__PRETTY_FUNCTION__))
5795 N->getOperand(0).getOpcode() == ISD::SRL) ||(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5798, __extension__
__PRETTY_FUNCTION__))
5796 (N->getOpcode() == ISD::SRL &&(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5798, __extension__
__PRETTY_FUNCTION__))
5797 N->getOperand(0).getOpcode() == ISD::SHL)) &&(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5798, __extension__
__PRETTY_FUNCTION__))
5798 "Expected shift-shift mask")(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5798, __extension__
__PRETTY_FUNCTION__))
;
5799 EVT VT = N->getValueType(0);
5800 if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||
5801 (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
5802 // Only fold if the shift values are equal - so it folds to AND.
5803 // TODO - we should fold if either is a non-uniform vector but we don't do
5804 // the fold for non-splats yet.
5805 return N->getOperand(1) == N->getOperand(0).getOperand(1);
5806 }
5807 return TargetLoweringBase::shouldFoldConstantShiftPairToMask(N, Level);
5808}
5809
5810bool X86TargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const {
5811 EVT VT = Y.getValueType();
5812
5813 // For vectors, we don't have a preference, but we probably want a mask.
5814 if (VT.isVector())
5815 return false;
5816
5817 // 64-bit shifts on 32-bit targets produce really bad bloated code.
5818 if (VT == MVT::i64 && !Subtarget.is64Bit())
5819 return false;
5820
5821 return true;
5822}
5823
5824bool X86TargetLowering::shouldExpandShift(SelectionDAG &DAG,
5825 SDNode *N) const {
5826 if (DAG.getMachineFunction().getFunction().hasMinSize() &&
5827 !Subtarget.isOSWindows())
5828 return false;
5829 return true;
5830}
5831
5832bool X86TargetLowering::shouldSplatInsEltVarIndex(EVT VT) const {
5833 // Any legal vector type can be splatted more efficiently than
5834 // loading/spilling from memory.
5835 return isTypeLegal(VT);
5836}
5837
5838MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
5839 MVT VT = MVT::getIntegerVT(NumBits);
5840 if (isTypeLegal(VT))
5841 return VT;
5842
5843 // PMOVMSKB can handle this.
5844 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
5845 return MVT::v16i8;
5846
5847 // VPMOVMSKB can handle this.
5848 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
5849 return MVT::v32i8;
5850
5851 // TODO: Allow 64-bit type for 32-bit target.
5852 // TODO: 512-bit types should be allowed, but make sure that those
5853 // cases are handled in combineVectorSizedSetCCEquality().
5854
5855 return MVT::INVALID_SIMPLE_VALUE_TYPE;
5856}
5857
5858/// Val is the undef sentinel value or equal to the specified value.
5859static bool isUndefOrEqual(int Val, int CmpVal) {
5860 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
5861}
5862
5863/// Return true if every element in Mask is the undef sentinel value or equal to
5864/// the specified value..
5865static bool isUndefOrEqual(ArrayRef<int> Mask, int CmpVal) {
5866 return llvm::all_of(Mask, [CmpVal](int M) {
5867 return (M == SM_SentinelUndef) || (M == CmpVal);
5868 });
5869}
5870
5871/// Val is either the undef or zero sentinel value.
5872static bool isUndefOrZero(int Val) {
5873 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
5874}
5875
5876/// Return true if every element in Mask, beginning from position Pos and ending
5877/// in Pos+Size is the undef sentinel value.
5878static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
5879 return llvm::all_of(Mask.slice(Pos, Size),
5880 [](int M) { return M == SM_SentinelUndef; });
5881}
5882
5883/// Return true if the mask creates a vector whose lower half is undefined.
5884static bool isUndefLowerHalf(ArrayRef<int> Mask) {
5885 unsigned NumElts = Mask.size();
5886 return isUndefInRange(Mask, 0, NumElts / 2);
5887}
5888
5889/// Return true if the mask creates a vector whose upper half is undefined.
5890static bool isUndefUpperHalf(ArrayRef<int> Mask) {
5891 unsigned NumElts = Mask.size();
5892 return isUndefInRange(Mask, NumElts / 2, NumElts / 2);
5893}
5894
5895/// Return true if Val falls within the specified range (L, H].
5896static bool isInRange(int Val, int Low, int Hi) {
5897 return (Val >= Low && Val < Hi);
5898}
5899
5900/// Return true if the value of any element in Mask falls within the specified
5901/// range (L, H].
5902static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
5903 return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });
5904}
5905
5906/// Return true if the value of any element in Mask is the zero sentinel value.
5907static bool isAnyZero(ArrayRef<int> Mask) {
5908 return llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
5909}
5910
5911/// Return true if the value of any element in Mask is the zero or undef
5912/// sentinel values.
5913static bool isAnyZeroOrUndef(ArrayRef<int> Mask) {
5914 return llvm::any_of(Mask, [](int M) {
5915 return M == SM_SentinelZero || M == SM_SentinelUndef;
5916 });
5917}
5918
5919/// Return true if Val is undef or if its value falls within the
5920/// specified range (L, H].
5921static bool isUndefOrInRange(int Val, int Low, int Hi) {
5922 return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
5923}
5924
5925/// Return true if every element in Mask is undef or if its value
5926/// falls within the specified range (L, H].
5927static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
5928 return llvm::all_of(
5929 Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });
5930}
5931
5932/// Return true if Val is undef, zero or if its value falls within the
5933/// specified range (L, H].
5934static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
5935 return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
5936}
5937
5938/// Return true if every element in Mask is undef, zero or if its value
5939/// falls within the specified range (L, H].
5940static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
5941 return llvm::all_of(
5942 Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });
5943}
5944
5945/// Return true if every element in Mask, beginning
5946/// from position Pos and ending in Pos + Size, falls within the specified
5947/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
5948static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
5949 unsigned Size, int Low, int Step = 1) {
5950 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
5951 if (!isUndefOrEqual(Mask[i], Low))
5952 return false;
5953 return true;
5954}
5955
5956/// Return true if every element in Mask, beginning
5957/// from position Pos and ending in Pos+Size, falls within the specified
5958/// sequential range (Low, Low+Size], or is undef or is zero.
5959static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
5960 unsigned Size, int Low,
5961 int Step = 1) {
5962 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
5963 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
5964 return false;
5965 return true;
5966}
5967
5968/// Return true if every element in Mask, beginning
5969/// from position Pos and ending in Pos+Size is undef or is zero.
5970static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
5971 unsigned Size) {
5972 return llvm::all_of(Mask.slice(Pos, Size), isUndefOrZero);
5973}
5974
5975/// Helper function to test whether a shuffle mask could be
5976/// simplified by widening the elements being shuffled.
5977///
5978/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
5979/// leaves it in an unspecified state.
5980///
5981/// NOTE: This must handle normal vector shuffle masks and *target* vector
5982/// shuffle masks. The latter have the special property of a '-2' representing
5983/// a zero-ed lane of a vector.
5984static bool canWidenShuffleElements(ArrayRef<int> Mask,
5985 SmallVectorImpl<int> &WidenedMask) {
5986 WidenedMask.assign(Mask.size() / 2, 0);
5987 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
5988 int M0 = Mask[i];
5989 int M1 = Mask[i + 1];
5990
5991 // If both elements are undef, its trivial.
5992 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
5993 WidenedMask[i / 2] = SM_SentinelUndef;
5994 continue;
5995 }
5996
5997 // Check for an undef mask and a mask value properly aligned to fit with
5998 // a pair of values. If we find such a case, use the non-undef mask's value.
5999 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
6000 WidenedMask[i / 2] = M1 / 2;
6001 continue;
6002 }
6003 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
6004 WidenedMask[i / 2] = M0 / 2;
6005 continue;
6006 }
6007
6008 // When zeroing, we need to spread the zeroing across both lanes to widen.
6009 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
6010 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
6011 (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {
6012 WidenedMask[i / 2] = SM_SentinelZero;
6013 continue;
6014 }
6015 return false;
6016 }
6017
6018 // Finally check if the two mask values are adjacent and aligned with
6019 // a pair.
6020 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
6021 WidenedMask[i / 2] = M0 / 2;
6022 continue;
6023 }
6024
6025 // Otherwise we can't safely widen the elements used in this shuffle.
6026 return false;
6027 }
6028 assert(WidenedMask.size() == Mask.size() / 2 &&(static_cast <bool> (WidenedMask.size() == Mask.size() /
2 && "Incorrect size of mask after widening the elements!"
) ? void (0) : __assert_fail ("WidenedMask.size() == Mask.size() / 2 && \"Incorrect size of mask after widening the elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6029, __extension__
__PRETTY_FUNCTION__))
6029 "Incorrect size of mask after widening the elements!")(static_cast <bool> (WidenedMask.size() == Mask.size() /
2 && "Incorrect size of mask after widening the elements!"
) ? void (0) : __assert_fail ("WidenedMask.size() == Mask.size() / 2 && \"Incorrect size of mask after widening the elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6029, __extension__
__PRETTY_FUNCTION__))
;
6030
6031 return true;
6032}
6033
6034static bool canWidenShuffleElements(ArrayRef<int> Mask,
6035 const APInt &Zeroable,
6036 bool V2IsZero,
6037 SmallVectorImpl<int> &WidenedMask) {
6038 // Create an alternative mask with info about zeroable elements.
6039 // Here we do not set undef elements as zeroable.
6040 SmallVector<int, 64> ZeroableMask(Mask.begin(), Mask.end());
6041 if (V2IsZero) {
6042 assert(!Zeroable.isZero() && "V2's non-undef elements are used?!")(static_cast <bool> (!Zeroable.isZero() && "V2's non-undef elements are used?!"
) ? void (0) : __assert_fail ("!Zeroable.isZero() && \"V2's non-undef elements are used?!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6042, __extension__
__PRETTY_FUNCTION__))
;
6043 for (int i = 0, Size = Mask.size(); i != Size; ++i)
6044 if (Mask[i] != SM_SentinelUndef && Zeroable[i])
6045 ZeroableMask[i] = SM_SentinelZero;
6046 }
6047 return canWidenShuffleElements(ZeroableMask, WidenedMask);
6048}
6049
6050static bool canWidenShuffleElements(ArrayRef<int> Mask) {
6051 SmallVector<int, 32> WidenedMask;
6052 return canWidenShuffleElements(Mask, WidenedMask);
6053}
6054
6055// Attempt to narrow/widen shuffle mask until it matches the target number of
6056// elements.
6057static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,
6058 SmallVectorImpl<int> &ScaledMask) {
6059 unsigned NumSrcElts = Mask.size();
6060 assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&(static_cast <bool> (((NumSrcElts % NumDstElts) == 0 ||
(NumDstElts % NumSrcElts) == 0) && "Illegal shuffle scale factor"
) ? void (0) : __assert_fail ("((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) && \"Illegal shuffle scale factor\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6061, __extension__
__PRETTY_FUNCTION__))
6061 "Illegal shuffle scale factor")(static_cast <bool> (((NumSrcElts % NumDstElts) == 0 ||
(NumDstElts % NumSrcElts) == 0) && "Illegal shuffle scale factor"
) ? void (0) : __assert_fail ("((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) && \"Illegal shuffle scale factor\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6061, __extension__
__PRETTY_FUNCTION__))
;
6062
6063 // Narrowing is guaranteed to work.
6064 if (NumDstElts >= NumSrcElts) {
6065 int Scale = NumDstElts / NumSrcElts;
6066 llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);
6067 return true;
6068 }
6069
6070 // We have to repeat the widening until we reach the target size, but we can
6071 // split out the first widening as it sets up ScaledMask for us.
6072 if (canWidenShuffleElements(Mask, ScaledMask)) {
6073 while (ScaledMask.size() > NumDstElts) {
6074 SmallVector<int, 16> WidenedMask;
6075 if (!canWidenShuffleElements(ScaledMask, WidenedMask))
6076 return false;
6077 ScaledMask = std::move(WidenedMask);
6078 }
6079 return true;
6080 }
6081
6082 return false;
6083}
6084
6085/// Returns true if Elt is a constant zero or a floating point constant +0.0.
6086bool X86::isZeroNode(SDValue Elt) {
6087 return isNullConstant(Elt) || isNullFPConstant(Elt);
6088}
6089
6090// Build a vector of constants.
6091// Use an UNDEF node if MaskElt == -1.
6092// Split 64-bit constants in the 32-bit mode.
6093static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
6094 const SDLoc &dl, bool IsMask = false) {
6095
6096 SmallVector<SDValue, 32> Ops;
6097 bool Split = false;
6098
6099 MVT ConstVecVT = VT;
6100 unsigned NumElts = VT.getVectorNumElements();
6101 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
6102 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
6103 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
6104 Split = true;
6105 }
6106
6107 MVT EltVT = ConstVecVT.getVectorElementType();
6108 for (unsigned i = 0; i < NumElts; ++i) {
6109 bool IsUndef = Values[i] < 0 && IsMask;
6110 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
6111 DAG.getConstant(Values[i], dl, EltVT);
6112 Ops.push_back(OpNode);
6113 if (Split)
6114 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
6115 DAG.getConstant(0, dl, EltVT));
6116 }
6117 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
6118 if (Split)
6119 ConstsNode = DAG.getBitcast(VT, ConstsNode);
6120 return ConstsNode;
6121}
6122
6123static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
6124 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
6125 assert(Bits.size() == Undefs.getBitWidth() &&(static_cast <bool> (Bits.size() == Undefs.getBitWidth(
) && "Unequal constant and undef arrays") ? void (0) :
__assert_fail ("Bits.size() == Undefs.getBitWidth() && \"Unequal constant and undef arrays\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6126, __extension__
__PRETTY_FUNCTION__))
6126 "Unequal constant and undef arrays")(static_cast <bool> (Bits.size() == Undefs.getBitWidth(
) && "Unequal constant and undef arrays") ? void (0) :
__assert_fail ("Bits.size() == Undefs.getBitWidth() && \"Unequal constant and undef arrays\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6126, __extension__
__PRETTY_FUNCTION__))
;
6127 SmallVector<SDValue, 32> Ops;
6128 bool Split = false;
6129
6130 MVT ConstVecVT = VT;
6131 unsigned NumElts = VT.getVectorNumElements();
6132 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
6133 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
6134 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
6135 Split = true;
6136 }
6137
6138 MVT EltVT = ConstVecVT.getVectorElementType();
6139 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
6140 if (Undefs[i]) {
6141 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
6142 continue;
6143 }
6144 const APInt &V = Bits[i];
6145 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes")(static_cast <bool> (V.getBitWidth() == VT.getScalarSizeInBits
() && "Unexpected sizes") ? void (0) : __assert_fail (
"V.getBitWidth() == VT.getScalarSizeInBits() && \"Unexpected sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6145, __extension__
__PRETTY_FUNCTION__))
;
6146 if (Split) {
6147 Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
6148 Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
6149 } else if (EltVT == MVT::f32) {
6150 APFloat FV(APFloat::IEEEsingle(), V);
6151 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
6152 } else if (EltVT == MVT::f64) {
6153 APFloat FV(APFloat::IEEEdouble(), V);
6154 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
6155 } else {
6156 Ops.push_back(DAG.getConstant(V, dl, EltVT));
6157 }
6158 }
6159
6160 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
6161 return DAG.getBitcast(VT, ConstsNode);
6162}
6163
6164/// Returns a vector of specified type with all zero elements.
6165static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
6166 SelectionDAG &DAG, const SDLoc &dl) {
6167 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector() || VT.getVectorElementType() == MVT
::i1) && "Unexpected vector type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6169, __extension__
__PRETTY_FUNCTION__))
6168 VT.getVectorElementType() == MVT::i1) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector() || VT.getVectorElementType() == MVT
::i1) && "Unexpected vector type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6169, __extension__
__PRETTY_FUNCTION__))
6169 "Unexpected vector type")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector() || VT.getVectorElementType() == MVT
::i1) && "Unexpected vector type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6169, __extension__
__PRETTY_FUNCTION__))
;
6170
6171 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
6172 // type. This ensures they get CSE'd. But if the integer type is not
6173 // available, use a floating-point +0.0 instead.
6174 SDValue Vec;
6175 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
6176 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
6177 } else if (VT.isFloatingPoint()) {
6178 Vec = DAG.getConstantFP(+0.0, dl, VT);
6179 } else if (VT.getVectorElementType() == MVT::i1) {
6180 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&(static_cast <bool> ((Subtarget.hasBWI() || VT.getVectorNumElements
() <= 16) && "Unexpected vector type") ? void (0) :
__assert_fail ("(Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6181, __extension__
__PRETTY_FUNCTION__))
6181 "Unexpected vector type")(static_cast <bool> ((Subtarget.hasBWI() || VT.getVectorNumElements
() <= 16) && "Unexpected vector type") ? void (0) :
__assert_fail ("(Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6181, __extension__
__PRETTY_FUNCTION__))
;
6182 Vec = DAG.getConstant(0, dl, VT);
6183 } else {
6184 unsigned Num32BitElts = VT.getSizeInBits() / 32;
6185 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
6186 }
6187 return DAG.getBitcast(VT, Vec);
6188}
6189
6190// Helper to determine if the ops are all the extracted subvectors come from a
6191// single source. If we allow commute they don't have to be in order (Lo/Hi).
6192static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute) {
6193 if (LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
6194 RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
6195 LHS.getValueType() != RHS.getValueType() ||
6196 LHS.getOperand(0) != RHS.getOperand(0))
6197 return SDValue();
6198
6199 SDValue Src = LHS.getOperand(0);
6200 if (Src.getValueSizeInBits() != (LHS.getValueSizeInBits() * 2))
6201 return SDValue();
6202
6203 unsigned NumElts = LHS.getValueType().getVectorNumElements();
6204 if ((LHS.getConstantOperandAPInt(1) == 0 &&
6205 RHS.getConstantOperandAPInt(1) == NumElts) ||
6206 (AllowCommute && RHS.getConstantOperandAPInt(1) == 0 &&
6207 LHS.getConstantOperandAPInt(1) == NumElts))
6208 return Src;
6209
6210 return SDValue();
6211}
6212
6213static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
6214 const SDLoc &dl, unsigned vectorWidth) {
6215 EVT VT = Vec.getValueType();
6216 EVT ElVT = VT.getVectorElementType();
6217 unsigned Factor = VT.getSizeInBits() / vectorWidth;
6218 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
6219 VT.getVectorNumElements() / Factor);
6220
6221 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
6222 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
6223 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")(static_cast <bool> (isPowerOf2_32(ElemsPerChunk) &&
"Elements per chunk not power of 2") ? void (0) : __assert_fail
("isPowerOf2_32(ElemsPerChunk) && \"Elements per chunk not power of 2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6223, __extension__
__PRETTY_FUNCTION__))
;
6224
6225 // This is the index of the first element of the vectorWidth-bit chunk
6226 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
6227 IdxVal &= ~(ElemsPerChunk - 1);
6228
6229 // If the input is a buildvector just emit a smaller one.
6230 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
6231 return DAG.getBuildVector(ResultVT, dl,
6232 Vec->ops().slice(IdxVal, ElemsPerChunk));
6233
6234 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
6235 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
6236}
6237
6238/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
6239/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
6240/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
6241/// instructions or a simple subregister reference. Idx is an index in the
6242/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
6243/// lowering EXTRACT_VECTOR_ELT operations easier.
6244static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
6245 SelectionDAG &DAG, const SDLoc &dl) {
6246 assert((Vec.getValueType().is256BitVector() ||(static_cast <bool> ((Vec.getValueType().is256BitVector
() || Vec.getValueType().is512BitVector()) && "Unexpected vector size!"
) ? void (0) : __assert_fail ("(Vec.getValueType().is256BitVector() || Vec.getValueType().is512BitVector()) && \"Unexpected vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6247, __extension__
__PRETTY_FUNCTION__))
6247 Vec.getValueType().is512BitVector()) && "Unexpected vector size!")(static_cast <bool> ((Vec.getValueType().is256BitVector
() || Vec.getValueType().is512BitVector()) && "Unexpected vector size!"
) ? void (0) : __assert_fail ("(Vec.getValueType().is256BitVector() || Vec.getValueType().is512BitVector()) && \"Unexpected vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6247, __extension__
__PRETTY_FUNCTION__))
;
6248 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
6249}
6250
6251/// Generate a DAG to grab 256-bits from a 512-bit vector.
6252static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
6253 SelectionDAG &DAG, const SDLoc &dl) {
6254 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!")(static_cast <bool> (Vec.getValueType().is512BitVector(
) && "Unexpected vector size!") ? void (0) : __assert_fail
("Vec.getValueType().is512BitVector() && \"Unexpected vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6254, __extension__
__PRETTY_FUNCTION__))
;
6255 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
6256}
6257
6258static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
6259 SelectionDAG &DAG, const SDLoc &dl,
6260 unsigned vectorWidth) {
6261 assert((vectorWidth == 128 || vectorWidth == 256) &&(static_cast <bool> ((vectorWidth == 128 || vectorWidth
== 256) && "Unsupported vector width") ? void (0) : __assert_fail
("(vectorWidth == 128 || vectorWidth == 256) && \"Unsupported vector width\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6262, __extension__
__PRETTY_FUNCTION__))
6262 "Unsupported vector width")(static_cast <bool> ((vectorWidth == 128 || vectorWidth
== 256) && "Unsupported vector width") ? void (0) : __assert_fail
("(vectorWidth == 128 || vectorWidth == 256) && \"Unsupported vector width\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6262, __extension__
__PRETTY_FUNCTION__))
;
6263 // Inserting UNDEF is Result
6264 if (Vec.isUndef())
6265 return Result;
6266 EVT VT = Vec.getValueType();
6267 EVT ElVT = VT.getVectorElementType();
6268 EVT ResultVT = Result.getValueType();
6269
6270 // Insert the relevant vectorWidth bits.
6271 unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
6272 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")(static_cast <bool> (isPowerOf2_32(ElemsPerChunk) &&
"Elements per chunk not power of 2") ? void (0) : __assert_fail
("isPowerOf2_32(ElemsPerChunk) && \"Elements per chunk not power of 2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6272, __extension__
__PRETTY_FUNCTION__))
;
6273
6274 // This is the index of the first element of the vectorWidth-bit chunk
6275 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
6276 IdxVal &= ~(ElemsPerChunk - 1);
6277
6278 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
6279 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
6280}
6281
6282/// Generate a DAG to put 128-bits into a vector > 128 bits. This
6283/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
6284/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
6285/// simple superregister reference. Idx is an index in the 128 bits
6286/// we want. It need not be aligned to a 128-bit boundary. That makes
6287/// lowering INSERT_VECTOR_ELT operations easier.
6288static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
6289 SelectionDAG &DAG, const SDLoc &dl) {
6290 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!")(static_cast <bool> (Vec.getValueType().is128BitVector(
) && "Unexpected vector size!") ? void (0) : __assert_fail
("Vec.getValueType().is128BitVector() && \"Unexpected vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6290, __extension__
__PRETTY_FUNCTION__))
;
6291 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
6292}
6293
6294/// Widen a vector to a larger size with the same scalar type, with the new
6295/// elements either zero or undef.
6296static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
6297 const X86Subtarget &Subtarget, SelectionDAG &DAG,
6298 const SDLoc &dl) {
6299 assert(Vec.getValueSizeInBits().getFixedSize() < VT.getFixedSizeInBits() &&(static_cast <bool> (Vec.getValueSizeInBits().getFixedSize
() < VT.getFixedSizeInBits() && Vec.getValueType()
.getScalarType() == VT.getScalarType() && "Unsupported vector widening type"
) ? void (0) : __assert_fail ("Vec.getValueSizeInBits().getFixedSize() < VT.getFixedSizeInBits() && Vec.getValueType().getScalarType() == VT.getScalarType() && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6301, __extension__
__PRETTY_FUNCTION__))
6300 Vec.getValueType().getScalarType() == VT.getScalarType() &&(static_cast <bool> (Vec.getValueSizeInBits().getFixedSize
() < VT.getFixedSizeInBits() && Vec.getValueType()
.getScalarType() == VT.getScalarType() && "Unsupported vector widening type"
) ? void (0) : __assert_fail ("Vec.getValueSizeInBits().getFixedSize() < VT.getFixedSizeInBits() && Vec.getValueType().getScalarType() == VT.getScalarType() && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6301, __extension__
__PRETTY_FUNCTION__))
6301 "Unsupported vector widening type")(static_cast <bool> (Vec.getValueSizeInBits().getFixedSize
() < VT.getFixedSizeInBits() && Vec.getValueType()
.getScalarType() == VT.getScalarType() && "Unsupported vector widening type"
) ? void (0) : __assert_fail ("Vec.getValueSizeInBits().getFixedSize() < VT.getFixedSizeInBits() && Vec.getValueType().getScalarType() == VT.getScalarType() && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6301, __extension__
__PRETTY_FUNCTION__))
;
6302 SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
6303 : DAG.getUNDEF(VT);
6304 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec,
6305 DAG.getIntPtrConstant(0, dl));
6306}
6307
6308/// Widen a vector to a larger size with the same scalar type, with the new
6309/// elements either zero or undef.
6310static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
6311 const X86Subtarget &Subtarget, SelectionDAG &DAG,
6312 const SDLoc &dl, unsigned WideSizeInBits) {
6313 assert(Vec.getValueSizeInBits() < WideSizeInBits &&(static_cast <bool> (Vec.getValueSizeInBits() < WideSizeInBits
&& (WideSizeInBits % Vec.getScalarValueSizeInBits())
== 0 && "Unsupported vector widening type") ? void (
0) : __assert_fail ("Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6315, __extension__
__PRETTY_FUNCTION__))
6314 (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&(static_cast <bool> (Vec.getValueSizeInBits() < WideSizeInBits
&& (WideSizeInBits % Vec.getScalarValueSizeInBits())
== 0 && "Unsupported vector widening type") ? void (
0) : __assert_fail ("Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6315, __extension__
__PRETTY_FUNCTION__))
6315 "Unsupported vector widening type")(static_cast <bool> (Vec.getValueSizeInBits() < WideSizeInBits
&& (WideSizeInBits % Vec.getScalarValueSizeInBits())
== 0 && "Unsupported vector widening type") ? void (
0) : __assert_fail ("Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6315, __extension__
__PRETTY_FUNCTION__))
;
6316 unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();
6317 MVT SVT = Vec.getSimpleValueType().getScalarType();
6318 MVT VT = MVT::getVectorVT(SVT, WideNumElts);
6319 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
6320}
6321
6322// Helper function to collect subvector ops that are concatenated together,
6323// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
6324// The subvectors in Ops are guaranteed to be the same type.
6325static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops) {
6326 assert(Ops.empty() && "Expected an empty ops vector")(static_cast <bool> (Ops.empty() && "Expected an empty ops vector"
) ? void (0) : __assert_fail ("Ops.empty() && \"Expected an empty ops vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6326, __extension__
__PRETTY_FUNCTION__))
;
6327
6328 if (N->getOpcode() == ISD::CONCAT_VECTORS) {
6329 Ops.append(N->op_begin(), N->op_end());
6330 return true;
6331 }
6332
6333 if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {
6334 SDValue Src = N->getOperand(0);
6335 SDValue Sub = N->getOperand(1);
6336 const APInt &Idx = N->getConstantOperandAPInt(2);
6337 EVT VT = Src.getValueType();
6338 EVT SubVT = Sub.getValueType();
6339
6340 // TODO - Handle more general insert_subvector chains.
6341 if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2) &&
6342 Idx == (VT.getVectorNumElements() / 2)) {
6343 // insert_subvector(insert_subvector(undef, x, lo), y, hi)
6344 if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
6345 Src.getOperand(1).getValueType() == SubVT &&
6346 isNullConstant(Src.getOperand(2))) {
6347 Ops.push_back(Src.getOperand(1));
6348 Ops.push_back(Sub);
6349 return true;
6350 }
6351 // insert_subvector(x, extract_subvector(x, lo), hi)
6352 if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
6353 Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {
6354 Ops.append(2, Sub);
6355 return true;
6356 }
6357 }
6358 }
6359
6360 return false;
6361}
6362
6363static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,
6364 const SDLoc &dl) {
6365 EVT VT = Op.getValueType();
6366 unsigned NumElems = VT.getVectorNumElements();
6367 unsigned SizeInBits = VT.getSizeInBits();
6368 assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&(static_cast <bool> ((NumElems % 2) == 0 && (SizeInBits
% 2) == 0 && "Can't split odd sized vector") ? void (
0) : __assert_fail ("(NumElems % 2) == 0 && (SizeInBits % 2) == 0 && \"Can't split odd sized vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6369, __extension__
__PRETTY_FUNCTION__))
6369 "Can't split odd sized vector")(static_cast <bool> ((NumElems % 2) == 0 && (SizeInBits
% 2) == 0 && "Can't split odd sized vector") ? void (
0) : __assert_fail ("(NumElems % 2) == 0 && (SizeInBits % 2) == 0 && \"Can't split odd sized vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6369, __extension__
__PRETTY_FUNCTION__))
;
6370
6371 // If this is a splat value (with no-undefs) then use the lower subvector,
6372 // which should be a free extraction.
6373 SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);
6374 if (DAG.isSplatValue(Op, /*AllowUndefs*/ false))
6375 return std::make_pair(Lo, Lo);
6376
6377 SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);
6378 return std::make_pair(Lo, Hi);
6379}
6380
6381/// Break an operation into 2 half sized ops and then concatenate the results.
6382static SDValue splitVectorOp(SDValue Op, SelectionDAG &DAG) {
6383 unsigned NumOps = Op.getNumOperands();
6384 EVT VT = Op.getValueType();
6385 SDLoc dl(Op);
6386
6387 // Extract the LHS Lo/Hi vectors
6388 SmallVector<SDValue> LoOps(NumOps, SDValue());
6389 SmallVector<SDValue> HiOps(NumOps, SDValue());
6390 for (unsigned I = 0; I != NumOps; ++I) {
6391 SDValue SrcOp = Op.getOperand(I);
6392 if (!SrcOp.getValueType().isVector()) {
6393 LoOps[I] = HiOps[I] = SrcOp;
6394 continue;
6395 }
6396 std::tie(LoOps[I], HiOps[I]) = splitVector(SrcOp, DAG, dl);
6397 }
6398
6399 EVT LoVT, HiVT;
6400 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
6401 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
6402 DAG.getNode(Op.getOpcode(), dl, LoVT, LoOps),
6403 DAG.getNode(Op.getOpcode(), dl, HiVT, HiOps));
6404}
6405
6406/// Break an unary integer operation into 2 half sized ops and then
6407/// concatenate the result back.
6408static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
6409 // Make sure we only try to split 256/512-bit types to avoid creating
6410 // narrow vectors.
6411 EVT VT = Op.getValueType();
6412 (void)VT;
6413 assert((Op.getOperand(0).getValueType().is256BitVector() ||(static_cast <bool> ((Op.getOperand(0).getValueType().is256BitVector
() || Op.getOperand(0).getValueType().is512BitVector()) &&
(VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!"
) ? void (0) : __assert_fail ("(Op.getOperand(0).getValueType().is256BitVector() || Op.getOperand(0).getValueType().is512BitVector()) && (VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6415, __extension__
__PRETTY_FUNCTION__))
6414 Op.getOperand(0).getValueType().is512BitVector()) &&(static_cast <bool> ((Op.getOperand(0).getValueType().is256BitVector
() || Op.getOperand(0).getValueType().is512BitVector()) &&
(VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!"
) ? void (0) : __assert_fail ("(Op.getOperand(0).getValueType().is256BitVector() || Op.getOperand(0).getValueType().is512BitVector()) && (VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6415, __extension__
__PRETTY_FUNCTION__))
6415 (VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!")(static_cast <bool> ((Op.getOperand(0).getValueType().is256BitVector
() || Op.getOperand(0).getValueType().is512BitVector()) &&
(VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!"
) ? void (0) : __assert_fail ("(Op.getOperand(0).getValueType().is256BitVector() || Op.getOperand(0).getValueType().is512BitVector()) && (VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6415, __extension__
__PRETTY_FUNCTION__))
;
6416 assert(Op.getOperand(0).getValueType().getVectorNumElements() ==(static_cast <bool> (Op.getOperand(0).getValueType().getVectorNumElements
() == VT.getVectorNumElements() && "Unexpected VTs!")
? void (0) : __assert_fail ("Op.getOperand(0).getValueType().getVectorNumElements() == VT.getVectorNumElements() && \"Unexpected VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6418, __extension__
__PRETTY_FUNCTION__))
6417 VT.getVectorNumElements() &&(static_cast <bool> (Op.getOperand(0).getValueType().getVectorNumElements
() == VT.getVectorNumElements() && "Unexpected VTs!")
? void (0) : __assert_fail ("Op.getOperand(0).getValueType().getVectorNumElements() == VT.getVectorNumElements() && \"Unexpected VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6418, __extension__
__PRETTY_FUNCTION__))
6418 "Unexpected VTs!")(static_cast <bool> (Op.getOperand(0).getValueType().getVectorNumElements
() == VT.getVectorNumElements() && "Unexpected VTs!")
? void (0) : __assert_fail ("Op.getOperand(0).getValueType().getVectorNumElements() == VT.getVectorNumElements() && \"Unexpected VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6418, __extension__
__PRETTY_FUNCTION__))
;
6419 return splitVectorOp(Op, DAG);
6420}
6421
6422/// Break a binary integer operation into 2 half sized ops and then
6423/// concatenate the result back.
6424static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG) {
6425 // Assert that all the types match.
6426 EVT VT = Op.getValueType();
6427 (void)VT;
6428 assert(Op.getOperand(0).getValueType() == VT &&(static_cast <bool> (Op.getOperand(0).getValueType() ==
VT && Op.getOperand(1).getValueType() == VT &&
"Unexpected VTs!") ? void (0) : __assert_fail ("Op.getOperand(0).getValueType() == VT && Op.getOperand(1).getValueType() == VT && \"Unexpected VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6429, __extension__
__PRETTY_FUNCTION__))
6429 Op.getOperand(1).getValueType() == VT && "Unexpected VTs!")(static_cast <bool> (Op.getOperand(0).getValueType() ==
VT && Op.getOperand(1).getValueType() == VT &&
"Unexpected VTs!") ? void (0) : __assert_fail ("Op.getOperand(0).getValueType() == VT && Op.getOperand(1).getValueType() == VT && \"Unexpected VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6429, __extension__
__PRETTY_FUNCTION__))
;
6430 assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!")(static_cast <bool> ((VT.is256BitVector() || VT.is512BitVector
()) && "Unsupported VT!") ? void (0) : __assert_fail (
"(VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6430, __extension__
__PRETTY_FUNCTION__))
;
6431 return splitVectorOp(Op, DAG);
6432}
6433
6434// Helper for splitting operands of an operation to legal target size and
6435// apply a function on each part.
6436// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
6437// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
6438// deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
6439// The argument Builder is a function that will be applied on each split part:
6440// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
6441template <typename F>
6442SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,
6443 const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
6444 F Builder, bool CheckBWI = true) {
6445 assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2")(static_cast <bool> (Subtarget.hasSSE2() && "Target assumed to support at least SSE2"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Target assumed to support at least SSE2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6445, __extension__
__PRETTY_FUNCTION__))
;
6446 unsigned NumSubs = 1;
6447 if ((CheckBWI && Subtarget.useBWIRegs()) ||
6448 (!CheckBWI && Subtarget.useAVX512Regs())) {
6449 if (VT.getSizeInBits() > 512) {
6450 NumSubs = VT.getSizeInBits() / 512;
6451 assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size")(static_cast <bool> ((VT.getSizeInBits() % 512) == 0 &&
"Illegal vector size") ? void (0) : __assert_fail ("(VT.getSizeInBits() % 512) == 0 && \"Illegal vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6451, __extension__
__PRETTY_FUNCTION__))
;
6452 }
6453 } else if (Subtarget.hasAVX2()) {
6454 if (VT.getSizeInBits() > 256) {
6455 NumSubs = VT.getSizeInBits() / 256;
6456 assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size")(static_cast <bool> ((VT.getSizeInBits() % 256) == 0 &&
"Illegal vector size") ? void (0) : __assert_fail ("(VT.getSizeInBits() % 256) == 0 && \"Illegal vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6456, __extension__
__PRETTY_FUNCTION__))
;
6457 }
6458 } else {
6459 if (VT.getSizeInBits() > 128) {
6460 NumSubs = VT.getSizeInBits() / 128;
6461 assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size")(static_cast <bool> ((VT.getSizeInBits() % 128) == 0 &&
"Illegal vector size") ? void (0) : __assert_fail ("(VT.getSizeInBits() % 128) == 0 && \"Illegal vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6461, __extension__
__PRETTY_FUNCTION__))
;
6462 }
6463 }
6464
6465 if (NumSubs == 1)
6466 return Builder(DAG, DL, Ops);
6467
6468 SmallVector<SDValue, 4> Subs;
6469 for (unsigned i = 0; i != NumSubs; ++i) {
6470 SmallVector<SDValue, 2> SubOps;
6471 for (SDValue Op : Ops) {
6472 EVT OpVT = Op.getValueType();
6473 unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
6474 unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
6475 SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
6476 }
6477 Subs.push_back(Builder(DAG, DL, SubOps));
6478 }
6479 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
6480}
6481
6482// Helper function that extends a non-512-bit vector op to 512-bits on non-VLX
6483// targets.
6484static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT,
6485 ArrayRef<SDValue> Ops, SelectionDAG &DAG,
6486 const X86Subtarget &Subtarget) {
6487 assert(Subtarget.hasAVX512() && "AVX512 target expected")(static_cast <bool> (Subtarget.hasAVX512() && "AVX512 target expected"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"AVX512 target expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6487, __extension__
__PRETTY_FUNCTION__))
;
6488 MVT SVT = VT.getScalarType();
6489
6490 // If we have a 32/64 splatted constant, splat it to DstTy to
6491 // encourage a foldable broadcast'd operand.
6492 auto MakeBroadcastOp = [&](SDValue Op, MVT OpVT, MVT DstVT) {
6493 unsigned OpEltSizeInBits = OpVT.getScalarSizeInBits();
6494 // AVX512 broadcasts 32/64-bit operands.
6495 // TODO: Support float once getAVX512Node is used by fp-ops.
6496 if (!OpVT.isInteger() || OpEltSizeInBits < 32 ||
6497 !DAG.getTargetLoweringInfo().isTypeLegal(SVT))
6498 return SDValue();
6499 // If we're not widening, don't bother if we're not bitcasting.
6500 if (OpVT == DstVT && Op.getOpcode() != ISD::BITCAST)
6501 return SDValue();
6502 if (auto *BV = dyn_cast<BuildVectorSDNode>(peekThroughBitcasts(Op))) {
6503 APInt SplatValue, SplatUndef;
6504 unsigned SplatBitSize;
6505 bool HasAnyUndefs;
6506 if (BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
6507 HasAnyUndefs, OpEltSizeInBits) &&
6508 !HasAnyUndefs && SplatValue.getBitWidth() == OpEltSizeInBits)
6509 return DAG.getConstant(SplatValue, DL, DstVT);
6510 }
6511 return SDValue();
6512 };
6513
6514 bool Widen = !(Subtarget.hasVLX() || VT.is512BitVector());
6515
6516 MVT DstVT = VT;
6517 if (Widen)
6518 DstVT = MVT::getVectorVT(SVT, 512 / SVT.getSizeInBits());
6519
6520 // Canonicalize src operands.
6521 SmallVector<SDValue> SrcOps(Ops.begin(), Ops.end());
6522 for (SDValue &Op : SrcOps) {
6523 MVT OpVT = Op.getSimpleValueType();
6524 // Just pass through scalar operands.
6525 if (!OpVT.isVector())
6526 continue;
6527 assert(OpVT == VT && "Vector type mismatch")(static_cast <bool> (OpVT == VT && "Vector type mismatch"
) ? void (0) : __assert_fail ("OpVT == VT && \"Vector type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6527, __extension__
__PRETTY_FUNCTION__))
;
6528
6529 if (SDValue BroadcastOp = MakeBroadcastOp(Op, OpVT, DstVT)) {
6530 Op = BroadcastOp;
6531 continue;
6532 }
6533
6534 // Just widen the subvector by inserting into an undef wide vector.
6535 if (Widen)
6536 Op = widenSubVector(Op, false, Subtarget, DAG, DL, 512);
6537 }
6538
6539 SDValue Res = DAG.getNode(Opcode, DL, DstVT, SrcOps);
6540
6541 // Perform the 512-bit op then extract the bottom subvector.
6542 if (Widen)
6543 Res = extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
6544 return Res;
6545}
6546
6547/// Insert i1-subvector to i1-vector.
6548static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
6549 const X86Subtarget &Subtarget) {
6550
6551 SDLoc dl(Op);
6552 SDValue Vec = Op.getOperand(0);
6553 SDValue SubVec = Op.getOperand(1);
6554 SDValue Idx = Op.getOperand(2);
6555 unsigned IdxVal = Op.getConstantOperandVal(2);
6556
6557 // Inserting undef is a nop. We can just return the original vector.
6558 if (SubVec.isUndef())
6559 return Vec;
6560
6561 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
6562 return Op;
6563
6564 MVT OpVT = Op.getSimpleValueType();
6565 unsigned NumElems = OpVT.getVectorNumElements();
6566 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
6567
6568 // Extend to natively supported kshift.
6569 MVT WideOpVT = OpVT;
6570 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
6571 WideOpVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
6572
6573 // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
6574 // if necessary.
6575 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
6576 // May need to promote to a legal type.
6577 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6578 DAG.getConstant(0, dl, WideOpVT),
6579 SubVec, Idx);
6580 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
6581 }
6582
6583 MVT SubVecVT = SubVec.getSimpleValueType();
6584 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
6585 assert(IdxVal + SubVecNumElems <= NumElems &&(static_cast <bool> (IdxVal + SubVecNumElems <= NumElems
&& IdxVal % SubVecVT.getSizeInBits() == 0 &&
"Unexpected index value in INSERT_SUBVECTOR") ? void (0) : __assert_fail
("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6587, __extension__
__PRETTY_FUNCTION__))
6586 IdxVal % SubVecVT.getSizeInBits() == 0 &&(static_cast <bool> (IdxVal + SubVecNumElems <= NumElems
&& IdxVal % SubVecVT.getSizeInBits() == 0 &&
"Unexpected index value in INSERT_SUBVECTOR") ? void (0) : __assert_fail
("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6587, __extension__
__PRETTY_FUNCTION__))
6587 "Unexpected index value in INSERT_SUBVECTOR")(static_cast <bool> (IdxVal + SubVecNumElems <= NumElems
&& IdxVal % SubVecVT.getSizeInBits() == 0 &&
"Unexpected index value in INSERT_SUBVECTOR") ? void (0) : __assert_fail
("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6587, __extension__
__PRETTY_FUNCTION__))
;
6588
6589 SDValue Undef = DAG.getUNDEF(WideOpVT);
6590
6591 if (IdxVal == 0) {
6592 // Zero lower bits of the Vec
6593 SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);
6594 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
6595 ZeroIdx);
6596 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
6597 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
6598 // Merge them together, SubVec should be zero extended.
6599 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6600 DAG.getConstant(0, dl, WideOpVT),
6601 SubVec, ZeroIdx);
6602 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
6603 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
6604 }
6605
6606 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6607 Undef, SubVec, ZeroIdx);
6608
6609 if (Vec.isUndef()) {
6610 assert(IdxVal != 0 && "Unexpected index")(static_cast <bool> (IdxVal != 0 && "Unexpected index"
) ? void (0) : __assert_fail ("IdxVal != 0 && \"Unexpected index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6610, __extension__
__PRETTY_FUNCTION__))
;
6611 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6612 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
6613 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
6614 }
6615
6616 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
6617 assert(IdxVal != 0 && "Unexpected index")(static_cast <bool> (IdxVal != 0 && "Unexpected index"
) ? void (0) : __assert_fail ("IdxVal != 0 && \"Unexpected index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6617, __extension__
__PRETTY_FUNCTION__))
;
6618 // If upper elements of Vec are known undef, then just shift into place.
6619 if (llvm::all_of(Vec->ops().slice(IdxVal + SubVecNumElems),
6620 [](SDValue V) { return V.isUndef(); })) {
6621 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6622 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
6623 } else {
6624 NumElems = WideOpVT.getVectorNumElements();
6625 unsigned ShiftLeft = NumElems - SubVecNumElems;
6626 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
6627 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6628 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
6629 if (ShiftRight != 0)
6630 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
6631 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
6632 }
6633 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
6634 }
6635
6636 // Simple case when we put subvector in the upper part
6637 if (IdxVal + SubVecNumElems == NumElems) {
6638 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6639 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
6640 if (SubVecNumElems * 2 == NumElems) {
6641 // Special case, use legal zero extending insert_subvector. This allows
6642 // isel to optimize when bits are known zero.
6643 Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
6644 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6645 DAG.getConstant(0, dl, WideOpVT),
6646 Vec, ZeroIdx);
6647 } else {
6648 // Otherwise use explicit shifts to zero the bits.
6649 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6650 Undef, Vec, ZeroIdx);
6651 NumElems = WideOpVT.getVectorNumElements();
6652 SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
6653 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
6654 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
6655 }
6656 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
6657 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
6658 }
6659
6660 // Inserting into the middle is more complicated.
6661
6662 NumElems = WideOpVT.getVectorNumElements();
6663
6664 // Widen the vector if needed.
6665 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
6666
6667 unsigned ShiftLeft = NumElems - SubVecNumElems;
6668 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
6669
6670 // Do an optimization for the the most frequently used types.
6671 if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {
6672 APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);
6673 Mask0.flipAllBits();
6674 SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));
6675 SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);
6676 Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);
6677 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6678 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
6679 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
6680 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
6681 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
6682
6683 // Reduce to original width if needed.
6684 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
6685 }
6686
6687 // Clear the upper bits of the subvector and move it to its insert position.
6688 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6689 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
6690 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
6691 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
6692
6693 // Isolate the bits below the insertion point.
6694 unsigned LowShift = NumElems - IdxVal;
6695 SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,
6696 DAG.getTargetConstant(LowShift, dl, MVT::i8));
6697 Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,
6698 DAG.getTargetConstant(LowShift, dl, MVT::i8));
6699
6700 // Isolate the bits after the last inserted bit.
6701 unsigned HighShift = IdxVal + SubVecNumElems;
6702 SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
6703 DAG.getTargetConstant(HighShift, dl, MVT::i8));
6704 High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
6705 DAG.getTargetConstant(HighShift, dl, MVT::i8));
6706
6707 // Now OR all 3 pieces together.
6708 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
6709 SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);
6710
6711 // Reduce to original width if needed.
6712 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
6713}
6714
6715static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG,
6716 const SDLoc &dl) {
6717 assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch")(static_cast <bool> (V1.getValueType() == V2.getValueType
() && "subvector type mismatch") ? void (0) : __assert_fail
("V1.getValueType() == V2.getValueType() && \"subvector type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6717, __extension__
__PRETTY_FUNCTION__))
;
6718 EVT SubVT = V1.getValueType();
6719 EVT SubSVT = SubVT.getScalarType();
6720 unsigned SubNumElts = SubVT.getVectorNumElements();
6721 unsigned SubVectorWidth = SubVT.getSizeInBits();
6722 EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);
6723 SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);
6724 return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);
6725}
6726
6727/// Returns a vector of specified type with all bits set.
6728/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
6729/// Then bitcast to their original type, ensuring they get CSE'd.
6730static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
6731 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected a 128/256/512-bit vector type"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected a 128/256/512-bit vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6732, __extension__
__PRETTY_FUNCTION__))
6732 "Expected a 128/256/512-bit vector type")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected a 128/256/512-bit vector type"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected a 128/256/512-bit vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6732, __extension__
__PRETTY_FUNCTION__))
;
6733
6734 APInt Ones = APInt::getAllOnes(32);
6735 unsigned NumElts = VT.getSizeInBits() / 32;
6736 SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
6737 return DAG.getBitcast(VT, Vec);
6738}
6739
6740// Convert *_EXTEND_VECTOR_INREG to *_EXTEND opcode.
6741static unsigned getOpcode_EXTEND(unsigned Opcode) {
6742 switch (Opcode) {
6743 case ISD::ANY_EXTEND:
6744 case ISD::ANY_EXTEND_VECTOR_INREG:
6745 return ISD::ANY_EXTEND;
6746 case ISD::ZERO_EXTEND:
6747 case ISD::ZERO_EXTEND_VECTOR_INREG:
6748 return ISD::ZERO_EXTEND;
6749 case ISD::SIGN_EXTEND:
6750 case ISD::SIGN_EXTEND_VECTOR_INREG:
6751 return ISD::SIGN_EXTEND;
6752 }
6753 llvm_unreachable("Unknown opcode")::llvm::llvm_unreachable_internal("Unknown opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6753)
;
6754}
6755
6756// Convert *_EXTEND to *_EXTEND_VECTOR_INREG opcode.
6757static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode) {
6758 switch (Opcode) {
6759 case ISD::ANY_EXTEND:
6760 case ISD::ANY_EXTEND_VECTOR_INREG:
6761 return ISD::ANY_EXTEND_VECTOR_INREG;
6762 case ISD::ZERO_EXTEND:
6763 case ISD::ZERO_EXTEND_VECTOR_INREG:
6764 return ISD::ZERO_EXTEND_VECTOR_INREG;
6765 case ISD::SIGN_EXTEND:
6766 case ISD::SIGN_EXTEND_VECTOR_INREG:
6767 return ISD::SIGN_EXTEND_VECTOR_INREG;
6768 }
6769 llvm_unreachable("Unknown opcode")::llvm::llvm_unreachable_internal("Unknown opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6769)
;
6770}
6771
6772static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT,
6773 SDValue In, SelectionDAG &DAG) {
6774 EVT InVT = In.getValueType();
6775 assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.")(static_cast <bool> (VT.isVector() && InVT.isVector
() && "Expected vector VTs.") ? void (0) : __assert_fail
("VT.isVector() && InVT.isVector() && \"Expected vector VTs.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6775, __extension__
__PRETTY_FUNCTION__))
;
6776 assert((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||(static_cast <bool> ((ISD::ANY_EXTEND == Opcode || ISD::
SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) &&
"Unknown extension opcode") ? void (0) : __assert_fail ("(ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) && \"Unknown extension opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6778, __extension__
__PRETTY_FUNCTION__))
6777 ISD::ZERO_EXTEND == Opcode) &&(static_cast <bool> ((ISD::ANY_EXTEND == Opcode || ISD::
SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) &&
"Unknown extension opcode") ? void (0) : __assert_fail ("(ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) && \"Unknown extension opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6778, __extension__
__PRETTY_FUNCTION__))
6778 "Unknown extension opcode")(static_cast <bool> ((ISD::ANY_EXTEND == Opcode || ISD::
SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) &&
"Unknown extension opcode") ? void (0) : __assert_fail ("(ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) && \"Unknown extension opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6778, __extension__
__PRETTY_FUNCTION__))
;
6779
6780 // For 256-bit vectors, we only need the lower (128-bit) input half.
6781 // For 512-bit vectors, we only need the lower input half or quarter.
6782 if (InVT.getSizeInBits() > 128) {
6783 assert(VT.getSizeInBits() == InVT.getSizeInBits() &&(static_cast <bool> (VT.getSizeInBits() == InVT.getSizeInBits
() && "Expected VTs to be the same size!") ? void (0)
: __assert_fail ("VT.getSizeInBits() == InVT.getSizeInBits() && \"Expected VTs to be the same size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6784, __extension__
__PRETTY_FUNCTION__))
6784 "Expected VTs to be the same size!")(static_cast <bool> (VT.getSizeInBits() == InVT.getSizeInBits
() && "Expected VTs to be the same size!") ? void (0)
: __assert_fail ("VT.getSizeInBits() == InVT.getSizeInBits() && \"Expected VTs to be the same size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6784, __extension__
__PRETTY_FUNCTION__))
;
6785 unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
6786 In = extractSubVector(In, 0, DAG, DL,
6787 std::max(128U, (unsigned)VT.getSizeInBits() / Scale));
6788 InVT = In.getValueType();
6789 }
6790
6791 if (VT.getVectorNumElements() != InVT.getVectorNumElements())
6792 Opcode = getOpcode_EXTEND_VECTOR_INREG(Opcode);
6793
6794 return DAG.getNode(Opcode, DL, VT, In);
6795}
6796
6797// Match (xor X, -1) -> X.
6798// Match extract_subvector(xor X, -1) -> extract_subvector(X).
6799// Match concat_vectors(xor X, -1, xor Y, -1) -> concat_vectors(X, Y).
6800static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {
6801 V = peekThroughBitcasts(V);
6802 if (V.getOpcode() == ISD::XOR &&
6803 ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()))
6804 return V.getOperand(0);
6805 if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
6806 (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
6807 if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
6808 Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
6809 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), V.getValueType(),
6810 Not, V.getOperand(1));
6811 }
6812 }
6813 SmallVector<SDValue, 2> CatOps;
6814 if (collectConcatOps(V.getNode(), CatOps)) {
6815 for (SDValue &CatOp : CatOps) {
6816 SDValue NotCat = IsNOT(CatOp, DAG);
6817 if (!NotCat) return SDValue();
6818 CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
6819 }
6820 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), V.getValueType(), CatOps);
6821 }
6822 return SDValue();
6823}
6824
6825void llvm::createUnpackShuffleMask(EVT VT, SmallVectorImpl<int> &Mask,
6826 bool Lo, bool Unary) {
6827 assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 &&(static_cast <bool> (VT.getScalarType().isSimple() &&
(VT.getSizeInBits() % 128) == 0 && "Illegal vector type to unpack"
) ? void (0) : __assert_fail ("VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 && \"Illegal vector type to unpack\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6828, __extension__
__PRETTY_FUNCTION__))
6828 "Illegal vector type to unpack")(static_cast <bool> (VT.getScalarType().isSimple() &&
(VT.getSizeInBits() % 128) == 0 && "Illegal vector type to unpack"
) ? void (0) : __assert_fail ("VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 && \"Illegal vector type to unpack\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6828, __extension__
__PRETTY_FUNCTION__))
;
6829 assert(Mask.empty() && "Expected an empty shuffle mask vector")(static_cast <bool> (Mask.empty() && "Expected an empty shuffle mask vector"
) ? void (0) : __assert_fail ("Mask.empty() && \"Expected an empty shuffle mask vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6829, __extension__
__PRETTY_FUNCTION__))
;
6830 int NumElts = VT.getVectorNumElements();
6831 int NumEltsInLane = 128 / VT.getScalarSizeInBits();
6832 for (int i = 0; i < NumElts; ++i) {
6833 unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
6834 int Pos = (i % NumEltsInLane) / 2 + LaneStart;
6835 Pos += (Unary ? 0 : NumElts * (i % 2));
6836 Pos += (Lo ? 0 : NumEltsInLane / 2);
6837 Mask.push_back(Pos);
6838 }
6839}
6840
6841/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
6842/// imposed by AVX and specific to the unary pattern. Example:
6843/// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
6844/// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
6845void llvm::createSplat2ShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
6846 bool Lo) {
6847 assert(Mask.empty() && "Expected an empty shuffle mask vector")(static_cast <bool> (Mask.empty() && "Expected an empty shuffle mask vector"
) ? void (0) : __assert_fail ("Mask.empty() && \"Expected an empty shuffle mask vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6847, __extension__
__PRETTY_FUNCTION__))
;
6848 int NumElts = VT.getVectorNumElements();
6849 for (int i = 0; i < NumElts; ++i) {
6850 int Pos = i / 2;
6851 Pos += (Lo ? 0 : NumElts / 2);
6852 Mask.push_back(Pos);
6853 }
6854}
6855
6856// Attempt to constant fold, else just create a VECTOR_SHUFFLE.
6857static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl,
6858 SDValue V1, SDValue V2, ArrayRef<int> Mask) {
6859 if ((ISD::isBuildVectorOfConstantSDNodes(V1.getNode()) || V1.isUndef()) &&
6860 (ISD::isBuildVectorOfConstantSDNodes(V2.getNode()) || V2.isUndef())) {
6861 SmallVector<SDValue> Ops(Mask.size(), DAG.getUNDEF(VT.getScalarType()));
6862 for (int I = 0, NumElts = Mask.size(); I != NumElts; ++I) {
6863 int M = Mask[I];
6864 if (M < 0)
6865 continue;
6866 SDValue V = (M < NumElts) ? V1 : V2;
6867 if (V.isUndef())
6868 continue;
6869 Ops[I] = V.getOperand(M % NumElts);
6870 }
6871 return DAG.getBuildVector(VT, dl, Ops);
6872 }
6873
6874 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
6875}
6876
6877/// Returns a vector_shuffle node for an unpackl operation.
6878static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
6879 SDValue V1, SDValue V2) {
6880 SmallVector<int, 8> Mask;
6881 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
6882 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
6883}
6884
6885/// Returns a vector_shuffle node for an unpackh operation.
6886static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
6887 SDValue V1, SDValue V2) {
6888 SmallVector<int, 8> Mask;
6889 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
6890 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
6891}
6892
6893/// Returns a node that packs the LHS + RHS nodes together at half width.
6894/// May return X86ISD::PACKSS/PACKUS, packing the top/bottom half.
6895/// TODO: Add subvector splitting if/when we have a need for it.
6896static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget,
6897 const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS,
6898 bool PackHiHalf = false) {
6899 MVT OpVT = LHS.getSimpleValueType();
6900 unsigned EltSizeInBits = VT.getScalarSizeInBits();
6901 bool UsePackUS = Subtarget.hasSSE41() || EltSizeInBits == 8;
6902 assert(OpVT == RHS.getSimpleValueType() &&(static_cast <bool> (OpVT == RHS.getSimpleValueType() &&
VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits
* 2) == OpVT.getScalarSizeInBits() && "Unexpected PACK operand types"
) ? void (0) : __assert_fail ("OpVT == RHS.getSimpleValueType() && VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() && \"Unexpected PACK operand types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6905, __extension__
__PRETTY_FUNCTION__))
6903 VT.getSizeInBits() == OpVT.getSizeInBits() &&(static_cast <bool> (OpVT == RHS.getSimpleValueType() &&
VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits
* 2) == OpVT.getScalarSizeInBits() && "Unexpected PACK operand types"
) ? void (0) : __assert_fail ("OpVT == RHS.getSimpleValueType() && VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() && \"Unexpected PACK operand types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6905, __extension__
__PRETTY_FUNCTION__))
6904 (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() &&(static_cast <bool> (OpVT == RHS.getSimpleValueType() &&
VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits
* 2) == OpVT.getScalarSizeInBits() && "Unexpected PACK operand types"
) ? void (0) : __assert_fail ("OpVT == RHS.getSimpleValueType() && VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() && \"Unexpected PACK operand types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6905, __extension__
__PRETTY_FUNCTION__))
6905 "Unexpected PACK operand types")(static_cast <bool> (OpVT == RHS.getSimpleValueType() &&
VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits
* 2) == OpVT.getScalarSizeInBits() && "Unexpected PACK operand types"
) ? void (0) : __assert_fail ("OpVT == RHS.getSimpleValueType() && VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() && \"Unexpected PACK operand types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6905, __extension__
__PRETTY_FUNCTION__))
;
6906 assert((EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) &&(static_cast <bool> ((EltSizeInBits == 8 || EltSizeInBits
== 16 || EltSizeInBits == 32) && "Unexpected PACK result type"
) ? void (0) : __assert_fail ("(EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) && \"Unexpected PACK result type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6907, __extension__
__PRETTY_FUNCTION__))
6907 "Unexpected PACK result type")(static_cast <bool> ((EltSizeInBits == 8 || EltSizeInBits
== 16 || EltSizeInBits == 32) && "Unexpected PACK result type"
) ? void (0) : __assert_fail ("(EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) && \"Unexpected PACK result type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6907, __extension__
__PRETTY_FUNCTION__))
;
6908
6909 // Rely on vector shuffles for vXi64 -> vXi32 packing.
6910 if (EltSizeInBits == 32) {
6911 SmallVector<int> PackMask;
6912 int Offset = PackHiHalf ? 1 : 0;
6913 int NumElts = VT.getVectorNumElements();
6914 for (int I = 0; I != NumElts; I += 4) {
6915 PackMask.push_back(I + Offset);
6916 PackMask.push_back(I + Offset + 2);
6917 PackMask.push_back(I + Offset + NumElts);
6918 PackMask.push_back(I + Offset + NumElts + 2);
6919 }
6920 return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, LHS),
6921 DAG.getBitcast(VT, RHS), PackMask);
6922 }
6923
6924 // See if we already have sufficient leading bits for PACKSS/PACKUS.
6925 if (!PackHiHalf) {
6926 if (UsePackUS &&
6927 DAG.computeKnownBits(LHS).countMaxActiveBits() <= EltSizeInBits &&
6928 DAG.computeKnownBits(RHS).countMaxActiveBits() <= EltSizeInBits)
6929 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
6930
6931 if (DAG.ComputeMaxSignificantBits(LHS) <= EltSizeInBits &&
6932 DAG.ComputeMaxSignificantBits(RHS) <= EltSizeInBits)
6933 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
6934 }
6935
6936 // Fallback to sign/zero extending the requested half and pack.
6937 SDValue Amt = DAG.getTargetConstant(EltSizeInBits, dl, MVT::i8);
6938 if (UsePackUS) {
6939 if (PackHiHalf) {
6940 LHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, LHS, Amt);
6941 RHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, RHS, Amt);
6942 } else {
6943 SDValue Mask = DAG.getConstant((1ULL << EltSizeInBits) - 1, dl, OpVT);
6944 LHS = DAG.getNode(ISD::AND, dl, OpVT, LHS, Mask);
6945 RHS = DAG.getNode(ISD::AND, dl, OpVT, RHS, Mask);
6946 };
6947 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
6948 };
6949
6950 if (!PackHiHalf) {
6951 LHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, LHS, Amt);
6952 RHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, RHS, Amt);
6953 }
6954 LHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, LHS, Amt);
6955 RHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, RHS, Amt);
6956 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
6957}
6958
6959/// Return a vector_shuffle of the specified vector of zero or undef vector.
6960/// This produces a shuffle where the low element of V2 is swizzled into the
6961/// zero/undef vector, landing at element Idx.
6962/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
6963static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
6964 bool IsZero,
6965 const X86Subtarget &Subtarget,
6966 SelectionDAG &DAG) {
6967 MVT VT = V2.getSimpleValueType();
6968 SDValue V1 = IsZero
6969 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
6970 int NumElems = VT.getVectorNumElements();
6971 SmallVector<int, 16> MaskVec(NumElems);
6972 for (int i = 0; i != NumElems; ++i)
6973 // If this is the insertion idx, put the low elt of V2 here.
6974 MaskVec[i] = (i == Idx) ? NumElems : i;
6975 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
6976}
6977
6978static const Constant *getTargetConstantFromBasePtr(SDValue Ptr) {
6979 if (Ptr.getOpcode() == X86ISD::Wrapper ||
6980 Ptr.getOpcode() == X86ISD::WrapperRIP)
6981 Ptr = Ptr.getOperand(0);
6982
6983 auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
6984 if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
6985 return nullptr;
6986
6987 return CNode->getConstVal();
6988}
6989
6990static const Constant *getTargetConstantFromNode(LoadSDNode *Load) {
6991 if (!Load || !ISD::isNormalLoad(Load))
6992 return nullptr;
6993 return getTargetConstantFromBasePtr(Load->getBasePtr());
6994}
6995
6996static const Constant *getTargetConstantFromNode(SDValue Op) {
6997 Op = peekThroughBitcasts(Op);
6998 return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op));
6999}
7000
7001const Constant *
7002X86TargetLowering::getTargetConstantFromLoad(LoadSDNode *LD) const {
7003 assert(LD && "Unexpected null LoadSDNode")(static_cast <bool> (LD && "Unexpected null LoadSDNode"
) ? void (0) : __assert_fail ("LD && \"Unexpected null LoadSDNode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7003, __extension__
__PRETTY_FUNCTION__))
;
7004 return getTargetConstantFromNode(LD);
7005}
7006
7007// Extract raw constant bits from constant pools.
7008static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
7009 APInt &UndefElts,
7010 SmallVectorImpl<APInt> &EltBits,
7011 bool AllowWholeUndefs = true,
7012 bool AllowPartialUndefs = true) {
7013 assert(EltBits.empty() && "Expected an empty EltBits vector")(static_cast <bool> (EltBits.empty() && "Expected an empty EltBits vector"
) ? void (0) : __assert_fail ("EltBits.empty() && \"Expected an empty EltBits vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7013, __extension__
__PRETTY_FUNCTION__))
;
7014
7015 Op = peekThroughBitcasts(Op);
7016
7017 EVT VT = Op.getValueType();
7018 unsigned SizeInBits = VT.getSizeInBits();
7019 assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!")(static_cast <bool> ((SizeInBits % EltSizeInBits) == 0 &&
"Can't split constant!") ? void (0) : __assert_fail ("(SizeInBits % EltSizeInBits) == 0 && \"Can't split constant!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7019, __extension__
__PRETTY_FUNCTION__))
;
7020 unsigned NumElts = SizeInBits / EltSizeInBits;
7021
7022 // Bitcast a source array of element bits to the target size.
7023 auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
7024 unsigned NumSrcElts = UndefSrcElts.getBitWidth();
7025 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
7026 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&(static_cast <bool> ((NumSrcElts * SrcEltSizeInBits) ==
SizeInBits && "Constant bit sizes don't match") ? void
(0) : __assert_fail ("(NumSrcElts * SrcEltSizeInBits) == SizeInBits && \"Constant bit sizes don't match\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7027, __extension__
__PRETTY_FUNCTION__))
7027 "Constant bit sizes don't match")(static_cast <bool> ((NumSrcElts * SrcEltSizeInBits) ==
SizeInBits && "Constant bit sizes don't match") ? void
(0) : __assert_fail ("(NumSrcElts * SrcEltSizeInBits) == SizeInBits && \"Constant bit sizes don't match\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7027, __extension__
__PRETTY_FUNCTION__))
;
7028
7029 // Don't split if we don't allow undef bits.
7030 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
7031 if (UndefSrcElts.getBoolValue() && !AllowUndefs)
7032 return false;
7033
7034 // If we're already the right size, don't bother bitcasting.
7035 if (NumSrcElts == NumElts) {
7036 UndefElts = UndefSrcElts;
7037 EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
7038 return true;
7039 }
7040
7041 // Extract all the undef/constant element data and pack into single bitsets.
7042 APInt UndefBits(SizeInBits, 0);
7043 APInt MaskBits(SizeInBits, 0);
7044
7045 for (unsigned i = 0; i != NumSrcElts; ++i) {
7046 unsigned BitOffset = i * SrcEltSizeInBits;
7047 if (UndefSrcElts[i])
7048 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
7049 MaskBits.insertBits(SrcEltBits[i], BitOffset);
7050 }
7051
7052 // Split the undef/constant single bitset data into the target elements.
7053 UndefElts = APInt(NumElts, 0);
7054 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
7055
7056 for (unsigned i = 0; i != NumElts; ++i) {
7057 unsigned BitOffset = i * EltSizeInBits;
7058 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
7059
7060 // Only treat an element as UNDEF if all bits are UNDEF.
7061 if (UndefEltBits.isAllOnes()) {
7062 if (!AllowWholeUndefs)
7063 return false;
7064 UndefElts.setBit(i);
7065 continue;
7066 }
7067
7068 // If only some bits are UNDEF then treat them as zero (or bail if not
7069 // supported).
7070 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
7071 return false;
7072
7073 EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);
7074 }
7075 return true;
7076 };
7077
7078 // Collect constant bits and insert into mask/undef bit masks.
7079 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
7080 unsigned UndefBitIndex) {
7081 if (!Cst)
7082 return false;
7083 if (isa<UndefValue>(Cst)) {
7084 Undefs.setBit(UndefBitIndex);
7085 return true;
7086 }
7087 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
7088 Mask = CInt->getValue();
7089 return true;
7090 }
7091 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
7092 Mask = CFP->getValueAPF().bitcastToAPInt();
7093 return true;
7094 }
7095 return false;
7096 };
7097
7098 // Handle UNDEFs.
7099 if (Op.isUndef()) {
7100 APInt UndefSrcElts = APInt::getAllOnes(NumElts);
7101 SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
7102 return CastBitData(UndefSrcElts, SrcEltBits);
7103 }
7104
7105 // Extract scalar constant bits.
7106 if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
7107 APInt UndefSrcElts = APInt::getZero(1);
7108 SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
7109 return CastBitData(UndefSrcElts, SrcEltBits);
7110 }
7111 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
7112 APInt UndefSrcElts = APInt::getZero(1);
7113 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
7114 SmallVector<APInt, 64> SrcEltBits(1, RawBits);
7115 return CastBitData(UndefSrcElts, SrcEltBits);
7116 }
7117
7118 // Extract constant bits from build vector.
7119 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op)) {
7120 BitVector Undefs;
7121 SmallVector<APInt> SrcEltBits;
7122 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
7123 if (BV->getConstantRawBits(true, SrcEltSizeInBits, SrcEltBits, Undefs)) {
7124 APInt UndefSrcElts = APInt::getNullValue(SrcEltBits.size());
7125 for (unsigned I = 0, E = SrcEltBits.size(); I != E; ++I)
7126 if (Undefs[I])
7127 UndefSrcElts.setBit(I);
7128 return CastBitData(UndefSrcElts, SrcEltBits);
7129 }
7130 }
7131
7132 // Extract constant bits from constant pool vector.
7133 if (auto *Cst = getTargetConstantFromNode(Op)) {
7134 Type *CstTy = Cst->getType();
7135 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
7136 if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
7137 return false;
7138
7139 unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
7140 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
7141
7142 APInt UndefSrcElts(NumSrcElts, 0);
7143 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
7144 for (unsigned i = 0; i != NumSrcElts; ++i)
7145 if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
7146 UndefSrcElts, i))
7147 return false;
7148
7149 return CastBitData(UndefSrcElts, SrcEltBits);
7150 }
7151
7152 // Extract constant bits from a broadcasted constant pool scalar.
7153 if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
7154 EltSizeInBits <= VT.getScalarSizeInBits()) {
7155 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
7156 if (MemIntr->getMemoryVT().getScalarSizeInBits() != VT.getScalarSizeInBits())
7157 return false;
7158
7159 SDValue Ptr = MemIntr->getBasePtr();
7160 if (const Constant *C = getTargetConstantFromBasePtr(Ptr)) {
7161 unsigned SrcEltSizeInBits = C->getType()->getScalarSizeInBits();
7162 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
7163
7164 APInt UndefSrcElts(NumSrcElts, 0);
7165 SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
7166 if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {
7167 if (UndefSrcElts[0])
7168 UndefSrcElts.setBits(0, NumSrcElts);
7169 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
7170 return CastBitData(UndefSrcElts, SrcEltBits);
7171 }
7172 }
7173 }
7174
7175 // Extract constant bits from a subvector broadcast.
7176 if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
7177 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
7178 SDValue Ptr = MemIntr->getBasePtr();
7179 // The source constant may be larger than the subvector broadcast,
7180 // ensure we extract the correct subvector constants.
7181 if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) {
7182 Type *CstTy = Cst->getType();
7183 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
7184 unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits();
7185 if (!CstTy->isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 ||
7186 (SizeInBits % SubVecSizeInBits) != 0)
7187 return false;
7188 unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
7189 unsigned NumSubElts = SubVecSizeInBits / CstEltSizeInBits;
7190 unsigned NumSubVecs = SizeInBits / SubVecSizeInBits;
7191 APInt UndefSubElts(NumSubElts, 0);
7192 SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs,
7193 APInt(CstEltSizeInBits, 0));
7194 for (unsigned i = 0; i != NumSubElts; ++i) {
7195 if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],
7196 UndefSubElts, i))
7197 return false;
7198 for (unsigned j = 1; j != NumSubVecs; ++j)
7199 SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];
7200 }
7201 UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(),
7202 UndefSubElts);
7203 return CastBitData(UndefSubElts, SubEltBits);
7204 }
7205 }
7206
7207 // Extract a rematerialized scalar constant insertion.
7208 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
7209 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
7210 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
7211 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
7212 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
7213
7214 APInt UndefSrcElts(NumSrcElts, 0);
7215 SmallVector<APInt, 64> SrcEltBits;
7216 auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
7217 SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));
7218 SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
7219 return CastBitData(UndefSrcElts, SrcEltBits);
7220 }
7221
7222 // Insert constant bits from a base and sub vector sources.
7223 if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {
7224 // If bitcasts to larger elements we might lose track of undefs - don't
7225 // allow any to be safe.
7226 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
7227 bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;
7228
7229 APInt UndefSrcElts, UndefSubElts;
7230 SmallVector<APInt, 32> EltSrcBits, EltSubBits;
7231 if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits,
7232 UndefSubElts, EltSubBits,
7233 AllowWholeUndefs && AllowUndefs,
7234 AllowPartialUndefs && AllowUndefs) &&
7235 getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits,
7236 UndefSrcElts, EltSrcBits,
7237 AllowWholeUndefs && AllowUndefs,
7238 AllowPartialUndefs && AllowUndefs)) {
7239 unsigned BaseIdx = Op.getConstantOperandVal(2);
7240 UndefSrcElts.insertBits(UndefSubElts, BaseIdx);
7241 for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
7242 EltSrcBits[BaseIdx + i] = EltSubBits[i];
7243 return CastBitData(UndefSrcElts, EltSrcBits);
7244 }
7245 }
7246
7247 // Extract constant bits from a subvector's source.
7248 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
7249 // TODO - support extract_subvector through bitcasts.
7250 if (EltSizeInBits != VT.getScalarSizeInBits())
7251 return false;
7252
7253 if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
7254 UndefElts, EltBits, AllowWholeUndefs,
7255 AllowPartialUndefs)) {
7256 EVT SrcVT = Op.getOperand(0).getValueType();
7257 unsigned NumSrcElts = SrcVT.getVectorNumElements();
7258 unsigned NumSubElts = VT.getVectorNumElements();
7259 unsigned BaseIdx = Op.getConstantOperandVal(1);
7260 UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
7261 if ((BaseIdx + NumSubElts) != NumSrcElts)
7262 EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
7263 if (BaseIdx != 0)
7264 EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
7265 return true;
7266 }
7267 }
7268
7269 // Extract constant bits from shuffle node sources.
7270 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {
7271 // TODO - support shuffle through bitcasts.
7272 if (EltSizeInBits != VT.getScalarSizeInBits())
7273 return false;
7274
7275 ArrayRef<int> Mask = SVN->getMask();
7276 if ((!AllowWholeUndefs || !AllowPartialUndefs) &&
7277 llvm::any_of(Mask, [](int M) { return M < 0; }))
7278 return false;
7279
7280 APInt UndefElts0, UndefElts1;
7281 SmallVector<APInt, 32> EltBits0, EltBits1;
7282 if (isAnyInRange(Mask, 0, NumElts) &&
7283 !getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
7284 UndefElts0, EltBits0, AllowWholeUndefs,
7285 AllowPartialUndefs))
7286 return false;
7287 if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&
7288 !getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
7289 UndefElts1, EltBits1, AllowWholeUndefs,
7290 AllowPartialUndefs))
7291 return false;
7292
7293 UndefElts = APInt::getZero(NumElts);
7294 for (int i = 0; i != (int)NumElts; ++i) {
7295 int M = Mask[i];
7296 if (M < 0) {
7297 UndefElts.setBit(i);
7298 EltBits.push_back(APInt::getZero(EltSizeInBits));
7299 } else if (M < (int)NumElts) {
7300 if (UndefElts0[M])
7301 UndefElts.setBit(i);
7302 EltBits.push_back(EltBits0[M]);
7303 } else {
7304 if (UndefElts1[M - NumElts])
7305 UndefElts.setBit(i);
7306 EltBits.push_back(EltBits1[M - NumElts]);
7307 }
7308 }
7309 return true;
7310 }
7311
7312 return false;
7313}
7314
7315namespace llvm {
7316namespace X86 {
7317bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {
7318 APInt UndefElts;
7319 SmallVector<APInt, 16> EltBits;
7320 if (getTargetConstantBitsFromNode(Op, Op.getScalarValueSizeInBits(),
7321 UndefElts, EltBits, true,
7322 AllowPartialUndefs)) {
7323 int SplatIndex = -1;
7324 for (int i = 0, e = EltBits.size(); i != e; ++i) {
7325 if (UndefElts[i])
7326 continue;
7327 if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
7328 SplatIndex = -1;
7329 break;
7330 }
7331 SplatIndex = i;
7332 }
7333 if (0 <= SplatIndex) {
7334 SplatVal = EltBits[SplatIndex];
7335 return true;
7336 }
7337 }
7338
7339 return false;
7340}
7341} // namespace X86
7342} // namespace llvm
7343
7344static bool getTargetShuffleMaskIndices(SDValue MaskNode,
7345 unsigned MaskEltSizeInBits,
7346 SmallVectorImpl<uint64_t> &RawMask,
7347 APInt &UndefElts) {
7348 // Extract the raw target constant bits.
7349 SmallVector<APInt, 64> EltBits;
7350 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
7351 EltBits, /* AllowWholeUndefs */ true,
7352 /* AllowPartialUndefs */ false))
7353 return false;
7354
7355 // Insert the extracted elements into the mask.
7356 for (const APInt &Elt : EltBits)
7357 RawMask.push_back(Elt.getZExtValue());
7358
7359 return true;
7360}
7361
7362/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
7363/// A multi-stage pack shuffle mask is created by specifying NumStages > 1.
7364/// Note: This ignores saturation, so inputs must be checked first.
7365static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
7366 bool Unary, unsigned NumStages = 1) {
7367 assert(Mask.empty() && "Expected an empty shuffle mask vector")(static_cast <bool> (Mask.empty() && "Expected an empty shuffle mask vector"
) ? void (0) : __assert_fail ("Mask.empty() && \"Expected an empty shuffle mask vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7367, __extension__
__PRETTY_FUNCTION__))
;
7368 unsigned NumElts = VT.getVectorNumElements();
7369 unsigned NumLanes = VT.getSizeInBits() / 128;
7370 unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
7371 unsigned Offset = Unary ? 0 : NumElts;
7372 unsigned Repetitions = 1u << (NumStages - 1);
7373 unsigned Increment = 1u << NumStages;
7374 assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction")(static_cast <bool> ((NumEltsPerLane >> NumStages
) > 0 && "Illegal packing compaction") ? void (0) :
__assert_fail ("(NumEltsPerLane >> NumStages) > 0 && \"Illegal packing compaction\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7374, __extension__
__PRETTY_FUNCTION__))
;
7375
7376 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
7377 for (unsigned Stage = 0; Stage != Repetitions; ++Stage) {
7378 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
7379 Mask.push_back(Elt + (Lane * NumEltsPerLane));
7380 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
7381 Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
7382 }
7383 }
7384}
7385
7386// Split the demanded elts of a PACKSS/PACKUS node between its operands.
7387static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
7388 APInt &DemandedLHS, APInt &DemandedRHS) {
7389 int NumLanes = VT.getSizeInBits() / 128;
7390 int NumElts = DemandedElts.getBitWidth();
7391 int NumInnerElts = NumElts / 2;
7392 int NumEltsPerLane = NumElts / NumLanes;
7393 int NumInnerEltsPerLane = NumInnerElts / NumLanes;
7394
7395 DemandedLHS = APInt::getZero(NumInnerElts);
7396 DemandedRHS = APInt::getZero(NumInnerElts);
7397
7398 // Map DemandedElts to the packed operands.
7399 for (int Lane = 0; Lane != NumLanes; ++Lane) {
7400 for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
7401 int OuterIdx = (Lane * NumEltsPerLane) + Elt;
7402 int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
7403 if (DemandedElts[OuterIdx])
7404 DemandedLHS.setBit(InnerIdx);
7405 if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
7406 DemandedRHS.setBit(InnerIdx);
7407 }
7408 }
7409}
7410
7411// Split the demanded elts of a HADD/HSUB node between its operands.
7412static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
7413 APInt &DemandedLHS, APInt &DemandedRHS) {
7414 int NumLanes = VT.getSizeInBits() / 128;
7415 int NumElts = DemandedElts.getBitWidth();
7416 int NumEltsPerLane = NumElts / NumLanes;
7417 int HalfEltsPerLane = NumEltsPerLane / 2;
7418
7419 DemandedLHS = APInt::getZero(NumElts);
7420 DemandedRHS = APInt::getZero(NumElts);
7421
7422 // Map DemandedElts to the horizontal operands.
7423 for (int Idx = 0; Idx != NumElts; ++Idx) {
7424 if (!DemandedElts[Idx])
7425 continue;
7426 int LaneIdx = (Idx / NumEltsPerLane) * NumEltsPerLane;
7427 int LocalIdx = Idx % NumEltsPerLane;
7428 if (LocalIdx < HalfEltsPerLane) {
7429 DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 0);
7430 DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 1);
7431 } else {
7432 LocalIdx -= HalfEltsPerLane;
7433 DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 0);
7434 DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 1);
7435 }
7436 }
7437}
7438
7439/// Calculates the shuffle mask corresponding to the target-specific opcode.
7440/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
7441/// operands in \p Ops, and returns true.
7442/// Sets \p IsUnary to true if only one source is used. Note that this will set
7443/// IsUnary for shuffles which use a single input multiple times, and in those
7444/// cases it will adjust the mask to only have indices within that single input.
7445/// It is an error to call this with non-empty Mask/Ops vectors.
7446static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
7447 SmallVectorImpl<SDValue> &Ops,
7448 SmallVectorImpl<int> &Mask, bool &IsUnary) {
7449 unsigned NumElems = VT.getVectorNumElements();
7450 unsigned MaskEltSize = VT.getScalarSizeInBits();
7451 SmallVector<uint64_t, 32> RawMask;
7452 APInt RawUndefs;
7453 uint64_t ImmN;
7454
7455 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector")(static_cast <bool> (Mask.empty() && "getTargetShuffleMask expects an empty Mask vector"
) ? void (0) : __assert_fail ("Mask.empty() && \"getTargetShuffleMask expects an empty Mask vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7455, __extension__
__PRETTY_FUNCTION__))
;
7456 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector")(static_cast <bool> (Ops.empty() && "getTargetShuffleMask expects an empty Ops vector"
) ? void (0) : __assert_fail ("Ops.empty() && \"getTargetShuffleMask expects an empty Ops vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7456, __extension__
__PRETTY_FUNCTION__))
;
7457
7458 IsUnary = false;
7459 bool IsFakeUnary = false;
7460 switch (N->getOpcode()) {
7461 case X86ISD::BLENDI:
7462 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7462, __extension__
__PRETTY_FUNCTION__))
;
7463 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7463, __extension__
__PRETTY_FUNCTION__))
;
7464 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7465 DecodeBLENDMask(NumElems, ImmN, Mask);
7466 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7467 break;
7468 case X86ISD::SHUFP:
7469 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7469, __extension__
__PRETTY_FUNCTION__))
;
7470 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7470, __extension__
__PRETTY_FUNCTION__))
;
7471 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7472 DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);
7473 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7474 break;
7475 case X86ISD::INSERTPS:
7476 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7476, __extension__
__PRETTY_FUNCTION__))
;
7477 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7477, __extension__
__PRETTY_FUNCTION__))
;
7478 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7479 DecodeINSERTPSMask(ImmN, Mask);
7480 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7481 break;
7482 case X86ISD::EXTRQI:
7483 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7483, __extension__
__PRETTY_FUNCTION__))
;
7484 if (isa<ConstantSDNode>(N->getOperand(1)) &&
7485 isa<ConstantSDNode>(N->getOperand(2))) {
7486 int BitLen = N->getConstantOperandVal(1);
7487 int BitIdx = N->getConstantOperandVal(2);
7488 DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
7489 IsUnary = true;
7490 }
7491 break;
7492 case X86ISD::INSERTQI:
7493 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7493, __extension__
__PRETTY_FUNCTION__))
;
7494 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7494, __extension__
__PRETTY_FUNCTION__))
;
7495 if (isa<ConstantSDNode>(N->getOperand(2)) &&
7496 isa<ConstantSDNode>(N->getOperand(3))) {
7497 int BitLen = N->getConstantOperandVal(2);
7498 int BitIdx = N->getConstantOperandVal(3);
7499 DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
7500 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7501 }
7502 break;
7503 case X86ISD::UNPCKH:
7504 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7504, __extension__
__PRETTY_FUNCTION__))
;
7505 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7505, __extension__
__PRETTY_FUNCTION__))
;
7506 DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);
7507 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7508 break;
7509 case X86ISD::UNPCKL:
7510 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7510, __extension__
__PRETTY_FUNCTION__))
;
7511 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7511, __extension__
__PRETTY_FUNCTION__))
;
7512 DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);
7513 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7514 break;
7515 case X86ISD::MOVHLPS:
7516 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7516, __extension__
__PRETTY_FUNCTION__))
;
7517 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7517, __extension__
__PRETTY_FUNCTION__))
;
7518 DecodeMOVHLPSMask(NumElems, Mask);
7519 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7520 break;
7521 case X86ISD::MOVLHPS:
7522 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7522, __extension__
__PRETTY_FUNCTION__))
;
7523 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7523, __extension__
__PRETTY_FUNCTION__))
;
7524 DecodeMOVLHPSMask(NumElems, Mask);
7525 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7526 break;
7527 case X86ISD::VALIGN:
7528 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&(static_cast <bool> ((VT.getScalarType() == MVT::i32 ||
VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? void (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7529, __extension__
__PRETTY_FUNCTION__))
7529 "Only 32-bit and 64-bit elements are supported!")(static_cast <bool> ((VT.getScalarType() == MVT::i32 ||
VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? void (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7529, __extension__
__PRETTY_FUNCTION__))
;
7530 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7530, __extension__
__PRETTY_FUNCTION__))
;
7531 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7531, __extension__
__PRETTY_FUNCTION__))
;
7532 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7533 DecodeVALIGNMask(NumElems, ImmN, Mask);
7534 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7535 Ops.push_back(N->getOperand(1));
7536 Ops.push_back(N->getOperand(0));
7537 break;
7538 case X86ISD::PALIGNR:
7539 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Byte vector expected") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7539, __extension__
__PRETTY_FUNCTION__))
;
7540 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7540, __extension__
__PRETTY_FUNCTION__))
;
7541 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7541, __extension__
__PRETTY_FUNCTION__))
;
7542 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7543 DecodePALIGNRMask(NumElems, ImmN, Mask);
7544 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7545 Ops.push_back(N->getOperand(1));
7546 Ops.push_back(N->getOperand(0));
7547 break;
7548 case X86ISD::VSHLDQ:
7549 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Byte vector expected") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7549, __extension__
__PRETTY_FUNCTION__))
;
7550 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7550, __extension__
__PRETTY_FUNCTION__))
;
7551 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7552 DecodePSLLDQMask(NumElems, ImmN, Mask);
7553 IsUnary = true;
7554 break;
7555 case X86ISD::VSRLDQ:
7556 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Byte vector expected") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7556, __extension__
__PRETTY_FUNCTION__))
;
7557 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7557, __extension__
__PRETTY_FUNCTION__))
;
7558 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7559 DecodePSRLDQMask(NumElems, ImmN, Mask);
7560 IsUnary = true;
7561 break;
7562 case X86ISD::PSHUFD:
7563 case X86ISD::VPERMILPI:
7564 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7564, __extension__
__PRETTY_FUNCTION__))
;
7565 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7566 DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);
7567 IsUnary = true;
7568 break;
7569 case X86ISD::PSHUFHW:
7570 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7570, __extension__
__PRETTY_FUNCTION__))
;
7571 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7572 DecodePSHUFHWMask(NumElems, ImmN, Mask);
7573 IsUnary = true;
7574 break;
7575 case X86ISD::PSHUFLW:
7576 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7576, __extension__
__PRETTY_FUNCTION__))
;
7577 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7578 DecodePSHUFLWMask(NumElems, ImmN, Mask);
7579 IsUnary = true;
7580 break;
7581 case X86ISD::VZEXT_MOVL:
7582 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7582, __extension__
__PRETTY_FUNCTION__))
;
7583 DecodeZeroMoveLowMask(NumElems, Mask);
7584 IsUnary = true;
7585 break;
7586 case X86ISD::VBROADCAST:
7587 // We only decode broadcasts of same-sized vectors, peeking through to
7588 // extracted subvectors is likely to cause hasOneUse issues with
7589 // SimplifyDemandedBits etc.
7590 if (N->getOperand(0).getValueType() == VT) {
7591 DecodeVectorBroadcast(NumElems, Mask);
7592 IsUnary = true;
7593 break;
7594 }
7595 return false;
7596 case X86ISD::VPERMILPV: {
7597 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7597, __extension__
__PRETTY_FUNCTION__))
;
7598 IsUnary = true;
7599 SDValue MaskNode = N->getOperand(1);
7600 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
7601 RawUndefs)) {
7602 DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);
7603 break;
7604 }
7605 return false;
7606 }
7607 case X86ISD::PSHUFB: {
7608 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Byte vector expected") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7608, __extension__
__PRETTY_FUNCTION__))
;
7609 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7609, __extension__
__PRETTY_FUNCTION__))
;
7610 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7610, __extension__
__PRETTY_FUNCTION__))
;
7611 IsUnary = true;
7612 SDValue MaskNode = N->getOperand(1);
7613 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
7614 DecodePSHUFBMask(RawMask, RawUndefs, Mask);
7615 break;
7616 }
7617 return false;
7618 }
7619 case X86ISD::VPERMI:
7620 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7620, __extension__
__PRETTY_FUNCTION__))
;
7621 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7622 DecodeVPERMMask(NumElems, ImmN, Mask);
7623 IsUnary = true;
7624 break;
7625 case X86ISD::MOVSS:
7626 case X86ISD::MOVSD:
7627 case X86ISD::MOVSH:
7628 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7628, __extension__
__PRETTY_FUNCTION__))
;
7629 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7629, __extension__
__PRETTY_FUNCTION__))
;
7630 DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
7631 break;
7632 case X86ISD::VPERM2X128:
7633 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7633, __extension__
__PRETTY_FUNCTION__))
;
7634 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7634, __extension__
__PRETTY_FUNCTION__))
;
7635 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7636 DecodeVPERM2X128Mask(NumElems, ImmN, Mask);
7637 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7638 break;
7639 case X86ISD::SHUF128:
7640 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7640, __extension__
__PRETTY_FUNCTION__))
;
7641 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7641, __extension__
__PRETTY_FUNCTION__))
;
7642 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7643 decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);
7644 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7645 break;
7646 case X86ISD::MOVSLDUP:
7647 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7647, __extension__
__PRETTY_FUNCTION__))
;
7648 DecodeMOVSLDUPMask(NumElems, Mask);
7649 IsUnary = true;
7650 break;
7651 case X86ISD::MOVSHDUP:
7652 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7652, __extension__
__PRETTY_FUNCTION__))
;
7653 DecodeMOVSHDUPMask(NumElems, Mask);
7654 IsUnary = true;
7655 break;
7656 case X86ISD::MOVDDUP:
7657 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7657, __extension__
__PRETTY_FUNCTION__))
;
7658 DecodeMOVDDUPMask(NumElems, Mask);
7659 IsUnary = true;
7660 break;
7661 case X86ISD::VPERMIL2: {
7662 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7662, __extension__
__PRETTY_FUNCTION__))
;
7663 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7663, __extension__
__PRETTY_FUNCTION__))
;
7664 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7665 SDValue MaskNode = N->getOperand(2);
7666 SDValue CtrlNode = N->getOperand(3);
7667 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
7668 unsigned CtrlImm = CtrlOp->getZExtValue();
7669 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
7670 RawUndefs)) {
7671 DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,
7672 Mask);
7673 break;
7674 }
7675 }
7676 return false;
7677 }
7678 case X86ISD::VPPERM: {
7679 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7679, __extension__
__PRETTY_FUNCTION__))
;
7680 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7680, __extension__
__PRETTY_FUNCTION__))
;
7681 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7682 SDValue MaskNode = N->getOperand(2);
7683 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
7684 DecodeVPPERMMask(RawMask, RawUndefs, Mask);
7685 break;
7686 }
7687 return false;
7688 }
7689 case X86ISD::VPERMV: {
7690 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7690, __extension__
__PRETTY_FUNCTION__))
;
7691 IsUnary = true;
7692 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
7693 Ops.push_back(N->getOperand(1));
7694 SDValue MaskNode = N->getOperand(0);
7695 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
7696 RawUndefs)) {
7697 DecodeVPERMVMask(RawMask, RawUndefs, Mask);
7698 break;
7699 }
7700 return false;
7701 }
7702 case X86ISD::VPERMV3: {
7703 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7703, __extension__
__PRETTY_FUNCTION__))
;
7704 assert(N->getOperand(2).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(2).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(2).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7704, __extension__
__PRETTY_FUNCTION__))
;
7705 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
7706 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
7707 Ops.push_back(N->getOperand(0));
7708 Ops.push_back(N->getOperand(2));
7709 SDValue MaskNode = N->getOperand(1);
7710 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
7711 RawUndefs)) {
7712 DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);
7713 break;
7714 }
7715 return false;
7716 }
7717 default: llvm_unreachable("unknown target shuffle node")::llvm::llvm_unreachable_internal("unknown target shuffle node"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7717)
;
7718 }
7719
7720 // Empty mask indicates the decode failed.
7721 if (Mask.empty())
7722 return false;
7723
7724 // Check if we're getting a shuffle mask with zero'd elements.
7725 if (!AllowSentinelZero && isAnyZero(Mask))
7726 return false;
7727
7728 // If we have a fake unary shuffle, the shuffle mask is spread across two
7729 // inputs that are actually the same node. Re-map the mask to always point
7730 // into the first input.
7731 if (IsFakeUnary)
7732 for (int &M : Mask)
7733 if (M >= (int)Mask.size())
7734 M -= Mask.size();
7735
7736 // If we didn't already add operands in the opcode-specific code, default to
7737 // adding 1 or 2 operands starting at 0.
7738 if (Ops.empty()) {
7739 Ops.push_back(N->getOperand(0));
7740 if (!IsUnary || IsFakeUnary)
7741 Ops.push_back(N->getOperand(1));
7742 }
7743
7744 return true;
7745}
7746
7747// Wrapper for getTargetShuffleMask with InUnary;
7748static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
7749 SmallVectorImpl<SDValue> &Ops,
7750 SmallVectorImpl<int> &Mask) {
7751 bool IsUnary;
7752 return getTargetShuffleMask(N, VT, AllowSentinelZero, Ops, Mask, IsUnary);
7753}
7754
7755/// Compute whether each element of a shuffle is zeroable.
7756///
7757/// A "zeroable" vector shuffle element is one which can be lowered to zero.
7758/// Either it is an undef element in the shuffle mask, the element of the input
7759/// referenced is undef, or the element of the input referenced is known to be
7760/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
7761/// as many lanes with this technique as possible to simplify the remaining
7762/// shuffle.
7763static void computeZeroableShuffleElements(ArrayRef<int> Mask,
7764 SDValue V1, SDValue V2,
7765 APInt &KnownUndef, APInt &KnownZero) {
7766 int Size = Mask.size();
7767 KnownUndef = KnownZero = APInt::getZero(Size);
7768
7769 V1 = peekThroughBitcasts(V1);
7770 V2 = peekThroughBitcasts(V2);
7771
7772 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
7773 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
7774
7775 int VectorSizeInBits = V1.getValueSizeInBits();
7776 int ScalarSizeInBits = VectorSizeInBits / Size;
7777 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size")(static_cast <bool> (!(VectorSizeInBits % ScalarSizeInBits
) && "Illegal shuffle mask size") ? void (0) : __assert_fail
("!(VectorSizeInBits % ScalarSizeInBits) && \"Illegal shuffle mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7777, __extension__
__PRETTY_FUNCTION__))
;
7778
7779 for (int i = 0; i < Size; ++i) {
7780 int M = Mask[i];
7781 // Handle the easy cases.
7782 if (M < 0) {
7783 KnownUndef.setBit(i);
7784 continue;
7785 }
7786 if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
7787 KnownZero.setBit(i);
7788 continue;
7789 }
7790
7791 // Determine shuffle input and normalize the mask.
7792 SDValue V = M < Size ? V1 : V2;
7793 M %= Size;
7794
7795 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
7796 if (V.getOpcode() != ISD::BUILD_VECTOR)
7797 continue;
7798
7799 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
7800 // the (larger) source element must be UNDEF/ZERO.
7801 if ((Size % V.getNumOperands()) == 0) {
7802 int Scale = Size / V->getNumOperands();
7803 SDValue Op = V.getOperand(M / Scale);
7804 if (Op.isUndef())
7805 KnownUndef.setBit(i);
7806 if (X86::isZeroNode(Op))
7807 KnownZero.setBit(i);
7808 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
7809 APInt Val = Cst->getAPIntValue();
7810 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
7811 if (Val == 0)
7812 KnownZero.setBit(i);
7813 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
7814 APInt Val = Cst->getValueAPF().bitcastToAPInt();
7815 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
7816 if (Val == 0)
7817 KnownZero.setBit(i);
7818 }
7819 continue;
7820 }
7821
7822 // If the BUILD_VECTOR has more elements then all the (smaller) source
7823 // elements must be UNDEF or ZERO.
7824 if ((V.getNumOperands() % Size) == 0) {
7825 int Scale = V->getNumOperands() / Size;
7826 bool AllUndef = true;
7827 bool AllZero = true;
7828 for (int j = 0; j < Scale; ++j) {
7829 SDValue Op = V.getOperand((M * Scale) + j);
7830 AllUndef &= Op.isUndef();
7831 AllZero &= X86::isZeroNode(Op);
7832 }
7833 if (AllUndef)
7834 KnownUndef.setBit(i);
7835 if (AllZero)
7836 KnownZero.setBit(i);
7837 continue;
7838 }
7839 }
7840}
7841
7842/// Decode a target shuffle mask and inputs and see if any values are
7843/// known to be undef or zero from their inputs.
7844/// Returns true if the target shuffle mask was decoded.
7845/// FIXME: Merge this with computeZeroableShuffleElements?
7846static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask,
7847 SmallVectorImpl<SDValue> &Ops,
7848 APInt &KnownUndef, APInt &KnownZero) {
7849 bool IsUnary;
7850 if (!isTargetShuffle(N.getOpcode()))
7851 return false;
7852
7853 MVT VT = N.getSimpleValueType();
7854 if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
7855 return false;
7856
7857 int Size = Mask.size();
7858 SDValue V1 = Ops[0];
7859 SDValue V2 = IsUnary ? V1 : Ops[1];
7860 KnownUndef = KnownZero = APInt::getZero(Size);
7861
7862 V1 = peekThroughBitcasts(V1);
7863 V2 = peekThroughBitcasts(V2);
7864
7865 assert((VT.getSizeInBits() % Size) == 0 &&(static_cast <bool> ((VT.getSizeInBits() % Size) == 0 &&
"Illegal split of shuffle value type") ? void (0) : __assert_fail
("(VT.getSizeInBits() % Size) == 0 && \"Illegal split of shuffle value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7866, __extension__
__PRETTY_FUNCTION__))
7866 "Illegal split of shuffle value type")(static_cast <bool> ((VT.getSizeInBits() % Size) == 0 &&
"Illegal split of shuffle value type") ? void (0) : __assert_fail
("(VT.getSizeInBits() % Size) == 0 && \"Illegal split of shuffle value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7866, __extension__
__PRETTY_FUNCTION__))
;
7867 unsigned EltSizeInBits = VT.getSizeInBits() / Size;
7868
7869 // Extract known constant input data.
7870 APInt UndefSrcElts[2];
7871 SmallVector<APInt, 32> SrcEltBits[2];
7872 bool IsSrcConstant[2] = {
7873 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
7874 SrcEltBits[0], true, false),
7875 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
7876 SrcEltBits[1], true, false)};
7877
7878 for (int i = 0; i < Size; ++i) {
7879 int M = Mask[i];
7880
7881 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
7882 if (M < 0) {
7883 assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!")(static_cast <bool> (isUndefOrZero(M) && "Unknown shuffle sentinel value!"
) ? void (0) : __assert_fail ("isUndefOrZero(M) && \"Unknown shuffle sentinel value!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7883, __extension__
__PRETTY_FUNCTION__))
;
7884 if (SM_SentinelUndef == M)
7885 KnownUndef.setBit(i);
7886 if (SM_SentinelZero == M)
7887 KnownZero.setBit(i);
7888 continue;
7889 }
7890
7891 // Determine shuffle input and normalize the mask.
7892 unsigned SrcIdx = M / Size;
7893 SDValue V = M < Size ? V1 : V2;
7894 M %= Size;
7895
7896 // We are referencing an UNDEF input.
7897 if (V.isUndef()) {
7898 KnownUndef.setBit(i);
7899 continue;
7900 }
7901
7902 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
7903 // TODO: We currently only set UNDEF for integer types - floats use the same
7904 // registers as vectors and many of the scalar folded loads rely on the
7905 // SCALAR_TO_VECTOR pattern.
7906 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
7907 (Size % V.getValueType().getVectorNumElements()) == 0) {
7908 int Scale = Size / V.getValueType().getVectorNumElements();
7909 int Idx = M / Scale;
7910 if (Idx != 0 && !VT.isFloatingPoint())
7911 KnownUndef.setBit(i);
7912 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
7913 KnownZero.setBit(i);
7914 continue;
7915 }
7916
7917 // INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF
7918 // base vectors.
7919 if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {
7920 SDValue Vec = V.getOperand(0);
7921 int NumVecElts = Vec.getValueType().getVectorNumElements();
7922 if (Vec.isUndef() && Size == NumVecElts) {
7923 int Idx = V.getConstantOperandVal(2);
7924 int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();
7925 if (M < Idx || (Idx + NumSubElts) <= M)
7926 KnownUndef.setBit(i);
7927 }
7928 continue;
7929 }
7930
7931 // Attempt to extract from the source's constant bits.
7932 if (IsSrcConstant[SrcIdx]) {
7933 if (UndefSrcElts[SrcIdx][M])
7934 KnownUndef.setBit(i);
7935 else if (SrcEltBits[SrcIdx][M] == 0)
7936 KnownZero.setBit(i);
7937 }
7938 }
7939
7940 assert(VT.getVectorNumElements() == (unsigned)Size &&(static_cast <bool> (VT.getVectorNumElements() == (unsigned
)Size && "Different mask size from vector size!") ? void
(0) : __assert_fail ("VT.getVectorNumElements() == (unsigned)Size && \"Different mask size from vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7941, __extension__
__PRETTY_FUNCTION__))
7941 "Different mask size from vector size!")(static_cast <bool> (VT.getVectorNumElements() == (unsigned
)Size && "Different mask size from vector size!") ? void
(0) : __assert_fail ("VT.getVectorNumElements() == (unsigned)Size && \"Different mask size from vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7941, __extension__
__PRETTY_FUNCTION__))
;
7942 return true;
7943}
7944
7945// Replace target shuffle mask elements with known undef/zero sentinels.
7946static void resolveTargetShuffleFromZeroables(SmallVectorImpl<int> &Mask,
7947 const APInt &KnownUndef,
7948 const APInt &KnownZero,
7949 bool ResolveKnownZeros= true) {
7950 unsigned NumElts = Mask.size();
7951 assert(KnownUndef.getBitWidth() == NumElts &&(static_cast <bool> (KnownUndef.getBitWidth() == NumElts
&& KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch"
) ? void (0) : __assert_fail ("KnownUndef.getBitWidth() == NumElts && KnownZero.getBitWidth() == NumElts && \"Shuffle mask size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7952, __extension__
__PRETTY_FUNCTION__))
7952 KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch")(static_cast <bool> (KnownUndef.getBitWidth() == NumElts
&& KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch"
) ? void (0) : __assert_fail ("KnownUndef.getBitWidth() == NumElts && KnownZero.getBitWidth() == NumElts && \"Shuffle mask size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7952, __extension__
__PRETTY_FUNCTION__))
;
7953
7954 for (unsigned i = 0; i != NumElts; ++i) {
7955 if (KnownUndef[i])
7956 Mask[i] = SM_SentinelUndef;
7957 else if (ResolveKnownZeros && KnownZero[i])
7958 Mask[i] = SM_SentinelZero;
7959 }
7960}
7961
7962// Extract target shuffle mask sentinel elements to known undef/zero bitmasks.
7963static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl<int> &Mask,
7964 APInt &KnownUndef,
7965 APInt &KnownZero) {
7966 unsigned NumElts = Mask.size();
7967 KnownUndef = KnownZero = APInt::getZero(NumElts);
7968
7969 for (unsigned i = 0; i != NumElts; ++i) {
7970 int M = Mask[i];
7971 if (SM_SentinelUndef == M)
7972 KnownUndef.setBit(i);
7973 if (SM_SentinelZero == M)
7974 KnownZero.setBit(i);
7975 }
7976}
7977
7978// Forward declaration (for getFauxShuffleMask recursive check).
7979static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
7980 SmallVectorImpl<int> &Mask,
7981 const SelectionDAG &DAG, unsigned Depth,
7982 bool ResolveKnownElts);
7983
7984// Attempt to decode ops that could be represented as a shuffle mask.
7985// The decoded shuffle mask may contain a different number of elements to the
7986// destination value type.
7987// TODO: Merge into getTargetShuffleInputs()
7988static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
7989 SmallVectorImpl<int> &Mask,
7990 SmallVectorImpl<SDValue> &Ops,
7991 const SelectionDAG &DAG, unsigned Depth,
7992 bool ResolveKnownElts) {
7993 Mask.clear();
7994 Ops.clear();
7995
7996 MVT VT = N.getSimpleValueType();
7997 unsigned NumElts = VT.getVectorNumElements();
7998 unsigned NumSizeInBits = VT.getSizeInBits();
7999 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
8000 if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
8001 return false;
8002 assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size")(static_cast <bool> (NumElts == DemandedElts.getBitWidth
() && "Unexpected vector size") ? void (0) : __assert_fail
("NumElts == DemandedElts.getBitWidth() && \"Unexpected vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8002, __extension__
__PRETTY_FUNCTION__))
;
8003 unsigned NumSizeInBytes = NumSizeInBits / 8;
8004 unsigned NumBytesPerElt = NumBitsPerElt / 8;
8005
8006 unsigned Opcode = N.getOpcode();
8007 switch (Opcode) {
8008 case ISD::VECTOR_SHUFFLE: {
8009 // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
8010 ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
8011 if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
8012 Mask.append(ShuffleMask.begin(), ShuffleMask.end());
8013 Ops.push_back(N.getOperand(0));
8014 Ops.push_back(N.getOperand(1));
8015 return true;
8016 }
8017 return false;
8018 }
8019 case ISD::AND:
8020 case X86ISD::ANDNP: {
8021 // Attempt to decode as a per-byte mask.
8022 APInt UndefElts;
8023 SmallVector<APInt, 32> EltBits;
8024 SDValue N0 = N.getOperand(0);
8025 SDValue N1 = N.getOperand(1);
8026 bool IsAndN = (X86ISD::ANDNP == Opcode);
8027 uint64_t ZeroMask = IsAndN ? 255 : 0;
8028 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
8029 return false;
8030 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
8031 if (UndefElts[i]) {
8032 Mask.push_back(SM_SentinelUndef);
8033 continue;
8034 }
8035 const APInt &ByteBits = EltBits[i];
8036 if (ByteBits != 0 && ByteBits != 255)
8037 return false;
8038 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
8039 }
8040 Ops.push_back(IsAndN ? N1 : N0);
8041 return true;
8042 }
8043 case ISD::OR: {
8044 // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
8045 // is a valid shuffle index.
8046 SDValue N0 = peekThroughBitcasts(N.getOperand(0));
8047 SDValue N1 = peekThroughBitcasts(N.getOperand(1));
8048 if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
8049 return false;
8050 SmallVector<int, 64> SrcMask0, SrcMask1;
8051 SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
8052 if (!getTargetShuffleInputs(N0, SrcInputs0, SrcMask0, DAG, Depth + 1,
8053 true) ||
8054 !getTargetShuffleInputs(N1, SrcInputs1, SrcMask1, DAG, Depth + 1,
8055 true))
8056 return false;
8057
8058 size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
8059 SmallVector<int, 64> Mask0, Mask1;
8060 narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
8061 narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
8062 for (int i = 0; i != (int)MaskSize; ++i) {
8063 // NOTE: Don't handle SM_SentinelUndef, as we can end up in infinite
8064 // loops converting between OR and BLEND shuffles due to
8065 // canWidenShuffleElements merging away undef elements, meaning we
8066 // fail to recognise the OR as the undef element isn't known zero.
8067 if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
8068 Mask.push_back(SM_SentinelZero);
8069 else if (Mask1[i] == SM_SentinelZero)
8070 Mask.push_back(i);
8071 else if (Mask0[i] == SM_SentinelZero)
8072 Mask.push_back(i + MaskSize);
8073 else
8074 return false;
8075 }
8076 Ops.push_back(N0);
8077 Ops.push_back(N1);
8078 return true;
8079 }
8080 case ISD::INSERT_SUBVECTOR: {
8081 SDValue Src = N.getOperand(0);
8082 SDValue Sub = N.getOperand(1);
8083 EVT SubVT = Sub.getValueType();
8084 unsigned NumSubElts = SubVT.getVectorNumElements();
8085 if (!N->isOnlyUserOf(Sub.getNode()))
8086 return false;
8087 uint64_t InsertIdx = N.getConstantOperandVal(2);
8088 // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
8089 if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
8090 Sub.getOperand(0).getValueType() == VT) {
8091 uint64_t ExtractIdx = Sub.getConstantOperandVal(1);
8092 for (int i = 0; i != (int)NumElts; ++i)
8093 Mask.push_back(i);
8094 for (int i = 0; i != (int)NumSubElts; ++i)
8095 Mask[InsertIdx + i] = NumElts + ExtractIdx + i;
8096 Ops.push_back(Src);
8097 Ops.push_back(Sub.getOperand(0));
8098 return true;
8099 }
8100 // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
8101 SmallVector<int, 64> SubMask;
8102 SmallVector<SDValue, 2> SubInputs;
8103 if (!getTargetShuffleInputs(peekThroughOneUseBitcasts(Sub), SubInputs,
8104 SubMask, DAG, Depth + 1, ResolveKnownElts))
8105 return false;
8106
8107 // Subvector shuffle inputs must not be larger than the subvector.
8108 if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {
8109 return SubVT.getFixedSizeInBits() <
8110 SubInput.getValueSizeInBits().getFixedSize();
8111 }))
8112 return false;
8113
8114 if (SubMask.size() != NumSubElts) {
8115 assert(((SubMask.size() % NumSubElts) == 0 ||(static_cast <bool> (((SubMask.size() % NumSubElts) == 0
|| (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale"
) ? void (0) : __assert_fail ("((SubMask.size() % NumSubElts) == 0 || (NumSubElts % SubMask.size()) == 0) && \"Illegal submask scale\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8116, __extension__
__PRETTY_FUNCTION__))
8116 (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale")(static_cast <bool> (((SubMask.size() % NumSubElts) == 0
|| (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale"
) ? void (0) : __assert_fail ("((SubMask.size() % NumSubElts) == 0 || (NumSubElts % SubMask.size()) == 0) && \"Illegal submask scale\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8116, __extension__
__PRETTY_FUNCTION__))
;
8117 if ((NumSubElts % SubMask.size()) == 0) {
8118 int Scale = NumSubElts / SubMask.size();
8119 SmallVector<int,64> ScaledSubMask;
8120 narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask);
8121 SubMask = ScaledSubMask;
8122 } else {
8123 int Scale = SubMask.size() / NumSubElts;
8124 NumSubElts = SubMask.size();
8125 NumElts *= Scale;
8126 InsertIdx *= Scale;
8127 }
8128 }
8129 Ops.push_back(Src);
8130 Ops.append(SubInputs.begin(), SubInputs.end());
8131 if (ISD::isBuildVectorAllZeros(Src.getNode()))
8132 Mask.append(NumElts, SM_SentinelZero);
8133 else
8134 for (int i = 0; i != (int)NumElts; ++i)
8135 Mask.push_back(i);
8136 for (int i = 0; i != (int)NumSubElts; ++i) {
8137 int M = SubMask[i];
8138 if (0 <= M) {
8139 int InputIdx = M / NumSubElts;
8140 M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
8141 }
8142 Mask[i + InsertIdx] = M;
8143 }
8144 return true;
8145 }
8146 case X86ISD::PINSRB:
8147 case X86ISD::PINSRW:
8148 case ISD::SCALAR_TO_VECTOR:
8149 case ISD::INSERT_VECTOR_ELT: {
8150 // Match against a insert_vector_elt/scalar_to_vector of an extract from a
8151 // vector, for matching src/dst vector types.
8152 SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1);
8153
8154 unsigned DstIdx = 0;
8155 if (Opcode != ISD::SCALAR_TO_VECTOR) {
8156 // Check we have an in-range constant insertion index.
8157 if (!isa<ConstantSDNode>(N.getOperand(2)) ||
8158 N.getConstantOperandAPInt(2).uge(NumElts))
8159 return false;
8160 DstIdx = N.getConstantOperandVal(2);
8161
8162 // Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern.
8163 if (X86::isZeroNode(Scl)) {
8164 Ops.push_back(N.getOperand(0));
8165 for (unsigned i = 0; i != NumElts; ++i)
8166 Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i);
8167 return true;
8168 }
8169 }
8170
8171 // Peek through trunc/aext/zext.
8172 // TODO: aext shouldn't require SM_SentinelZero padding.
8173 // TODO: handle shift of scalars.
8174 unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();
8175 while (Scl.getOpcode() == ISD::TRUNCATE ||
8176 Scl.getOpcode() == ISD::ANY_EXTEND ||
8177 Scl.getOpcode() == ISD::ZERO_EXTEND) {
8178 Scl = Scl.getOperand(0);
8179 MinBitsPerElt =
8180 std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());
8181 }
8182 if ((MinBitsPerElt % 8) != 0)
8183 return false;
8184
8185 // Attempt to find the source vector the scalar was extracted from.
8186 SDValue SrcExtract;
8187 if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
8188 Scl.getOpcode() == X86ISD::PEXTRW ||
8189 Scl.getOpcode() == X86ISD::PEXTRB) &&
8190 Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
8191 SrcExtract = Scl;
8192 }
8193 if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
8194 return false;
8195
8196 SDValue SrcVec = SrcExtract.getOperand(0);
8197 EVT SrcVT = SrcVec.getValueType();
8198 if (!SrcVT.getScalarType().isByteSized())
8199 return false;
8200 unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
8201 unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);
8202 unsigned DstByte = DstIdx * NumBytesPerElt;
8203 MinBitsPerElt =
8204 std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());
8205
8206 // Create 'identity' byte level shuffle mask and then add inserted bytes.
8207 if (Opcode == ISD::SCALAR_TO_VECTOR) {
8208 Ops.push_back(SrcVec);
8209 Mask.append(NumSizeInBytes, SM_SentinelUndef);
8210 } else {
8211 Ops.push_back(SrcVec);
8212 Ops.push_back(N.getOperand(0));
8213 for (int i = 0; i != (int)NumSizeInBytes; ++i)
8214 Mask.push_back(NumSizeInBytes + i);
8215 }
8216
8217 unsigned MinBytesPerElts = MinBitsPerElt / 8;
8218 MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
8219 for (unsigned i = 0; i != MinBytesPerElts; ++i)
8220 Mask[DstByte + i] = SrcByte + i;
8221 for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
8222 Mask[DstByte + i] = SM_SentinelZero;
8223 return true;
8224 }
8225 case X86ISD::PACKSS:
8226 case X86ISD::PACKUS: {
8227 SDValue N0 = N.getOperand(0);
8228 SDValue N1 = N.getOperand(1);
8229 assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&(static_cast <bool> (N0.getValueType().getVectorNumElements
() == (NumElts / 2) && N1.getValueType().getVectorNumElements
() == (NumElts / 2) && "Unexpected input value type")
? void (0) : __assert_fail ("N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && \"Unexpected input value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8231, __extension__
__PRETTY_FUNCTION__))
8230 N1.getValueType().getVectorNumElements() == (NumElts / 2) &&(static_cast <bool> (N0.getValueType().getVectorNumElements
() == (NumElts / 2) && N1.getValueType().getVectorNumElements
() == (NumElts / 2) && "Unexpected input value type")
? void (0) : __assert_fail ("N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && \"Unexpected input value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8231, __extension__
__PRETTY_FUNCTION__))
8231 "Unexpected input value type")(static_cast <bool> (N0.getValueType().getVectorNumElements
() == (NumElts / 2) && N1.getValueType().getVectorNumElements
() == (NumElts / 2) && "Unexpected input value type")
? void (0) : __assert_fail ("N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && \"Unexpected input value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8231, __extension__
__PRETTY_FUNCTION__))
;
8232
8233 APInt EltsLHS, EltsRHS;
8234 getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);
8235
8236 // If we know input saturation won't happen (or we don't care for particular
8237 // lanes), we can treat this as a truncation shuffle.
8238 bool Offset0 = false, Offset1 = false;
8239 if (Opcode == X86ISD::PACKSS) {
8240 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
8241 DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
8242 (!(N1.isUndef() || EltsRHS.isZero()) &&
8243 DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
8244 return false;
8245 // We can't easily fold ASHR into a shuffle, but if it was feeding a
8246 // PACKSS then it was likely being used for sign-extension for a
8247 // truncation, so just peek through and adjust the mask accordingly.
8248 if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&
8249 N0.getConstantOperandAPInt(1) == NumBitsPerElt) {
8250 Offset0 = true;
8251 N0 = N0.getOperand(0);
8252 }
8253 if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&
8254 N1.getConstantOperandAPInt(1) == NumBitsPerElt) {
8255 Offset1 = true;
8256 N1 = N1.getOperand(0);
8257 }
8258 } else {
8259 APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
8260 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
8261 !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||
8262 (!(N1.isUndef() || EltsRHS.isZero()) &&
8263 !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
8264 return false;
8265 }
8266
8267 bool IsUnary = (N0 == N1);
8268
8269 Ops.push_back(N0);
8270 if (!IsUnary)
8271 Ops.push_back(N1);
8272
8273 createPackShuffleMask(VT, Mask, IsUnary);
8274
8275 if (Offset0 || Offset1) {
8276 for (int &M : Mask)
8277 if ((Offset0 && isInRange(M, 0, NumElts)) ||
8278 (Offset1 && isInRange(M, NumElts, 2 * NumElts)))
8279 ++M;
8280 }
8281 return true;
8282 }
8283 case X86ISD::VTRUNC: {
8284 SDValue Src = N.getOperand(0);
8285 EVT SrcVT = Src.getValueType();
8286 // Truncated source must be a simple vector.
8287 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
8288 (SrcVT.getScalarSizeInBits() % 8) != 0)
8289 return false;
8290 unsigned NumSrcElts = SrcVT.getVectorNumElements();
8291 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
8292 unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
8293 assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation")(static_cast <bool> ((NumBitsPerSrcElt % NumBitsPerElt)
== 0 && "Illegal truncation") ? void (0) : __assert_fail
("(NumBitsPerSrcElt % NumBitsPerElt) == 0 && \"Illegal truncation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8293, __extension__
__PRETTY_FUNCTION__))
;
8294 for (unsigned i = 0; i != NumSrcElts; ++i)
8295 Mask.push_back(i * Scale);
8296 Mask.append(NumElts - NumSrcElts, SM_SentinelZero);
8297 Ops.push_back(Src);
8298 return true;
8299 }
8300 case X86ISD::VSHLI:
8301 case X86ISD::VSRLI: {
8302 uint64_t ShiftVal = N.getConstantOperandVal(1);
8303 // Out of range bit shifts are guaranteed to be zero.
8304 if (NumBitsPerElt <= ShiftVal) {
8305 Mask.append(NumElts, SM_SentinelZero);
8306 return true;
8307 }
8308
8309 // We can only decode 'whole byte' bit shifts as shuffles.
8310 if ((ShiftVal % 8) != 0)
8311 break;
8312
8313 uint64_t ByteShift = ShiftVal / 8;
8314 Ops.push_back(N.getOperand(0));
8315
8316 // Clear mask to all zeros and insert the shifted byte indices.
8317 Mask.append(NumSizeInBytes, SM_SentinelZero);
8318
8319 if (X86ISD::VSHLI == Opcode) {
8320 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
8321 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
8322 Mask[i + j] = i + j - ByteShift;
8323 } else {
8324 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
8325 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
8326 Mask[i + j - ByteShift] = i + j;
8327 }
8328 return true;
8329 }
8330 case X86ISD::VROTLI:
8331 case X86ISD::VROTRI: {
8332 // We can only decode 'whole byte' bit rotates as shuffles.
8333 uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);
8334 if ((RotateVal % 8) != 0)
8335 return false;
8336 Ops.push_back(N.getOperand(0));
8337 int Offset = RotateVal / 8;
8338 Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);
8339 for (int i = 0; i != (int)NumElts; ++i) {
8340 int BaseIdx = i * NumBytesPerElt;
8341 for (int j = 0; j != (int)NumBytesPerElt; ++j) {
8342 Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));
8343 }
8344 }
8345 return true;
8346 }
8347 case X86ISD::VBROADCAST: {
8348 SDValue Src = N.getOperand(0);
8349 if (!Src.getSimpleValueType().isVector()) {
8350 if (Src.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8351 !isNullConstant(Src.getOperand(1)) ||
8352 Src.getOperand(0).getValueType().getScalarType() !=
8353 VT.getScalarType())
8354 return false;
8355 Src = Src.getOperand(0);
8356 }
8357 Ops.push_back(Src);
8358 Mask.append(NumElts, 0);
8359 return true;
8360 }
8361 case ISD::ZERO_EXTEND:
8362 case ISD::ANY_EXTEND:
8363 case ISD::ZERO_EXTEND_VECTOR_INREG:
8364 case ISD::ANY_EXTEND_VECTOR_INREG: {
8365 SDValue Src = N.getOperand(0);
8366 EVT SrcVT = Src.getValueType();
8367
8368 // Extended source must be a simple vector.
8369 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
8370 (SrcVT.getScalarSizeInBits() % 8) != 0)
8371 return false;
8372
8373 bool IsAnyExtend =
8374 (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
8375 DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,
8376 IsAnyExtend, Mask);
8377 Ops.push_back(Src);
8378 return true;
8379 }
8380 }
8381
8382 return false;
8383}
8384
8385/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
8386static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
8387 SmallVectorImpl<int> &Mask) {
8388 int MaskWidth = Mask.size();
8389 SmallVector<SDValue, 16> UsedInputs;
8390 for (int i = 0, e = Inputs.size(); i < e; ++i) {
8391 int lo = UsedInputs.size() * MaskWidth;
8392 int hi = lo + MaskWidth;
8393
8394 // Strip UNDEF input usage.
8395 if (Inputs[i].isUndef())
8396 for (int &M : Mask)
8397 if ((lo <= M) && (M < hi))
8398 M = SM_SentinelUndef;
8399
8400 // Check for unused inputs.
8401 if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
8402 for (int &M : Mask)
8403 if (lo <= M)
8404 M -= MaskWidth;
8405 continue;
8406 }
8407
8408 // Check for repeated inputs.
8409 bool IsRepeat = false;
8410 for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {
8411 if (UsedInputs[j] != Inputs[i])
8412 continue;
8413 for (int &M : Mask)
8414 if (lo <= M)
8415 M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
8416 IsRepeat = true;
8417 break;
8418 }
8419 if (IsRepeat)
8420 continue;
8421
8422 UsedInputs.push_back(Inputs[i]);
8423 }
8424 Inputs = UsedInputs;
8425}
8426
8427/// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs
8428/// and then sets the SM_SentinelUndef and SM_SentinelZero values.
8429/// Returns true if the target shuffle mask was decoded.
8430static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
8431 SmallVectorImpl<SDValue> &Inputs,
8432 SmallVectorImpl<int> &Mask,
8433 APInt &KnownUndef, APInt &KnownZero,
8434 const SelectionDAG &DAG, unsigned Depth,
8435 bool ResolveKnownElts) {
8436 if (Depth >= SelectionDAG::MaxRecursionDepth)
8437 return false; // Limit search depth.
8438
8439 EVT VT = Op.getValueType();
8440 if (!VT.isSimple() || !VT.isVector())
8441 return false;
8442
8443 if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {
8444 if (ResolveKnownElts)
8445 resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);
8446 return true;
8447 }
8448 if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,
8449 ResolveKnownElts)) {
8450 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
8451 return true;
8452 }
8453 return false;
8454}
8455
8456static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
8457 SmallVectorImpl<int> &Mask,
8458 const SelectionDAG &DAG, unsigned Depth = 0,
8459 bool ResolveKnownElts = true) {
8460 EVT VT = Op.getValueType();
8461 if (!VT.isSimple() || !VT.isVector())
8462 return false;
8463
8464 APInt KnownUndef, KnownZero;
8465 unsigned NumElts = Op.getValueType().getVectorNumElements();
8466 APInt DemandedElts = APInt::getAllOnes(NumElts);
8467 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,
8468 KnownZero, DAG, Depth, ResolveKnownElts);
8469}
8470
8471// Attempt to create a scalar/subvector broadcast from the base MemSDNode.
8472static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT,
8473 EVT MemVT, MemSDNode *Mem, unsigned Offset,
8474 SelectionDAG &DAG) {
8475 assert((Opcode == X86ISD::VBROADCAST_LOAD ||(static_cast <bool> ((Opcode == X86ISD::VBROADCAST_LOAD
|| Opcode == X86ISD::SUBV_BROADCAST_LOAD) && "Unknown broadcast load type"
) ? void (0) : __assert_fail ("(Opcode == X86ISD::VBROADCAST_LOAD || Opcode == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8477, __extension__
__PRETTY_FUNCTION__))
8476 Opcode == X86ISD::SUBV_BROADCAST_LOAD) &&(static_cast <bool> ((Opcode == X86ISD::VBROADCAST_LOAD
|| Opcode == X86ISD::SUBV_BROADCAST_LOAD) && "Unknown broadcast load type"
) ? void (0) : __assert_fail ("(Opcode == X86ISD::VBROADCAST_LOAD || Opcode == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8477, __extension__
__PRETTY_FUNCTION__))
8477 "Unknown broadcast load type")(static_cast <bool> ((Opcode == X86ISD::VBROADCAST_LOAD
|| Opcode == X86ISD::SUBV_BROADCAST_LOAD) && "Unknown broadcast load type"
) ? void (0) : __assert_fail ("(Opcode == X86ISD::VBROADCAST_LOAD || Opcode == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8477, __extension__
__PRETTY_FUNCTION__))
;
8478
8479 // Ensure this is a simple (non-atomic, non-voltile), temporal read memop.
8480 if (!Mem || !Mem->readMem() || !Mem->isSimple() || Mem->isNonTemporal())
8481 return SDValue();
8482
8483 SDValue Ptr =
8484 DAG.getMemBasePlusOffset(Mem->getBasePtr(), TypeSize::Fixed(Offset), DL);
8485 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
8486 SDValue Ops[] = {Mem->getChain(), Ptr};
8487 SDValue BcstLd = DAG.getMemIntrinsicNode(
8488 Opcode, DL, Tys, Ops, MemVT,
8489 DAG.getMachineFunction().getMachineMemOperand(
8490 Mem->getMemOperand(), Offset, MemVT.getStoreSize()));
8491 DAG.makeEquivalentMemoryOrdering(SDValue(Mem, 1), BcstLd.getValue(1));
8492 return BcstLd;
8493}
8494
8495/// Returns the scalar element that will make up the i'th
8496/// element of the result of the vector shuffle.
8497static SDValue getShuffleScalarElt(SDValue Op, unsigned Index,
8498 SelectionDAG &DAG, unsigned Depth) {
8499 if (Depth >= SelectionDAG::MaxRecursionDepth)
8500 return SDValue(); // Limit search depth.
8501
8502 EVT VT = Op.getValueType();
8503 unsigned Opcode = Op.getOpcode();
8504 unsigned NumElems = VT.getVectorNumElements();
8505
8506 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
8507 if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) {
8508 int Elt = SV->getMaskElt(Index);
8509
8510 if (Elt < 0)
8511 return DAG.getUNDEF(VT.getVectorElementType());
8512
8513 SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);
8514 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
8515 }
8516
8517 // Recurse into target specific vector shuffles to find scalars.
8518 if (isTargetShuffle(Opcode)) {
8519 MVT ShufVT = VT.getSimpleVT();
8520 MVT ShufSVT = ShufVT.getVectorElementType();
8521 int NumElems = (int)ShufVT.getVectorNumElements();
8522 SmallVector<int, 16> ShuffleMask;
8523 SmallVector<SDValue, 16> ShuffleOps;
8524 if (!getTargetShuffleMask(Op.getNode(), ShufVT, true, ShuffleOps,
8525 ShuffleMask))
8526 return SDValue();
8527
8528 int Elt = ShuffleMask[Index];
8529 if (Elt == SM_SentinelZero)
8530 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT)
8531 : DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT);
8532 if (Elt == SM_SentinelUndef)
8533 return DAG.getUNDEF(ShufSVT);
8534
8535 assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range")(static_cast <bool> (0 <= Elt && Elt < (2
* NumElems) && "Shuffle index out of range") ? void (
0) : __assert_fail ("0 <= Elt && Elt < (2 * NumElems) && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8535, __extension__
__PRETTY_FUNCTION__))
;
8536 SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
8537 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
8538 }
8539
8540 // Recurse into insert_subvector base/sub vector to find scalars.
8541 if (Opcode == ISD::INSERT_SUBVECTOR) {
8542 SDValue Vec = Op.getOperand(0);
8543 SDValue Sub = Op.getOperand(1);
8544 uint64_t SubIdx = Op.getConstantOperandVal(2);
8545 unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
8546
8547 if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
8548 return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);
8549 return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);
8550 }
8551
8552 // Recurse into concat_vectors sub vector to find scalars.
8553 if (Opcode == ISD::CONCAT_VECTORS) {
8554 EVT SubVT = Op.getOperand(0).getValueType();
8555 unsigned NumSubElts = SubVT.getVectorNumElements();
8556 uint64_t SubIdx = Index / NumSubElts;
8557 uint64_t SubElt = Index % NumSubElts;
8558 return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);
8559 }
8560
8561 // Recurse into extract_subvector src vector to find scalars.
8562 if (Opcode == ISD::EXTRACT_SUBVECTOR) {
8563 SDValue Src = Op.getOperand(0);
8564 uint64_t SrcIdx = Op.getConstantOperandVal(1);
8565 return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);
8566 }
8567
8568 // We only peek through bitcasts of the same vector width.
8569 if (Opcode == ISD::BITCAST) {
8570 SDValue Src = Op.getOperand(0);
8571 EVT SrcVT = Src.getValueType();
8572 if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems)
8573 return getShuffleScalarElt(Src, Index, DAG, Depth + 1);
8574 return SDValue();
8575 }
8576
8577 // Actual nodes that may contain scalar elements
8578
8579 // For insert_vector_elt - either return the index matching scalar or recurse
8580 // into the base vector.
8581 if (Opcode == ISD::INSERT_VECTOR_ELT &&
8582 isa<ConstantSDNode>(Op.getOperand(2))) {
8583 if (Op.getConstantOperandAPInt(2) == Index)
8584 return Op.getOperand(1);
8585 return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);
8586 }
8587
8588 if (Opcode == ISD::SCALAR_TO_VECTOR)
8589 return (Index == 0) ? Op.getOperand(0)
8590 : DAG.getUNDEF(VT.getVectorElementType());
8591
8592 if (Opcode == ISD::BUILD_VECTOR)
8593 return Op.getOperand(Index);
8594
8595 return SDValue();
8596}
8597
8598// Use PINSRB/PINSRW/PINSRD to create a build vector.
8599static SDValue LowerBuildVectorAsInsert(SDValue Op, const APInt &NonZeroMask,
8600 unsigned NumNonZero, unsigned NumZero,
8601 SelectionDAG &DAG,
8602 const X86Subtarget &Subtarget) {
8603 MVT VT = Op.getSimpleValueType();
8604 unsigned NumElts = VT.getVectorNumElements();
8605 assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||(static_cast <bool> (((VT == MVT::v8i16 && Subtarget
.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) &&
Subtarget.hasSSE41())) && "Illegal vector insertion"
) ? void (0) : __assert_fail ("((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && \"Illegal vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8607, __extension__
__PRETTY_FUNCTION__))
8606 ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&(static_cast <bool> (((VT == MVT::v8i16 && Subtarget
.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) &&
Subtarget.hasSSE41())) && "Illegal vector insertion"
) ? void (0) : __assert_fail ("((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && \"Illegal vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8607, __extension__
__PRETTY_FUNCTION__))
8607 "Illegal vector insertion")(static_cast <bool> (((VT == MVT::v8i16 && Subtarget
.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) &&
Subtarget.hasSSE41())) && "Illegal vector insertion"
) ? void (0) : __assert_fail ("((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && \"Illegal vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8607, __extension__
__PRETTY_FUNCTION__))
;
8608
8609 SDLoc dl(Op);
8610 SDValue V;
8611 bool First = true;
8612
8613 for (unsigned i = 0; i < NumElts; ++i) {
8614 bool IsNonZero = NonZeroMask[i];
8615 if (!IsNonZero)
8616 continue;
8617
8618 // If the build vector contains zeros or our first insertion is not the
8619 // first index then insert into zero vector to break any register
8620 // dependency else use SCALAR_TO_VECTOR.
8621 if (First) {
8622 First = false;
8623 if (NumZero || 0 != i)
8624 V = getZeroVector(VT, Subtarget, DAG, dl);
8625 else {
8626 assert(0 == i && "Expected insertion into zero-index")(static_cast <bool> (0 == i && "Expected insertion into zero-index"
) ? void (0) : __assert_fail ("0 == i && \"Expected insertion into zero-index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8626, __extension__
__PRETTY_FUNCTION__))
;
8627 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
8628 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
8629 V = DAG.getBitcast(VT, V);
8630 continue;
8631 }
8632 }
8633 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V, Op.getOperand(i),
8634 DAG.getIntPtrConstant(i, dl));
8635 }
8636
8637 return V;
8638}
8639
8640/// Custom lower build_vector of v16i8.
8641static SDValue LowerBuildVectorv16i8(SDValue Op, const APInt &NonZeroMask,
8642 unsigned NumNonZero, unsigned NumZero,
8643 SelectionDAG &DAG,
8644 const X86Subtarget &Subtarget) {
8645 if (NumNonZero > 8 && !Subtarget.hasSSE41())
8646 return SDValue();
8647
8648 // SSE4.1 - use PINSRB to insert each byte directly.
8649 if (Subtarget.hasSSE41())
8650 return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG,
8651 Subtarget);
8652
8653 SDLoc dl(Op);
8654 SDValue V;
8655
8656 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
8657 for (unsigned i = 0; i < 16; i += 2) {
8658 bool ThisIsNonZero = NonZeroMask[i];
8659 bool NextIsNonZero = NonZeroMask[i + 1];
8660 if (!ThisIsNonZero && !NextIsNonZero)
8661 continue;
8662
8663 // FIXME: Investigate combining the first 4 bytes as a i32 instead.
8664 SDValue Elt;
8665 if (ThisIsNonZero) {
8666 if (NumZero || NextIsNonZero)
8667 Elt = DAG.getZExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
8668 else
8669 Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
8670 }
8671
8672 if (NextIsNonZero) {
8673 SDValue NextElt = Op.getOperand(i + 1);
8674 if (i == 0 && NumZero)
8675 NextElt = DAG.getZExtOrTrunc(NextElt, dl, MVT::i32);
8676 else
8677 NextElt = DAG.getAnyExtOrTrunc(NextElt, dl, MVT::i32);
8678 NextElt = DAG.getNode(ISD::SHL, dl, MVT::i32, NextElt,
8679 DAG.getConstant(8, dl, MVT::i8));
8680 if (ThisIsNonZero)
8681 Elt = DAG.getNode(ISD::OR, dl, MVT::i32, NextElt, Elt);
8682 else
8683 Elt = NextElt;
8684 }
8685
8686 // If our first insertion is not the first index or zeros are needed, then
8687 // insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high
8688 // elements undefined).
8689 if (!V) {
8690 if (i != 0 || NumZero)
8691 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
8692 else {
8693 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Elt);
8694 V = DAG.getBitcast(MVT::v8i16, V);
8695 continue;
8696 }
8697 }
8698 Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Elt);
8699 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, Elt,
8700 DAG.getIntPtrConstant(i / 2, dl));
8701 }
8702
8703 return DAG.getBitcast(MVT::v16i8, V);
8704}
8705
8706/// Custom lower build_vector of v8i16.
8707static SDValue LowerBuildVectorv8i16(SDValue Op, const APInt &NonZeroMask,
8708 unsigned NumNonZero, unsigned NumZero,
8709 SelectionDAG &DAG,
8710 const X86Subtarget &Subtarget) {
8711 if (NumNonZero > 4 && !Subtarget.hasSSE41())
8712 return SDValue();
8713
8714 // Use PINSRW to insert each byte directly.
8715 return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG,
8716 Subtarget);
8717}
8718
8719/// Custom lower build_vector of v4i32 or v4f32.
8720static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
8721 const X86Subtarget &Subtarget) {
8722 // If this is a splat of a pair of elements, use MOVDDUP (unless the target
8723 // has XOP; in that case defer lowering to potentially use VPERMIL2PS).
8724 // Because we're creating a less complicated build vector here, we may enable
8725 // further folding of the MOVDDUP via shuffle transforms.
8726 if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&
8727 Op.getOperand(0) == Op.getOperand(2) &&
8728 Op.getOperand(1) == Op.getOperand(3) &&
8729 Op.getOperand(0) != Op.getOperand(1)) {
8730 SDLoc DL(Op);
8731 MVT VT = Op.getSimpleValueType();
8732 MVT EltVT = VT.getVectorElementType();
8733 // Create a new build vector with the first 2 elements followed by undef
8734 // padding, bitcast to v2f64, duplicate, and bitcast back.
8735 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
8736 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
8737 SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));
8738 SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);
8739 return DAG.getBitcast(VT, Dup);
8740 }
8741
8742 // Find all zeroable elements.
8743 std::bitset<4> Zeroable, Undefs;
8744 for (int i = 0; i < 4; ++i) {
8745 SDValue Elt = Op.getOperand(i);
8746 Undefs[i] = Elt.isUndef();
8747 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
8748 }
8749 assert(Zeroable.size() - Zeroable.count() > 1 &&(static_cast <bool> (Zeroable.size() - Zeroable.count()
> 1 && "We expect at least two non-zero elements!"
) ? void (0) : __assert_fail ("Zeroable.size() - Zeroable.count() > 1 && \"We expect at least two non-zero elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8750, __extension__
__PRETTY_FUNCTION__))
8750 "We expect at least two non-zero elements!")(static_cast <bool> (Zeroable.size() - Zeroable.count()
> 1 && "We expect at least two non-zero elements!"
) ? void (0) : __assert_fail ("Zeroable.size() - Zeroable.count() > 1 && \"We expect at least two non-zero elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8750, __extension__
__PRETTY_FUNCTION__))
;
8751
8752 // We only know how to deal with build_vector nodes where elements are either
8753 // zeroable or extract_vector_elt with constant index.
8754 SDValue FirstNonZero;
8755 unsigned FirstNonZeroIdx;
8756 for (unsigned i = 0; i < 4; ++i) {
8757 if (Zeroable[i])
8758 continue;
8759 SDValue Elt = Op.getOperand(i);
8760 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8761 !isa<ConstantSDNode>(Elt.getOperand(1)))
8762 return SDValue();
8763 // Make sure that this node is extracting from a 128-bit vector.
8764 MVT VT = Elt.getOperand(0).getSimpleValueType();
8765 if (!VT.is128BitVector())
8766 return SDValue();
8767 if (!FirstNonZero.getNode()) {
8768 FirstNonZero = Elt;
8769 FirstNonZeroIdx = i;
8770 }
8771 }
8772
8773 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!")(static_cast <bool> (FirstNonZero.getNode() && "Unexpected build vector of all zeros!"
) ? void (0) : __assert_fail ("FirstNonZero.getNode() && \"Unexpected build vector of all zeros!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8773, __extension__
__PRETTY_FUNCTION__))
;
8774 SDValue V1 = FirstNonZero.getOperand(0);
8775 MVT VT = V1.getSimpleValueType();
8776
8777 // See if this build_vector can be lowered as a blend with zero.
8778 SDValue Elt;
8779 unsigned EltMaskIdx, EltIdx;
8780 int Mask[4];
8781 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
8782 if (Zeroable[EltIdx]) {
8783 // The zero vector will be on the right hand side.
8784 Mask[EltIdx] = EltIdx+4;
8785 continue;
8786 }
8787
8788 Elt = Op->getOperand(EltIdx);
8789 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
8790 EltMaskIdx = Elt.getConstantOperandVal(1);
8791 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
8792 break;
8793 Mask[EltIdx] = EltIdx;
8794 }
8795
8796 if (EltIdx == 4) {
8797 // Let the shuffle legalizer deal with blend operations.
8798 SDValue VZeroOrUndef = (Zeroable == Undefs)
8799 ? DAG.getUNDEF(VT)
8800 : getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
8801 if (V1.getSimpleValueType() != VT)
8802 V1 = DAG.getBitcast(VT, V1);
8803 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);
8804 }
8805
8806 // See if we can lower this build_vector to a INSERTPS.
8807 if (!Subtarget.hasSSE41())
8808 return SDValue();
8809
8810 SDValue V2 = Elt.getOperand(0);
8811 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
8812 V1 = SDValue();
8813
8814 bool CanFold = true;
8815 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
8816 if (Zeroable[i])
8817 continue;
8818
8819 SDValue Current = Op->getOperand(i);
8820 SDValue SrcVector = Current->getOperand(0);
8821 if (!V1.getNode())
8822 V1 = SrcVector;
8823 CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);
8824 }
8825
8826 if (!CanFold)
8827 return SDValue();
8828
8829 assert(V1.getNode() && "Expected at least two non-zero elements!")(static_cast <bool> (V1.getNode() && "Expected at least two non-zero elements!"
) ? void (0) : __assert_fail ("V1.getNode() && \"Expected at least two non-zero elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8829, __extension__
__PRETTY_FUNCTION__))
;
8830 if (V1.getSimpleValueType() != MVT::v4f32)
8831 V1 = DAG.getBitcast(MVT::v4f32, V1);
8832 if (V2.getSimpleValueType() != MVT::v4f32)
8833 V2 = DAG.getBitcast(MVT::v4f32, V2);
8834
8835 // Ok, we can emit an INSERTPS instruction.
8836 unsigned ZMask = Zeroable.to_ulong();
8837
8838 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
8839 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!")(static_cast <bool> ((InsertPSMask & ~0xFFu) == 0 &&
"Invalid mask!") ? void (0) : __assert_fail ("(InsertPSMask & ~0xFFu) == 0 && \"Invalid mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8839, __extension__
__PRETTY_FUNCTION__))
;
8840 SDLoc DL(Op);
8841 SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
8842 DAG.getIntPtrConstant(InsertPSMask, DL, true));
8843 return DAG.getBitcast(VT, Result);
8844}
8845
8846/// Return a vector logical shift node.
8847static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
8848 SelectionDAG &DAG, const TargetLowering &TLI,
8849 const SDLoc &dl) {
8850 assert(VT.is128BitVector() && "Unknown type for VShift")(static_cast <bool> (VT.is128BitVector() && "Unknown type for VShift"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Unknown type for VShift\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8850, __extension__
__PRETTY_FUNCTION__))
;
8851 MVT ShVT = MVT::v16i8;
8852 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
8853 SrcOp = DAG.getBitcast(ShVT, SrcOp);
8854 assert(NumBits % 8 == 0 && "Only support byte sized shifts")(static_cast <bool> (NumBits % 8 == 0 && "Only support byte sized shifts"
) ? void (0) : __assert_fail ("NumBits % 8 == 0 && \"Only support byte sized shifts\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8854, __extension__
__PRETTY_FUNCTION__))
;
8855 SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);
8856 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
8857}
8858
8859static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
8860 SelectionDAG &DAG) {
8861
8862 // Check if the scalar load can be widened into a vector load. And if
8863 // the address is "base + cst" see if the cst can be "absorbed" into
8864 // the shuffle mask.
8865 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
8866 SDValue Ptr = LD->getBasePtr();
8867 if (!ISD::isNormalLoad(LD) || !LD->isSimple())
8868 return SDValue();
8869 EVT PVT = LD->getValueType(0);
8870 if (PVT != MVT::i32 && PVT != MVT::f32)
8871 return SDValue();
8872
8873 int FI = -1;
8874 int64_t Offset = 0;
8875 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
8876 FI = FINode->getIndex();
8877 Offset = 0;
8878 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
8879 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
8880 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
8881 Offset = Ptr.getConstantOperandVal(1);
8882 Ptr = Ptr.getOperand(0);
8883 } else {
8884 return SDValue();
8885 }
8886
8887 // FIXME: 256-bit vector instructions don't require a strict alignment,
8888 // improve this code to support it better.
8889 Align RequiredAlign(VT.getSizeInBits() / 8);
8890 SDValue Chain = LD->getChain();
8891 // Make sure the stack object alignment is at least 16 or 32.
8892 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
8893 MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr);
8894 if (!InferredAlign || *InferredAlign < RequiredAlign) {
8895 if (MFI.isFixedObjectIndex(FI)) {
8896 // Can't change the alignment. FIXME: It's possible to compute
8897 // the exact stack offset and reference FI + adjust offset instead.
8898 // If someone *really* cares about this. That's the way to implement it.
8899 return SDValue();
8900 } else {
8901 MFI.setObjectAlignment(FI, RequiredAlign);
8902 }
8903 }
8904
8905 // (Offset % 16 or 32) must be multiple of 4. Then address is then
8906 // Ptr + (Offset & ~15).
8907 if (Offset < 0)
8908 return SDValue();
8909 if ((Offset % RequiredAlign.value()) & 3)
8910 return SDValue();
8911 int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);
8912 if (StartOffset) {
8913 SDLoc DL(Ptr);
8914 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
8915 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
8916 }
8917
8918 int EltNo = (Offset - StartOffset) >> 2;
8919 unsigned NumElems = VT.getVectorNumElements();
8920
8921 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
8922 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
8923 LD->getPointerInfo().getWithOffset(StartOffset));
8924
8925 SmallVector<int, 8> Mask(NumElems, EltNo);
8926
8927 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
8928 }
8929
8930 return SDValue();
8931}
8932
8933// Recurse to find a LoadSDNode source and the accumulated ByteOffest.
8934static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
8935 if (ISD::isNON_EXTLoad(Elt.getNode())) {
8936 auto *BaseLd = cast<LoadSDNode>(Elt);
8937 if (!BaseLd->isSimple())
8938 return false;
8939 Ld = BaseLd;
8940 ByteOffset = 0;
8941 return true;
8942 }
8943
8944 switch (Elt.getOpcode()) {
8945 case ISD::BITCAST:
8946 case ISD::TRUNCATE:
8947 case ISD::SCALAR_TO_VECTOR:
8948 return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
8949 case ISD::SRL:
8950 if (auto *AmtC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
8951 uint64_t Amt = AmtC->getZExtValue();
8952 if ((Amt % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
8953 ByteOffset += Amt / 8;
8954 return true;
8955 }
8956 }
8957 break;
8958 case ISD::EXTRACT_VECTOR_ELT:
8959 if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
8960 SDValue Src = Elt.getOperand(0);
8961 unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
8962 unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
8963 if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
8964 findEltLoadSrc(Src, Ld, ByteOffset)) {
8965 uint64_t Idx = IdxC->getZExtValue();
8966 ByteOffset += Idx * (SrcSizeInBits / 8);
8967 return true;
8968 }
8969 }
8970 break;
8971 }
8972
8973 return false;
8974}
8975
8976/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
8977/// elements can be replaced by a single large load which has the same value as
8978/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
8979///
8980/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
8981static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
8982 const SDLoc &DL, SelectionDAG &DAG,
8983 const X86Subtarget &Subtarget,
8984 bool IsAfterLegalize) {
8985 if ((VT.getScalarSizeInBits() % 8) != 0)
8986 return SDValue();
8987
8988 unsigned NumElems = Elts.size();
8989
8990 int LastLoadedElt = -1;
8991 APInt LoadMask = APInt::getZero(NumElems);
8992 APInt ZeroMask = APInt::getZero(NumElems);
8993 APInt UndefMask = APInt::getZero(NumElems);
8994
8995 SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
8996 SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
8997
8998 // For each element in the initializer, see if we've found a load, zero or an
8999 // undef.
9000 for (unsigned i = 0; i < NumElems; ++i) {
9001 SDValue Elt = peekThroughBitcasts(Elts[i]);
9002 if (!Elt.getNode())
9003 return SDValue();
9004 if (Elt.isUndef()) {
9005 UndefMask.setBit(i);
9006 continue;
9007 }
9008 if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode())) {
9009 ZeroMask.setBit(i);
9010 continue;
9011 }
9012
9013 // Each loaded element must be the correct fractional portion of the
9014 // requested vector load.
9015 unsigned EltSizeInBits = Elt.getValueSizeInBits();
9016 if ((NumElems * EltSizeInBits) != VT.getSizeInBits())
9017 return SDValue();
9018
9019 if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)
9020 return SDValue();
9021 unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
9022 if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
9023 return SDValue();
9024
9025 LoadMask.setBit(i);
9026 LastLoadedElt = i;
9027 }
9028 assert((ZeroMask.countPopulation() + UndefMask.countPopulation() +(static_cast <bool> ((ZeroMask.countPopulation() + UndefMask
.countPopulation() + LoadMask.countPopulation()) == NumElems &&
"Incomplete element masks") ? void (0) : __assert_fail ("(ZeroMask.countPopulation() + UndefMask.countPopulation() + LoadMask.countPopulation()) == NumElems && \"Incomplete element masks\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9030, __extension__
__PRETTY_FUNCTION__))
9029 LoadMask.countPopulation()) == NumElems &&(static_cast <bool> ((ZeroMask.countPopulation() + UndefMask
.countPopulation() + LoadMask.countPopulation()) == NumElems &&
"Incomplete element masks") ? void (0) : __assert_fail ("(ZeroMask.countPopulation() + UndefMask.countPopulation() + LoadMask.countPopulation()) == NumElems && \"Incomplete element masks\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9030, __extension__
__PRETTY_FUNCTION__))
9030 "Incomplete element masks")(static_cast <bool> ((ZeroMask.countPopulation() + UndefMask
.countPopulation() + LoadMask.countPopulation()) == NumElems &&
"Incomplete element masks") ? void (0) : __assert_fail ("(ZeroMask.countPopulation() + UndefMask.countPopulation() + LoadMask.countPopulation()) == NumElems && \"Incomplete element masks\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9030, __extension__
__PRETTY_FUNCTION__))
;
9031
9032 // Handle Special Cases - all undef or undef/zero.
9033 if (UndefMask.countPopulation() == NumElems)
9034 return DAG.getUNDEF(VT);
9035 if ((ZeroMask.countPopulation() + UndefMask.countPopulation()) == NumElems)
9036 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
9037 : DAG.getConstantFP(0.0, DL, VT);
9038
9039 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9040 int FirstLoadedElt = LoadMask.countTrailingZeros();
9041 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
9042 EVT EltBaseVT = EltBase.getValueType();
9043 assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&(static_cast <bool> (EltBaseVT.getSizeInBits() == EltBaseVT
.getStoreSizeInBits() && "Register/Memory size mismatch"
) ? void (0) : __assert_fail ("EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() && \"Register/Memory size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9044, __extension__
__PRETTY_FUNCTION__))
9044 "Register/Memory size mismatch")(static_cast <bool> (EltBaseVT.getSizeInBits() == EltBaseVT
.getStoreSizeInBits() && "Register/Memory size mismatch"
) ? void (0) : __assert_fail ("EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() && \"Register/Memory size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9044, __extension__
__PRETTY_FUNCTION__))
;
9045 LoadSDNode *LDBase = Loads[FirstLoadedElt];
9046 assert(LDBase && "Did not find base load for merging consecutive loads")(static_cast <bool> (LDBase && "Did not find base load for merging consecutive loads"
) ? void (0) : __assert_fail ("LDBase && \"Did not find base load for merging consecutive loads\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9046, __extension__
__PRETTY_FUNCTION__))
;
9047 unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
9048 unsigned BaseSizeInBytes = BaseSizeInBits / 8;
9049 int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);
9050 int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;
9051 assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected")(static_cast <bool> ((BaseSizeInBits % 8) == 0 &&
"Sub-byte element loads detected") ? void (0) : __assert_fail
("(BaseSizeInBits % 8) == 0 && \"Sub-byte element loads detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9051, __extension__
__PRETTY_FUNCTION__))
;
9052
9053 // TODO: Support offsetting the base load.
9054 if (ByteOffsets[FirstLoadedElt] != 0)
9055 return SDValue();
9056
9057 // Check to see if the element's load is consecutive to the base load
9058 // or offset from a previous (already checked) load.
9059 auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
9060 LoadSDNode *Ld = Loads[EltIdx];
9061 int64_t ByteOffset = ByteOffsets[EltIdx];
9062 if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
9063 int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
9064 return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
9065 Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
9066 }
9067 return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,
9068 EltIdx - FirstLoadedElt);
9069 };
9070
9071 // Consecutive loads can contain UNDEFS but not ZERO elements.
9072 // Consecutive loads with UNDEFs and ZEROs elements require a
9073 // an additional shuffle stage to clear the ZERO elements.
9074 bool IsConsecutiveLoad = true;
9075 bool IsConsecutiveLoadWithZeros = true;
9076 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
9077 if (LoadMask[i]) {
9078 if (!CheckConsecutiveLoad(LDBase, i)) {
9079 IsConsecutiveLoad = false;
9080 IsConsecutiveLoadWithZeros = false;
9081 break;
9082 }
9083 } else if (ZeroMask[i]) {
9084 IsConsecutiveLoad = false;
9085 }
9086 }
9087
9088 auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
9089 auto MMOFlags = LDBase->getMemOperand()->getFlags();
9090 assert(LDBase->isSimple() &&(static_cast <bool> (LDBase->isSimple() && "Cannot merge volatile or atomic loads."
) ? void (0) : __assert_fail ("LDBase->isSimple() && \"Cannot merge volatile or atomic loads.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9091, __extension__
__PRETTY_FUNCTION__))
9091 "Cannot merge volatile or atomic loads.")(static_cast <bool> (LDBase->isSimple() && "Cannot merge volatile or atomic loads."
) ? void (0) : __assert_fail ("LDBase->isSimple() && \"Cannot merge volatile or atomic loads.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9091, __extension__
__PRETTY_FUNCTION__))
;
9092 SDValue NewLd =
9093 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
9094 LDBase->getPointerInfo(), LDBase->getOriginalAlign(),
9095 MMOFlags);
9096 for (auto *LD : Loads)
9097 if (LD)
9098 DAG.makeEquivalentMemoryOrdering(LD, NewLd);
9099 return NewLd;
9100 };
9101
9102 // Check if the base load is entirely dereferenceable.
9103 bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
9104 VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());
9105
9106 // LOAD - all consecutive load/undefs (must start/end with a load or be
9107 // entirely dereferenceable). If we have found an entire vector of loads and
9108 // undefs, then return a large load of the entire vector width starting at the
9109 // base pointer. If the vector contains zeros, then attempt to shuffle those
9110 // elements.
9111 if (FirstLoadedElt == 0 &&
9112 (NumLoadedElts == (int)NumElems || IsDereferenceable) &&
9113 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
9114 if (IsAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
9115 return SDValue();
9116
9117 // Don't create 256-bit non-temporal aligned loads without AVX2 as these
9118 // will lower to regular temporal loads and use the cache.
9119 if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 &&
9120 VT.is256BitVector() && !Subtarget.hasInt256())
9121 return SDValue();
9122
9123 if (NumElems == 1)
9124 return DAG.getBitcast(VT, Elts[FirstLoadedElt]);
9125
9126 if (!ZeroMask)
9127 return CreateLoad(VT, LDBase);
9128
9129 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
9130 // vector and a zero vector to clear out the zero elements.
9131 if (!IsAfterLegalize && VT.isVector()) {
9132 unsigned NumMaskElts = VT.getVectorNumElements();
9133 if ((NumMaskElts % NumElems) == 0) {
9134 unsigned Scale = NumMaskElts / NumElems;
9135 SmallVector<int, 4> ClearMask(NumMaskElts, -1);
9136 for (unsigned i = 0; i < NumElems; ++i) {
9137 if (UndefMask[i])
9138 continue;
9139 int Offset = ZeroMask[i] ? NumMaskElts : 0;
9140 for (unsigned j = 0; j != Scale; ++j)
9141 ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;
9142 }
9143 SDValue V = CreateLoad(VT, LDBase);
9144 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
9145 : DAG.getConstantFP(0.0, DL, VT);
9146 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
9147 }
9148 }
9149 }
9150
9151 // If the upper half of a ymm/zmm load is undef then just load the lower half.
9152 if (VT.is256BitVector() || VT.is512BitVector()) {
9153 unsigned HalfNumElems = NumElems / 2;
9154 if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnes()) {
9155 EVT HalfVT =
9156 EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
9157 SDValue HalfLD =
9158 EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
9159 DAG, Subtarget, IsAfterLegalize);
9160 if (HalfLD)
9161 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
9162 HalfLD, DAG.getIntPtrConstant(0, DL));
9163 }
9164 }
9165
9166 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
9167 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
9168 ((LoadSizeInBits == 16 && Subtarget.hasFP16()) || LoadSizeInBits == 32 ||
9169 LoadSizeInBits == 64) &&
9170 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
9171 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
9172 : MVT::getIntegerVT(LoadSizeInBits);
9173 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
9174 // Allow v4f32 on SSE1 only targets.
9175 // FIXME: Add more isel patterns so we can just use VT directly.
9176 if (!Subtarget.hasSSE2() && VT == MVT::v4f32)
9177 VecVT = MVT::v4f32;
9178 if (TLI.isTypeLegal(VecVT)) {
9179 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
9180 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
9181 SDValue ResNode = DAG.getMemIntrinsicNode(
9182 X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),
9183 LDBase->getOriginalAlign(), MachineMemOperand::MOLoad);
9184 for (auto *LD : Loads)
9185 if (LD)
9186 DAG.makeEquivalentMemoryOrdering(LD, ResNode);
9187 return DAG.getBitcast(VT, ResNode);
9188 }
9189 }
9190
9191 // BROADCAST - match the smallest possible repetition pattern, load that
9192 // scalar/subvector element and then broadcast to the entire vector.
9193 if (ZeroMask.isZero() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
9194 (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {
9195 for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
9196 unsigned RepeatSize = SubElems * BaseSizeInBits;
9197 unsigned ScalarSize = std::min(RepeatSize, 64u);
9198 if (!Subtarget.hasAVX2() && ScalarSize < 32)
9199 continue;
9200
9201 // Don't attempt a 1:N subvector broadcast - it should be caught by
9202 // combineConcatVectorOps, else will cause infinite loops.
9203 if (RepeatSize > ScalarSize && SubElems == 1)
9204 continue;
9205
9206 bool Match = true;
9207 SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
9208 for (unsigned i = 0; i != NumElems && Match; ++i) {
9209 if (!LoadMask[i])
9210 continue;
9211 SDValue Elt = peekThroughBitcasts(Elts[i]);
9212 if (RepeatedLoads[i % SubElems].isUndef())
9213 RepeatedLoads[i % SubElems] = Elt;
9214 else
9215 Match &= (RepeatedLoads[i % SubElems] == Elt);
9216 }
9217
9218 // We must have loads at both ends of the repetition.
9219 Match &= !RepeatedLoads.front().isUndef();
9220 Match &= !RepeatedLoads.back().isUndef();
9221 if (!Match)
9222 continue;
9223
9224 EVT RepeatVT =
9225 VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))
9226 ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)
9227 : EVT::getFloatingPointVT(ScalarSize);
9228 if (RepeatSize > ScalarSize)
9229 RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,
9230 RepeatSize / ScalarSize);
9231 EVT BroadcastVT =
9232 EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),
9233 VT.getSizeInBits() / ScalarSize);
9234 if (TLI.isTypeLegal(BroadcastVT)) {
9235 if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
9236 RepeatVT, RepeatedLoads, DL, DAG, Subtarget, IsAfterLegalize)) {
9237 SDValue Broadcast = RepeatLoad;
9238 if (RepeatSize > ScalarSize) {
9239 while (Broadcast.getValueSizeInBits() < VT.getSizeInBits())
9240 Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL);
9241 } else {
9242 if (!Subtarget.hasAVX2() &&
9243 !X86::mayFoldLoadIntoBroadcastFromMem(
9244 RepeatLoad, RepeatVT.getScalarType().getSimpleVT(),
9245 Subtarget,
9246 /*AssumeSingleUse=*/true))
9247 return SDValue();
9248 Broadcast =
9249 DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad);
9250 }
9251 return DAG.getBitcast(VT, Broadcast);
9252 }
9253 }
9254 }
9255 }
9256
9257 return SDValue();
9258}
9259
9260// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
9261// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
9262// are consecutive, non-overlapping, and in the right order.
9263static SDValue combineToConsecutiveLoads(EVT VT, SDValue Op, const SDLoc &DL,
9264 SelectionDAG &DAG,
9265 const X86Subtarget &Subtarget,
9266 bool IsAfterLegalize) {
9267 SmallVector<SDValue, 64> Elts;
9268 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
9269 if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {
9270 Elts.push_back(Elt);
9271 continue;
9272 }
9273 return SDValue();
9274 }
9275 assert(Elts.size() == VT.getVectorNumElements())(static_cast <bool> (Elts.size() == VT.getVectorNumElements
()) ? void (0) : __assert_fail ("Elts.size() == VT.getVectorNumElements()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9275, __extension__
__PRETTY_FUNCTION__))
;
9276 return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,
9277 IsAfterLegalize);
9278}
9279
9280static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
9281 unsigned SplatBitSize, LLVMContext &C) {
9282 unsigned ScalarSize = VT.getScalarSizeInBits();
9283 unsigned NumElm = SplatBitSize / ScalarSize;
9284
9285 SmallVector<Constant *, 32> ConstantVec;
9286 for (unsigned i = 0; i < NumElm; i++) {
9287 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
9288 Constant *Const;
9289 if (VT.isFloatingPoint()) {
9290 if (ScalarSize == 16) {
9291 Const = ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
9292 } else if (ScalarSize == 32) {
9293 Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
9294 } else {
9295 assert(ScalarSize == 64 && "Unsupported floating point scalar size")(static_cast <bool> (ScalarSize == 64 && "Unsupported floating point scalar size"
) ? void (0) : __assert_fail ("ScalarSize == 64 && \"Unsupported floating point scalar size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9295, __extension__
__PRETTY_FUNCTION__))
;
9296 Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
9297 }
9298 } else
9299 Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
9300 ConstantVec.push_back(Const);
9301 }
9302 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
9303}
9304
9305static bool isFoldableUseOfShuffle(SDNode *N) {
9306 for (auto *U : N->uses()) {
9307 unsigned Opc = U->getOpcode();
9308 // VPERMV/VPERMV3 shuffles can never fold their index operands.
9309 if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
9310 return false;
9311 if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
9312 return false;
9313 if (isTargetShuffle(Opc))
9314 return true;
9315 if (Opc == ISD::BITCAST) // Ignore bitcasts
9316 return isFoldableUseOfShuffle(U);
9317 if (N->hasOneUse()) {
9318 // TODO, there may be some general way to know if a SDNode can
9319 // be folded. We now only know whether an MI is foldable.
9320 if (Opc == X86ISD::VPDPBUSD && U->getOperand(2).getNode() != N)
9321 return false;
9322 return true;
9323 }
9324 }
9325 return false;
9326}
9327
9328/// Attempt to use the vbroadcast instruction to generate a splat value
9329/// from a splat BUILD_VECTOR which uses:
9330/// a. A single scalar load, or a constant.
9331/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
9332///
9333/// The VBROADCAST node is returned when a pattern is found,
9334/// or SDValue() otherwise.
9335static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
9336 const X86Subtarget &Subtarget,
9337 SelectionDAG &DAG) {
9338 // VBROADCAST requires AVX.
9339 // TODO: Splats could be generated for non-AVX CPUs using SSE
9340 // instructions, but there's less potential gain for only 128-bit vectors.
9341 if (!Subtarget.hasAVX())
9342 return SDValue();
9343
9344 MVT VT = BVOp->getSimpleValueType(0);
9345 unsigned NumElts = VT.getVectorNumElements();
9346 SDLoc dl(BVOp);
9347
9348 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Unsupported vector type for broadcast."
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported vector type for broadcast.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9349, __extension__
__PRETTY_FUNCTION__))
9349 "Unsupported vector type for broadcast.")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Unsupported vector type for broadcast."
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported vector type for broadcast.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9349, __extension__
__PRETTY_FUNCTION__))
;
9350
9351 // See if the build vector is a repeating sequence of scalars (inc. splat).
9352 SDValue Ld;
9353 BitVector UndefElements;
9354 SmallVector<SDValue, 16> Sequence;
9355 if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) {
9356 assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit.")(static_cast <bool> ((NumElts % Sequence.size()) == 0 &&
"Sequence doesn't fit.") ? void (0) : __assert_fail ("(NumElts % Sequence.size()) == 0 && \"Sequence doesn't fit.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9356, __extension__
__PRETTY_FUNCTION__))
;
9357 if (Sequence.size() == 1)
9358 Ld = Sequence[0];
9359 }
9360
9361 // Attempt to use VBROADCASTM
9362 // From this pattern:
9363 // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
9364 // b. t1 = (build_vector t0 t0)
9365 //
9366 // Create (VBROADCASTM v2i1 X)
9367 if (!Sequence.empty() && Subtarget.hasCDI()) {
9368 // If not a splat, are the upper sequence values zeroable?
9369 unsigned SeqLen = Sequence.size();
9370 bool UpperZeroOrUndef =
9371 SeqLen == 1 ||
9372 llvm::all_of(makeArrayRef(Sequence).drop_front(), [](SDValue V) {
9373 return !V || V.isUndef() || isNullConstant(V);
9374 });
9375 SDValue Op0 = Sequence[0];
9376 if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) ||
9377 (Op0.getOpcode() == ISD::ZERO_EXTEND &&
9378 Op0.getOperand(0).getOpcode() == ISD::BITCAST))) {
9379 SDValue BOperand = Op0.getOpcode() == ISD::BITCAST
9380 ? Op0.getOperand(0)
9381 : Op0.getOperand(0).getOperand(0);
9382 MVT MaskVT = BOperand.getSimpleValueType();
9383 MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen);
9384 if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q
9385 (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
9386 MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen);
9387 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
9388 unsigned Scale = 512 / VT.getSizeInBits();
9389 BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen));
9390 }
9391 SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand);
9392 if (BcstVT.getSizeInBits() != VT.getSizeInBits())
9393 Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits());
9394 return DAG.getBitcast(VT, Bcst);
9395 }
9396 }
9397 }
9398
9399 unsigned NumUndefElts = UndefElements.count();
9400 if (!Ld || (NumElts - NumUndefElts) <= 1) {
9401 APInt SplatValue, Undef;
9402 unsigned SplatBitSize;
9403 bool HasUndef;
9404 // Check if this is a repeated constant pattern suitable for broadcasting.
9405 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
9406 SplatBitSize > VT.getScalarSizeInBits() &&
9407 SplatBitSize < VT.getSizeInBits()) {
9408 // Avoid replacing with broadcast when it's a use of a shuffle
9409 // instruction to preserve the present custom lowering of shuffles.
9410 if (isFoldableUseOfShuffle(BVOp))
9411 return SDValue();
9412 // replace BUILD_VECTOR with broadcast of the repeated constants.
9413 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9414 LLVMContext *Ctx = DAG.getContext();
9415 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
9416 if (Subtarget.hasAVX()) {
9417 if (SplatBitSize == 32 || SplatBitSize == 64 ||
9418 (SplatBitSize < 32 && Subtarget.hasAVX2())) {
9419 // Splatted value can fit in one INTEGER constant in constant pool.
9420 // Load the constant and broadcast it.
9421 MVT CVT = MVT::getIntegerVT(SplatBitSize);
9422 Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);
9423 Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
9424 SDValue CP = DAG.getConstantPool(C, PVT);
9425 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
9426
9427 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
9428 SDVTList Tys =
9429 DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);
9430 SDValue Ops[] = {DAG.getEntryNode(), CP};
9431 MachinePointerInfo MPI =
9432 MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
9433 SDValue Brdcst = DAG.getMemIntrinsicNode(
9434 X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT, MPI, Alignment,
9435 MachineMemOperand::MOLoad);
9436 return DAG.getBitcast(VT, Brdcst);
9437 }
9438 if (SplatBitSize > 64) {
9439 // Load the vector of constants and broadcast it.
9440 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
9441 *Ctx);
9442 SDValue VCP = DAG.getConstantPool(VecC, PVT);
9443 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
9444 MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm);
9445 Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();
9446 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
9447 SDValue Ops[] = {DAG.getEntryNode(), VCP};
9448 MachinePointerInfo MPI =
9449 MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
9450 return DAG.getMemIntrinsicNode(
9451 X86ISD::SUBV_BROADCAST_LOAD, dl, Tys, Ops, VVT, MPI, Alignment,
9452 MachineMemOperand::MOLoad);
9453 }
9454 }
9455 }
9456
9457 // If we are moving a scalar into a vector (Ld must be set and all elements
9458 // but 1 are undef) and that operation is not obviously supported by
9459 // vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.
9460 // That's better than general shuffling and may eliminate a load to GPR and
9461 // move from scalar to vector register.
9462 if (!Ld || NumElts - NumUndefElts != 1)
9463 return SDValue();
9464 unsigned ScalarSize = Ld.getValueSizeInBits();
9465 if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))
9466 return SDValue();
9467 }
9468
9469 bool ConstSplatVal =
9470 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
9471 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
9472
9473 // TODO: Handle broadcasts of non-constant sequences.
9474
9475 // Make sure that all of the users of a non-constant load are from the
9476 // BUILD_VECTOR node.
9477 // FIXME: Is the use count needed for non-constant, non-load case?
9478 if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))
9479 return SDValue();
9480
9481 unsigned ScalarSize = Ld.getValueSizeInBits();
9482 bool IsGE256 = (VT.getSizeInBits() >= 256);
9483
9484 // When optimizing for size, generate up to 5 extra bytes for a broadcast
9485 // instruction to save 8 or more bytes of constant pool data.
9486 // TODO: If multiple splats are generated to load the same constant,
9487 // it may be detrimental to overall size. There needs to be a way to detect
9488 // that condition to know if this is truly a size win.
9489 bool OptForSize = DAG.shouldOptForSize();
9490
9491 // Handle broadcasting a single constant scalar from the constant pool
9492 // into a vector.
9493 // On Sandybridge (no AVX2), it is still better to load a constant vector
9494 // from the constant pool and not to broadcast it from a scalar.
9495 // But override that restriction when optimizing for size.
9496 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
9497 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
9498 EVT CVT = Ld.getValueType();
9499 assert(!CVT.isVector() && "Must not broadcast a vector type")(static_cast <bool> (!CVT.isVector() && "Must not broadcast a vector type"
) ? void (0) : __assert_fail ("!CVT.isVector() && \"Must not broadcast a vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9499, __extension__
__PRETTY_FUNCTION__))
;
9500
9501 // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
9502 // For size optimization, also splat v2f64 and v2i64, and for size opt
9503 // with AVX2, also splat i8 and i16.
9504 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
9505 if (ScalarSize == 32 ||
9506 (ScalarSize == 64 && (IsGE256 || Subtarget.hasVLX())) ||
9507 (ScalarSize == 16 && Subtarget.hasFP16() && CVT.isFloatingPoint()) ||
9508 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
9509 const Constant *C = nullptr;
9510 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
9511 C = CI->getConstantIntValue();
9512 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
9513 C = CF->getConstantFPValue();
9514
9515 assert(C && "Invalid constant type")(static_cast <bool> (C && "Invalid constant type"
) ? void (0) : __assert_fail ("C && \"Invalid constant type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9515, __extension__
__PRETTY_FUNCTION__))
;
9516
9517 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9518 SDValue CP =
9519 DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
9520 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
9521
9522 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
9523 SDValue Ops[] = {DAG.getEntryNode(), CP};
9524 MachinePointerInfo MPI =
9525 MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
9526 return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
9527 MPI, Alignment, MachineMemOperand::MOLoad);
9528 }
9529 }
9530
9531 // Handle AVX2 in-register broadcasts.
9532 if (!IsLoad && Subtarget.hasInt256() &&
9533 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
9534 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
9535
9536 // The scalar source must be a normal load.
9537 if (!IsLoad)
9538 return SDValue();
9539
9540 // Make sure the non-chain result is only used by this build vector.
9541 if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))
9542 return SDValue();
9543
9544 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
9545 (Subtarget.hasVLX() && ScalarSize == 64)) {
9546 auto *LN = cast<LoadSDNode>(Ld);
9547 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
9548 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
9549 SDValue BCast =
9550 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
9551 LN->getMemoryVT(), LN->getMemOperand());
9552 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
9553 return BCast;
9554 }
9555
9556 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
9557 // double since there is no vbroadcastsd xmm
9558 if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&
9559 (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {
9560 auto *LN = cast<LoadSDNode>(Ld);
9561 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
9562 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
9563 SDValue BCast =
9564 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
9565 LN->getMemoryVT(), LN->getMemOperand());
9566 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
9567 return BCast;
9568 }
9569
9570 if (ScalarSize == 16 && Subtarget.hasFP16() && IsGE256)
9571 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
9572
9573 // Unsupported broadcast.
9574 return SDValue();
9575}
9576
9577/// For an EXTRACT_VECTOR_ELT with a constant index return the real
9578/// underlying vector and index.
9579///
9580/// Modifies \p ExtractedFromVec to the real vector and returns the real
9581/// index.
9582static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
9583 SDValue ExtIdx) {
9584 int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
9585 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
9586 return Idx;
9587
9588 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
9589 // lowered this:
9590 // (extract_vector_elt (v8f32 %1), Constant<6>)
9591 // to:
9592 // (extract_vector_elt (vector_shuffle<2,u,u,u>
9593 // (extract_subvector (v8f32 %0), Constant<4>),
9594 // undef)
9595 // Constant<0>)
9596 // In this case the vector is the extract_subvector expression and the index
9597 // is 2, as specified by the shuffle.
9598 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
9599 SDValue ShuffleVec = SVOp->getOperand(0);
9600 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
9601 assert(ShuffleVecVT.getVectorElementType() ==(static_cast <bool> (ShuffleVecVT.getVectorElementType(
) == ExtractedFromVec.getSimpleValueType().getVectorElementType
()) ? void (0) : __assert_fail ("ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType().getVectorElementType()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9602, __extension__
__PRETTY_FUNCTION__))
9602 ExtractedFromVec.getSimpleValueType().getVectorElementType())(static_cast <bool> (ShuffleVecVT.getVectorElementType(
) == ExtractedFromVec.getSimpleValueType().getVectorElementType
()) ? void (0) : __assert_fail ("ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType().getVectorElementType()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9602, __extension__
__PRETTY_FUNCTION__))
;
9603
9604 int ShuffleIdx = SVOp->getMaskElt(Idx);
9605 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
9606 ExtractedFromVec = ShuffleVec;
9607 return ShuffleIdx;
9608 }
9609 return Idx;
9610}
9611
9612static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
9613 MVT VT = Op.getSimpleValueType();
9614
9615 // Skip if insert_vec_elt is not supported.
9616 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9617 if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
9618 return SDValue();
9619
9620 SDLoc DL(Op);
9621 unsigned NumElems = Op.getNumOperands();
9622
9623 SDValue VecIn1;
9624 SDValue VecIn2;
9625 SmallVector<unsigned, 4> InsertIndices;
9626 SmallVector<int, 8> Mask(NumElems, -1);
9627
9628 for (unsigned i = 0; i != NumElems; ++i) {
9629 unsigned Opc = Op.getOperand(i).getOpcode();
9630
9631 if (Opc == ISD::UNDEF)
9632 continue;
9633
9634 if (Opc != ISD::EXTRACT_VECTOR_ELT) {
9635 // Quit if more than 1 elements need inserting.
9636 if (InsertIndices.size() > 1)
9637 return SDValue();
9638
9639 InsertIndices.push_back(i);
9640 continue;
9641 }
9642
9643 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
9644 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
9645
9646 // Quit if non-constant index.
9647 if (!isa<ConstantSDNode>(ExtIdx))
9648 return SDValue();
9649 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
9650
9651 // Quit if extracted from vector of different type.
9652 if (ExtractedFromVec.getValueType() != VT)
9653 return SDValue();
9654
9655 if (!VecIn1.getNode())
9656 VecIn1 = ExtractedFromVec;
9657 else if (VecIn1 != ExtractedFromVec) {
9658 if (!VecIn2.getNode())
9659 VecIn2 = ExtractedFromVec;
9660 else if (VecIn2 != ExtractedFromVec)
9661 // Quit if more than 2 vectors to shuffle
9662 return SDValue();
9663 }
9664
9665 if (ExtractedFromVec == VecIn1)
9666 Mask[i] = Idx;
9667 else if (ExtractedFromVec == VecIn2)
9668 Mask[i] = Idx + NumElems;
9669 }
9670
9671 if (!VecIn1.getNode())
9672 return SDValue();
9673
9674 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
9675 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
9676
9677 for (unsigned Idx : InsertIndices)
9678 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
9679 DAG.getIntPtrConstant(Idx, DL));
9680
9681 return NV;
9682}
9683
9684// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
9685static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
9686 const X86Subtarget &Subtarget) {
9687
9688 MVT VT = Op.getSimpleValueType();
9689 assert((VT.getVectorElementType() == MVT::i1) &&(static_cast <bool> ((VT.getVectorElementType() == MVT::
i1) && "Unexpected type in LowerBUILD_VECTORvXi1!") ?
void (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i1) && \"Unexpected type in LowerBUILD_VECTORvXi1!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9690, __extension__
__PRETTY_FUNCTION__))
9690 "Unexpected type in LowerBUILD_VECTORvXi1!")(static_cast <bool> ((VT.getVectorElementType() == MVT::
i1) && "Unexpected type in LowerBUILD_VECTORvXi1!") ?
void (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i1) && \"Unexpected type in LowerBUILD_VECTORvXi1!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9690, __extension__
__PRETTY_FUNCTION__))
;
9691
9692 SDLoc dl(Op);
9693 if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
9694 ISD::isBuildVectorAllOnes(Op.getNode()))
9695 return Op;
9696
9697 uint64_t Immediate = 0;
9698 SmallVector<unsigned, 16> NonConstIdx;
9699 bool IsSplat = true;
9700 bool HasConstElts = false;
9701 int SplatIdx = -1;
9702 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
9703 SDValue In = Op.getOperand(idx);
9704 if (In.isUndef())
9705 continue;
9706 if (auto *InC = dyn_cast<ConstantSDNode>(In)) {
9707 Immediate |= (InC->getZExtValue() & 0x1) << idx;
9708 HasConstElts = true;
9709 } else {
9710 NonConstIdx.push_back(idx);
9711 }
9712 if (SplatIdx < 0)
9713 SplatIdx = idx;
9714 else if (In != Op.getOperand(SplatIdx))
9715 IsSplat = false;
9716 }
9717
9718 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
9719 if (IsSplat) {
9720 // The build_vector allows the scalar element to be larger than the vector
9721 // element type. We need to mask it to use as a condition unless we know
9722 // the upper bits are zero.
9723 // FIXME: Use computeKnownBits instead of checking specific opcode?
9724 SDValue Cond = Op.getOperand(SplatIdx);
9725 assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!")(static_cast <bool> (Cond.getValueType() == MVT::i8 &&
"Unexpected VT!") ? void (0) : __assert_fail ("Cond.getValueType() == MVT::i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9725, __extension__
__PRETTY_FUNCTION__))
;
9726 if (Cond.getOpcode() != ISD::SETCC)
9727 Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,
9728 DAG.getConstant(1, dl, MVT::i8));
9729
9730 // Perform the select in the scalar domain so we can use cmov.
9731 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
9732 SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,
9733 DAG.getAllOnesConstant(dl, MVT::i32),
9734 DAG.getConstant(0, dl, MVT::i32));
9735 Select = DAG.getBitcast(MVT::v32i1, Select);
9736 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);
9737 } else {
9738 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
9739 SDValue Select = DAG.getSelect(dl, ImmVT, Cond,
9740 DAG.getAllOnesConstant(dl, ImmVT),
9741 DAG.getConstant(0, dl, ImmVT));
9742 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
9743 Select = DAG.getBitcast(VecVT, Select);
9744 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,
9745 DAG.getIntPtrConstant(0, dl));
9746 }
9747 }
9748
9749 // insert elements one by one
9750 SDValue DstVec;
9751 if (HasConstElts) {
9752 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
9753 SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);
9754 SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);
9755 ImmL = DAG.getBitcast(MVT::v32i1, ImmL);
9756 ImmH = DAG.getBitcast(MVT::v32i1, ImmH);
9757 DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);
9758 } else {
9759 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
9760 SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);
9761 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
9762 DstVec = DAG.getBitcast(VecVT, Imm);
9763 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,
9764 DAG.getIntPtrConstant(0, dl));
9765 }
9766 } else
9767 DstVec = DAG.getUNDEF(VT);
9768
9769 for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
9770 unsigned InsertIdx = NonConstIdx[i];
9771 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
9772 Op.getOperand(InsertIdx),
9773 DAG.getIntPtrConstant(InsertIdx, dl));
9774 }
9775 return DstVec;
9776}
9777
9778LLVM_ATTRIBUTE_UNUSED__attribute__((__unused__)) static bool isHorizOp(unsigned Opcode) {
9779 switch (Opcode) {
9780 case X86ISD::PACKSS:
9781 case X86ISD::PACKUS:
9782 case X86ISD::FHADD:
9783 case X86ISD::FHSUB:
9784 case X86ISD::HADD:
9785 case X86ISD::HSUB:
9786 return true;
9787 }
9788 return false;
9789}
9790
9791/// This is a helper function of LowerToHorizontalOp().
9792/// This function checks that the build_vector \p N in input implements a
9793/// 128-bit partial horizontal operation on a 256-bit vector, but that operation
9794/// may not match the layout of an x86 256-bit horizontal instruction.
9795/// In other words, if this returns true, then some extraction/insertion will
9796/// be required to produce a valid horizontal instruction.
9797///
9798/// Parameter \p Opcode defines the kind of horizontal operation to match.
9799/// For example, if \p Opcode is equal to ISD::ADD, then this function
9800/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
9801/// is equal to ISD::SUB, then this function checks if this is a horizontal
9802/// arithmetic sub.
9803///
9804/// This function only analyzes elements of \p N whose indices are
9805/// in range [BaseIdx, LastIdx).
9806///
9807/// TODO: This function was originally used to match both real and fake partial
9808/// horizontal operations, but the index-matching logic is incorrect for that.
9809/// See the corrected implementation in isHopBuildVector(). Can we reduce this
9810/// code because it is only used for partial h-op matching now?
9811static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
9812 SelectionDAG &DAG,
9813 unsigned BaseIdx, unsigned LastIdx,
9814 SDValue &V0, SDValue &V1) {
9815 EVT VT = N->getValueType(0);
9816 assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops")(static_cast <bool> (VT.is256BitVector() && "Only use for matching partial 256-bit h-ops"
) ? void (0) : __assert_fail ("VT.is256BitVector() && \"Only use for matching partial 256-bit h-ops\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9816, __extension__
__PRETTY_FUNCTION__))
;
9817 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!")(static_cast <bool> (BaseIdx * 2 <= LastIdx &&
"Invalid Indices in input!") ? void (0) : __assert_fail ("BaseIdx * 2 <= LastIdx && \"Invalid Indices in input!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9817, __extension__
__PRETTY_FUNCTION__))
;
9818 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&(static_cast <bool> (VT.isVector() && VT.getVectorNumElements
() >= LastIdx && "Invalid Vector in input!") ? void
(0) : __assert_fail ("VT.isVector() && VT.getVectorNumElements() >= LastIdx && \"Invalid Vector in input!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9819, __extension__
__PRETTY_FUNCTION__))
9819 "Invalid Vector in input!")(static_cast <bool> (VT.isVector() && VT.getVectorNumElements
() >= LastIdx && "Invalid Vector in input!") ? void
(0) : __assert_fail ("VT.isVector() && VT.getVectorNumElements() >= LastIdx && \"Invalid Vector in input!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9819, __extension__
__PRETTY_FUNCTION__))
;
9820
9821 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
9822 bool CanFold = true;
9823 unsigned ExpectedVExtractIdx = BaseIdx;
9824 unsigned NumElts = LastIdx - BaseIdx;
9825 V0 = DAG.getUNDEF(VT);
9826 V1 = DAG.getUNDEF(VT);
9827
9828 // Check if N implements a horizontal binop.
9829 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
9830 SDValue Op = N->getOperand(i + BaseIdx);
9831
9832 // Skip UNDEFs.
9833 if (Op->isUndef()) {
9834 // Update the expected vector extract index.
9835 if (i * 2 == NumElts)
9836 ExpectedVExtractIdx = BaseIdx;
9837 ExpectedVExtractIdx += 2;
9838 continue;
9839 }
9840
9841 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
9842
9843 if (!CanFold)
9844 break;
9845
9846 SDValue Op0 = Op.getOperand(0);
9847 SDValue Op1 = Op.getOperand(1);
9848
9849 // Try to match the following pattern:
9850 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
9851 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
9852 Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
9853 Op0.getOperand(0) == Op1.getOperand(0) &&
9854 isa<ConstantSDNode>(Op0.getOperand(1)) &&
9855 isa<ConstantSDNode>(Op1.getOperand(1)));
9856 if (!CanFold)
9857 break;
9858
9859 unsigned I0 = Op0.getConstantOperandVal(1);
9860 unsigned I1 = Op1.getConstantOperandVal(1);
9861
9862 if (i * 2 < NumElts) {
9863 if (V0.isUndef()) {
9864 V0 = Op0.getOperand(0);
9865 if (V0.getValueType() != VT)
9866 return false;
9867 }
9868 } else {
9869 if (V1.isUndef()) {
9870 V1 = Op0.getOperand(0);
9871 if (V1.getValueType() != VT)
9872 return false;
9873 }
9874 if (i * 2 == NumElts)
9875 ExpectedVExtractIdx = BaseIdx;
9876 }
9877
9878 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
9879 if (I0 == ExpectedVExtractIdx)
9880 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
9881 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
9882 // Try to match the following dag sequence:
9883 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
9884 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
9885 } else
9886 CanFold = false;
9887
9888 ExpectedVExtractIdx += 2;
9889 }
9890
9891 return CanFold;
9892}
9893
9894/// Emit a sequence of two 128-bit horizontal add/sub followed by
9895/// a concat_vector.
9896///
9897/// This is a helper function of LowerToHorizontalOp().
9898/// This function expects two 256-bit vectors called V0 and V1.
9899/// At first, each vector is split into two separate 128-bit vectors.
9900/// Then, the resulting 128-bit vectors are used to implement two
9901/// horizontal binary operations.
9902///
9903/// The kind of horizontal binary operation is defined by \p X86Opcode.
9904///
9905/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
9906/// the two new horizontal binop.
9907/// When Mode is set, the first horizontal binop dag node would take as input
9908/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
9909/// horizontal binop dag node would take as input the lower 128-bit of V1
9910/// and the upper 128-bit of V1.
9911/// Example:
9912/// HADD V0_LO, V0_HI
9913/// HADD V1_LO, V1_HI
9914///
9915/// Otherwise, the first horizontal binop dag node takes as input the lower
9916/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
9917/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
9918/// Example:
9919/// HADD V0_LO, V1_LO
9920/// HADD V0_HI, V1_HI
9921///
9922/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
9923/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
9924/// the upper 128-bits of the result.
9925static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
9926 const SDLoc &DL, SelectionDAG &DAG,
9927 unsigned X86Opcode, bool Mode,
9928 bool isUndefLO, bool isUndefHI) {
9929 MVT VT = V0.getSimpleValueType();
9930 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&(static_cast <bool> (VT.is256BitVector() && VT ==
V1.getSimpleValueType() && "Invalid nodes in input!"
) ? void (0) : __assert_fail ("VT.is256BitVector() && VT == V1.getSimpleValueType() && \"Invalid nodes in input!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9931, __extension__
__PRETTY_FUNCTION__))
9931 "Invalid nodes in input!")(static_cast <bool> (VT.is256BitVector() && VT ==
V1.getSimpleValueType() && "Invalid nodes in input!"
) ? void (0) : __assert_fail ("VT.is256BitVector() && VT == V1.getSimpleValueType() && \"Invalid nodes in input!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9931, __extension__
__PRETTY_FUNCTION__))
;
9932
9933 unsigned NumElts = VT.getVectorNumElements();
9934 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
9935 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
9936 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
9937 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
9938 MVT NewVT = V0_LO.getSimpleValueType();
9939
9940 SDValue LO = DAG.getUNDEF(NewVT);
9941 SDValue HI = DAG.getUNDEF(NewVT);
9942
9943 if (Mode) {
9944 // Don't emit a horizontal binop if the result is expected to be UNDEF.
9945 if (!isUndefLO && !V0->isUndef())
9946 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
9947 if (!isUndefHI && !V1->isUndef())
9948 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
9949 } else {
9950 // Don't emit a horizontal binop if the result is expected to be UNDEF.
9951 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
9952 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
9953
9954 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
9955 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
9956 }
9957
9958 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
9959}
9960
9961/// Returns true iff \p BV builds a vector with the result equivalent to
9962/// the result of ADDSUB/SUBADD operation.
9963/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
9964/// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
9965/// \p Opnd0 and \p Opnd1.
9966static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,
9967 const X86Subtarget &Subtarget, SelectionDAG &DAG,
9968 SDValue &Opnd0, SDValue &Opnd1,
9969 unsigned &NumExtracts,
9970 bool &IsSubAdd) {
9971
9972 MVT VT = BV->getSimpleValueType(0);
9973 if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
9974 return false;
9975
9976 unsigned NumElts = VT.getVectorNumElements();
9977 SDValue InVec0 = DAG.getUNDEF(VT);
9978 SDValue InVec1 = DAG.getUNDEF(VT);
9979
9980 NumExtracts = 0;
9981
9982 // Odd-numbered elements in the input build vector are obtained from
9983 // adding/subtracting two integer/float elements.
9984 // Even-numbered elements in the input build vector are obtained from
9985 // subtracting/adding two integer/float elements.
9986 unsigned Opc[2] = {0, 0};
9987 for (unsigned i = 0, e = NumElts; i != e; ++i) {
9988 SDValue Op = BV->getOperand(i);
9989
9990 // Skip 'undef' values.
9991 unsigned Opcode = Op.getOpcode();
9992 if (Opcode == ISD::UNDEF)
9993 continue;
9994
9995 // Early exit if we found an unexpected opcode.
9996 if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
9997 return false;
9998
9999 SDValue Op0 = Op.getOperand(0);
10000 SDValue Op1 = Op.getOperand(1);
10001
10002 // Try to match the following pattern:
10003 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
10004 // Early exit if we cannot match that sequence.
10005 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
10006 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
10007 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
10008 Op0.getOperand(1) != Op1.getOperand(1))
10009 return false;
10010
10011 unsigned I0 = Op0.getConstantOperandVal(1);
10012 if (I0 != i)
10013 return false;
10014
10015 // We found a valid add/sub node, make sure its the same opcode as previous
10016 // elements for this parity.
10017 if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
10018 return false;
10019 Opc[i % 2] = Opcode;
10020
10021 // Update InVec0 and InVec1.
10022 if (InVec0.isUndef()) {
10023 InVec0 = Op0.getOperand(0);
10024 if (InVec0.getSimpleValueType() != VT)
10025 return false;
10026 }
10027 if (InVec1.isUndef()) {
10028 InVec1 = Op1.getOperand(0);
10029 if (InVec1.getSimpleValueType() != VT)
10030 return false;
10031 }
10032
10033 // Make sure that operands in input to each add/sub node always
10034 // come from a same pair of vectors.
10035 if (InVec0 != Op0.getOperand(0)) {
10036 if (Opcode == ISD::FSUB)
10037 return false;
10038
10039 // FADD is commutable. Try to commute the operands
10040 // and then test again.
10041 std::swap(Op0, Op1);
10042 if (InVec0 != Op0.getOperand(0))
10043 return false;
10044 }
10045
10046 if (InVec1 != Op1.getOperand(0))
10047 return false;
10048
10049 // Increment the number of extractions done.
10050 ++NumExtracts;
10051 }
10052
10053 // Ensure we have found an opcode for both parities and that they are
10054 // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
10055 // inputs are undef.
10056 if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
10057 InVec0.isUndef() || InVec1.isUndef())
10058 return false;
10059
10060 IsSubAdd = Opc[0] == ISD::FADD;
10061
10062 Opnd0 = InVec0;
10063 Opnd1 = InVec1;
10064 return true;
10065}
10066
10067/// Returns true if is possible to fold MUL and an idiom that has already been
10068/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
10069/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
10070/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
10071///
10072/// Prior to calling this function it should be known that there is some
10073/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
10074/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
10075/// before replacement of such SDNode with ADDSUB operation. Thus the number
10076/// of \p Opnd0 uses is expected to be equal to 2.
10077/// For example, this function may be called for the following IR:
10078/// %AB = fmul fast <2 x double> %A, %B
10079/// %Sub = fsub fast <2 x double> %AB, %C
10080/// %Add = fadd fast <2 x double> %AB, %C
10081/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
10082/// <2 x i32> <i32 0, i32 3>
10083/// There is a def for %Addsub here, which potentially can be replaced by
10084/// X86ISD::ADDSUB operation:
10085/// %Addsub = X86ISD::ADDSUB %AB, %C
10086/// and such ADDSUB can further be replaced with FMADDSUB:
10087/// %Addsub = FMADDSUB %A, %B, %C.
10088///
10089/// The main reason why this method is called before the replacement of the
10090/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
10091/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
10092/// FMADDSUB is.
10093static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
10094 SelectionDAG &DAG,
10095 SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,
10096 unsigned ExpectedUses) {
10097 if (Opnd0.getOpcode() != ISD::FMUL ||
10098 !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
10099 return false;
10100
10101 // FIXME: These checks must match the similar ones in
10102 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
10103 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
10104 // or MUL + ADDSUB to FMADDSUB.
10105 const TargetOptions &Options = DAG.getTarget().Options;
10106 bool AllowFusion =
10107 (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
10108 if (!AllowFusion)
10109 return false;
10110
10111 Opnd2 = Opnd1;
10112 Opnd1 = Opnd0.getOperand(1);
10113 Opnd0 = Opnd0.getOperand(0);
10114
10115 return true;
10116}
10117
10118/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
10119/// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
10120/// X86ISD::FMSUBADD node.
10121static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
10122 const X86Subtarget &Subtarget,
10123 SelectionDAG &DAG) {
10124 SDValue Opnd0, Opnd1;
10125 unsigned NumExtracts;
10126 bool IsSubAdd;
10127 if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts,
10128 IsSubAdd))
10129 return SDValue();
10130
10131 MVT VT = BV->getSimpleValueType(0);
10132 SDLoc DL(BV);
10133
10134 // Try to generate X86ISD::FMADDSUB node here.
10135 SDValue Opnd2;
10136 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) {
10137 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
10138 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
10139 }
10140
10141 // We only support ADDSUB.
10142 if (IsSubAdd)
10143 return SDValue();
10144
10145 // There are no known X86 targets with 512-bit ADDSUB instructions!
10146 // Convert to blend(fsub,fadd).
10147 if (VT.is512BitVector()) {
10148 SmallVector<int> Mask;
10149 for (int I = 0, E = VT.getVectorNumElements(); I != E; I += 2) {
10150 Mask.push_back(I);
10151 Mask.push_back(I + E + 1);
10152 }
10153 SDValue Sub = DAG.getNode(ISD::FSUB, DL, VT, Opnd0, Opnd1);
10154 SDValue Add = DAG.getNode(ISD::FADD, DL, VT, Opnd0, Opnd1);
10155 return DAG.getVectorShuffle(VT, DL, Sub, Add, Mask);
10156 }
10157
10158 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
10159}
10160
10161static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG,
10162 unsigned &HOpcode, SDValue &V0, SDValue &V1) {
10163 // Initialize outputs to known values.
10164 MVT VT = BV->getSimpleValueType(0);
10165 HOpcode = ISD::DELETED_NODE;
10166 V0 = DAG.getUNDEF(VT);
10167 V1 = DAG.getUNDEF(VT);
10168
10169 // x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
10170 // half of the result is calculated independently from the 128-bit halves of
10171 // the inputs, so that makes the index-checking logic below more complicated.
10172 unsigned NumElts = VT.getVectorNumElements();
10173 unsigned GenericOpcode = ISD::DELETED_NODE;
10174 unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;
1
'?' condition is false
10175 unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
10176 unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
2
'NumEltsIn64Bits' initialized here
10177 for (unsigned i = 0; i != Num128BitChunks; ++i) {
3
Loop condition is true. Entering loop body
10178 for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {
4
Assuming 'j' is not equal to 'NumEltsIn128Bits'
5
Loop condition is true. Entering loop body
10179 // Ignore undef elements.
10180 SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
10181 if (Op.isUndef())
10182 continue;
10183
10184 // If there's an opcode mismatch, we're done.
10185 if (HOpcode
5.1
'HOpcode' is equal to DELETED_NODE
!= ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)
10186 return false;
10187
10188 // Initialize horizontal opcode.
10189 if (HOpcode
5.2
'HOpcode' is equal to DELETED_NODE
== ISD::DELETED_NODE) {
6
Taking true branch
10190 GenericOpcode = Op.getOpcode();
10191 switch (GenericOpcode) {
7
Control jumps to 'case FADD:' at line 10194
10192 case ISD::ADD: HOpcode = X86ISD::HADD; break;
10193 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
10194 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
8
Execution continues on line 10200
10195 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
10196 default: return false;
10197 }
10198 }
10199
10200 SDValue Op0 = Op.getOperand(0);
10201 SDValue Op1 = Op.getOperand(1);
10202 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9
Assuming the condition is false
14
Taking false branch
10203 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
10
Assuming the condition is false
10204 Op0.getOperand(0) != Op1.getOperand(0) ||
10205 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
11
Assuming the object is a 'ConstantSDNode'
10206 !isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())
12
Assuming the object is a 'ConstantSDNode'
13
Assuming the condition is false
10207 return false;
10208
10209 // The source vector is chosen based on which 64-bit half of the
10210 // destination vector is being calculated.
10211 if (j < NumEltsIn64Bits) {
15
Assuming 'j' is >= 'NumEltsIn64Bits'
16
Taking false branch
10212 if (V0.isUndef())
10213 V0 = Op0.getOperand(0);
10214 } else {
10215 if (V1.isUndef())
10216 V1 = Op0.getOperand(0);
10217 }
10218
10219 SDValue SourceVec = (j
16.1
'j' is >= 'NumEltsIn64Bits'
< NumEltsIn64Bits) ? V0 : V1;
17
'?' condition is false
10220 if (SourceVec != Op0.getOperand(0))
18
Taking false branch
10221 return false;
10222
10223 // op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
10224 unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
10225 unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
10226 unsigned ExpectedIndex = i * NumEltsIn128Bits +
10227 (j % NumEltsIn64Bits) * 2;
19
Division by zero
10228 if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
10229 continue;
10230
10231 // If this is not a commutative op, this does not match.
10232 if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)
10233 return false;
10234
10235 // Addition is commutative, so try swapping the extract indexes.
10236 // op (extract_vector_elt A, I+1), (extract_vector_elt A, I)
10237 if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
10238 continue;
10239
10240 // Extract indexes do not match horizontal requirement.
10241 return false;
10242 }
10243 }
10244 // We matched. Opcode and operands are returned by reference as arguments.
10245 return true;
10246}
10247
10248static SDValue getHopForBuildVector(const BuildVectorSDNode *BV,
10249 SelectionDAG &DAG, unsigned HOpcode,
10250 SDValue V0, SDValue V1) {
10251 // If either input vector is not the same size as the build vector,
10252 // extract/insert the low bits to the correct size.
10253 // This is free (examples: zmm --> xmm, xmm --> ymm).
10254 MVT VT = BV->getSimpleValueType(0);
10255 unsigned Width = VT.getSizeInBits();
10256 if (V0.getValueSizeInBits() > Width)
10257 V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), Width);
10258 else if (V0.getValueSizeInBits() < Width)
10259 V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, SDLoc(BV), Width);
10260
10261 if (V1.getValueSizeInBits() > Width)
10262 V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), Width);
10263 else if (V1.getValueSizeInBits() < Width)
10264 V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, SDLoc(BV), Width);
10265
10266 unsigned NumElts = VT.getVectorNumElements();
10267 APInt DemandedElts = APInt::getAllOnes(NumElts);
10268 for (unsigned i = 0; i != NumElts; ++i)
10269 if (BV->getOperand(i).isUndef())
10270 DemandedElts.clearBit(i);
10271
10272 // If we don't need the upper xmm, then perform as a xmm hop.
10273 unsigned HalfNumElts = NumElts / 2;
10274 if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
10275 MVT HalfVT = VT.getHalfNumVectorElementsVT();
10276 V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), 128);
10277 V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), 128);
10278 SDValue Half = DAG.getNode(HOpcode, SDLoc(BV), HalfVT, V0, V1);
10279 return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, SDLoc(BV), 256);
10280 }
10281
10282 return DAG.getNode(HOpcode, SDLoc(BV), VT, V0, V1);
10283}
10284
10285/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
10286static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
10287 const X86Subtarget &Subtarget,
10288 SelectionDAG &DAG) {
10289 // We need at least 2 non-undef elements to make this worthwhile by default.
10290 unsigned NumNonUndefs =
10291 count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
10292 if (NumNonUndefs < 2)
10293 return SDValue();
10294
10295 // There are 4 sets of horizontal math operations distinguished by type:
10296 // int/FP at 128-bit/256-bit. Each type was introduced with a different
10297 // subtarget feature. Try to match those "native" patterns first.
10298 MVT VT = BV->getSimpleValueType(0);
10299 if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||
10300 ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||
10301 ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||
10302 ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {
10303 unsigned HOpcode;
10304 SDValue V0, V1;
10305 if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
10306 return getHopForBuildVector(BV, DAG, HOpcode, V0, V1);
10307 }
10308
10309 // Try harder to match 256-bit ops by using extract/concat.
10310 if (!Subtarget.hasAVX() || !VT.is256BitVector())
10311 return SDValue();
10312
10313 // Count the number of UNDEF operands in the build_vector in input.
10314 unsigned NumElts = VT.getVectorNumElements();
10315 unsigned Half = NumElts / 2;
10316 unsigned NumUndefsLO = 0;
10317 unsigned NumUndefsHI = 0;
10318 for (unsigned i = 0, e = Half; i != e; ++i)
10319 if (BV->getOperand(i)->isUndef())
10320 NumUndefsLO++;
10321
10322 for (unsigned i = Half, e = NumElts; i != e; ++i)
10323 if (BV->getOperand(i)->isUndef())
10324 NumUndefsHI++;
10325
10326 SDLoc DL(BV);
10327 SDValue InVec0, InVec1;
10328 if (VT == MVT::v8i32 || VT == MVT::v16i16) {
10329 SDValue InVec2, InVec3;
10330 unsigned X86Opcode;
10331 bool CanFold = true;
10332
10333 if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
10334 isHorizontalBinOpPart(BV, ISD::ADD, DAG, Half, NumElts, InVec2,
10335 InVec3) &&
10336 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
10337 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
10338 X86Opcode = X86ISD::HADD;
10339 else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, Half, InVec0,
10340 InVec1) &&
10341 isHorizontalBinOpPart(BV, ISD::SUB, DAG, Half, NumElts, InVec2,
10342 InVec3) &&
10343 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
10344 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
10345 X86Opcode = X86ISD::HSUB;
10346 else
10347 CanFold = false;
10348
10349 if (CanFold) {
10350 // Do not try to expand this build_vector into a pair of horizontal
10351 // add/sub if we can emit a pair of scalar add/sub.
10352 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
10353 return SDValue();
10354
10355 // Convert this build_vector into a pair of horizontal binops followed by
10356 // a concat vector. We must adjust the outputs from the partial horizontal
10357 // matching calls above to account for undefined vector halves.
10358 SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
10359 SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
10360 assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?")(static_cast <bool> ((!V0.isUndef() || !V1.isUndef()) &&
"Horizontal-op of undefs?") ? void (0) : __assert_fail ("(!V0.isUndef() || !V1.isUndef()) && \"Horizontal-op of undefs?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10360, __extension__
__PRETTY_FUNCTION__))
;
10361 bool isUndefLO = NumUndefsLO == Half;
10362 bool isUndefHI = NumUndefsHI == Half;
10363 return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,
10364 isUndefHI);
10365 }
10366 }
10367
10368 if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
10369 VT == MVT::v16i16) {
10370 unsigned X86Opcode;
10371 if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
10372 X86Opcode = X86ISD::HADD;
10373 else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, NumElts, InVec0,
10374 InVec1))
10375 X86Opcode = X86ISD::HSUB;
10376 else if (isHorizontalBinOpPart(BV, ISD::FADD, DAG, 0, NumElts, InVec0,
10377 InVec1))
10378 X86Opcode = X86ISD::FHADD;
10379 else if (isHorizontalBinOpPart(BV, ISD::FSUB, DAG, 0, NumElts, InVec0,
10380 InVec1))
10381 X86Opcode = X86ISD::FHSUB;
10382 else
10383 return SDValue();
10384
10385 // Don't try to expand this build_vector into a pair of horizontal add/sub
10386 // if we can simply emit a pair of scalar add/sub.
10387 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
10388 return SDValue();
10389
10390 // Convert this build_vector into two horizontal add/sub followed by
10391 // a concat vector.
10392 bool isUndefLO = NumUndefsLO == Half;
10393 bool isUndefHI = NumUndefsHI == Half;
10394 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
10395 isUndefLO, isUndefHI);
10396 }
10397
10398 return SDValue();
10399}
10400
10401static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
10402 SelectionDAG &DAG);
10403
10404/// If a BUILD_VECTOR's source elements all apply the same bit operation and
10405/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
10406/// just apply the bit to the vectors.
10407/// NOTE: Its not in our interest to start make a general purpose vectorizer
10408/// from this, but enough scalar bit operations are created from the later
10409/// legalization + scalarization stages to need basic support.
10410static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
10411 const X86Subtarget &Subtarget,
10412 SelectionDAG &DAG) {
10413 SDLoc DL(Op);
10414 MVT VT = Op->getSimpleValueType(0);
10415 unsigned NumElems = VT.getVectorNumElements();
10416 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
10417
10418 // Check that all elements have the same opcode.
10419 // TODO: Should we allow UNDEFS and if so how many?
10420 unsigned Opcode = Op->getOperand(0).getOpcode();
10421 for (unsigned i = 1; i < NumElems; ++i)
10422 if (Opcode != Op->getOperand(i).getOpcode())
10423 return SDValue();
10424
10425 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
10426 bool IsShift = false;
10427 switch (Opcode) {
10428 default:
10429 return SDValue();
10430 case ISD::SHL:
10431 case ISD::SRL:
10432 case ISD::SRA:
10433 IsShift = true;
10434 break;
10435 case ISD::AND:
10436 case ISD::XOR:
10437 case ISD::OR:
10438 // Don't do this if the buildvector is a splat - we'd replace one
10439 // constant with an entire vector.
10440 if (Op->getSplatValue())
10441 return SDValue();
10442 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
10443 return SDValue();
10444 break;
10445 }
10446
10447 SmallVector<SDValue, 4> LHSElts, RHSElts;
10448 for (SDValue Elt : Op->ops()) {
10449 SDValue LHS = Elt.getOperand(0);
10450 SDValue RHS = Elt.getOperand(1);
10451
10452 // We expect the canonicalized RHS operand to be the constant.
10453 if (!isa<ConstantSDNode>(RHS))
10454 return SDValue();
10455
10456 // Extend shift amounts.
10457 if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {
10458 if (!IsShift)
10459 return SDValue();
10460 RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());
10461 }
10462
10463 LHSElts.push_back(LHS);
10464 RHSElts.push_back(RHS);
10465 }
10466
10467 // Limit to shifts by uniform immediates.
10468 // TODO: Only accept vXi8/vXi64 special cases?
10469 // TODO: Permit non-uniform XOP/AVX2/MULLO cases?
10470 if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
10471 return SDValue();
10472
10473 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
10474 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
10475 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
10476
10477 if (!IsShift)
10478 return Res;
10479
10480 // Immediately lower the shift to ensure the constant build vector doesn't
10481 // get converted to a constant pool before the shift is lowered.
10482 return LowerShift(Res, Subtarget, DAG);
10483}
10484
10485/// Create a vector constant without a load. SSE/AVX provide the bare minimum
10486/// functionality to do this, so it's all zeros, all ones, or some derivation
10487/// that is cheap to calculate.
10488static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
10489 const X86Subtarget &Subtarget) {
10490 SDLoc DL(Op);
10491 MVT VT = Op.getSimpleValueType();
10492
10493 // Vectors containing all zeros can be matched by pxor and xorps.
10494 if (ISD::isBuildVectorAllZeros(Op.getNode()))
10495 return Op;
10496
10497 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
10498 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
10499 // vpcmpeqd on 256-bit vectors.
10500 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
10501 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
10502 return Op;
10503
10504 return getOnesVector(VT, DAG, DL);
10505 }
10506
10507 return SDValue();
10508}
10509
10510/// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
10511/// from a vector of source values and a vector of extraction indices.
10512/// The vectors might be manipulated to match the type of the permute op.
10513static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
10514 SDLoc &DL, SelectionDAG &DAG,
10515 const X86Subtarget &Subtarget) {
10516 MVT ShuffleVT = VT;
10517 EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
10518 unsigned NumElts = VT.getVectorNumElements();
10519 unsigned SizeInBits = VT.getSizeInBits();
10520
10521 // Adjust IndicesVec to match VT size.
10522 assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&(static_cast <bool> (IndicesVec.getValueType().getVectorNumElements
() >= NumElts && "Illegal variable permute mask size"
) ? void (0) : __assert_fail ("IndicesVec.getValueType().getVectorNumElements() >= NumElts && \"Illegal variable permute mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10523, __extension__
__PRETTY_FUNCTION__))
10523 "Illegal variable permute mask size")(static_cast <bool> (IndicesVec.getValueType().getVectorNumElements
() >= NumElts && "Illegal variable permute mask size"
) ? void (0) : __assert_fail ("IndicesVec.getValueType().getVectorNumElements() >= NumElts && \"Illegal variable permute mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10523, __extension__
__PRETTY_FUNCTION__))
;
10524 if (IndicesVec.getValueType().getVectorNumElements() > NumElts) {
10525 // Narrow/widen the indices vector to the correct size.
10526 if (IndicesVec.getValueSizeInBits() > SizeInBits)
10527 IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
10528 NumElts * VT.getScalarSizeInBits());
10529 else if (IndicesVec.getValueSizeInBits() < SizeInBits)
10530 IndicesVec = widenSubVector(IndicesVec, false, Subtarget, DAG,
10531 SDLoc(IndicesVec), SizeInBits);
10532 // Zero-extend the index elements within the vector.
10533 if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
10534 IndicesVec = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(IndicesVec),
10535 IndicesVT, IndicesVec);
10536 }
10537 IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
10538
10539 // Handle SrcVec that don't match VT type.
10540 if (SrcVec.getValueSizeInBits() != SizeInBits) {
10541 if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
10542 // Handle larger SrcVec by treating it as a larger permute.
10543 unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
10544 VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
10545 IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
10546 IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
10547 Subtarget, DAG, SDLoc(IndicesVec));
10548 SDValue NewSrcVec =
10549 createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
10550 if (NewSrcVec)
10551 return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);
10552 return SDValue();
10553 } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
10554 // Widen smaller SrcVec to match VT.
10555 SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
10556 } else
10557 return SDValue();
10558 }
10559
10560 auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
10561 assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale")(static_cast <bool> (isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale"
) ? void (0) : __assert_fail ("isPowerOf2_64(Scale) && \"Illegal variable permute shuffle scale\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10561, __extension__
__PRETTY_FUNCTION__))
;
10562 EVT SrcVT = Idx.getValueType();
10563 unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
10564 uint64_t IndexScale = 0;
10565 uint64_t IndexOffset = 0;
10566
10567 // If we're scaling a smaller permute op, then we need to repeat the
10568 // indices, scaling and offsetting them as well.
10569 // e.g. v4i32 -> v16i8 (Scale = 4)
10570 // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
10571 // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
10572 for (uint64_t i = 0; i != Scale; ++i) {
10573 IndexScale |= Scale << (i * NumDstBits);
10574 IndexOffset |= i << (i * NumDstBits);
10575 }
10576
10577 Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
10578 DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
10579 Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
10580 DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
10581 return Idx;
10582 };
10583
10584 unsigned Opcode = 0;
10585 switch (VT.SimpleTy) {
10586 default:
10587 break;
10588 case MVT::v16i8:
10589 if (Subtarget.hasSSSE3())
10590 Opcode = X86ISD::PSHUFB;
10591 break;
10592 case MVT::v8i16:
10593 if (Subtarget.hasVLX() && Subtarget.hasBWI())
10594 Opcode = X86ISD::VPERMV;
10595 else if (Subtarget.hasSSSE3()) {
10596 Opcode = X86ISD::PSHUFB;
10597 ShuffleVT = MVT::v16i8;
10598 }
10599 break;
10600 case MVT::v4f32:
10601 case MVT::v4i32:
10602 if (Subtarget.hasAVX()) {
10603 Opcode = X86ISD::VPERMILPV;
10604 ShuffleVT = MVT::v4f32;
10605 } else if (Subtarget.hasSSSE3()) {
10606 Opcode = X86ISD::PSHUFB;
10607 ShuffleVT = MVT::v16i8;
10608 }
10609 break;
10610 case MVT::v2f64:
10611 case MVT::v2i64:
10612 if (Subtarget.hasAVX()) {
10613 // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
10614 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
10615 Opcode = X86ISD::VPERMILPV;
10616 ShuffleVT = MVT::v2f64;
10617 } else if (Subtarget.hasSSE41()) {
10618 // SSE41 can compare v2i64 - select between indices 0 and 1.
10619 return DAG.getSelectCC(
10620 DL, IndicesVec,
10621 getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
10622 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
10623 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
10624 ISD::CondCode::SETEQ);
10625 }
10626 break;
10627 case MVT::v32i8:
10628 if (Subtarget.hasVLX() && Subtarget.hasVBMI())
10629 Opcode = X86ISD::VPERMV;
10630 else if (Subtarget.hasXOP()) {
10631 SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
10632 SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
10633 SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
10634 SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
10635 return DAG.getNode(
10636 ISD::CONCAT_VECTORS, DL, VT,
10637 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
10638 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
10639 } else if (Subtarget.hasAVX()) {
10640 SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
10641 SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
10642 SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
10643 SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
10644 auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
10645 ArrayRef<SDValue> Ops) {
10646 // Permute Lo and Hi and then select based on index range.
10647 // This works as SHUFB uses bits[3:0] to permute elements and we don't
10648 // care about the bit[7] as its just an index vector.
10649 SDValue Idx = Ops[2];
10650 EVT VT = Idx.getValueType();
10651 return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
10652 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
10653 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
10654 ISD::CondCode::SETGT);
10655 };
10656 SDValue Ops[] = {LoLo, HiHi, IndicesVec};
10657 return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
10658 PSHUFBBuilder);
10659 }
10660 break;
10661 case MVT::v16i16:
10662 if (Subtarget.hasVLX() && Subtarget.hasBWI())
10663 Opcode = X86ISD::VPERMV;
10664 else if (Subtarget.hasAVX()) {
10665 // Scale to v32i8 and perform as v32i8.
10666 IndicesVec = ScaleIndices(IndicesVec, 2);
10667 return DAG.getBitcast(
10668 VT, createVariablePermute(
10669 MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
10670 DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
10671 }
10672 break;
10673 case MVT::v8f32:
10674 case MVT::v8i32:
10675 if (Subtarget.hasAVX2())
10676 Opcode = X86ISD::VPERMV;
10677 else if (Subtarget.hasAVX()) {
10678 SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
10679 SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
10680 {0, 1, 2, 3, 0, 1, 2, 3});
10681 SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
10682 {4, 5, 6, 7, 4, 5, 6, 7});
10683 if (Subtarget.hasXOP())
10684 return DAG.getBitcast(
10685 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,
10686 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
10687 // Permute Lo and Hi and then select based on index range.
10688 // This works as VPERMILPS only uses index bits[0:1] to permute elements.
10689 SDValue Res = DAG.getSelectCC(
10690 DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
10691 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
10692 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
10693 ISD::CondCode::SETGT);
10694 return DAG.getBitcast(VT, Res);
10695 }
10696 break;
10697 case MVT::v4i64:
10698 case MVT::v4f64:
10699 if (Subtarget.hasAVX512()) {
10700 if (!Subtarget.hasVLX()) {
10701 MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
10702 SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
10703 SDLoc(SrcVec));
10704 IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
10705 DAG, SDLoc(IndicesVec));
10706 SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
10707 DAG, Subtarget);
10708 return extract256BitVector(Res, 0, DAG, DL);
10709 }
10710 Opcode = X86ISD::VPERMV;
10711 } else if (Subtarget.hasAVX()) {
10712 SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
10713 SDValue LoLo =
10714 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
10715 SDValue HiHi =
10716 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
10717 // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
10718 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
10719 if (Subtarget.hasXOP())
10720 return DAG.getBitcast(
10721 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,
10722 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
10723 // Permute Lo and Hi and then select based on index range.
10724 // This works as VPERMILPD only uses index bit[1] to permute elements.
10725 SDValue Res = DAG.getSelectCC(
10726 DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
10727 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
10728 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
10729 ISD::CondCode::SETGT);
10730 return DAG.getBitcast(VT, Res);
10731 }
10732 break;
10733 case MVT::v64i8:
10734 if (Subtarget.hasVBMI())
10735 Opcode = X86ISD::VPERMV;
10736 break;
10737 case MVT::v32i16:
10738 if (Subtarget.hasBWI())
10739 Opcode = X86ISD::VPERMV;
10740 break;
10741 case MVT::v16f32:
10742 case MVT::v16i32:
10743 case MVT::v8f64:
10744 case MVT::v8i64:
10745 if (Subtarget.hasAVX512())
10746 Opcode = X86ISD::VPERMV;
10747 break;
10748 }
10749 if (!Opcode)
10750 return SDValue();
10751
10752 assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&(static_cast <bool> ((VT.getSizeInBits() == ShuffleVT.getSizeInBits
()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits
()) == 0 && "Illegal variable permute shuffle type") ?
void (0) : __assert_fail ("(VT.getSizeInBits() == ShuffleVT.getSizeInBits()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 && \"Illegal variable permute shuffle type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10754, __extension__
__PRETTY_FUNCTION__))
10753 (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&(static_cast <bool> ((VT.getSizeInBits() == ShuffleVT.getSizeInBits
()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits
()) == 0 && "Illegal variable permute shuffle type") ?
void (0) : __assert_fail ("(VT.getSizeInBits() == ShuffleVT.getSizeInBits()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 && \"Illegal variable permute shuffle type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10754, __extension__
__PRETTY_FUNCTION__))
10754 "Illegal variable permute shuffle type")(static_cast <bool> ((VT.getSizeInBits() == ShuffleVT.getSizeInBits
()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits
()) == 0 && "Illegal variable permute shuffle type") ?
void (0) : __assert_fail ("(VT.getSizeInBits() == ShuffleVT.getSizeInBits()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 && \"Illegal variable permute shuffle type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10754, __extension__
__PRETTY_FUNCTION__))
;
10755
10756 uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
10757 if (Scale > 1)
10758 IndicesVec = ScaleIndices(IndicesVec, Scale);
10759
10760 EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
10761 IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
10762
10763 SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
10764 SDValue Res = Opcode == X86ISD::VPERMV
10765 ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
10766 : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
10767 return DAG.getBitcast(VT, Res);
10768}
10769
10770// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
10771// reasoned to be a permutation of a vector by indices in a non-constant vector.
10772// (build_vector (extract_elt V, (extract_elt I, 0)),
10773// (extract_elt V, (extract_elt I, 1)),
10774// ...
10775// ->
10776// (vpermv I, V)
10777//
10778// TODO: Handle undefs
10779// TODO: Utilize pshufb and zero mask blending to support more efficient
10780// construction of vectors with constant-0 elements.
10781static SDValue
10782LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
10783 const X86Subtarget &Subtarget) {
10784 SDValue SrcVec, IndicesVec;
10785 // Check for a match of the permute source vector and permute index elements.
10786 // This is done by checking that the i-th build_vector operand is of the form:
10787 // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
10788 for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
10789 SDValue Op = V.getOperand(Idx);
10790 if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
10791 return SDValue();
10792
10793 // If this is the first extract encountered in V, set the source vector,
10794 // otherwise verify the extract is from the previously defined source
10795 // vector.
10796 if (!SrcVec)
10797 SrcVec = Op.getOperand(0);
10798 else if (SrcVec != Op.getOperand(0))
10799 return SDValue();
10800 SDValue ExtractedIndex = Op->getOperand(1);
10801 // Peek through extends.
10802 if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
10803 ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
10804 ExtractedIndex = ExtractedIndex.getOperand(0);
10805 if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
10806 return SDValue();
10807
10808 // If this is the first extract from the index vector candidate, set the
10809 // indices vector, otherwise verify the extract is from the previously
10810 // defined indices vector.
10811 if (!IndicesVec)
10812 IndicesVec = ExtractedIndex.getOperand(0);
10813 else if (IndicesVec != ExtractedIndex.getOperand(0))
10814 return SDValue();
10815
10816 auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
10817 if (!PermIdx || PermIdx->getAPIntValue() != Idx)
10818 return SDValue();
10819 }
10820
10821 SDLoc DL(V);
10822 MVT VT = V.getSimpleValueType();
10823 return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
10824}
10825
10826SDValue
10827X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
10828 SDLoc dl(Op);
10829
10830 MVT VT = Op.getSimpleValueType();
10831 MVT EltVT = VT.getVectorElementType();
10832 unsigned NumElems = Op.getNumOperands();
10833
10834 // Generate vectors for predicate vectors.
10835 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
10836 return LowerBUILD_VECTORvXi1(Op, DAG, Subtarget);
10837
10838 if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
10839 return VectorConstant;
10840
10841 unsigned EVTBits = EltVT.getSizeInBits();
10842 APInt UndefMask = APInt::getZero(NumElems);
10843 APInt ZeroMask = APInt::getZero(NumElems);
10844 APInt NonZeroMask = APInt::getZero(NumElems);
10845 bool IsAllConstants = true;
10846 SmallSet<SDValue, 8> Values;
10847 unsigned NumConstants = NumElems;
10848 for (unsigned i = 0; i < NumElems; ++i) {
10849 SDValue Elt = Op.getOperand(i);
10850 if (Elt.isUndef()) {
10851 UndefMask.setBit(i);
10852 continue;
10853 }
10854 Values.insert(Elt);
10855 if (!isa<ConstantSDNode>(Elt) && !isa<ConstantFPSDNode>(Elt)) {
10856 IsAllConstants = false;
10857 NumConstants--;
10858 }
10859 if (X86::isZeroNode(Elt)) {
10860 ZeroMask.setBit(i);
10861 } else {
10862 NonZeroMask.setBit(i);
10863 }
10864 }
10865
10866 // All undef vector. Return an UNDEF. All zero vectors were handled above.
10867 if (NonZeroMask == 0) {
10868 assert(UndefMask.isAllOnes() && "Fully undef mask expected")(static_cast <bool> (UndefMask.isAllOnes() && "Fully undef mask expected"
) ? void (0) : __assert_fail ("UndefMask.isAllOnes() && \"Fully undef mask expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10868, __extension__
__PRETTY_FUNCTION__))
;
10869 return DAG.getUNDEF(VT);
10870 }
10871
10872 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
10873
10874 // If the upper elts of a ymm/zmm are undef/zero then we might be better off
10875 // lowering to a smaller build vector and padding with undef/zero.
10876 if ((VT.is256BitVector() || VT.is512BitVector()) &&
10877 !isFoldableUseOfShuffle(BV)) {
10878 unsigned UpperElems = NumElems / 2;
10879 APInt UndefOrZeroMask = UndefMask | ZeroMask;
10880 unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countLeadingOnes();
10881 if (NumUpperUndefsOrZeros >= UpperElems) {
10882 if (VT.is512BitVector() &&
10883 NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))
10884 UpperElems = NumElems - (NumElems / 4);
10885 bool UndefUpper = UndefMask.countLeadingOnes() >= UpperElems;
10886 MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems);
10887 SDValue NewBV =
10888 DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems));
10889 return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);
10890 }
10891 }
10892
10893 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
10894 return AddSub;
10895 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
10896 return HorizontalOp;
10897 if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
10898 return Broadcast;
10899 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, Subtarget, DAG))
10900 return BitOp;
10901
10902 unsigned NumZero = ZeroMask.countPopulation();
10903 unsigned NumNonZero = NonZeroMask.countPopulation();
10904
10905 // If we are inserting one variable into a vector of non-zero constants, try
10906 // to avoid loading each constant element as a scalar. Load the constants as a
10907 // vector and then insert the variable scalar element. If insertion is not
10908 // supported, fall back to a shuffle to get the scalar blended with the
10909 // constants. Insertion into a zero vector is handled as a special-case
10910 // somewhere below here.
10911 if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
10912 (isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) ||
10913 isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) {
10914 // Create an all-constant vector. The variable element in the old
10915 // build vector is replaced by undef in the constant vector. Save the
10916 // variable scalar element and its index for use in the insertelement.
10917 LLVMContext &Context = *DAG.getContext();
10918 Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
10919 SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
10920 SDValue VarElt;
10921 SDValue InsIndex;
10922 for (unsigned i = 0; i != NumElems; ++i) {
10923 SDValue Elt = Op.getOperand(i);
10924 if (auto *C = dyn_cast<ConstantSDNode>(Elt))
10925 ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
10926 else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
10927 ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
10928 else if (!Elt.isUndef()) {
10929 assert(!VarElt.getNode() && !InsIndex.getNode() &&(static_cast <bool> (!VarElt.getNode() && !InsIndex
.getNode() && "Expected one variable element in this vector"
) ? void (0) : __assert_fail ("!VarElt.getNode() && !InsIndex.getNode() && \"Expected one variable element in this vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10930, __extension__
__PRETTY_FUNCTION__))
10930 "Expected one variable element in this vector")(static_cast <bool> (!VarElt.getNode() && !InsIndex
.getNode() && "Expected one variable element in this vector"
) ? void (0) : __assert_fail ("!VarElt.getNode() && !InsIndex.getNode() && \"Expected one variable element in this vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10930, __extension__
__PRETTY_FUNCTION__))
;
10931 VarElt = Elt;
10932 InsIndex = DAG.getVectorIdxConstant(i, dl);
10933 }
10934 }
10935 Constant *CV = ConstantVector::get(ConstVecOps);
10936 SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
10937
10938 // The constants we just created may not be legal (eg, floating point). We
10939 // must lower the vector right here because we can not guarantee that we'll
10940 // legalize it before loading it. This is also why we could not just create
10941 // a new build vector here. If the build vector contains illegal constants,
10942 // it could get split back up into a series of insert elements.
10943 // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
10944 SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
10945 MachineFunction &MF = DAG.getMachineFunction();
10946 MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
10947 SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
10948 unsigned InsertC = cast<ConstantSDNode>(InsIndex)->getZExtValue();
10949 unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();
10950 if (InsertC < NumEltsInLow128Bits)
10951 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
10952
10953 // There's no good way to insert into the high elements of a >128-bit
10954 // vector, so use shuffles to avoid an extract/insert sequence.
10955 assert(VT.getSizeInBits() > 128 && "Invalid insertion index?")(static_cast <bool> (VT.getSizeInBits() > 128 &&
"Invalid insertion index?") ? void (0) : __assert_fail ("VT.getSizeInBits() > 128 && \"Invalid insertion index?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10955, __extension__
__PRETTY_FUNCTION__))
;
10956 assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector")(static_cast <bool> (Subtarget.hasAVX() && "Must have AVX with >16-byte vector"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"Must have AVX with >16-byte vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10956, __extension__
__PRETTY_FUNCTION__))
;
10957 SmallVector<int, 8> ShuffleMask;
10958 unsigned NumElts = VT.getVectorNumElements();
10959 for (unsigned i = 0; i != NumElts; ++i)
10960 ShuffleMask.push_back(i == InsertC ? NumElts : i);
10961 SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);
10962 return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);
10963 }
10964
10965 // Special case for single non-zero, non-undef, element.
10966 if (NumNonZero == 1) {
10967 unsigned Idx = NonZeroMask.countTrailingZeros();
10968 SDValue Item = Op.getOperand(Idx);
10969
10970 // If we have a constant or non-constant insertion into the low element of
10971 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
10972 // the rest of the elements. This will be matched as movd/movq/movss/movsd
10973 // depending on what the source datatype is.
10974 if (Idx == 0) {
10975 if (NumZero == 0)
10976 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
10977
10978 if (EltVT == MVT::i32 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
10979 EltVT == MVT::f64 || (EltVT == MVT::i64 && Subtarget.is64Bit()) ||
10980 (EltVT == MVT::i16 && Subtarget.hasFP16())) {
10981 assert((VT.is128BitVector() || VT.is256BitVector() ||(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected an SSE value type!"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected an SSE value type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10983, __extension__
__PRETTY_FUNCTION__))
10982 VT.is512BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected an SSE value type!"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected an SSE value type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10983, __extension__
__PRETTY_FUNCTION__))
10983 "Expected an SSE value type!")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected an SSE value type!"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected an SSE value type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10983, __extension__
__PRETTY_FUNCTION__))
;
10984 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
10985 // Turn it into a MOVL (i.e. movsh, movss, movsd, movw or movd) to a
10986 // zero vector.
10987 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
10988 }
10989
10990 // We can't directly insert an i8 or i16 into a vector, so zero extend
10991 // it to i32 first.
10992 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
10993 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
10994 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
10995 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
10996 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
10997 return DAG.getBitcast(VT, Item);
10998 }
10999 }
11000
11001 // Is it a vector logical left shift?
11002 if (NumElems == 2 && Idx == 1 &&
11003 X86::isZeroNode(Op.getOperand(0)) &&
11004 !X86::isZeroNode(Op.getOperand(1))) {
11005 unsigned NumBits = VT.getSizeInBits();
11006 return getVShift(true, VT,
11007 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
11008 VT, Op.getOperand(1)),
11009 NumBits/2, DAG, *this, dl);
11010 }
11011
11012 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
11013 return SDValue();
11014
11015 // Otherwise, if this is a vector with i32 or f32 elements, and the element
11016 // is a non-constant being inserted into an element other than the low one,
11017 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
11018 // movd/movss) to move this into the low element, then shuffle it into
11019 // place.
11020 if (EVTBits == 32) {
11021 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
11022 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
11023 }
11024 }
11025
11026 // Splat is obviously ok. Let legalizer expand it to a shuffle.
11027 if (Values.size() == 1) {
11028 if (EVTBits == 32) {
11029 // Instead of a shuffle like this:
11030 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
11031 // Check if it's possible to issue this instead.
11032 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
11033 unsigned Idx = NonZeroMask.countTrailingZeros();
11034 SDValue Item = Op.getOperand(Idx);
11035 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
11036 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
11037 }
11038 return SDValue();
11039 }
11040
11041 // A vector full of immediates; various special cases are already
11042 // handled, so this is best done with a single constant-pool load.
11043 if (IsAllConstants)
11044 return SDValue();
11045
11046 if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget))
11047 return V;
11048
11049 // See if we can use a vector load to get all of the elements.
11050 {
11051 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
11052 if (SDValue LD =
11053 EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
11054 return LD;
11055 }
11056
11057 // If this is a splat of pairs of 32-bit elements, we can use a narrower
11058 // build_vector and broadcast it.
11059 // TODO: We could probably generalize this more.
11060 if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
11061 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
11062 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
11063 auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
11064 // Make sure all the even/odd operands match.
11065 for (unsigned i = 2; i != NumElems; ++i)
11066 if (Ops[i % 2] != Op.getOperand(i))
11067 return false;
11068 return true;
11069 };
11070 if (CanSplat(Op, NumElems, Ops)) {
11071 MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
11072 MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
11073 // Create a new build vector and cast to v2i64/v2f64.
11074 SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
11075 DAG.getBuildVector(NarrowVT, dl, Ops));
11076 // Broadcast from v2i64/v2f64 and cast to final VT.
11077 MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems / 2);
11078 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
11079 NewBV));
11080 }
11081 }
11082
11083 // For AVX-length vectors, build the individual 128-bit pieces and use
11084 // shuffles to put them in place.
11085 if (VT.getSizeInBits() > 128) {
11086 MVT HVT = MVT::getVectorVT(EltVT, NumElems / 2);
11087
11088 // Build both the lower and upper subvector.
11089 SDValue Lower =
11090 DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
11091 SDValue Upper = DAG.getBuildVector(
11092 HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
11093
11094 // Recreate the wider vector with the lower and upper part.
11095 return concatSubVectors(Lower, Upper, DAG, dl);
11096 }
11097
11098 // Let legalizer expand 2-wide build_vectors.
11099 if (EVTBits == 64) {
11100 if (NumNonZero == 1) {
11101 // One half is zero or undef.
11102 unsigned Idx = NonZeroMask.countTrailingZeros();
11103 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
11104 Op.getOperand(Idx));
11105 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
11106 }
11107 return SDValue();
11108 }
11109
11110 // If element VT is < 32 bits, convert it to inserts into a zero vector.
11111 if (EVTBits == 8 && NumElems == 16)
11112 if (SDValue V = LowerBuildVectorv16i8(Op, NonZeroMask, NumNonZero, NumZero,
11113 DAG, Subtarget))
11114 return V;
11115
11116 if (EltVT == MVT::i16 && NumElems == 8)
11117 if (SDValue V = LowerBuildVectorv8i16(Op, NonZeroMask, NumNonZero, NumZero,
11118 DAG, Subtarget))
11119 return V;
11120
11121 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
11122 if (EVTBits == 32 && NumElems == 4)
11123 if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
11124 return V;
11125
11126 // If element VT is == 32 bits, turn it into a number of shuffles.
11127 if (NumElems == 4 && NumZero > 0) {
11128 SmallVector<SDValue, 8> Ops(NumElems);
11129 for (unsigned i = 0; i < 4; ++i) {
11130 bool isZero = !NonZeroMask[i];
11131 if (isZero)
11132 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
11133 else
11134 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
11135 }
11136
11137 for (unsigned i = 0; i < 2; ++i) {
11138 switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) {
11139 default: llvm_unreachable("Unexpected NonZero count")::llvm::llvm_unreachable_internal("Unexpected NonZero count",
"llvm/lib/Target/X86/X86ISelLowering.cpp", 11139)
;
11140 case 0:
11141 Ops[i] = Ops[i*2]; // Must be a zero vector.
11142 break;
11143 case 1:
11144 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
11145 break;
11146 case 2:
11147 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
11148 break;
11149 case 3:
11150 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
11151 break;
11152 }
11153 }
11154
11155 bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2;
11156 bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2;
11157 int MaskVec[] = {
11158 Reverse1 ? 1 : 0,
11159 Reverse1 ? 0 : 1,
11160 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
11161 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
11162 };
11163 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
11164 }
11165
11166 assert(Values.size() > 1 && "Expected non-undef and non-splat vector")(static_cast <bool> (Values.size() > 1 && "Expected non-undef and non-splat vector"
) ? void (0) : __assert_fail ("Values.size() > 1 && \"Expected non-undef and non-splat vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11166, __extension__
__PRETTY_FUNCTION__))
;
11167
11168 // Check for a build vector from mostly shuffle plus few inserting.
11169 if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
11170 return Sh;
11171
11172 // For SSE 4.1, use insertps to put the high elements into the low element.
11173 if (Subtarget.hasSSE41() && EltVT != MVT::f16) {
11174 SDValue Result;
11175 if (!Op.getOperand(0).isUndef())
11176 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
11177 else
11178 Result = DAG.getUNDEF(VT);
11179
11180 for (unsigned i = 1; i < NumElems; ++i) {
11181 if (Op.getOperand(i).isUndef()) continue;
11182 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
11183 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
11184 }
11185 return Result;
11186 }
11187
11188 // Otherwise, expand into a number of unpckl*, start by extending each of
11189 // our (non-undef) elements to the full vector width with the element in the
11190 // bottom slot of the vector (which generates no code for SSE).
11191 SmallVector<SDValue, 8> Ops(NumElems);
11192 for (unsigned i = 0; i < NumElems; ++i) {
11193 if (!Op.getOperand(i).isUndef())
11194 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
11195 else
11196 Ops[i] = DAG.getUNDEF(VT);
11197 }
11198
11199 // Next, we iteratively mix elements, e.g. for v4f32:
11200 // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
11201 // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
11202 // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
11203 for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
11204 // Generate scaled UNPCKL shuffle mask.
11205 SmallVector<int, 16> Mask;
11206 for(unsigned i = 0; i != Scale; ++i)
11207 Mask.push_back(i);
11208 for (unsigned i = 0; i != Scale; ++i)
11209 Mask.push_back(NumElems+i);
11210 Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
11211
11212 for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
11213 Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
11214 }
11215 return Ops[0];
11216}
11217
11218// 256-bit AVX can use the vinsertf128 instruction
11219// to create 256-bit vectors from two other 128-bit ones.
11220// TODO: Detect subvector broadcast here instead of DAG combine?
11221static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
11222 const X86Subtarget &Subtarget) {
11223 SDLoc dl(Op);
11224 MVT ResVT = Op.getSimpleValueType();
11225
11226 assert((ResVT.is256BitVector() ||(static_cast <bool> ((ResVT.is256BitVector() || ResVT.is512BitVector
()) && "Value type must be 256-/512-bit wide") ? void
(0) : __assert_fail ("(ResVT.is256BitVector() || ResVT.is512BitVector()) && \"Value type must be 256-/512-bit wide\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11227, __extension__
__PRETTY_FUNCTION__))
11227 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide")(static_cast <bool> ((ResVT.is256BitVector() || ResVT.is512BitVector
()) && "Value type must be 256-/512-bit wide") ? void
(0) : __assert_fail ("(ResVT.is256BitVector() || ResVT.is512BitVector()) && \"Value type must be 256-/512-bit wide\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11227, __extension__
__PRETTY_FUNCTION__))
;
11228
11229 unsigned NumOperands = Op.getNumOperands();
11230 unsigned NumZero = 0;
11231 unsigned NumNonZero = 0;
11232 unsigned NonZeros = 0;
11233 for (unsigned i = 0; i != NumOperands; ++i) {
11234 SDValue SubVec = Op.getOperand(i);
11235 if (SubVec.isUndef())
11236 continue;
11237 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
11238 ++NumZero;
11239 else {
11240 assert(i < sizeof(NonZeros) * CHAR_BIT)(static_cast <bool> (i < sizeof(NonZeros) * 8) ? void
(0) : __assert_fail ("i < sizeof(NonZeros) * CHAR_BIT", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11240, __extension__ __PRETTY_FUNCTION__))
; // Ensure the shift is in range.
11241 NonZeros |= 1 << i;
11242 ++NumNonZero;
11243 }
11244 }
11245
11246 // If we have more than 2 non-zeros, build each half separately.
11247 if (NumNonZero > 2) {
11248 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
11249 ArrayRef<SDUse> Ops = Op->ops();
11250 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
11251 Ops.slice(0, NumOperands/2));
11252 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
11253 Ops.slice(NumOperands/2));
11254 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
11255 }
11256
11257 // Otherwise, build it up through insert_subvectors.
11258 SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
11259 : DAG.getUNDEF(ResVT);
11260
11261 MVT SubVT = Op.getOperand(0).getSimpleValueType();
11262 unsigned NumSubElems = SubVT.getVectorNumElements();
11263 for (unsigned i = 0; i != NumOperands; ++i) {
11264 if ((NonZeros & (1 << i)) == 0)
11265 continue;
11266
11267 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec,
11268 Op.getOperand(i),
11269 DAG.getIntPtrConstant(i * NumSubElems, dl));
11270 }
11271
11272 return Vec;
11273}
11274
11275// Returns true if the given node is a type promotion (by concatenating i1
11276// zeros) of the result of a node that already zeros all upper bits of
11277// k-register.
11278// TODO: Merge this with LowerAVXCONCAT_VECTORS?
11279static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
11280 const X86Subtarget &Subtarget,
11281 SelectionDAG & DAG) {
11282 SDLoc dl(Op);
11283 MVT ResVT = Op.getSimpleValueType();
11284 unsigned NumOperands = Op.getNumOperands();
11285
11286 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&(static_cast <bool> (NumOperands > 1 && isPowerOf2_32
(NumOperands) && "Unexpected number of operands in CONCAT_VECTORS"
) ? void (0) : __assert_fail ("NumOperands > 1 && isPowerOf2_32(NumOperands) && \"Unexpected number of operands in CONCAT_VECTORS\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11287, __extension__
__PRETTY_FUNCTION__))
11287 "Unexpected number of operands in CONCAT_VECTORS")(static_cast <bool> (NumOperands > 1 && isPowerOf2_32
(NumOperands) && "Unexpected number of operands in CONCAT_VECTORS"
) ? void (0) : __assert_fail ("NumOperands > 1 && isPowerOf2_32(NumOperands) && \"Unexpected number of operands in CONCAT_VECTORS\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11287, __extension__
__PRETTY_FUNCTION__))
;
11288
11289 uint64_t Zeros = 0;
11290 uint64_t NonZeros = 0;
11291 for (unsigned i = 0; i != NumOperands; ++i) {
11292 SDValue SubVec = Op.getOperand(i);
11293 if (SubVec.isUndef())
11294 continue;
11295 assert(i < sizeof(NonZeros) * CHAR_BIT)(static_cast <bool> (i < sizeof(NonZeros) * 8) ? void
(0) : __assert_fail ("i < sizeof(NonZeros) * CHAR_BIT", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11295, __extension__ __PRETTY_FUNCTION__))
; // Ensure the shift is in range.
11296 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
11297 Zeros |= (uint64_t)1 << i;
11298 else
11299 NonZeros |= (uint64_t)1 << i;
11300 }
11301
11302 unsigned NumElems = ResVT.getVectorNumElements();
11303
11304 // If we are inserting non-zero vector and there are zeros in LSBs and undef
11305 // in the MSBs we need to emit a KSHIFTL. The generic lowering to
11306 // insert_subvector will give us two kshifts.
11307 if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
11308 Log2_64(NonZeros) != NumOperands - 1) {
11309 MVT ShiftVT = ResVT;
11310 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
11311 ShiftVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
11312 unsigned Idx = Log2_64(NonZeros);
11313 SDValue SubVec = Op.getOperand(Idx);
11314 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
11315 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ShiftVT,
11316 DAG.getUNDEF(ShiftVT), SubVec,
11317 DAG.getIntPtrConstant(0, dl));
11318 Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, SubVec,
11319 DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));
11320 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,
11321 DAG.getIntPtrConstant(0, dl));
11322 }
11323
11324 // If there are zero or one non-zeros we can handle this very simply.
11325 if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {
11326 SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);
11327 if (!NonZeros)
11328 return Vec;
11329 unsigned Idx = Log2_64(NonZeros);
11330 SDValue SubVec = Op.getOperand(Idx);
11331 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
11332 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
11333 DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));
11334 }
11335
11336 if (NumOperands > 2) {
11337 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
11338 ArrayRef<SDUse> Ops = Op->ops();
11339 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
11340 Ops.slice(0, NumOperands/2));
11341 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
11342 Ops.slice(NumOperands/2));
11343 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
11344 }
11345
11346 assert(countPopulation(NonZeros) == 2 && "Simple cases not handled?")(static_cast <bool> (countPopulation(NonZeros) == 2 &&
"Simple cases not handled?") ? void (0) : __assert_fail ("countPopulation(NonZeros) == 2 && \"Simple cases not handled?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11346, __extension__
__PRETTY_FUNCTION__))
;
11347
11348 if (ResVT.getVectorNumElements() >= 16)
11349 return Op; // The operation is legal with KUNPCK
11350
11351 SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,
11352 DAG.getUNDEF(ResVT), Op.getOperand(0),
11353 DAG.getIntPtrConstant(0, dl));
11354 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
11355 DAG.getIntPtrConstant(NumElems/2, dl));
11356}
11357
11358static SDValue LowerCONCAT_VECTORS(SDValue Op,
11359 const X86Subtarget &Subtarget,
11360 SelectionDAG &DAG) {
11361 MVT VT = Op.getSimpleValueType();
11362 if (VT.getVectorElementType() == MVT::i1)
11363 return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
11364
11365 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||(static_cast <bool> ((VT.is256BitVector() && Op
.getNumOperands() == 2) || (VT.is512BitVector() && (Op
.getNumOperands() == 2 || Op.getNumOperands() == 4))) ? void (
0) : __assert_fail ("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11367, __extension__
__PRETTY_FUNCTION__))
11366 (VT.is512BitVector() && (Op.getNumOperands() == 2 ||(static_cast <bool> ((VT.is256BitVector() && Op
.getNumOperands() == 2) || (VT.is512BitVector() && (Op
.getNumOperands() == 2 || Op.getNumOperands() == 4))) ? void (
0) : __assert_fail ("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11367, __extension__
__PRETTY_FUNCTION__))
11367 Op.getNumOperands() == 4)))(static_cast <bool> ((VT.is256BitVector() && Op
.getNumOperands() == 2) || (VT.is512BitVector() && (Op
.getNumOperands() == 2 || Op.getNumOperands() == 4))) ? void (
0) : __assert_fail ("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11367, __extension__
__PRETTY_FUNCTION__))
;
11368
11369 // AVX can use the vinsertf128 instruction to create 256-bit vectors
11370 // from two other 128-bit ones.
11371
11372 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
11373 return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget);
11374}
11375
11376//===----------------------------------------------------------------------===//
11377// Vector shuffle lowering
11378//
11379// This is an experimental code path for lowering vector shuffles on x86. It is
11380// designed to handle arbitrary vector shuffles and blends, gracefully
11381// degrading performance as necessary. It works hard to recognize idiomatic
11382// shuffles and lower them to optimal instruction patterns without leaving
11383// a framework that allows reasonably efficient handling of all vector shuffle
11384// patterns.
11385//===----------------------------------------------------------------------===//
11386
11387/// Tiny helper function to identify a no-op mask.
11388///
11389/// This is a somewhat boring predicate function. It checks whether the mask
11390/// array input, which is assumed to be a single-input shuffle mask of the kind
11391/// used by the X86 shuffle instructions (not a fully general
11392/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
11393/// in-place shuffle are 'no-op's.
11394static bool isNoopShuffleMask(ArrayRef<int> Mask) {
11395 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11396 assert(Mask[i] >= -1 && "Out of bound mask element!")(static_cast <bool> (Mask[i] >= -1 && "Out of bound mask element!"
) ? void (0) : __assert_fail ("Mask[i] >= -1 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11396, __extension__
__PRETTY_FUNCTION__))
;
11397 if (Mask[i] >= 0 && Mask[i] != i)
11398 return false;
11399 }
11400 return true;
11401}
11402
11403/// Test whether there are elements crossing LaneSizeInBits lanes in this
11404/// shuffle mask.
11405///
11406/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
11407/// and we routinely test for these.
11408static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,
11409 unsigned ScalarSizeInBits,
11410 ArrayRef<int> Mask) {
11411 assert(LaneSizeInBits && ScalarSizeInBits &&(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11413, __extension__
__PRETTY_FUNCTION__))
11412 (LaneSizeInBits % ScalarSizeInBits) == 0 &&(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11413, __extension__
__PRETTY_FUNCTION__))
11413 "Illegal shuffle lane size")(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11413, __extension__
__PRETTY_FUNCTION__))
;
11414 int LaneSize = LaneSizeInBits / ScalarSizeInBits;
11415 int Size = Mask.size();
11416 for (int i = 0; i < Size; ++i)
11417 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
11418 return true;
11419 return false;
11420}
11421
11422/// Test whether there are elements crossing 128-bit lanes in this
11423/// shuffle mask.
11424static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
11425 return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);
11426}
11427
11428/// Test whether elements in each LaneSizeInBits lane in this shuffle mask come
11429/// from multiple lanes - this is different to isLaneCrossingShuffleMask to
11430/// better support 'repeated mask + lane permute' style shuffles.
11431static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits,
11432 unsigned ScalarSizeInBits,
11433 ArrayRef<int> Mask) {
11434 assert(LaneSizeInBits && ScalarSizeInBits &&(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11436, __extension__
__PRETTY_FUNCTION__))
11435 (LaneSizeInBits % ScalarSizeInBits) == 0 &&(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11436, __extension__
__PRETTY_FUNCTION__))
11436 "Illegal shuffle lane size")(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11436, __extension__
__PRETTY_FUNCTION__))
;
11437 int NumElts = Mask.size();
11438 int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;
11439 int NumLanes = NumElts / NumEltsPerLane;
11440 if (NumLanes > 1) {
11441 for (int i = 0; i != NumLanes; ++i) {
11442 int SrcLane = -1;
11443 for (int j = 0; j != NumEltsPerLane; ++j) {
11444 int M = Mask[(i * NumEltsPerLane) + j];
11445 if (M < 0)
11446 continue;
11447 int Lane = (M % NumElts) / NumEltsPerLane;
11448 if (SrcLane >= 0 && SrcLane != Lane)
11449 return true;
11450 SrcLane = Lane;
11451 }
11452 }
11453 }
11454 return false;
11455}
11456
11457/// Test whether a shuffle mask is equivalent within each sub-lane.
11458///
11459/// This checks a shuffle mask to see if it is performing the same
11460/// lane-relative shuffle in each sub-lane. This trivially implies
11461/// that it is also not lane-crossing. It may however involve a blend from the
11462/// same lane of a second vector.
11463///
11464/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
11465/// non-trivial to compute in the face of undef lanes. The representation is
11466/// suitable for use with existing 128-bit shuffles as entries from the second
11467/// vector have been remapped to [LaneSize, 2*LaneSize).
11468static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
11469 ArrayRef<int> Mask,
11470 SmallVectorImpl<int> &RepeatedMask) {
11471 auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
11472 RepeatedMask.assign(LaneSize, -1);
11473 int Size = Mask.size();
11474 for (int i = 0; i < Size; ++i) {
11475 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0)(static_cast <bool> (Mask[i] == SM_SentinelUndef || Mask
[i] >= 0) ? void (0) : __assert_fail ("Mask[i] == SM_SentinelUndef || Mask[i] >= 0"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11475, __extension__
__PRETTY_FUNCTION__))
;
11476 if (Mask[i] < 0)
11477 continue;
11478 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
11479 // This entry crosses lanes, so there is no way to model this shuffle.
11480 return false;
11481
11482 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
11483 // Adjust second vector indices to start at LaneSize instead of Size.
11484 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
11485 : Mask[i] % LaneSize + LaneSize;
11486 if (RepeatedMask[i % LaneSize] < 0)
11487 // This is the first non-undef entry in this slot of a 128-bit lane.
11488 RepeatedMask[i % LaneSize] = LocalM;
11489 else if (RepeatedMask[i % LaneSize] != LocalM)
11490 // Found a mismatch with the repeated mask.
11491 return false;
11492 }
11493 return true;
11494}
11495
11496/// Test whether a shuffle mask is equivalent within each 128-bit lane.
11497static bool
11498is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
11499 SmallVectorImpl<int> &RepeatedMask) {
11500 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
11501}
11502
11503static bool
11504is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask) {
11505 SmallVector<int, 32> RepeatedMask;
11506 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
11507}
11508
11509/// Test whether a shuffle mask is equivalent within each 256-bit lane.
11510static bool
11511is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
11512 SmallVectorImpl<int> &RepeatedMask) {
11513 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
11514}
11515
11516/// Test whether a target shuffle mask is equivalent within each sub-lane.
11517/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
11518static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,
11519 unsigned EltSizeInBits,
11520 ArrayRef<int> Mask,
11521 SmallVectorImpl<int> &RepeatedMask) {
11522 int LaneSize = LaneSizeInBits / EltSizeInBits;
11523 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
11524 int Size = Mask.size();
11525 for (int i = 0; i < Size; ++i) {
11526 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0))(static_cast <bool> (isUndefOrZero(Mask[i]) || (Mask[i]
>= 0)) ? void (0) : __assert_fail ("isUndefOrZero(Mask[i]) || (Mask[i] >= 0)"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11526, __extension__
__PRETTY_FUNCTION__))
;
11527 if (Mask[i] == SM_SentinelUndef)
11528 continue;
11529 if (Mask[i] == SM_SentinelZero) {
11530 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
11531 return false;
11532 RepeatedMask[i % LaneSize] = SM_SentinelZero;
11533 continue;
11534 }
11535 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
11536 // This entry crosses lanes, so there is no way to model this shuffle.
11537 return false;
11538
11539 // Handle the in-lane shuffles by detecting if and when they repeat. Adjust
11540 // later vector indices to start at multiples of LaneSize instead of Size.
11541 int LaneM = Mask[i] / Size;
11542 int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize);
11543 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
11544 // This is the first non-undef entry in this slot of a 128-bit lane.
11545 RepeatedMask[i % LaneSize] = LocalM;
11546 else if (RepeatedMask[i % LaneSize] != LocalM)
11547 // Found a mismatch with the repeated mask.
11548 return false;
11549 }
11550 return true;
11551}
11552
11553/// Test whether a target shuffle mask is equivalent within each sub-lane.
11554/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
11555static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
11556 ArrayRef<int> Mask,
11557 SmallVectorImpl<int> &RepeatedMask) {
11558 return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(),
11559 Mask, RepeatedMask);
11560}
11561
11562/// Checks whether the vector elements referenced by two shuffle masks are
11563/// equivalent.
11564static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,
11565 int Idx, int ExpectedIdx) {
11566 assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&(static_cast <bool> (0 <= Idx && Idx < MaskSize
&& 0 <= ExpectedIdx && ExpectedIdx < MaskSize
&& "Out of range element index") ? void (0) : __assert_fail
("0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx && ExpectedIdx < MaskSize && \"Out of range element index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11567, __extension__
__PRETTY_FUNCTION__))
11567 ExpectedIdx < MaskSize && "Out of range element index")(static_cast <bool> (0 <= Idx && Idx < MaskSize
&& 0 <= ExpectedIdx && ExpectedIdx < MaskSize
&& "Out of range element index") ? void (0) : __assert_fail
("0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx && ExpectedIdx < MaskSize && \"Out of range element index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11567, __extension__
__PRETTY_FUNCTION__))
;
11568 if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode())
11569 return false;
11570
11571 switch (Op.getOpcode()) {
11572 case ISD::BUILD_VECTOR:
11573 // If the values are build vectors, we can look through them to find
11574 // equivalent inputs that make the shuffles equivalent.
11575 // TODO: Handle MaskSize != Op.getNumOperands()?
11576 if (MaskSize == (int)Op.getNumOperands() &&
11577 MaskSize == (int)ExpectedOp.getNumOperands())
11578 return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);
11579 break;
11580 case X86ISD::VBROADCAST:
11581 case X86ISD::VBROADCAST_LOAD:
11582 // TODO: Handle MaskSize != Op.getValueType().getVectorNumElements()?
11583 return (Op == ExpectedOp &&
11584 (int)Op.getValueType().getVectorNumElements() == MaskSize);
11585 case X86ISD::HADD:
11586 case X86ISD::HSUB:
11587 case X86ISD::FHADD:
11588 case X86ISD::FHSUB:
11589 case X86ISD::PACKSS:
11590 case X86ISD::PACKUS:
11591 // HOP(X,X) can refer to the elt from the lower/upper half of a lane.
11592 // TODO: Handle MaskSize != NumElts?
11593 // TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases.
11594 if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) {
11595 MVT VT = Op.getSimpleValueType();
11596 int NumElts = VT.getVectorNumElements();
11597 if (MaskSize == NumElts) {
11598 int NumLanes = VT.getSizeInBits() / 128;
11599 int NumEltsPerLane = NumElts / NumLanes;
11600 int NumHalfEltsPerLane = NumEltsPerLane / 2;
11601 bool SameLane =
11602 (Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);
11603 bool SameElt =
11604 (Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);
11605 return SameLane && SameElt;
11606 }
11607 }
11608 break;
11609 }
11610
11611 return false;
11612}
11613
11614/// Checks whether a shuffle mask is equivalent to an explicit list of
11615/// arguments.
11616///
11617/// This is a fast way to test a shuffle mask against a fixed pattern:
11618///
11619/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
11620///
11621/// It returns true if the mask is exactly as wide as the argument list, and
11622/// each element of the mask is either -1 (signifying undef) or the value given
11623/// in the argument.
11624static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask,
11625 SDValue V1 = SDValue(),
11626 SDValue V2 = SDValue()) {
11627 int Size = Mask.size();
11628 if (Size != (int)ExpectedMask.size())
11629 return false;
11630
11631 for (int i = 0; i < Size; ++i) {
11632 assert(Mask[i] >= -1 && "Out of bound mask element!")(static_cast <bool> (Mask[i] >= -1 && "Out of bound mask element!"
) ? void (0) : __assert_fail ("Mask[i] >= -1 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11632, __extension__
__PRETTY_FUNCTION__))
;
11633 int MaskIdx = Mask[i];
11634 int ExpectedIdx = ExpectedMask[i];
11635 if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {
11636 SDValue MaskV = MaskIdx < Size ? V1 : V2;
11637 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
11638 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
11639 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
11640 if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
11641 return false;
11642 }
11643 }
11644 return true;
11645}
11646
11647/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
11648///
11649/// The masks must be exactly the same width.
11650///
11651/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
11652/// value in ExpectedMask is always accepted. Otherwise the indices must match.
11653///
11654/// SM_SentinelZero is accepted as a valid negative index but must match in
11655/// both.
11656static bool isTargetShuffleEquivalent(MVT VT, ArrayRef<int> Mask,
11657 ArrayRef<int> ExpectedMask,
11658 SDValue V1 = SDValue(),
11659 SDValue V2 = SDValue()) {
11660 int Size = Mask.size();
11661 if (Size != (int)ExpectedMask.size())
11662 return false;
11663 assert(isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) &&(static_cast <bool> (isUndefOrZeroOrInRange(ExpectedMask
, 0, 2 * Size) && "Illegal target shuffle mask") ? void
(0) : __assert_fail ("isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) && \"Illegal target shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11664, __extension__
__PRETTY_FUNCTION__))
11664 "Illegal target shuffle mask")(static_cast <bool> (isUndefOrZeroOrInRange(ExpectedMask
, 0, 2 * Size) && "Illegal target shuffle mask") ? void
(0) : __assert_fail ("isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) && \"Illegal target shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11664, __extension__
__PRETTY_FUNCTION__))
;
11665
11666 // Check for out-of-range target shuffle mask indices.
11667 if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))
11668 return false;
11669
11670 // Don't use V1/V2 if they're not the same size as the shuffle mask type.
11671 if (V1 && V1.getValueSizeInBits() != VT.getSizeInBits())
11672 V1 = SDValue();
11673 if (V2 && V2.getValueSizeInBits() != VT.getSizeInBits())
11674 V2 = SDValue();
11675
11676 for (int i = 0; i < Size; ++i) {
11677 int MaskIdx = Mask[i];
11678 int ExpectedIdx = ExpectedMask[i];
11679 if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx)
11680 continue;
11681 if (0 <= MaskIdx && 0 <= ExpectedIdx) {
11682 SDValue MaskV = MaskIdx < Size ? V1 : V2;
11683 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
11684 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
11685 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
11686 if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
11687 continue;
11688 }
11689 // TODO - handle SM_Sentinel equivalences.
11690 return false;
11691 }
11692 return true;
11693}
11694
11695// Attempt to create a shuffle mask from a VSELECT/BLENDV condition mask.
11696static bool createShuffleMaskFromVSELECT(SmallVectorImpl<int> &Mask,
11697 SDValue Cond, bool IsBLENDV = false) {
11698 EVT CondVT = Cond.getValueType();
11699 unsigned EltSizeInBits = CondVT.getScalarSizeInBits();
11700 unsigned NumElts = CondVT.getVectorNumElements();
11701
11702 APInt UndefElts;
11703 SmallVector<APInt, 32> EltBits;
11704 if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,
11705 true, false))
11706 return false;
11707
11708 Mask.resize(NumElts, SM_SentinelUndef);
11709
11710 for (int i = 0; i != (int)NumElts; ++i) {
11711 Mask[i] = i;
11712 // Arbitrarily choose from the 2nd operand if the select condition element
11713 // is undef.
11714 // TODO: Can we do better by matching patterns such as even/odd?
11715 if (UndefElts[i] || (!IsBLENDV && EltBits[i].isZero()) ||
11716 (IsBLENDV && EltBits[i].isNonNegative()))
11717 Mask[i] += NumElts;
11718 }
11719
11720 return true;
11721}
11722
11723// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
11724// instructions.
11725static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
11726 if (VT != MVT::v8i32 && VT != MVT::v8f32)
11727 return false;
11728
11729 SmallVector<int, 8> Unpcklwd;
11730 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
11731 /* Unary = */ false);
11732 SmallVector<int, 8> Unpckhwd;
11733 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
11734 /* Unary = */ false);
11735 bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd) ||
11736 isTargetShuffleEquivalent(VT, Mask, Unpckhwd));
11737 return IsUnpackwdMask;
11738}
11739
11740static bool is128BitUnpackShuffleMask(ArrayRef<int> Mask) {
11741 // Create 128-bit vector type based on mask size.
11742 MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
11743 MVT VT = MVT::getVectorVT(EltVT, Mask.size());
11744
11745 // We can't assume a canonical shuffle mask, so try the commuted version too.
11746 SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
11747 ShuffleVectorSDNode::commuteMask(CommutedMask);
11748
11749 // Match any of unary/binary or low/high.
11750 for (unsigned i = 0; i != 4; ++i) {
11751 SmallVector<int, 16> UnpackMask;
11752 createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
11753 if (isTargetShuffleEquivalent(VT, Mask, UnpackMask) ||
11754 isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask))
11755 return true;
11756 }
11757 return false;
11758}
11759
11760/// Return true if a shuffle mask chooses elements identically in its top and
11761/// bottom halves. For example, any splat mask has the same top and bottom
11762/// halves. If an element is undefined in only one half of the mask, the halves
11763/// are not considered identical.
11764static bool hasIdenticalHalvesShuffleMask(ArrayRef<int> Mask) {
11765 assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask")(static_cast <bool> (Mask.size() % 2 == 0 && "Expecting even number of elements in mask"
) ? void (0) : __assert_fail ("Mask.size() % 2 == 0 && \"Expecting even number of elements in mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11765, __extension__
__PRETTY_FUNCTION__))
;
11766 unsigned HalfSize = Mask.size() / 2;
11767 for (unsigned i = 0; i != HalfSize; ++i) {
11768 if (Mask[i] != Mask[i + HalfSize])
11769 return false;
11770 }
11771 return true;
11772}
11773
11774/// Get a 4-lane 8-bit shuffle immediate for a mask.
11775///
11776/// This helper function produces an 8-bit shuffle immediate corresponding to
11777/// the ubiquitous shuffle encoding scheme used in x86 instructions for
11778/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
11779/// example.
11780///
11781/// NB: We rely heavily on "undef" masks preserving the input lane.
11782static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
11783 assert(Mask.size() == 4 && "Only 4-lane shuffle masks")(static_cast <bool> (Mask.size() == 4 && "Only 4-lane shuffle masks"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Only 4-lane shuffle masks\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11783, __extension__
__PRETTY_FUNCTION__))
;
11784 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[0] >= -1 && Mask[0
] < 4 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[0] >= -1 && Mask[0] < 4 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11784, __extension__
__PRETTY_FUNCTION__))
;
11785 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[1] >= -1 && Mask[1
] < 4 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[1] >= -1 && Mask[1] < 4 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11785, __extension__
__PRETTY_FUNCTION__))
;
11786 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[2] >= -1 && Mask[2
] < 4 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[2] >= -1 && Mask[2] < 4 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11786, __extension__
__PRETTY_FUNCTION__))
;
11787 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[3] >= -1 && Mask[3
] < 4 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[3] >= -1 && Mask[3] < 4 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11787, __extension__
__PRETTY_FUNCTION__))
;
11788
11789 // If the mask only uses one non-undef element, then fully 'splat' it to
11790 // improve later broadcast matching.
11791 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
11792 assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask")(static_cast <bool> (0 <= FirstIndex && FirstIndex
< 4 && "All undef shuffle mask") ? void (0) : __assert_fail
("0 <= FirstIndex && FirstIndex < 4 && \"All undef shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11792, __extension__
__PRETTY_FUNCTION__))
;
11793
11794 int FirstElt = Mask[FirstIndex];
11795 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }))
11796 return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;
11797
11798 unsigned Imm = 0;
11799 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
11800 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
11801 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
11802 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
11803 return Imm;
11804}
11805
11806static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
11807 SelectionDAG &DAG) {
11808 return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
11809}
11810
11811// The Shuffle result is as follow:
11812// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
11813// Each Zeroable's element correspond to a particular Mask's element.
11814// As described in computeZeroableShuffleElements function.
11815//
11816// The function looks for a sub-mask that the nonzero elements are in
11817// increasing order. If such sub-mask exist. The function returns true.
11818static bool isNonZeroElementsInOrder(const APInt &Zeroable,
11819 ArrayRef<int> Mask, const EVT &VectorType,
11820 bool &IsZeroSideLeft) {
11821 int NextElement = -1;
11822 // Check if the Mask's nonzero elements are in increasing order.
11823 for (int i = 0, e = Mask.size(); i < e; i++) {
11824 // Checks if the mask's zeros elements are built from only zeros.
11825 assert(Mask[i] >= -1 && "Out of bound mask element!")(static_cast <bool> (Mask[i] >= -1 && "Out of bound mask element!"
) ? void (0) : __assert_fail ("Mask[i] >= -1 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11825, __extension__
__PRETTY_FUNCTION__))
;
11826 if (Mask[i] < 0)
11827 return false;
11828 if (Zeroable[i])
11829 continue;
11830 // Find the lowest non zero element
11831 if (NextElement < 0) {
11832 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
11833 IsZeroSideLeft = NextElement != 0;
11834 }
11835 // Exit if the mask's non zero elements are not in increasing order.
11836 if (NextElement != Mask[i])
11837 return false;
11838 NextElement++;
11839 }
11840 return true;
11841}
11842
11843/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
11844static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
11845 ArrayRef<int> Mask, SDValue V1,
11846 SDValue V2, const APInt &Zeroable,
11847 const X86Subtarget &Subtarget,
11848 SelectionDAG &DAG) {
11849 int Size = Mask.size();
11850 int LaneSize = 128 / VT.getScalarSizeInBits();
11851 const int NumBytes = VT.getSizeInBits() / 8;
11852 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
11853
11854 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||(static_cast <bool> ((Subtarget.hasSSSE3() && VT
.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector
()) || (Subtarget.hasBWI() && VT.is512BitVector())) ?
void (0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11856, __extension__
__PRETTY_FUNCTION__))
11855 (Subtarget.hasAVX2() && VT.is256BitVector()) ||(static_cast <bool> ((Subtarget.hasSSSE3() && VT
.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector
()) || (Subtarget.hasBWI() && VT.is512BitVector())) ?
void (0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11856, __extension__
__PRETTY_FUNCTION__))
11856 (Subtarget.hasBWI() && VT.is512BitVector()))(static_cast <bool> ((Subtarget.hasSSSE3() && VT
.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector
()) || (Subtarget.hasBWI() && VT.is512BitVector())) ?
void (0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11856, __extension__
__PRETTY_FUNCTION__))
;
11857
11858 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
11859 // Sign bit set in i8 mask means zero element.
11860 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
11861
11862 SDValue V;
11863 for (int i = 0; i < NumBytes; ++i) {
11864 int M = Mask[i / NumEltBytes];
11865 if (M < 0) {
11866 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
11867 continue;
11868 }
11869 if (Zeroable[i / NumEltBytes]) {
11870 PSHUFBMask[i] = ZeroMask;
11871 continue;
11872 }
11873
11874 // We can only use a single input of V1 or V2.
11875 SDValue SrcV = (M >= Size ? V2 : V1);
11876 if (V && V != SrcV)
11877 return SDValue();
11878 V = SrcV;
11879 M %= Size;
11880
11881 // PSHUFB can't cross lanes, ensure this doesn't happen.
11882 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
11883 return SDValue();
11884
11885 M = M % LaneSize;
11886 M = M * NumEltBytes + (i % NumEltBytes);
11887 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
11888 }
11889 assert(V && "Failed to find a source input")(static_cast <bool> (V && "Failed to find a source input"
) ? void (0) : __assert_fail ("V && \"Failed to find a source input\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11889, __extension__
__PRETTY_FUNCTION__))
;
11890
11891 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
11892 return DAG.getBitcast(
11893 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
11894 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
11895}
11896
11897static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
11898 const X86Subtarget &Subtarget, SelectionDAG &DAG,
11899 const SDLoc &dl);
11900
11901// X86 has dedicated shuffle that can be lowered to VEXPAND
11902static SDValue lowerShuffleToEXPAND(const SDLoc &DL, MVT VT,
11903 const APInt &Zeroable,
11904 ArrayRef<int> Mask, SDValue &V1,
11905 SDValue &V2, SelectionDAG &DAG,
11906 const X86Subtarget &Subtarget) {
11907 bool IsLeftZeroSide = true;
11908 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
11909 IsLeftZeroSide))
11910 return SDValue();
11911 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
11912 MVT IntegerType =
11913 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
11914 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
11915 unsigned NumElts = VT.getVectorNumElements();
11916 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&(static_cast <bool> ((NumElts == 4 || NumElts == 8 || NumElts
== 16) && "Unexpected number of vector elements") ? void
(0) : __assert_fail ("(NumElts == 4 || NumElts == 8 || NumElts == 16) && \"Unexpected number of vector elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11917, __extension__
__PRETTY_FUNCTION__))
11917 "Unexpected number of vector elements")(static_cast <bool> ((NumElts == 4 || NumElts == 8 || NumElts
== 16) && "Unexpected number of vector elements") ? void
(0) : __assert_fail ("(NumElts == 4 || NumElts == 8 || NumElts == 16) && \"Unexpected number of vector elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11917, __extension__
__PRETTY_FUNCTION__))
;
11918 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
11919 Subtarget, DAG, DL);
11920 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
11921 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
11922 return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
11923}
11924
11925static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
11926 unsigned &UnpackOpcode, bool IsUnary,
11927 ArrayRef<int> TargetMask, const SDLoc &DL,
11928 SelectionDAG &DAG,
11929 const X86Subtarget &Subtarget) {
11930 int NumElts = VT.getVectorNumElements();
11931
11932 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
11933 for (int i = 0; i != NumElts; i += 2) {
11934 int M1 = TargetMask[i + 0];
11935 int M2 = TargetMask[i + 1];
11936 Undef1 &= (SM_SentinelUndef == M1);
11937 Undef2 &= (SM_SentinelUndef == M2);
11938 Zero1 &= isUndefOrZero(M1);
11939 Zero2 &= isUndefOrZero(M2);
11940 }
11941 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&(static_cast <bool> (!((Undef1 || Zero1) && (Undef2
|| Zero2)) && "Zeroable shuffle detected") ? void (0
) : __assert_fail ("!((Undef1 || Zero1) && (Undef2 || Zero2)) && \"Zeroable shuffle detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11942, __extension__
__PRETTY_FUNCTION__))
11942 "Zeroable shuffle detected")(static_cast <bool> (!((Undef1 || Zero1) && (Undef2
|| Zero2)) && "Zeroable shuffle detected") ? void (0
) : __assert_fail ("!((Undef1 || Zero1) && (Undef2 || Zero2)) && \"Zeroable shuffle detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11942, __extension__
__PRETTY_FUNCTION__))
;
11943
11944 // Attempt to match the target mask against the unpack lo/hi mask patterns.
11945 SmallVector<int, 64> Unpckl, Unpckh;
11946 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
11947 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, V1,
11948 (IsUnary ? V1 : V2))) {
11949 UnpackOpcode = X86ISD::UNPCKL;
11950 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
11951 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
11952 return true;
11953 }
11954
11955 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
11956 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, V1,
11957 (IsUnary ? V1 : V2))) {
11958 UnpackOpcode = X86ISD::UNPCKH;
11959 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
11960 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
11961 return true;
11962 }
11963
11964 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
11965 if (IsUnary && (Zero1 || Zero2)) {
11966 // Don't bother if we can blend instead.
11967 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
11968 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
11969 return false;
11970
11971 bool MatchLo = true, MatchHi = true;
11972 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
11973 int M = TargetMask[i];
11974
11975 // Ignore if the input is known to be zero or the index is undef.
11976 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
11977 (M == SM_SentinelUndef))
11978 continue;
11979
11980 MatchLo &= (M == Unpckl[i]);
11981 MatchHi &= (M == Unpckh[i]);
11982 }
11983
11984 if (MatchLo || MatchHi) {
11985 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
11986 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
11987 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
11988 return true;
11989 }
11990 }
11991
11992 // If a binary shuffle, commute and try again.
11993 if (!IsUnary) {
11994 ShuffleVectorSDNode::commuteMask(Unpckl);
11995 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl)) {
11996 UnpackOpcode = X86ISD::UNPCKL;
11997 std::swap(V1, V2);
11998 return true;
11999 }
12000
12001 ShuffleVectorSDNode::commuteMask(Unpckh);
12002 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh)) {
12003 UnpackOpcode = X86ISD::UNPCKH;
12004 std::swap(V1, V2);
12005 return true;
12006 }
12007 }
12008
12009 return false;
12010}
12011
12012// X86 has dedicated unpack instructions that can handle specific blend
12013// operations: UNPCKH and UNPCKL.
12014static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT,
12015 ArrayRef<int> Mask, SDValue V1, SDValue V2,
12016 SelectionDAG &DAG) {
12017 SmallVector<int, 8> Unpckl;
12018 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
12019 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
12020 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
12021
12022 SmallVector<int, 8> Unpckh;
12023 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
12024 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
12025 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
12026
12027 // Commute and try again.
12028 ShuffleVectorSDNode::commuteMask(Unpckl);
12029 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
12030 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
12031
12032 ShuffleVectorSDNode::commuteMask(Unpckh);
12033 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
12034 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
12035
12036 return SDValue();
12037}
12038
12039/// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)
12040/// followed by unpack 256-bit.
12041static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT,
12042 ArrayRef<int> Mask, SDValue V1,
12043 SDValue V2, SelectionDAG &DAG) {
12044 SmallVector<int, 32> Unpckl, Unpckh;
12045 createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);
12046 createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);
12047
12048 unsigned UnpackOpcode;
12049 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
12050 UnpackOpcode = X86ISD::UNPCKL;
12051 else if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
12052 UnpackOpcode = X86ISD::UNPCKH;
12053 else
12054 return SDValue();
12055
12056 // This is a "natural" unpack operation (rather than the 128-bit sectored
12057 // operation implemented by AVX). We need to rearrange 64-bit chunks of the
12058 // input in order to use the x86 instruction.
12059 V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),
12060 DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});
12061 V1 = DAG.getBitcast(VT, V1);
12062 return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);
12063}
12064
12065// Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the
12066// source into the lower elements and zeroing the upper elements.
12067static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,
12068 ArrayRef<int> Mask, const APInt &Zeroable,
12069 const X86Subtarget &Subtarget) {
12070 if (!VT.is512BitVector() && !Subtarget.hasVLX())
12071 return false;
12072
12073 unsigned NumElts = Mask.size();
12074 unsigned EltSizeInBits = VT.getScalarSizeInBits();
12075 unsigned MaxScale = 64 / EltSizeInBits;
12076
12077 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
12078 unsigned SrcEltBits = EltSizeInBits * Scale;
12079 if (SrcEltBits < 32 && !Subtarget.hasBWI())
12080 continue;
12081 unsigned NumSrcElts = NumElts / Scale;
12082 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))
12083 continue;
12084 unsigned UpperElts = NumElts - NumSrcElts;
12085 if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
12086 continue;
12087 SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);
12088 SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);
12089 DstVT = MVT::getIntegerVT(EltSizeInBits);
12090 if ((NumSrcElts * EltSizeInBits) >= 128) {
12091 // ISD::TRUNCATE
12092 DstVT = MVT::getVectorVT(DstVT, NumSrcElts);
12093 } else {
12094 // X86ISD::VTRUNC
12095 DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);
12096 }
12097 return true;
12098 }
12099
12100 return false;
12101}
12102
12103// Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper
12104// element padding to the final DstVT.
12105static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,
12106 const X86Subtarget &Subtarget,
12107 SelectionDAG &DAG, bool ZeroUppers) {
12108 MVT SrcVT = Src.getSimpleValueType();
12109 MVT DstSVT = DstVT.getScalarType();
12110 unsigned NumDstElts = DstVT.getVectorNumElements();
12111 unsigned NumSrcElts = SrcVT.getVectorNumElements();
12112 unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();
12113
12114 if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
12115 return SDValue();
12116
12117 // Perform a direct ISD::TRUNCATE if possible.
12118 if (NumSrcElts == NumDstElts)
12119 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);
12120
12121 if (NumSrcElts > NumDstElts) {
12122 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
12123 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
12124 return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());
12125 }
12126
12127 if ((NumSrcElts * DstEltSizeInBits) >= 128) {
12128 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
12129 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
12130 return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
12131 DstVT.getSizeInBits());
12132 }
12133
12134 // Non-VLX targets must truncate from a 512-bit type, so we need to
12135 // widen, truncate and then possibly extract the original subvector.
12136 if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {
12137 SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512);
12138 return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);
12139 }
12140
12141 // Fallback to a X86ISD::VTRUNC, padding if necessary.
12142 MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);
12143 SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);
12144 if (DstVT != TruncVT)
12145 Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
12146 DstVT.getSizeInBits());
12147 return Trunc;
12148}
12149
12150// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
12151//
12152// An example is the following:
12153//
12154// t0: ch = EntryToken
12155// t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
12156// t25: v4i32 = truncate t2
12157// t41: v8i16 = bitcast t25
12158// t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
12159// Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
12160// t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
12161// t18: v2i64 = bitcast t51
12162//
12163// One can just use a single vpmovdw instruction, without avx512vl we need to
12164// use the zmm variant and extract the lower subvector, padding with zeroes.
12165// TODO: Merge with lowerShuffleAsVTRUNC.
12166static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1,
12167 SDValue V2, ArrayRef<int> Mask,
12168 const APInt &Zeroable,
12169 const X86Subtarget &Subtarget,
12170 SelectionDAG &DAG) {
12171 assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type")(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v8i16
) && "Unexpected VTRUNC type") ? void (0) : __assert_fail
("(VT == MVT::v16i8 || VT == MVT::v8i16) && \"Unexpected VTRUNC type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12171, __extension__
__PRETTY_FUNCTION__))
;
12172 if (!Subtarget.hasAVX512())
12173 return SDValue();
12174
12175 unsigned NumElts = VT.getVectorNumElements();
12176 unsigned EltSizeInBits = VT.getScalarSizeInBits();
12177 unsigned MaxScale = 64 / EltSizeInBits;
12178 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
12179 unsigned NumSrcElts = NumElts / Scale;
12180 unsigned UpperElts = NumElts - NumSrcElts;
12181 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
12182 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
12183 continue;
12184
12185 SDValue Src = V1;
12186 if (!Src.hasOneUse())
12187 return SDValue();
12188
12189 Src = peekThroughOneUseBitcasts(Src);
12190 if (Src.getOpcode() != ISD::TRUNCATE ||
12191 Src.getScalarValueSizeInBits() != (EltSizeInBits * Scale))
12192 return SDValue();
12193 Src = Src.getOperand(0);
12194
12195 // VPMOVWB is only available with avx512bw.
12196 MVT SrcVT = Src.getSimpleValueType();
12197 if (SrcVT.getVectorElementType() == MVT::i16 && VT == MVT::v16i8 &&
12198 !Subtarget.hasBWI())
12199 return SDValue();
12200
12201 bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);
12202 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
12203 }
12204
12205 return SDValue();
12206}
12207
12208// Attempt to match binary shuffle patterns as a truncate.
12209static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1,
12210 SDValue V2, ArrayRef<int> Mask,
12211 const APInt &Zeroable,
12212 const X86Subtarget &Subtarget,
12213 SelectionDAG &DAG) {
12214 assert((VT.is128BitVector() || VT.is256BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
()) && "Unexpected VTRUNC type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector()) && \"Unexpected VTRUNC type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12215, __extension__
__PRETTY_FUNCTION__))
12215 "Unexpected VTRUNC type")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
()) && "Unexpected VTRUNC type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector()) && \"Unexpected VTRUNC type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12215, __extension__
__PRETTY_FUNCTION__))
;
12216 if (!Subtarget.hasAVX512())
12217 return SDValue();
12218
12219 unsigned NumElts = VT.getVectorNumElements();
12220 unsigned EltSizeInBits = VT.getScalarSizeInBits();
12221 unsigned MaxScale = 64 / EltSizeInBits;
12222 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
12223 // TODO: Support non-BWI VPMOVWB truncations?
12224 unsigned SrcEltBits = EltSizeInBits * Scale;
12225 if (SrcEltBits < 32 && !Subtarget.hasBWI())
12226 continue;
12227
12228 // Match shuffle <0,Scale,2*Scale,..,undef_or_zero,undef_or_zero,...>
12229 // Bail if the V2 elements are undef.
12230 unsigned NumHalfSrcElts = NumElts / Scale;
12231 unsigned NumSrcElts = 2 * NumHalfSrcElts;
12232 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
12233 isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))
12234 continue;
12235
12236 // The elements beyond the truncation must be undef/zero.
12237 unsigned UpperElts = NumElts - NumSrcElts;
12238 if (UpperElts > 0 &&
12239 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
12240 continue;
12241 bool UndefUppers =
12242 UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);
12243
12244 // As we're using both sources then we need to concat them together
12245 // and truncate from the double-sized src.
12246 MVT ConcatVT = MVT::getVectorVT(VT.getScalarType(), NumElts * 2);
12247 SDValue Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);
12248
12249 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
12250 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
12251 Src = DAG.getBitcast(SrcVT, Src);
12252 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
12253 }
12254
12255 return SDValue();
12256}
12257
12258/// Check whether a compaction lowering can be done by dropping even/odd
12259/// elements and compute how many times even/odd elements must be dropped.
12260///
12261/// This handles shuffles which take every Nth element where N is a power of
12262/// two. Example shuffle masks:
12263///
12264/// (even)
12265/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
12266/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
12267/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
12268/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
12269/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
12270/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
12271///
12272/// (odd)
12273/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14
12274/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
12275///
12276/// Any of these lanes can of course be undef.
12277///
12278/// This routine only supports N <= 3.
12279/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
12280/// for larger N.
12281///
12282/// \returns N above, or the number of times even/odd elements must be dropped
12283/// if there is such a number. Otherwise returns zero.
12284static int canLowerByDroppingElements(ArrayRef<int> Mask, bool MatchEven,
12285 bool IsSingleInput) {
12286 // The modulus for the shuffle vector entries is based on whether this is
12287 // a single input or not.
12288 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
12289 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&(static_cast <bool> (isPowerOf2_32((uint32_t)ShuffleModulus
) && "We should only be called with masks with a power-of-2 size!"
) ? void (0) : __assert_fail ("isPowerOf2_32((uint32_t)ShuffleModulus) && \"We should only be called with masks with a power-of-2 size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12290, __extension__
__PRETTY_FUNCTION__))
12290 "We should only be called with masks with a power-of-2 size!")(static_cast <bool> (isPowerOf2_32((uint32_t)ShuffleModulus
) && "We should only be called with masks with a power-of-2 size!"
) ? void (0) : __assert_fail ("isPowerOf2_32((uint32_t)ShuffleModulus) && \"We should only be called with masks with a power-of-2 size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12290, __extension__
__PRETTY_FUNCTION__))
;
12291
12292 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
12293 int Offset = MatchEven ? 0 : 1;
12294
12295 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
12296 // and 2^3 simultaneously. This is because we may have ambiguity with
12297 // partially undef inputs.
12298 bool ViableForN[3] = {true, true, true};
12299
12300 for (int i = 0, e = Mask.size(); i < e; ++i) {
12301 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
12302 // want.
12303 if (Mask[i] < 0)
12304 continue;
12305
12306 bool IsAnyViable = false;
12307 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
12308 if (ViableForN[j]) {
12309 uint64_t N = j + 1;
12310
12311 // The shuffle mask must be equal to (i * 2^N) % M.
12312 if ((uint64_t)(Mask[i] - Offset) == (((uint64_t)i << N) & ModMask))
12313 IsAnyViable = true;
12314 else
12315 ViableForN[j] = false;
12316 }
12317 // Early exit if we exhaust the possible powers of two.
12318 if (!IsAnyViable)
12319 break;
12320 }
12321
12322 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
12323 if (ViableForN[j])
12324 return j + 1;
12325
12326 // Return 0 as there is no viable power of two.
12327 return 0;
12328}
12329
12330// X86 has dedicated pack instructions that can handle specific truncation
12331// operations: PACKSS and PACKUS.
12332// Checks for compaction shuffle masks if MaxStages > 1.
12333// TODO: Add support for matching multiple PACKSS/PACKUS stages.
12334static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
12335 unsigned &PackOpcode, ArrayRef<int> TargetMask,
12336 const SelectionDAG &DAG,
12337 const X86Subtarget &Subtarget,
12338 unsigned MaxStages = 1) {
12339 unsigned NumElts = VT.getVectorNumElements();
12340 unsigned BitSize = VT.getScalarSizeInBits();
12341 assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&(static_cast <bool> (0 < MaxStages && MaxStages
<= 3 && (BitSize << MaxStages) <= 64 &&
"Illegal maximum compaction") ? void (0) : __assert_fail ("0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 && \"Illegal maximum compaction\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12342, __extension__
__PRETTY_FUNCTION__))
12342 "Illegal maximum compaction")(static_cast <bool> (0 < MaxStages && MaxStages
<= 3 && (BitSize << MaxStages) <= 64 &&
"Illegal maximum compaction") ? void (0) : __assert_fail ("0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 && \"Illegal maximum compaction\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12342, __extension__
__PRETTY_FUNCTION__))
;
12343
12344 auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {
12345 unsigned NumSrcBits = PackVT.getScalarSizeInBits();
12346 unsigned NumPackedBits = NumSrcBits - BitSize;
12347 N1 = peekThroughBitcasts(N1);
12348 N2 = peekThroughBitcasts(N2);
12349 unsigned NumBits1 = N1.getScalarValueSizeInBits();
12350 unsigned NumBits2 = N2.getScalarValueSizeInBits();
12351 bool IsZero1 = llvm::isNullOrNullSplat(N1, /*AllowUndefs*/ false);
12352 bool IsZero2 = llvm::isNullOrNullSplat(N2, /*AllowUndefs*/ false);
12353 if ((!N1.isUndef() && !IsZero1 && NumBits1 != NumSrcBits) ||
12354 (!N2.isUndef() && !IsZero2 && NumBits2 != NumSrcBits))
12355 return false;
12356 if (Subtarget.hasSSE41() || BitSize == 8) {
12357 APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);
12358 if ((N1.isUndef() || IsZero1 || DAG.MaskedValueIsZero(N1, ZeroMask)) &&
12359 (N2.isUndef() || IsZero2 || DAG.MaskedValueIsZero(N2, ZeroMask))) {
12360 V1 = N1;
12361 V2 = N2;
12362 SrcVT = PackVT;
12363 PackOpcode = X86ISD::PACKUS;
12364 return true;
12365 }
12366 }
12367 bool IsAllOnes1 = llvm::isAllOnesOrAllOnesSplat(N1, /*AllowUndefs*/ false);
12368 bool IsAllOnes2 = llvm::isAllOnesOrAllOnesSplat(N2, /*AllowUndefs*/ false);
12369 if ((N1.isUndef() || IsZero1 || IsAllOnes1 ||
12370 DAG.ComputeNumSignBits(N1) > NumPackedBits) &&
12371 (N2.isUndef() || IsZero2 || IsAllOnes2 ||
12372 DAG.ComputeNumSignBits(N2) > NumPackedBits)) {
12373 V1 = N1;
12374 V2 = N2;
12375 SrcVT = PackVT;
12376 PackOpcode = X86ISD::PACKSS;
12377 return true;
12378 }
12379 return false;
12380 };
12381
12382 // Attempt to match against wider and wider compaction patterns.
12383 for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {
12384 MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages);
12385 MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages);
12386
12387 // Try binary shuffle.
12388 SmallVector<int, 32> BinaryMask;
12389 createPackShuffleMask(VT, BinaryMask, false, NumStages);
12390 if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, V1, V2))
12391 if (MatchPACK(V1, V2, PackVT))
12392 return true;
12393
12394 // Try unary shuffle.
12395 SmallVector<int, 32> UnaryMask;
12396 createPackShuffleMask(VT, UnaryMask, true, NumStages);
12397 if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, V1))
12398 if (MatchPACK(V1, V1, PackVT))
12399 return true;
12400 }
12401
12402 return false;
12403}
12404
12405static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
12406 SDValue V1, SDValue V2, SelectionDAG &DAG,
12407 const X86Subtarget &Subtarget) {
12408 MVT PackVT;
12409 unsigned PackOpcode;
12410 unsigned SizeBits = VT.getSizeInBits();
12411 unsigned EltBits = VT.getScalarSizeInBits();
12412 unsigned MaxStages = Log2_32(64 / EltBits);
12413 if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
12414 Subtarget, MaxStages))
12415 return SDValue();
12416
12417 unsigned CurrentEltBits = PackVT.getScalarSizeInBits();
12418 unsigned NumStages = Log2_32(CurrentEltBits / EltBits);
12419
12420 // Don't lower multi-stage packs on AVX512, truncation is better.
12421 if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())
12422 return SDValue();
12423
12424 // Pack to the largest type possible:
12425 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
12426 unsigned MaxPackBits = 16;
12427 if (CurrentEltBits > 16 &&
12428 (PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41()))
12429 MaxPackBits = 32;
12430
12431 // Repeatedly pack down to the target size.
12432 SDValue Res;
12433 for (unsigned i = 0; i != NumStages; ++i) {
12434 unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);
12435 unsigned NumSrcElts = SizeBits / SrcEltBits;
12436 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
12437 MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2);
12438 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
12439 MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2);
12440 Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1),
12441 DAG.getBitcast(SrcVT, V2));
12442 V1 = V2 = Res;
12443 CurrentEltBits /= 2;
12444 }
12445 assert(Res && Res.getValueType() == VT &&(static_cast <bool> (Res && Res.getValueType() ==
VT && "Failed to lower compaction shuffle") ? void (
0) : __assert_fail ("Res && Res.getValueType() == VT && \"Failed to lower compaction shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12446, __extension__
__PRETTY_FUNCTION__))
12446 "Failed to lower compaction shuffle")(static_cast <bool> (Res && Res.getValueType() ==
VT && "Failed to lower compaction shuffle") ? void (
0) : __assert_fail ("Res && Res.getValueType() == VT && \"Failed to lower compaction shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12446, __extension__
__PRETTY_FUNCTION__))
;
12447 return Res;
12448}
12449
12450/// Try to emit a bitmask instruction for a shuffle.
12451///
12452/// This handles cases where we can model a blend exactly as a bitmask due to
12453/// one of the inputs being zeroable.
12454static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
12455 SDValue V2, ArrayRef<int> Mask,
12456 const APInt &Zeroable,
12457 const X86Subtarget &Subtarget,
12458 SelectionDAG &DAG) {
12459 MVT MaskVT = VT;
12460 MVT EltVT = VT.getVectorElementType();
12461 SDValue Zero, AllOnes;
12462 // Use f64 if i64 isn't legal.
12463 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
12464 EltVT = MVT::f64;
12465 MaskVT = MVT::getVectorVT(EltVT, Mask.size());
12466 }
12467
12468 MVT LogicVT = VT;
12469 if (EltVT == MVT::f32 || EltVT == MVT::f64) {
12470 Zero = DAG.getConstantFP(0.0, DL, EltVT);
12471 APFloat AllOnesValue =
12472 APFloat::getAllOnesValue(SelectionDAG::EVTToAPFloatSemantics(EltVT));
12473 AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);
12474 LogicVT =
12475 MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size());
12476 } else {
12477 Zero = DAG.getConstant(0, DL, EltVT);
12478 AllOnes = DAG.getAllOnesConstant(DL, EltVT);
12479 }
12480
12481 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
12482 SDValue V;
12483 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
12484 if (Zeroable[i])
12485 continue;
12486 if (Mask[i] % Size != i)
12487 return SDValue(); // Not a blend.
12488 if (!V)
12489 V = Mask[i] < Size ? V1 : V2;
12490 else if (V != (Mask[i] < Size ? V1 : V2))
12491 return SDValue(); // Can only let one input through the mask.
12492
12493 VMaskOps[i] = AllOnes;
12494 }
12495 if (!V)
12496 return SDValue(); // No non-zeroable elements!
12497
12498 SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
12499 VMask = DAG.getBitcast(LogicVT, VMask);
12500 V = DAG.getBitcast(LogicVT, V);
12501 SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
12502 return DAG.getBitcast(VT, And);
12503}
12504
12505/// Try to emit a blend instruction for a shuffle using bit math.
12506///
12507/// This is used as a fallback approach when first class blend instructions are
12508/// unavailable. Currently it is only suitable for integer vectors, but could
12509/// be generalized for floating point vectors if desirable.
12510static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
12511 SDValue V2, ArrayRef<int> Mask,
12512 SelectionDAG &DAG) {
12513 assert(VT.isInteger() && "Only supports integer vector types!")(static_cast <bool> (VT.isInteger() && "Only supports integer vector types!"
) ? void (0) : __assert_fail ("VT.isInteger() && \"Only supports integer vector types!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12513, __extension__
__PRETTY_FUNCTION__))
;
12514 MVT EltVT = VT.getVectorElementType();
12515 SDValue Zero = DAG.getConstant(0, DL, EltVT);
12516 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
12517 SmallVector<SDValue, 16> MaskOps;
12518 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
12519 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
12520 return SDValue(); // Shuffled input!
12521 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
12522 }
12523
12524 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
12525 V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
12526 V2 = DAG.getNode(X86ISD::ANDNP, DL, VT, V1Mask, V2);
12527 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
12528}
12529
12530static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
12531 SDValue PreservedSrc,
12532 const X86Subtarget &Subtarget,
12533 SelectionDAG &DAG);
12534
12535static bool matchShuffleAsBlend(SDValue V1, SDValue V2,
12536 MutableArrayRef<int> Mask,
12537 const APInt &Zeroable, bool &ForceV1Zero,
12538 bool &ForceV2Zero, uint64_t &BlendMask) {
12539 bool V1IsZeroOrUndef =
12540 V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());
12541 bool V2IsZeroOrUndef =
12542 V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
12543
12544 BlendMask = 0;
12545 ForceV1Zero = false, ForceV2Zero = false;
12546 assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask")(static_cast <bool> (Mask.size() <= 64 && "Shuffle mask too big for blend mask"
) ? void (0) : __assert_fail ("Mask.size() <= 64 && \"Shuffle mask too big for blend mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12546, __extension__
__PRETTY_FUNCTION__))
;
12547
12548 // Attempt to generate the binary blend mask. If an input is zero then
12549 // we can use any lane.
12550 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
12551 int M = Mask[i];
12552 if (M == SM_SentinelUndef)
12553 continue;
12554 if (M == i ||
12555 (0 <= M && M < Size && IsElementEquivalent(Size, V1, V1, M, i))) {
12556 Mask[i] = i;
12557 continue;
12558 }
12559 if (M == (i + Size) ||
12560 (Size <= M && IsElementEquivalent(Size, V2, V2, M - Size, i))) {
12561 BlendMask |= 1ull << i;
12562 Mask[i] = i + Size;
12563 continue;
12564 }
12565 if (Zeroable[i]) {
12566 if (V1IsZeroOrUndef) {
12567 ForceV1Zero = true;
12568 Mask[i] = i;
12569 continue;
12570 }
12571 if (V2IsZeroOrUndef) {
12572 ForceV2Zero = true;
12573 BlendMask |= 1ull << i;
12574 Mask[i] = i + Size;
12575 continue;
12576 }
12577 }
12578 return false;
12579 }
12580 return true;
12581}
12582
12583static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size,
12584 int Scale) {
12585 uint64_t ScaledMask = 0;
12586 for (int i = 0; i != Size; ++i)
12587 if (BlendMask & (1ull << i))
12588 ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);
12589 return ScaledMask;
12590}
12591
12592/// Try to emit a blend instruction for a shuffle.
12593///
12594/// This doesn't do any checks for the availability of instructions for blending
12595/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
12596/// be matched in the backend with the type given. What it does check for is
12597/// that the shuffle mask is a blend, or convertible into a blend with zero.
12598static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
12599 SDValue V2, ArrayRef<int> Original,
12600 const APInt &Zeroable,
12601 const X86Subtarget &Subtarget,
12602 SelectionDAG &DAG) {
12603 uint64_t BlendMask = 0;
12604 bool ForceV1Zero = false, ForceV2Zero = false;
12605 SmallVector<int, 64> Mask(Original.begin(), Original.end());
12606 if (!matchShuffleAsBlend(V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,
12607 BlendMask))
12608 return SDValue();
12609
12610 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
12611 if (ForceV1Zero)
12612 V1 = getZeroVector(VT, Subtarget, DAG, DL);
12613 if (ForceV2Zero)
12614 V2 = getZeroVector(VT, Subtarget, DAG, DL);
12615
12616 unsigned NumElts = VT.getVectorNumElements();
12617
12618 switch (VT.SimpleTy) {
12619 case MVT::v4i64:
12620 case MVT::v8i32:
12621 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "256-bit integer blends require AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"256-bit integer blends require AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12621, __extension__
__PRETTY_FUNCTION__))
;
12622 LLVM_FALLTHROUGH[[gnu::fallthrough]];
12623 case MVT::v4f64:
12624 case MVT::v8f32:
12625 assert(Subtarget.hasAVX() && "256-bit float blends require AVX!")(static_cast <bool> (Subtarget.hasAVX() && "256-bit float blends require AVX!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"256-bit float blends require AVX!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12625, __extension__
__PRETTY_FUNCTION__))
;
12626 LLVM_FALLTHROUGH[[gnu::fallthrough]];
12627 case MVT::v2f64:
12628 case MVT::v2i64:
12629 case MVT::v4f32:
12630 case MVT::v4i32:
12631 case MVT::v8i16:
12632 assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!")(static_cast <bool> (Subtarget.hasSSE41() && "128-bit blends require SSE41!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE41() && \"128-bit blends require SSE41!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12632, __extension__
__PRETTY_FUNCTION__))
;
12633 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
12634 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
12635 case MVT::v16i16: {
12636 assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "v16i16 blends require AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"v16i16 blends require AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12636, __extension__
__PRETTY_FUNCTION__))
;
12637 SmallVector<int, 8> RepeatedMask;
12638 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
12639 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
12640 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!")(static_cast <bool> (RepeatedMask.size() == 8 &&
"Repeated mask size doesn't match!") ? void (0) : __assert_fail
("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12640, __extension__
__PRETTY_FUNCTION__))
;
12641 BlendMask = 0;
12642 for (int i = 0; i < 8; ++i)
12643 if (RepeatedMask[i] >= 8)
12644 BlendMask |= 1ull << i;
12645 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
12646 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
12647 }
12648 // Use PBLENDW for lower/upper lanes and then blend lanes.
12649 // TODO - we should allow 2 PBLENDW here and leave shuffle combine to
12650 // merge to VSELECT where useful.
12651 uint64_t LoMask = BlendMask & 0xFF;
12652 uint64_t HiMask = (BlendMask >> 8) & 0xFF;
12653 if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
12654 SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
12655 DAG.getTargetConstant(LoMask, DL, MVT::i8));
12656 SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
12657 DAG.getTargetConstant(HiMask, DL, MVT::i8));
12658 return DAG.getVectorShuffle(
12659 MVT::v16i16, DL, Lo, Hi,
12660 {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
12661 }
12662 LLVM_FALLTHROUGH[[gnu::fallthrough]];
12663 }
12664 case MVT::v32i8:
12665 assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"256-bit byte-blends require AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12665, __extension__
__PRETTY_FUNCTION__))
;
12666 LLVM_FALLTHROUGH[[gnu::fallthrough]];
12667 case MVT::v16i8: {
12668 assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!")(static_cast <bool> (Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE41() && \"128-bit byte-blends require SSE41!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12668, __extension__
__PRETTY_FUNCTION__))
;
12669
12670 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
12671 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
12672 Subtarget, DAG))
12673 return Masked;
12674
12675 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
12676 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
12677 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
12678 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
12679 }
12680
12681 // If we have VPTERNLOG, we can use that as a bit blend.
12682 if (Subtarget.hasVLX())
12683 if (SDValue BitBlend =
12684 lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
12685 return BitBlend;
12686
12687 // Scale the blend by the number of bytes per element.
12688 int Scale = VT.getScalarSizeInBits() / 8;
12689
12690 // This form of blend is always done on bytes. Compute the byte vector
12691 // type.
12692 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
12693
12694 // x86 allows load folding with blendvb from the 2nd source operand. But
12695 // we are still using LLVM select here (see comment below), so that's V1.
12696 // If V2 can be load-folded and V1 cannot be load-folded, then commute to
12697 // allow that load-folding possibility.
12698 if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
12699 ShuffleVectorSDNode::commuteMask(Mask);
12700 std::swap(V1, V2);
12701 }
12702
12703 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
12704 // mix of LLVM's code generator and the x86 backend. We tell the code
12705 // generator that boolean values in the elements of an x86 vector register
12706 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
12707 // mapping a select to operand #1, and 'false' mapping to operand #2. The
12708 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
12709 // of the element (the remaining are ignored) and 0 in that high bit would
12710 // mean operand #1 while 1 in the high bit would mean operand #2. So while
12711 // the LLVM model for boolean values in vector elements gets the relevant
12712 // bit set, it is set backwards and over constrained relative to x86's
12713 // actual model.
12714 SmallVector<SDValue, 32> VSELECTMask;
12715 for (int i = 0, Size = Mask.size(); i < Size; ++i)
12716 for (int j = 0; j < Scale; ++j)
12717 VSELECTMask.push_back(
12718 Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
12719 : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
12720 MVT::i8));
12721
12722 V1 = DAG.getBitcast(BlendVT, V1);
12723 V2 = DAG.getBitcast(BlendVT, V2);
12724 return DAG.getBitcast(
12725 VT,
12726 DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
12727 V1, V2));
12728 }
12729 case MVT::v16f32:
12730 case MVT::v8f64:
12731 case MVT::v8i64:
12732 case MVT::v16i32:
12733 case MVT::v32i16:
12734 case MVT::v64i8: {
12735 // Attempt to lower to a bitmask if we can. Only if not optimizing for size.
12736 bool OptForSize = DAG.shouldOptForSize();
12737 if (!OptForSize) {
12738 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
12739 Subtarget, DAG))
12740 return Masked;
12741 }
12742
12743 // Otherwise load an immediate into a GPR, cast to k-register, and use a
12744 // masked move.
12745 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
12746 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
12747 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
12748 }
12749 default:
12750 llvm_unreachable("Not a supported integer vector type!")::llvm::llvm_unreachable_internal("Not a supported integer vector type!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12750)
;
12751 }
12752}
12753
12754/// Try to lower as a blend of elements from two inputs followed by
12755/// a single-input permutation.
12756///
12757/// This matches the pattern where we can blend elements from two inputs and
12758/// then reduce the shuffle to a single-input permutation.
12759static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
12760 SDValue V1, SDValue V2,
12761 ArrayRef<int> Mask,
12762 SelectionDAG &DAG,
12763 bool ImmBlends = false) {
12764 // We build up the blend mask while checking whether a blend is a viable way
12765 // to reduce the shuffle.
12766 SmallVector<int, 32> BlendMask(Mask.size(), -1);
12767 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
12768
12769 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
12770 if (Mask[i] < 0)
12771 continue;
12772
12773 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.")(static_cast <bool> (Mask[i] < Size * 2 && "Shuffle input is out of bounds."
) ? void (0) : __assert_fail ("Mask[i] < Size * 2 && \"Shuffle input is out of bounds.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12773, __extension__
__PRETTY_FUNCTION__))
;
12774
12775 if (BlendMask[Mask[i] % Size] < 0)
12776 BlendMask[Mask[i] % Size] = Mask[i];
12777 else if (BlendMask[Mask[i] % Size] != Mask[i])
12778 return SDValue(); // Can't blend in the needed input!
12779
12780 PermuteMask[i] = Mask[i] % Size;
12781 }
12782
12783 // If only immediate blends, then bail if the blend mask can't be widened to
12784 // i16.
12785 unsigned EltSize = VT.getScalarSizeInBits();
12786 if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
12787 return SDValue();
12788
12789 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
12790 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
12791}
12792
12793/// Try to lower as an unpack of elements from two inputs followed by
12794/// a single-input permutation.
12795///
12796/// This matches the pattern where we can unpack elements from two inputs and
12797/// then reduce the shuffle to a single-input (wider) permutation.
12798static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT,
12799 SDValue V1, SDValue V2,
12800 ArrayRef<int> Mask,
12801 SelectionDAG &DAG) {
12802 int NumElts = Mask.size();
12803 int NumLanes = VT.getSizeInBits() / 128;
12804 int NumLaneElts = NumElts / NumLanes;
12805 int NumHalfLaneElts = NumLaneElts / 2;
12806
12807 bool MatchLo = true, MatchHi = true;
12808 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
12809
12810 // Determine UNPCKL/UNPCKH type and operand order.
12811 for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
12812 for (int Elt = 0; Elt != NumLaneElts; ++Elt) {
12813 int M = Mask[Lane + Elt];
12814 if (M < 0)
12815 continue;
12816
12817 SDValue &Op = Ops[Elt & 1];
12818 if (M < NumElts && (Op.isUndef() || Op == V1))
12819 Op = V1;
12820 else if (NumElts <= M && (Op.isUndef() || Op == V2))
12821 Op = V2;
12822 else
12823 return SDValue();
12824
12825 int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;
12826 MatchLo &= isUndefOrInRange(M, Lo, Mid) ||
12827 isUndefOrInRange(M, NumElts + Lo, NumElts + Mid);
12828 MatchHi &= isUndefOrInRange(M, Mid, Hi) ||
12829 isUndefOrInRange(M, NumElts + Mid, NumElts + Hi);
12830 if (!MatchLo && !MatchHi)
12831 return SDValue();
12832 }
12833 }
12834 assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI")(static_cast <bool> ((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI"
) ? void (0) : __assert_fail ("(MatchLo ^ MatchHi) && \"Failed to match UNPCKLO/UNPCKHI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12834, __extension__
__PRETTY_FUNCTION__))
;
12835
12836 // Now check that each pair of elts come from the same unpack pair
12837 // and set the permute mask based on each pair.
12838 // TODO - Investigate cases where we permute individual elements.
12839 SmallVector<int, 32> PermuteMask(NumElts, -1);
12840 for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
12841 for (int Elt = 0; Elt != NumLaneElts; Elt += 2) {
12842 int M0 = Mask[Lane + Elt + 0];
12843 int M1 = Mask[Lane + Elt + 1];
12844 if (0 <= M0 && 0 <= M1 &&
12845 (M0 % NumHalfLaneElts) != (M1 % NumHalfLaneElts))
12846 return SDValue();
12847 if (0 <= M0)
12848 PermuteMask[Lane + Elt + 0] = Lane + (2 * (M0 % NumHalfLaneElts));
12849 if (0 <= M1)
12850 PermuteMask[Lane + Elt + 1] = Lane + (2 * (M1 % NumHalfLaneElts)) + 1;
12851 }
12852 }
12853
12854 unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
12855 SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);
12856 return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
12857}
12858
12859/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
12860/// permuting the elements of the result in place.
12861static SDValue lowerShuffleAsByteRotateAndPermute(
12862 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12863 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12864 if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
12865 (VT.is256BitVector() && !Subtarget.hasAVX2()) ||
12866 (VT.is512BitVector() && !Subtarget.hasBWI()))
12867 return SDValue();
12868
12869 // We don't currently support lane crossing permutes.
12870 if (is128BitLaneCrossingShuffleMask(VT, Mask))
12871 return SDValue();
12872
12873 int Scale = VT.getScalarSizeInBits() / 8;
12874 int NumLanes = VT.getSizeInBits() / 128;
12875 int NumElts = VT.getVectorNumElements();
12876 int NumEltsPerLane = NumElts / NumLanes;
12877
12878 // Determine range of mask elts.
12879 bool Blend1 = true;
12880 bool Blend2 = true;
12881 std::pair<int, int> Range1 = std::make_pair(INT_MAX2147483647, INT_MIN(-2147483647 -1));
12882 std::pair<int, int> Range2 = std::make_pair(INT_MAX2147483647, INT_MIN(-2147483647 -1));
12883 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
12884 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
12885 int M = Mask[Lane + Elt];
12886 if (M < 0)
12887 continue;
12888 if (M < NumElts) {
12889 Blend1 &= (M == (Lane + Elt));
12890 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask")(static_cast <bool> (Lane <= M && M < (Lane
+ NumEltsPerLane) && "Out of range mask") ? void (0)
: __assert_fail ("Lane <= M && M < (Lane + NumEltsPerLane) && \"Out of range mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12890, __extension__
__PRETTY_FUNCTION__))
;
12891 M = M % NumEltsPerLane;
12892 Range1.first = std::min(Range1.first, M);
12893 Range1.second = std::max(Range1.second, M);
12894 } else {
12895 M -= NumElts;
12896 Blend2 &= (M == (Lane + Elt));
12897 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask")(static_cast <bool> (Lane <= M && M < (Lane
+ NumEltsPerLane) && "Out of range mask") ? void (0)
: __assert_fail ("Lane <= M && M < (Lane + NumEltsPerLane) && \"Out of range mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12897, __extension__
__PRETTY_FUNCTION__))
;
12898 M = M % NumEltsPerLane;
12899 Range2.first = std::min(Range2.first, M);
12900 Range2.second = std::max(Range2.second, M);
12901 }
12902 }
12903 }
12904
12905 // Bail if we don't need both elements.
12906 // TODO - it might be worth doing this for unary shuffles if the permute
12907 // can be widened.
12908 if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
12909 !(0 <= Range2.first && Range2.second < NumEltsPerLane))
12910 return SDValue();
12911
12912 if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))
12913 return SDValue();
12914
12915 // Rotate the 2 ops so we can access both ranges, then permute the result.
12916 auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
12917 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
12918 SDValue Rotate = DAG.getBitcast(
12919 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
12920 DAG.getBitcast(ByteVT, Lo),
12921 DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));
12922 SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
12923 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
12924 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
12925 int M = Mask[Lane + Elt];
12926 if (M < 0)
12927 continue;
12928 if (M < NumElts)
12929 PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
12930 else
12931 PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
12932 }
12933 }
12934 return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
12935 };
12936
12937 // Check if the ranges are small enough to rotate from either direction.
12938 if (Range2.second < Range1.first)
12939 return RotateAndPermute(V1, V2, Range1.first, 0);
12940 if (Range1.second < Range2.first)
12941 return RotateAndPermute(V2, V1, Range2.first, NumElts);
12942 return SDValue();
12943}
12944
12945static bool isBroadcastShuffleMask(ArrayRef<int> Mask) {
12946 return isUndefOrEqual(Mask, 0);
12947}
12948
12949static bool isNoopOrBroadcastShuffleMask(ArrayRef<int> Mask) {
12950 return isNoopShuffleMask(Mask) || isBroadcastShuffleMask(Mask);
12951}
12952
12953/// Generic routine to decompose a shuffle and blend into independent
12954/// blends and permutes.
12955///
12956/// This matches the extremely common pattern for handling combined
12957/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
12958/// operations. It will try to pick the best arrangement of shuffles and
12959/// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.
12960static SDValue lowerShuffleAsDecomposedShuffleMerge(
12961 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12962 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12963 int NumElts = Mask.size();
12964 int NumLanes = VT.getSizeInBits() / 128;
12965 int NumEltsPerLane = NumElts / NumLanes;
12966
12967 // Shuffle the input elements into the desired positions in V1 and V2 and
12968 // unpack/blend them together.
12969 bool IsAlternating = true;
12970 SmallVector<int, 32> V1Mask(NumElts, -1);
12971 SmallVector<int, 32> V2Mask(NumElts, -1);
12972 SmallVector<int, 32> FinalMask(NumElts, -1);
12973 for (int i = 0; i < NumElts; ++i) {
12974 int M = Mask[i];
12975 if (M >= 0 && M < NumElts) {
12976 V1Mask[i] = M;
12977 FinalMask[i] = i;
12978 IsAlternating &= (i & 1) == 0;
12979 } else if (M >= NumElts) {
12980 V2Mask[i] = M - NumElts;
12981 FinalMask[i] = i + NumElts;
12982 IsAlternating &= (i & 1) == 1;
12983 }
12984 }
12985
12986 // If we effectively only demand the 0'th element of \p Input, and not only
12987 // as 0'th element, then broadcast said input,
12988 // and change \p InputMask to be a no-op (identity) mask.
12989 auto canonicalizeBroadcastableInput = [DL, VT, &Subtarget,
12990 &DAG](SDValue &Input,
12991 MutableArrayRef<int> InputMask) {
12992 unsigned EltSizeInBits = Input.getScalarValueSizeInBits();
12993 if (!Subtarget.hasAVX2() && (!Subtarget.hasAVX() || EltSizeInBits < 32 ||
12994 !X86::mayFoldLoad(Input, Subtarget)))
12995 return;
12996 if (isNoopShuffleMask(InputMask))
12997 return;
12998 assert(isBroadcastShuffleMask(InputMask) &&(static_cast <bool> (isBroadcastShuffleMask(InputMask) &&
"Expected to demand only the 0'th element.") ? void (0) : __assert_fail
("isBroadcastShuffleMask(InputMask) && \"Expected to demand only the 0'th element.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12999, __extension__
__PRETTY_FUNCTION__))
12999 "Expected to demand only the 0'th element.")(static_cast <bool> (isBroadcastShuffleMask(InputMask) &&
"Expected to demand only the 0'th element.") ? void (0) : __assert_fail
("isBroadcastShuffleMask(InputMask) && \"Expected to demand only the 0'th element.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12999, __extension__
__PRETTY_FUNCTION__))
;
13000 Input = DAG.getNode(X86ISD::VBROADCAST, DL, VT, Input);
13001 for (auto I : enumerate(InputMask)) {
13002 int &InputMaskElt = I.value();
13003 if (InputMaskElt >= 0)
13004 InputMaskElt = I.index();
13005 }
13006 };
13007
13008 // Currently, we may need to produce one shuffle per input, and blend results.
13009 // It is possible that the shuffle for one of the inputs is already a no-op.
13010 // See if we can simplify non-no-op shuffles into broadcasts,
13011 // which we consider to be strictly better than an arbitrary shuffle.
13012 if (isNoopOrBroadcastShuffleMask(V1Mask) &&
13013 isNoopOrBroadcastShuffleMask(V2Mask)) {
13014 canonicalizeBroadcastableInput(V1, V1Mask);
13015 canonicalizeBroadcastableInput(V2, V2Mask);
13016 }
13017
13018 // Try to lower with the simpler initial blend/unpack/rotate strategies unless
13019 // one of the input shuffles would be a no-op. We prefer to shuffle inputs as
13020 // the shuffle may be able to fold with a load or other benefit. However, when
13021 // we'll have to do 2x as many shuffles in order to achieve this, a 2-input
13022 // pre-shuffle first is a better strategy.
13023 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
13024 // Only prefer immediate blends to unpack/rotate.
13025 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
13026 DAG, true))
13027 return BlendPerm;
13028 if (SDValue UnpackPerm = lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask,
13029 DAG))
13030 return UnpackPerm;
13031 if (SDValue RotatePerm = lowerShuffleAsByteRotateAndPermute(
13032 DL, VT, V1, V2, Mask, Subtarget, DAG))
13033 return RotatePerm;
13034 // Unpack/rotate failed - try again with variable blends.
13035 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
13036 DAG))
13037 return BlendPerm;
13038 }
13039
13040 // If the final mask is an alternating blend of vXi8/vXi16, convert to an
13041 // UNPCKL(SHUFFLE, SHUFFLE) pattern.
13042 // TODO: It doesn't have to be alternating - but each lane mustn't have more
13043 // than half the elements coming from each source.
13044 if (IsAlternating && VT.getScalarSizeInBits() < 32) {
13045 V1Mask.assign(NumElts, -1);
13046 V2Mask.assign(NumElts, -1);
13047 FinalMask.assign(NumElts, -1);
13048 for (int i = 0; i != NumElts; i += NumEltsPerLane)
13049 for (int j = 0; j != NumEltsPerLane; ++j) {
13050 int M = Mask[i + j];
13051 if (M >= 0 && M < NumElts) {
13052 V1Mask[i + (j / 2)] = M;
13053 FinalMask[i + j] = i + (j / 2);
13054 } else if (M >= NumElts) {
13055 V2Mask[i + (j / 2)] = M - NumElts;
13056 FinalMask[i + j] = i + (j / 2) + NumElts;
13057 }
13058 }
13059 }
13060
13061 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
13062 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
13063 return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask);
13064}
13065
13066/// Try to lower a vector shuffle as a bit rotation.
13067///
13068/// Look for a repeated rotation pattern in each sub group.
13069/// Returns a ISD::ROTL element rotation amount or -1 if failed.
13070static int matchShuffleAsBitRotate(ArrayRef<int> Mask, int NumSubElts) {
13071 int NumElts = Mask.size();
13072 assert((NumElts % NumSubElts) == 0 && "Illegal shuffle mask")(static_cast <bool> ((NumElts % NumSubElts) == 0 &&
"Illegal shuffle mask") ? void (0) : __assert_fail ("(NumElts % NumSubElts) == 0 && \"Illegal shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13072, __extension__
__PRETTY_FUNCTION__))
;
13073
13074 int RotateAmt = -1;
13075 for (int i = 0; i != NumElts; i += NumSubElts) {
13076 for (int j = 0; j != NumSubElts; ++j) {
13077 int M = Mask[i + j];
13078 if (M < 0)
13079 continue;
13080 if (!isInRange(M, i, i + NumSubElts))
13081 return -1;
13082 int Offset = (NumSubElts - (M - (i + j))) % NumSubElts;
13083 if (0 <= RotateAmt && Offset != RotateAmt)
13084 return -1;
13085 RotateAmt = Offset;
13086 }
13087 }
13088 return RotateAmt;
13089}
13090
13091static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,
13092 const X86Subtarget &Subtarget,
13093 ArrayRef<int> Mask) {
13094 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")(static_cast <bool> (!isNoopShuffleMask(Mask) &&
"We shouldn't lower no-op shuffles!") ? void (0) : __assert_fail
("!isNoopShuffleMask(Mask) && \"We shouldn't lower no-op shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13094, __extension__
__PRETTY_FUNCTION__))
;
13095 assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers")(static_cast <bool> (EltSizeInBits < 64 && "Can't rotate 64-bit integers"
) ? void (0) : __assert_fail ("EltSizeInBits < 64 && \"Can't rotate 64-bit integers\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13095, __extension__
__PRETTY_FUNCTION__))
;
13096
13097 // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
13098 int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
13099 int MaxSubElts = 64 / EltSizeInBits;
13100 for (int NumSubElts = MinSubElts; NumSubElts <= MaxSubElts; NumSubElts *= 2) {
13101 int RotateAmt = matchShuffleAsBitRotate(Mask, NumSubElts);
13102 if (RotateAmt < 0)
13103 continue;
13104
13105 int NumElts = Mask.size();
13106 MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
13107 RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
13108 return RotateAmt * EltSizeInBits;
13109 }
13110
13111 return -1;
13112}
13113
13114/// Lower shuffle using X86ISD::VROTLI rotations.
13115static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1,
13116 ArrayRef<int> Mask,
13117 const X86Subtarget &Subtarget,
13118 SelectionDAG &DAG) {
13119 // Only XOP + AVX512 targets have bit rotation instructions.
13120 // If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.
13121 bool IsLegal =
13122 (VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();
13123 if (!IsLegal && Subtarget.hasSSE3())
13124 return SDValue();
13125
13126 MVT RotateVT;
13127 int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),
13128 Subtarget, Mask);
13129 if (RotateAmt < 0)
13130 return SDValue();
13131
13132 // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
13133 // expanded to OR(SRL,SHL), will be more efficient, but if they can
13134 // widen to vXi16 or more then existing lowering should will be better.
13135 if (!IsLegal) {
13136 if ((RotateAmt % 16) == 0)
13137 return SDValue();
13138 // TODO: Use getTargetVShiftByConstNode.
13139 unsigned ShlAmt = RotateAmt;
13140 unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;
13141 V1 = DAG.getBitcast(RotateVT, V1);
13142 SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,
13143 DAG.getTargetConstant(ShlAmt, DL, MVT::i8));
13144 SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,
13145 DAG.getTargetConstant(SrlAmt, DL, MVT::i8));
13146 SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
13147 return DAG.getBitcast(VT, Rot);
13148 }
13149
13150 SDValue Rot =
13151 DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
13152 DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
13153 return DAG.getBitcast(VT, Rot);
13154}
13155
13156/// Try to match a vector shuffle as an element rotation.
13157///
13158/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
13159static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2,
13160 ArrayRef<int> Mask) {
13161 int NumElts = Mask.size();
13162
13163 // We need to detect various ways of spelling a rotation:
13164 // [11, 12, 13, 14, 15, 0, 1, 2]
13165 // [-1, 12, 13, 14, -1, -1, 1, -1]
13166 // [-1, -1, -1, -1, -1, -1, 1, 2]
13167 // [ 3, 4, 5, 6, 7, 8, 9, 10]
13168 // [-1, 4, 5, 6, -1, -1, 9, -1]
13169 // [-1, 4, 5, 6, -1, -1, -1, -1]
13170 int Rotation = 0;
13171 SDValue Lo, Hi;
13172 for (int i = 0; i < NumElts; ++i) {
13173 int M = Mask[i];
13174 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&(static_cast <bool> ((M == SM_SentinelUndef || (0 <=
M && M < (2*NumElts))) && "Unexpected mask index."
) ? void (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) && \"Unexpected mask index.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13175, __extension__
__PRETTY_FUNCTION__))
13175 "Unexpected mask index.")(static_cast <bool> ((M == SM_SentinelUndef || (0 <=
M && M < (2*NumElts))) && "Unexpected mask index."
) ? void (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) && \"Unexpected mask index.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13175, __extension__
__PRETTY_FUNCTION__))
;
13176 if (M < 0)
13177 continue;
13178
13179 // Determine where a rotated vector would have started.
13180 int StartIdx = i - (M % NumElts);
13181 if (StartIdx == 0)
13182 // The identity rotation isn't interesting, stop.
13183 return -1;
13184
13185 // If we found the tail of a vector the rotation must be the missing
13186 // front. If we found the head of a vector, it must be how much of the
13187 // head.
13188 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
13189
13190 if (Rotation == 0)
13191 Rotation = CandidateRotation;
13192 else if (Rotation != CandidateRotation)
13193 // The rotations don't match, so we can't match this mask.
13194 return -1;
13195
13196 // Compute which value this mask is pointing at.
13197 SDValue MaskV = M < NumElts ? V1 : V2;
13198
13199 // Compute which of the two target values this index should be assigned
13200 // to. This reflects whether the high elements are remaining or the low
13201 // elements are remaining.
13202 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
13203
13204 // Either set up this value if we've not encountered it before, or check
13205 // that it remains consistent.
13206 if (!TargetV)
13207 TargetV = MaskV;
13208 else if (TargetV != MaskV)
13209 // This may be a rotation, but it pulls from the inputs in some
13210 // unsupported interleaving.
13211 return -1;
13212 }
13213
13214 // Check that we successfully analyzed the mask, and normalize the results.
13215 assert(Rotation != 0 && "Failed to locate a viable rotation!")(static_cast <bool> (Rotation != 0 && "Failed to locate a viable rotation!"
) ? void (0) : __assert_fail ("Rotation != 0 && \"Failed to locate a viable rotation!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13215, __extension__
__PRETTY_FUNCTION__))
;
13216 assert((Lo || Hi) && "Failed to find a rotated input vector!")(static_cast <bool> ((Lo || Hi) && "Failed to find a rotated input vector!"
) ? void (0) : __assert_fail ("(Lo || Hi) && \"Failed to find a rotated input vector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13216, __extension__
__PRETTY_FUNCTION__))
;
13217 if (!Lo)
13218 Lo = Hi;
13219 else if (!Hi)
13220 Hi = Lo;
13221
13222 V1 = Lo;
13223 V2 = Hi;
13224
13225 return Rotation;
13226}
13227
13228/// Try to lower a vector shuffle as a byte rotation.
13229///
13230/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
13231/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
13232/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
13233/// try to generically lower a vector shuffle through such an pattern. It
13234/// does not check for the profitability of lowering either as PALIGNR or
13235/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
13236/// This matches shuffle vectors that look like:
13237///
13238/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
13239///
13240/// Essentially it concatenates V1 and V2, shifts right by some number of
13241/// elements, and takes the low elements as the result. Note that while this is
13242/// specified as a *right shift* because x86 is little-endian, it is a *left
13243/// rotate* of the vector lanes.
13244static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
13245 ArrayRef<int> Mask) {
13246 // Don't accept any shuffles with zero elements.
13247 if (isAnyZero(Mask))
13248 return -1;
13249
13250 // PALIGNR works on 128-bit lanes.
13251 SmallVector<int, 16> RepeatedMask;
13252 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
13253 return -1;
13254
13255 int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask);
13256 if (Rotation <= 0)
13257 return -1;
13258
13259 // PALIGNR rotates bytes, so we need to scale the
13260 // rotation based on how many bytes are in the vector lane.
13261 int NumElts = RepeatedMask.size();
13262 int Scale = 16 / NumElts;
13263 return Rotation * Scale;
13264}
13265
13266static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1,
13267 SDValue V2, ArrayRef<int> Mask,
13268 const X86Subtarget &Subtarget,
13269 SelectionDAG &DAG) {
13270 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")(static_cast <bool> (!isNoopShuffleMask(Mask) &&
"We shouldn't lower no-op shuffles!") ? void (0) : __assert_fail
("!isNoopShuffleMask(Mask) && \"We shouldn't lower no-op shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13270, __extension__
__PRETTY_FUNCTION__))
;
13271
13272 SDValue Lo = V1, Hi = V2;
13273 int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
13274 if (ByteRotation <= 0)
13275 return SDValue();
13276
13277 // Cast the inputs to i8 vector of correct length to match PALIGNR or
13278 // PSLLDQ/PSRLDQ.
13279 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
13280 Lo = DAG.getBitcast(ByteVT, Lo);
13281 Hi = DAG.getBitcast(ByteVT, Hi);
13282
13283 // SSSE3 targets can use the palignr instruction.
13284 if (Subtarget.hasSSSE3()) {
13285 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&(static_cast <bool> ((!VT.is512BitVector() || Subtarget
.hasBWI()) && "512-bit PALIGNR requires BWI instructions"
) ? void (0) : __assert_fail ("(!VT.is512BitVector() || Subtarget.hasBWI()) && \"512-bit PALIGNR requires BWI instructions\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13286, __extension__
__PRETTY_FUNCTION__))
13286 "512-bit PALIGNR requires BWI instructions")(static_cast <bool> ((!VT.is512BitVector() || Subtarget
.hasBWI()) && "512-bit PALIGNR requires BWI instructions"
) ? void (0) : __assert_fail ("(!VT.is512BitVector() || Subtarget.hasBWI()) && \"512-bit PALIGNR requires BWI instructions\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13286, __extension__
__PRETTY_FUNCTION__))
;
13287 return DAG.getBitcast(
13288 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
13289 DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));
13290 }
13291
13292 assert(VT.is128BitVector() &&(static_cast <bool> (VT.is128BitVector() && "Rotate-based lowering only supports 128-bit lowering!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Rotate-based lowering only supports 128-bit lowering!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13293, __extension__
__PRETTY_FUNCTION__))
13293 "Rotate-based lowering only supports 128-bit lowering!")(static_cast <bool> (VT.is128BitVector() && "Rotate-based lowering only supports 128-bit lowering!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Rotate-based lowering only supports 128-bit lowering!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13293, __extension__
__PRETTY_FUNCTION__))
;
13294 assert(Mask.size() <= 16 &&(static_cast <bool> (Mask.size() <= 16 && "Can shuffle at most 16 bytes in a 128-bit vector!"
) ? void (0) : __assert_fail ("Mask.size() <= 16 && \"Can shuffle at most 16 bytes in a 128-bit vector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13295, __extension__
__PRETTY_FUNCTION__))
13295 "Can shuffle at most 16 bytes in a 128-bit vector!")(static_cast <bool> (Mask.size() <= 16 && "Can shuffle at most 16 bytes in a 128-bit vector!"
) ? void (0) : __assert_fail ("Mask.size() <= 16 && \"Can shuffle at most 16 bytes in a 128-bit vector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13295, __extension__
__PRETTY_FUNCTION__))
;
13296 assert(ByteVT == MVT::v16i8 &&(static_cast <bool> (ByteVT == MVT::v16i8 && "SSE2 rotate lowering only needed for v16i8!"
) ? void (0) : __assert_fail ("ByteVT == MVT::v16i8 && \"SSE2 rotate lowering only needed for v16i8!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13297, __extension__
__PRETTY_FUNCTION__))
13297 "SSE2 rotate lowering only needed for v16i8!")(static_cast <bool> (ByteVT == MVT::v16i8 && "SSE2 rotate lowering only needed for v16i8!"
) ? void (0) : __assert_fail ("ByteVT == MVT::v16i8 && \"SSE2 rotate lowering only needed for v16i8!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13297, __extension__
__PRETTY_FUNCTION__))
;
13298
13299 // Default SSE2 implementation
13300 int LoByteShift = 16 - ByteRotation;
13301 int HiByteShift = ByteRotation;
13302
13303 SDValue LoShift =
13304 DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
13305 DAG.getTargetConstant(LoByteShift, DL, MVT::i8));
13306 SDValue HiShift =
13307 DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
13308 DAG.getTargetConstant(HiByteShift, DL, MVT::i8));
13309 return DAG.getBitcast(VT,
13310 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
13311}
13312
13313/// Try to lower a vector shuffle as a dword/qword rotation.
13314///
13315/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
13316/// rotation of the concatenation of two vectors; This routine will
13317/// try to generically lower a vector shuffle through such an pattern.
13318///
13319/// Essentially it concatenates V1 and V2, shifts right by some number of
13320/// elements, and takes the low elements as the result. Note that while this is
13321/// specified as a *right shift* because x86 is little-endian, it is a *left
13322/// rotate* of the vector lanes.
13323static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1,
13324 SDValue V2, ArrayRef<int> Mask,
13325 const X86Subtarget &Subtarget,
13326 SelectionDAG &DAG) {
13327 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&(static_cast <bool> ((VT.getScalarType() == MVT::i32 ||
VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? void (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13328, __extension__
__PRETTY_FUNCTION__))
13328 "Only 32-bit and 64-bit elements are supported!")(static_cast <bool> ((VT.getScalarType() == MVT::i32 ||
VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? void (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13328, __extension__
__PRETTY_FUNCTION__))
;
13329
13330 // 128/256-bit vectors are only supported with VLX.
13331 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))(static_cast <bool> ((Subtarget.hasVLX() || (!VT.is128BitVector
() && !VT.is256BitVector())) && "VLX required for 128/256-bit vectors"
) ? void (0) : __assert_fail ("(Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector())) && \"VLX required for 128/256-bit vectors\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13332, __extension__
__PRETTY_FUNCTION__))
13332 && "VLX required for 128/256-bit vectors")(static_cast <bool> ((Subtarget.hasVLX() || (!VT.is128BitVector
() && !VT.is256BitVector())) && "VLX required for 128/256-bit vectors"
) ? void (0) : __assert_fail ("(Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector())) && \"VLX required for 128/256-bit vectors\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13332, __extension__
__PRETTY_FUNCTION__))
;
13333
13334 SDValue Lo = V1, Hi = V2;
13335 int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);
13336 if (Rotation <= 0)
13337 return SDValue();
13338
13339 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
13340 DAG.getTargetConstant(Rotation, DL, MVT::i8));
13341}
13342
13343/// Try to lower a vector shuffle as a byte shift sequence.
13344static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1,
13345 SDValue V2, ArrayRef<int> Mask,
13346 const APInt &Zeroable,
13347 const X86Subtarget &Subtarget,
13348 SelectionDAG &DAG) {
13349 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")(static_cast <bool> (!isNoopShuffleMask(Mask) &&
"We shouldn't lower no-op shuffles!") ? void (0) : __assert_fail
("!isNoopShuffleMask(Mask) && \"We shouldn't lower no-op shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13349, __extension__
__PRETTY_FUNCTION__))
;
13350 assert(VT.is128BitVector() && "Only 128-bit vectors supported")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vectors supported"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vectors supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13350, __extension__
__PRETTY_FUNCTION__))
;
13351
13352 // We need a shuffle that has zeros at one/both ends and a sequential
13353 // shuffle from one source within.
13354 unsigned ZeroLo = Zeroable.countTrailingOnes();
13355 unsigned ZeroHi = Zeroable.countLeadingOnes();
13356 if (!ZeroLo && !ZeroHi)
13357 return SDValue();
13358
13359 unsigned NumElts = Mask.size();
13360 unsigned Len = NumElts - (ZeroLo + ZeroHi);
13361 if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))
13362 return SDValue();
13363
13364 unsigned Scale = VT.getScalarSizeInBits() / 8;
13365 ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);
13366 if (!isUndefOrInRange(StubMask, 0, NumElts) &&
13367 !isUndefOrInRange(StubMask, NumElts, 2 * NumElts))
13368 return SDValue();
13369
13370 SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
13371 Res = DAG.getBitcast(MVT::v16i8, Res);
13372
13373 // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an
13374 // inner sequential set of elements, possibly offset:
13375 // 01234567 --> zzzzzz01 --> 1zzzzzzz
13376 // 01234567 --> 4567zzzz --> zzzzz456
13377 // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
13378 if (ZeroLo == 0) {
13379 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
13380 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
13381 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
13382 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
13383 DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));
13384 } else if (ZeroHi == 0) {
13385 unsigned Shift = Mask[ZeroLo] % NumElts;
13386 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
13387 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
13388 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
13389 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
13390 } else if (!Subtarget.hasSSSE3()) {
13391 // If we don't have PSHUFB then its worth avoiding an AND constant mask
13392 // by performing 3 byte shifts. Shuffle combining can kick in above that.
13393 // TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
13394 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
13395 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
13396 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
13397 Shift += Mask[ZeroLo] % NumElts;
13398 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
13399 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
13400 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
13401 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
13402 } else
13403 return SDValue();
13404
13405 return DAG.getBitcast(VT, Res);
13406}
13407
13408/// Try to lower a vector shuffle as a bit shift (shifts in zeros).
13409///
13410/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
13411/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
13412/// matches elements from one of the input vectors shuffled to the left or
13413/// right with zeroable elements 'shifted in'. It handles both the strictly
13414/// bit-wise element shifts and the byte shift across an entire 128-bit double
13415/// quad word lane.
13416///
13417/// PSHL : (little-endian) left bit shift.
13418/// [ zz, 0, zz, 2 ]
13419/// [ -1, 4, zz, -1 ]
13420/// PSRL : (little-endian) right bit shift.
13421/// [ 1, zz, 3, zz]
13422/// [ -1, -1, 7, zz]
13423/// PSLLDQ : (little-endian) left byte shift
13424/// [ zz, 0, 1, 2, 3, 4, 5, 6]
13425/// [ zz, zz, -1, -1, 2, 3, 4, -1]
13426/// [ zz, zz, zz, zz, zz, zz, -1, 1]
13427/// PSRLDQ : (little-endian) right byte shift
13428/// [ 5, 6, 7, zz, zz, zz, zz, zz]
13429/// [ -1, 5, 6, 7, zz, zz, zz, zz]
13430/// [ 1, 2, -1, -1, -1, -1, zz, zz]
13431static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
13432 unsigned ScalarSizeInBits, ArrayRef<int> Mask,
13433 int MaskOffset, const APInt &Zeroable,
13434 const X86Subtarget &Subtarget) {
13435 int Size = Mask.size();
13436 unsigned SizeInBits = Size * ScalarSizeInBits;
13437
13438 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
13439 for (int i = 0; i < Size; i += Scale)
13440 for (int j = 0; j < Shift; ++j)
13441 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
13442 return false;
13443
13444 return true;
13445 };
13446
13447 auto MatchShift = [&](int Shift, int Scale, bool Left) {
13448 for (int i = 0; i != Size; i += Scale) {
13449 unsigned Pos = Left ? i + Shift : i;
13450 unsigned Low = Left ? i : i + Shift;
13451 unsigned Len = Scale - Shift;
13452 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
13453 return -1;
13454 }
13455
13456 int ShiftEltBits = ScalarSizeInBits * Scale;
13457 bool ByteShift = ShiftEltBits > 64;
13458 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
13459 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
13460 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
13461
13462 // Normalize the scale for byte shifts to still produce an i64 element
13463 // type.
13464 Scale = ByteShift ? Scale / 2 : Scale;
13465
13466 // We need to round trip through the appropriate type for the shift.
13467 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
13468 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
13469 : MVT::getVectorVT(ShiftSVT, Size / Scale);
13470 return (int)ShiftAmt;
13471 };
13472
13473 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
13474 // keep doubling the size of the integer elements up to that. We can
13475 // then shift the elements of the integer vector by whole multiples of
13476 // their width within the elements of the larger integer vector. Test each
13477 // multiple to see if we can find a match with the moved element indices
13478 // and that the shifted in elements are all zeroable.
13479 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
13480 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
13481 for (int Shift = 1; Shift != Scale; ++Shift)
13482 for (bool Left : {true, false})
13483 if (CheckZeros(Shift, Scale, Left)) {
13484 int ShiftAmt = MatchShift(Shift, Scale, Left);
13485 if (0 < ShiftAmt)
13486 return ShiftAmt;
13487 }
13488
13489 // no match
13490 return -1;
13491}
13492
13493static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
13494 SDValue V2, ArrayRef<int> Mask,
13495 const APInt &Zeroable,
13496 const X86Subtarget &Subtarget,
13497 SelectionDAG &DAG) {
13498 int Size = Mask.size();
13499 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")(static_cast <bool> (Size == (int)VT.getVectorNumElements
() && "Unexpected mask size") ? void (0) : __assert_fail
("Size == (int)VT.getVectorNumElements() && \"Unexpected mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13499, __extension__
__PRETTY_FUNCTION__))
;
13500
13501 MVT ShiftVT;
13502 SDValue V = V1;
13503 unsigned Opcode;
13504
13505 // Try to match shuffle against V1 shift.
13506 int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
13507 Mask, 0, Zeroable, Subtarget);
13508
13509 // If V1 failed, try to match shuffle against V2 shift.
13510 if (ShiftAmt < 0) {
13511 ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
13512 Mask, Size, Zeroable, Subtarget);
13513 V = V2;
13514 }
13515
13516 if (ShiftAmt < 0)
13517 return SDValue();
13518
13519 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&(static_cast <bool> (DAG.getTargetLoweringInfo().isTypeLegal
(ShiftVT) && "Illegal integer vector type") ? void (0
) : __assert_fail ("DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) && \"Illegal integer vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13520, __extension__
__PRETTY_FUNCTION__))
13520 "Illegal integer vector type")(static_cast <bool> (DAG.getTargetLoweringInfo().isTypeLegal
(ShiftVT) && "Illegal integer vector type") ? void (0
) : __assert_fail ("DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) && \"Illegal integer vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13520, __extension__
__PRETTY_FUNCTION__))
;
13521 V = DAG.getBitcast(ShiftVT, V);
13522 V = DAG.getNode(Opcode, DL, ShiftVT, V,
13523 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
13524 return DAG.getBitcast(VT, V);
13525}
13526
13527// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
13528// Remainder of lower half result is zero and upper half is all undef.
13529static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
13530 ArrayRef<int> Mask, uint64_t &BitLen,
13531 uint64_t &BitIdx, const APInt &Zeroable) {
13532 int Size = Mask.size();
13533 int HalfSize = Size / 2;
13534 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")(static_cast <bool> (Size == (int)VT.getVectorNumElements
() && "Unexpected mask size") ? void (0) : __assert_fail
("Size == (int)VT.getVectorNumElements() && \"Unexpected mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13534, __extension__
__PRETTY_FUNCTION__))
;
13535 assert(!Zeroable.isAllOnes() && "Fully zeroable shuffle mask")(static_cast <bool> (!Zeroable.isAllOnes() && "Fully zeroable shuffle mask"
) ? void (0) : __assert_fail ("!Zeroable.isAllOnes() && \"Fully zeroable shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13535, __extension__
__PRETTY_FUNCTION__))
;
13536
13537 // Upper half must be undefined.
13538 if (!isUndefUpperHalf(Mask))
13539 return false;
13540
13541 // Determine the extraction length from the part of the
13542 // lower half that isn't zeroable.
13543 int Len = HalfSize;
13544 for (; Len > 0; --Len)
13545 if (!Zeroable[Len - 1])
13546 break;
13547 assert(Len > 0 && "Zeroable shuffle mask")(static_cast <bool> (Len > 0 && "Zeroable shuffle mask"
) ? void (0) : __assert_fail ("Len > 0 && \"Zeroable shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13547, __extension__
__PRETTY_FUNCTION__))
;
13548
13549 // Attempt to match first Len sequential elements from the lower half.
13550 SDValue Src;
13551 int Idx = -1;
13552 for (int i = 0; i != Len; ++i) {
13553 int M = Mask[i];
13554 if (M == SM_SentinelUndef)
13555 continue;
13556 SDValue &V = (M < Size ? V1 : V2);
13557 M = M % Size;
13558
13559 // The extracted elements must start at a valid index and all mask
13560 // elements must be in the lower half.
13561 if (i > M || M >= HalfSize)
13562 return false;
13563
13564 if (Idx < 0 || (Src == V && Idx == (M - i))) {
13565 Src = V;
13566 Idx = M - i;
13567 continue;
13568 }
13569 return false;
13570 }
13571
13572 if (!Src || Idx < 0)
13573 return false;
13574
13575 assert((Idx + Len) <= HalfSize && "Illegal extraction mask")(static_cast <bool> ((Idx + Len) <= HalfSize &&
"Illegal extraction mask") ? void (0) : __assert_fail ("(Idx + Len) <= HalfSize && \"Illegal extraction mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13575, __extension__
__PRETTY_FUNCTION__))
;
13576 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
13577 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
13578 V1 = Src;
13579 return true;
13580}
13581
13582// INSERTQ: Extract lowest Len elements from lower half of second source and
13583// insert over first source, starting at Idx.
13584// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
13585static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
13586 ArrayRef<int> Mask, uint64_t &BitLen,
13587 uint64_t &BitIdx) {
13588 int Size = Mask.size();
13589 int HalfSize = Size / 2;
13590 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")(static_cast <bool> (Size == (int)VT.getVectorNumElements
() && "Unexpected mask size") ? void (0) : __assert_fail
("Size == (int)VT.getVectorNumElements() && \"Unexpected mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13590, __extension__
__PRETTY_FUNCTION__))
;
13591
13592 // Upper half must be undefined.
13593 if (!isUndefUpperHalf(Mask))
13594 return false;
13595
13596 for (int Idx = 0; Idx != HalfSize; ++Idx) {
13597 SDValue Base;
13598
13599 // Attempt to match first source from mask before insertion point.
13600 if (isUndefInRange(Mask, 0, Idx)) {
13601 /* EMPTY */
13602 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
13603 Base = V1;
13604 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
13605 Base = V2;
13606 } else {
13607 continue;
13608 }
13609
13610 // Extend the extraction length looking to match both the insertion of
13611 // the second source and the remaining elements of the first.
13612 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
13613 SDValue Insert;
13614 int Len = Hi - Idx;
13615
13616 // Match insertion.
13617 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
13618 Insert = V1;
13619 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
13620 Insert = V2;
13621 } else {
13622 continue;
13623 }
13624
13625 // Match the remaining elements of the lower half.
13626 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
13627 /* EMPTY */
13628 } else if ((!Base || (Base == V1)) &&
13629 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
13630 Base = V1;
13631 } else if ((!Base || (Base == V2)) &&
13632 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
13633 Size + Hi)) {
13634 Base = V2;
13635 } else {
13636 continue;
13637 }
13638
13639 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
13640 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
13641 V1 = Base;
13642 V2 = Insert;
13643 return true;
13644 }
13645 }
13646
13647 return false;
13648}
13649
13650/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
13651static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
13652 SDValue V2, ArrayRef<int> Mask,
13653 const APInt &Zeroable, SelectionDAG &DAG) {
13654 uint64_t BitLen, BitIdx;
13655 if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
13656 return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
13657 DAG.getTargetConstant(BitLen, DL, MVT::i8),
13658 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
13659
13660 if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
13661 return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
13662 V2 ? V2 : DAG.getUNDEF(VT),
13663 DAG.getTargetConstant(BitLen, DL, MVT::i8),
13664 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
13665
13666 return SDValue();
13667}
13668
13669/// Lower a vector shuffle as a zero or any extension.
13670///
13671/// Given a specific number of elements, element bit width, and extension
13672/// stride, produce either a zero or any extension based on the available
13673/// features of the subtarget. The extended elements are consecutive and
13674/// begin and can start from an offsetted element index in the input; to
13675/// avoid excess shuffling the offset must either being in the bottom lane
13676/// or at the start of a higher lane. All extended elements must be from
13677/// the same lane.
13678static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(
13679 const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
13680 ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13681 assert(Scale > 1 && "Need a scale to extend.")(static_cast <bool> (Scale > 1 && "Need a scale to extend."
) ? void (0) : __assert_fail ("Scale > 1 && \"Need a scale to extend.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13681, __extension__
__PRETTY_FUNCTION__))
;
13682 int EltBits = VT.getScalarSizeInBits();
13683 int NumElements = VT.getVectorNumElements();
13684 int NumEltsPerLane = 128 / EltBits;
13685 int OffsetLane = Offset / NumEltsPerLane;
13686 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&(static_cast <bool> ((EltBits == 8 || EltBits == 16 || EltBits
== 32) && "Only 8, 16, and 32 bit elements can be extended."
) ? void (0) : __assert_fail ("(EltBits == 8 || EltBits == 16 || EltBits == 32) && \"Only 8, 16, and 32 bit elements can be extended.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13687, __extension__
__PRETTY_FUNCTION__))
13687 "Only 8, 16, and 32 bit elements can be extended.")(static_cast <bool> ((EltBits == 8 || EltBits == 16 || EltBits
== 32) && "Only 8, 16, and 32 bit elements can be extended."
) ? void (0) : __assert_fail ("(EltBits == 8 || EltBits == 16 || EltBits == 32) && \"Only 8, 16, and 32 bit elements can be extended.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13687, __extension__
__PRETTY_FUNCTION__))
;
13688 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.")(static_cast <bool> (Scale * EltBits <= 64 &&
"Cannot zero extend past 64 bits.") ? void (0) : __assert_fail
("Scale * EltBits <= 64 && \"Cannot zero extend past 64 bits.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13688, __extension__
__PRETTY_FUNCTION__))
;
13689 assert(0 <= Offset && "Extension offset must be positive.")(static_cast <bool> (0 <= Offset && "Extension offset must be positive."
) ? void (0) : __assert_fail ("0 <= Offset && \"Extension offset must be positive.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13689, __extension__
__PRETTY_FUNCTION__))
;
13690 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&(static_cast <bool> ((Offset < NumEltsPerLane || Offset
% NumEltsPerLane == 0) && "Extension offset must be in the first lane or start an upper lane."
) ? void (0) : __assert_fail ("(Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) && \"Extension offset must be in the first lane or start an upper lane.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13691, __extension__
__PRETTY_FUNCTION__))
13691 "Extension offset must be in the first lane or start an upper lane.")(static_cast <bool> ((Offset < NumEltsPerLane || Offset
% NumEltsPerLane == 0) && "Extension offset must be in the first lane or start an upper lane."
) ? void (0) : __assert_fail ("(Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) && \"Extension offset must be in the first lane or start an upper lane.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13691, __extension__
__PRETTY_FUNCTION__))
;
13692
13693 // Check that an index is in same lane as the base offset.
13694 auto SafeOffset = [&](int Idx) {
13695 return OffsetLane == (Idx / NumEltsPerLane);
13696 };
13697
13698 // Shift along an input so that the offset base moves to the first element.
13699 auto ShuffleOffset = [&](SDValue V) {
13700 if (!Offset)
13701 return V;
13702
13703 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
13704 for (int i = 0; i * Scale < NumElements; ++i) {
13705 int SrcIdx = i + Offset;
13706 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
13707 }
13708 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
13709 };
13710
13711 // Found a valid a/zext mask! Try various lowering strategies based on the
13712 // input type and available ISA extensions.
13713 if (Subtarget.hasSSE41()) {
13714 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
13715 // PUNPCK will catch this in a later shuffle match.
13716 if (Offset && Scale == 2 && VT.is128BitVector())
13717 return SDValue();
13718 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
13719 NumElements / Scale);
13720 InputV = ShuffleOffset(InputV);
13721 InputV = getEXTEND_VECTOR_INREG(AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND,
13722 DL, ExtVT, InputV, DAG);
13723 return DAG.getBitcast(VT, InputV);
13724 }
13725
13726 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vectors can be extended."
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vectors can be extended.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13726, __extension__
__PRETTY_FUNCTION__))
;
13727
13728 // For any extends we can cheat for larger element sizes and use shuffle
13729 // instructions that can fold with a load and/or copy.
13730 if (AnyExt && EltBits == 32) {
13731 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
13732 -1};
13733 return DAG.getBitcast(
13734 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
13735 DAG.getBitcast(MVT::v4i32, InputV),
13736 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13737 }
13738 if (AnyExt && EltBits == 16 && Scale > 2) {
13739 int PSHUFDMask[4] = {Offset / 2, -1,
13740 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
13741 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
13742 DAG.getBitcast(MVT::v4i32, InputV),
13743 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
13744 int PSHUFWMask[4] = {1, -1, -1, -1};
13745 unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
13746 return DAG.getBitcast(
13747 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
13748 DAG.getBitcast(MVT::v8i16, InputV),
13749 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
13750 }
13751
13752 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
13753 // to 64-bits.
13754 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
13755 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!")(static_cast <bool> (NumElements == (int)Mask.size() &&
"Unexpected shuffle mask size!") ? void (0) : __assert_fail (
"NumElements == (int)Mask.size() && \"Unexpected shuffle mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13755, __extension__
__PRETTY_FUNCTION__))
;
13756 assert(VT.is128BitVector() && "Unexpected vector width!")(static_cast <bool> (VT.is128BitVector() && "Unexpected vector width!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Unexpected vector width!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13756, __extension__
__PRETTY_FUNCTION__))
;
13757
13758 int LoIdx = Offset * EltBits;
13759 SDValue Lo = DAG.getBitcast(
13760 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
13761 DAG.getTargetConstant(EltBits, DL, MVT::i8),
13762 DAG.getTargetConstant(LoIdx, DL, MVT::i8)));
13763
13764 if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))
13765 return DAG.getBitcast(VT, Lo);
13766
13767 int HiIdx = (Offset + 1) * EltBits;
13768 SDValue Hi = DAG.getBitcast(
13769 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
13770 DAG.getTargetConstant(EltBits, DL, MVT::i8),
13771 DAG.getTargetConstant(HiIdx, DL, MVT::i8)));
13772 return DAG.getBitcast(VT,
13773 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
13774 }
13775
13776 // If this would require more than 2 unpack instructions to expand, use
13777 // pshufb when available. We can only use more than 2 unpack instructions
13778 // when zero extending i8 elements which also makes it easier to use pshufb.
13779 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
13780 assert(NumElements == 16 && "Unexpected byte vector width!")(static_cast <bool> (NumElements == 16 && "Unexpected byte vector width!"
) ? void (0) : __assert_fail ("NumElements == 16 && \"Unexpected byte vector width!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13780, __extension__
__PRETTY_FUNCTION__))
;
13781 SDValue PSHUFBMask[16];
13782 for (int i = 0; i < 16; ++i) {
13783 int Idx = Offset + (i / Scale);
13784 if ((i % Scale == 0 && SafeOffset(Idx))) {
13785 PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);
13786 continue;
13787 }
13788 PSHUFBMask[i] =
13789 AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);
13790 }
13791 InputV = DAG.getBitcast(MVT::v16i8, InputV);
13792 return DAG.getBitcast(
13793 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
13794 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
13795 }
13796
13797 // If we are extending from an offset, ensure we start on a boundary that
13798 // we can unpack from.
13799 int AlignToUnpack = Offset % (NumElements / Scale);
13800 if (AlignToUnpack) {
13801 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
13802 for (int i = AlignToUnpack; i < NumElements; ++i)
13803 ShMask[i - AlignToUnpack] = i;
13804 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
13805 Offset -= AlignToUnpack;
13806 }
13807
13808 // Otherwise emit a sequence of unpacks.
13809 do {
13810 unsigned UnpackLoHi = X86ISD::UNPCKL;
13811 if (Offset >= (NumElements / 2)) {
13812 UnpackLoHi = X86ISD::UNPCKH;
13813 Offset -= (NumElements / 2);
13814 }
13815
13816 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
13817 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
13818 : getZeroVector(InputVT, Subtarget, DAG, DL);
13819 InputV = DAG.getBitcast(InputVT, InputV);
13820 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
13821 Scale /= 2;
13822 EltBits *= 2;
13823 NumElements /= 2;
13824 } while (Scale > 1);
13825 return DAG.getBitcast(VT, InputV);
13826}
13827
13828/// Try to lower a vector shuffle as a zero extension on any microarch.
13829///
13830/// This routine will try to do everything in its power to cleverly lower
13831/// a shuffle which happens to match the pattern of a zero extend. It doesn't
13832/// check for the profitability of this lowering, it tries to aggressively
13833/// match this pattern. It will use all of the micro-architectural details it
13834/// can to emit an efficient lowering. It handles both blends with all-zero
13835/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
13836/// masking out later).
13837///
13838/// The reason we have dedicated lowering for zext-style shuffles is that they
13839/// are both incredibly common and often quite performance sensitive.
13840static SDValue lowerShuffleAsZeroOrAnyExtend(
13841 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13842 const APInt &Zeroable, const X86Subtarget &Subtarget,
13843 SelectionDAG &DAG) {
13844 int Bits = VT.getSizeInBits();
13845 int NumLanes = Bits / 128;
13846 int NumElements = VT.getVectorNumElements();
13847 int NumEltsPerLane = NumElements / NumLanes;
13848 assert(VT.getScalarSizeInBits() <= 32 &&(static_cast <bool> (VT.getScalarSizeInBits() <= 32 &&
"Exceeds 32-bit integer zero extension limit") ? void (0) : __assert_fail
("VT.getScalarSizeInBits() <= 32 && \"Exceeds 32-bit integer zero extension limit\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13849, __extension__
__PRETTY_FUNCTION__))
13849 "Exceeds 32-bit integer zero extension limit")(static_cast <bool> (VT.getScalarSizeInBits() <= 32 &&
"Exceeds 32-bit integer zero extension limit") ? void (0) : __assert_fail
("VT.getScalarSizeInBits() <= 32 && \"Exceeds 32-bit integer zero extension limit\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13849, __extension__
__PRETTY_FUNCTION__))
;
13850 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size")(static_cast <bool> ((int)Mask.size() == NumElements &&
"Unexpected shuffle mask size") ? void (0) : __assert_fail (
"(int)Mask.size() == NumElements && \"Unexpected shuffle mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13850, __extension__
__PRETTY_FUNCTION__))
;
13851
13852 // Define a helper function to check a particular ext-scale and lower to it if
13853 // valid.
13854 auto Lower = [&](int Scale) -> SDValue {
13855 SDValue InputV;
13856 bool AnyExt = true;
13857 int Offset = 0;
13858 int Matches = 0;
13859 for (int i = 0; i < NumElements; ++i) {
13860 int M = Mask[i];
13861 if (M < 0)
13862 continue; // Valid anywhere but doesn't tell us anything.
13863 if (i % Scale != 0) {
13864 // Each of the extended elements need to be zeroable.
13865 if (!Zeroable[i])
13866 return SDValue();
13867
13868 // We no longer are in the anyext case.
13869 AnyExt = false;
13870 continue;
13871 }
13872
13873 // Each of the base elements needs to be consecutive indices into the
13874 // same input vector.
13875 SDValue V = M < NumElements ? V1 : V2;
13876 M = M % NumElements;
13877 if (!InputV) {
13878 InputV = V;
13879 Offset = M - (i / Scale);
13880 } else if (InputV != V)
13881 return SDValue(); // Flip-flopping inputs.
13882
13883 // Offset must start in the lowest 128-bit lane or at the start of an
13884 // upper lane.
13885 // FIXME: Is it ever worth allowing a negative base offset?
13886 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
13887 (Offset % NumEltsPerLane) == 0))
13888 return SDValue();
13889
13890 // If we are offsetting, all referenced entries must come from the same
13891 // lane.
13892 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
13893 return SDValue();
13894
13895 if ((M % NumElements) != (Offset + (i / Scale)))
13896 return SDValue(); // Non-consecutive strided elements.
13897 Matches++;
13898 }
13899
13900 // If we fail to find an input, we have a zero-shuffle which should always
13901 // have already been handled.
13902 // FIXME: Maybe handle this here in case during blending we end up with one?
13903 if (!InputV)
13904 return SDValue();
13905
13906 // If we are offsetting, don't extend if we only match a single input, we
13907 // can always do better by using a basic PSHUF or PUNPCK.
13908 if (Offset != 0 && Matches < 2)
13909 return SDValue();
13910
13911 return lowerShuffleAsSpecificZeroOrAnyExtend(DL, VT, Scale, Offset, AnyExt,
13912 InputV, Mask, Subtarget, DAG);
13913 };
13914
13915 // The widest scale possible for extending is to a 64-bit integer.
13916 assert(Bits % 64 == 0 &&(static_cast <bool> (Bits % 64 == 0 && "The number of bits in a vector must be divisible by 64 on x86!"
) ? void (0) : __assert_fail ("Bits % 64 == 0 && \"The number of bits in a vector must be divisible by 64 on x86!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13917, __extension__
__PRETTY_FUNCTION__))
13917 "The number of bits in a vector must be divisible by 64 on x86!")(static_cast <bool> (Bits % 64 == 0 && "The number of bits in a vector must be divisible by 64 on x86!"
) ? void (0) : __assert_fail ("Bits % 64 == 0 && \"The number of bits in a vector must be divisible by 64 on x86!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13917, __extension__
__PRETTY_FUNCTION__))
;
13918 int NumExtElements = Bits / 64;
13919
13920 // Each iteration, try extending the elements half as much, but into twice as
13921 // many elements.
13922 for (; NumExtElements < NumElements; NumExtElements *= 2) {
13923 assert(NumElements % NumExtElements == 0 &&(static_cast <bool> (NumElements % NumExtElements == 0 &&
"The input vector size must be divisible by the extended size."
) ? void (0) : __assert_fail ("NumElements % NumExtElements == 0 && \"The input vector size must be divisible by the extended size.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13924, __extension__
__PRETTY_FUNCTION__))
13924 "The input vector size must be divisible by the extended size.")(static_cast <bool> (NumElements % NumExtElements == 0 &&
"The input vector size must be divisible by the extended size."
) ? void (0) : __assert_fail ("NumElements % NumExtElements == 0 && \"The input vector size must be divisible by the extended size.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13924, __extension__
__PRETTY_FUNCTION__))
;
13925 if (SDValue V = Lower(NumElements / NumExtElements))
13926 return V;
13927 }
13928
13929 // General extends failed, but 128-bit vectors may be able to use MOVQ.
13930 if (Bits != 128)
13931 return SDValue();
13932
13933 // Returns one of the source operands if the shuffle can be reduced to a
13934 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
13935 auto CanZExtLowHalf = [&]() {
13936 for (int i = NumElements / 2; i != NumElements; ++i)
13937 if (!Zeroable[i])
13938 return SDValue();
13939 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
13940 return V1;
13941 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
13942 return V2;
13943 return SDValue();
13944 };
13945
13946 if (SDValue V = CanZExtLowHalf()) {
13947 V = DAG.getBitcast(MVT::v2i64, V);
13948 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
13949 return DAG.getBitcast(VT, V);
13950 }
13951
13952 // No viable ext lowering found.
13953 return SDValue();
13954}
13955
13956/// Try to get a scalar value for a specific element of a vector.
13957///
13958/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
13959static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
13960 SelectionDAG &DAG) {
13961 MVT VT = V.getSimpleValueType();
13962 MVT EltVT = VT.getVectorElementType();
13963 V = peekThroughBitcasts(V);
13964
13965 // If the bitcasts shift the element size, we can't extract an equivalent
13966 // element from it.
13967 MVT NewVT = V.getSimpleValueType();
13968 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
13969 return SDValue();
13970
13971 if (V.getOpcode() == ISD::BUILD_VECTOR ||
13972 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
13973 // Ensure the scalar operand is the same size as the destination.
13974 // FIXME: Add support for scalar truncation where possible.
13975 SDValue S = V.getOperand(Idx);
13976 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
13977 return DAG.getBitcast(EltVT, S);
13978 }
13979
13980 return SDValue();
13981}
13982
13983/// Helper to test for a load that can be folded with x86 shuffles.
13984///
13985/// This is particularly important because the set of instructions varies
13986/// significantly based on whether the operand is a load or not.
13987static bool isShuffleFoldableLoad(SDValue V) {
13988 V = peekThroughBitcasts(V);
13989 return ISD::isNON_EXTLoad(V.getNode());
13990}
13991
13992/// Try to lower insertion of a single element into a zero vector.
13993///
13994/// This is a common pattern that we have especially efficient patterns to lower
13995/// across all subtarget feature sets.
13996static SDValue lowerShuffleAsElementInsertion(
13997 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13998 const APInt &Zeroable, const X86Subtarget &Subtarget,
13999 SelectionDAG &DAG) {
14000 MVT ExtVT = VT;
14001 MVT EltVT = VT.getVectorElementType();
14002
14003 int V2Index =
14004 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
14005 Mask.begin();
14006 bool IsV1Zeroable = true;
14007 for (int i = 0, Size = Mask.size(); i < Size; ++i)
14008 if (i != V2Index && !Zeroable[i]) {
14009 IsV1Zeroable = false;
14010 break;
14011 }
14012
14013 // Check for a single input from a SCALAR_TO_VECTOR node.
14014 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
14015 // all the smarts here sunk into that routine. However, the current
14016 // lowering of BUILD_VECTOR makes that nearly impossible until the old
14017 // vector shuffle lowering is dead.
14018 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
14019 DAG);
14020 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
14021 // We need to zext the scalar if it is smaller than an i32.
14022 V2S = DAG.getBitcast(EltVT, V2S);
14023 if (EltVT == MVT::i8 || (EltVT == MVT::i16 && !Subtarget.hasFP16())) {
14024 // Using zext to expand a narrow element won't work for non-zero
14025 // insertions.
14026 if (!IsV1Zeroable)
14027 return SDValue();
14028
14029 // Zero-extend directly to i32.
14030 ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
14031 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
14032 }
14033 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
14034 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
14035 EltVT == MVT::i16) {
14036 // Either not inserting from the low element of the input or the input
14037 // element size is too small to use VZEXT_MOVL to clear the high bits.
14038 return SDValue();
14039 }
14040
14041 if (!IsV1Zeroable) {
14042 // If V1 can't be treated as a zero vector we have fewer options to lower
14043 // this. We can't support integer vectors or non-zero targets cheaply, and
14044 // the V1 elements can't be permuted in any way.
14045 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!")(static_cast <bool> (VT == ExtVT && "Cannot change extended type when non-zeroable!"
) ? void (0) : __assert_fail ("VT == ExtVT && \"Cannot change extended type when non-zeroable!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14045, __extension__
__PRETTY_FUNCTION__))
;
14046 if (!VT.isFloatingPoint() || V2Index != 0)
14047 return SDValue();
14048 SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
14049 V1Mask[V2Index] = -1;
14050 if (!isNoopShuffleMask(V1Mask))
14051 return SDValue();
14052 if (!VT.is128BitVector())
14053 return SDValue();
14054
14055 // Otherwise, use MOVSD, MOVSS or MOVSH.
14056 unsigned MovOpc = 0;
14057 if (EltVT == MVT::f16)
14058 MovOpc = X86ISD::MOVSH;
14059 else if (EltVT == MVT::f32)
14060 MovOpc = X86ISD::MOVSS;
14061 else if (EltVT == MVT::f64)
14062 MovOpc = X86ISD::MOVSD;
14063 else
14064 llvm_unreachable("Unsupported floating point element type to handle!")::llvm::llvm_unreachable_internal("Unsupported floating point element type to handle!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14064)
;
14065 return DAG.getNode(MovOpc, DL, ExtVT, V1, V2);
14066 }
14067
14068 // This lowering only works for the low element with floating point vectors.
14069 if (VT.isFloatingPoint() && V2Index != 0)
14070 return SDValue();
14071
14072 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
14073 if (ExtVT != VT)
14074 V2 = DAG.getBitcast(VT, V2);
14075
14076 if (V2Index != 0) {
14077 // If we have 4 or fewer lanes we can cheaply shuffle the element into
14078 // the desired position. Otherwise it is more efficient to do a vector
14079 // shift left. We know that we can do a vector shift left because all
14080 // the inputs are zero.
14081 if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
14082 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
14083 V2Shuffle[V2Index] = 0;
14084 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
14085 } else {
14086 V2 = DAG.getBitcast(MVT::v16i8, V2);
14087 V2 = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
14088 DAG.getTargetConstant(
14089 V2Index * EltVT.getSizeInBits() / 8, DL, MVT::i8));
14090 V2 = DAG.getBitcast(VT, V2);
14091 }
14092 }
14093 return V2;
14094}
14095
14096/// Try to lower broadcast of a single - truncated - integer element,
14097/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
14098///
14099/// This assumes we have AVX2.
14100static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0,
14101 int BroadcastIdx,
14102 const X86Subtarget &Subtarget,
14103 SelectionDAG &DAG) {
14104 assert(Subtarget.hasAVX2() &&(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower integer broadcasts with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower integer broadcasts with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14105, __extension__
__PRETTY_FUNCTION__))
14105 "We can only lower integer broadcasts with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower integer broadcasts with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower integer broadcasts with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14105, __extension__
__PRETTY_FUNCTION__))
;
14106
14107 MVT EltVT = VT.getVectorElementType();
14108 MVT V0VT = V0.getSimpleValueType();
14109
14110 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!")(static_cast <bool> (VT.isInteger() && "Unexpected non-integer trunc broadcast!"
) ? void (0) : __assert_fail ("VT.isInteger() && \"Unexpected non-integer trunc broadcast!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14110, __extension__
__PRETTY_FUNCTION__))
;
14111 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!")(static_cast <bool> (V0VT.isVector() && "Unexpected non-vector vector-sized value!"
) ? void (0) : __assert_fail ("V0VT.isVector() && \"Unexpected non-vector vector-sized value!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14111, __extension__
__PRETTY_FUNCTION__))
;
14112
14113 MVT V0EltVT = V0VT.getVectorElementType();
14114 if (!V0EltVT.isInteger())
14115 return SDValue();
14116
14117 const unsigned EltSize = EltVT.getSizeInBits();
14118 const unsigned V0EltSize = V0EltVT.getSizeInBits();
14119
14120 // This is only a truncation if the original element type is larger.
14121 if (V0EltSize <= EltSize)
14122 return SDValue();
14123
14124 assert(((V0EltSize % EltSize) == 0) &&(static_cast <bool> (((V0EltSize % EltSize) == 0) &&
"Scalar type sizes must all be powers of 2 on x86!") ? void (
0) : __assert_fail ("((V0EltSize % EltSize) == 0) && \"Scalar type sizes must all be powers of 2 on x86!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14125, __extension__
__PRETTY_FUNCTION__))
14125 "Scalar type sizes must all be powers of 2 on x86!")(static_cast <bool> (((V0EltSize % EltSize) == 0) &&
"Scalar type sizes must all be powers of 2 on x86!") ? void (
0) : __assert_fail ("((V0EltSize % EltSize) == 0) && \"Scalar type sizes must all be powers of 2 on x86!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14125, __extension__
__PRETTY_FUNCTION__))
;
14126
14127 const unsigned V0Opc = V0.getOpcode();
14128 const unsigned Scale = V0EltSize / EltSize;
14129 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
14130
14131 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
14132 V0Opc != ISD::BUILD_VECTOR)
14133 return SDValue();
14134
14135 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
14136
14137 // If we're extracting non-least-significant bits, shift so we can truncate.
14138 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
14139 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
14140 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
14141 if (const int OffsetIdx = BroadcastIdx % Scale)
14142 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
14143 DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));
14144
14145 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
14146 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
14147}
14148
14149/// Test whether this can be lowered with a single SHUFPS instruction.
14150///
14151/// This is used to disable more specialized lowerings when the shufps lowering
14152/// will happen to be efficient.
14153static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
14154 // This routine only handles 128-bit shufps.
14155 assert(Mask.size() == 4 && "Unsupported mask size!")(static_cast <bool> (Mask.size() == 4 && "Unsupported mask size!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unsupported mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14155, __extension__
__PRETTY_FUNCTION__))
;
14156 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[0] >= -1 && Mask[0
] < 8 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[0] >= -1 && Mask[0] < 8 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14156, __extension__
__PRETTY_FUNCTION__))
;
14157 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[1] >= -1 && Mask[1
] < 8 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[1] >= -1 && Mask[1] < 8 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14157, __extension__
__PRETTY_FUNCTION__))
;
14158 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[2] >= -1 && Mask[2
] < 8 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[2] >= -1 && Mask[2] < 8 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14158, __extension__
__PRETTY_FUNCTION__))
;
14159 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[3] >= -1 && Mask[3
] < 8 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[3] >= -1 && Mask[3] < 8 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14159, __extension__
__PRETTY_FUNCTION__))
;
14160
14161 // To lower with a single SHUFPS we need to have the low half and high half
14162 // each requiring a single input.
14163 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
14164 return false;
14165 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
14166 return false;
14167
14168 return true;
14169}
14170
14171/// If we are extracting two 128-bit halves of a vector and shuffling the
14172/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
14173/// multi-shuffle lowering.
14174static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0,
14175 SDValue N1, ArrayRef<int> Mask,
14176 SelectionDAG &DAG) {
14177 MVT VT = N0.getSimpleValueType();
14178 assert((VT.is128BitVector() &&(static_cast <bool> ((VT.is128BitVector() && (VT
.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64
)) && "VPERM* family of shuffles requires 32-bit or 64-bit elements"
) ? void (0) : __assert_fail ("(VT.is128BitVector() && (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && \"VPERM* family of shuffles requires 32-bit or 64-bit elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14180, __extension__
__PRETTY_FUNCTION__))
14179 (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&(static_cast <bool> ((VT.is128BitVector() && (VT
.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64
)) && "VPERM* family of shuffles requires 32-bit or 64-bit elements"
) ? void (0) : __assert_fail ("(VT.is128BitVector() && (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && \"VPERM* family of shuffles requires 32-bit or 64-bit elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14180, __extension__
__PRETTY_FUNCTION__))
14180 "VPERM* family of shuffles requires 32-bit or 64-bit elements")(static_cast <bool> ((VT.is128BitVector() && (VT
.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64
)) && "VPERM* family of shuffles requires 32-bit or 64-bit elements"
) ? void (0) : __assert_fail ("(VT.is128BitVector() && (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && \"VPERM* family of shuffles requires 32-bit or 64-bit elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14180, __extension__
__PRETTY_FUNCTION__))
;
14181
14182 // Check that both sources are extracts of the same source vector.
14183 if (!N0.hasOneUse() || !N1.hasOneUse() ||
14184 N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
14185 N1.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
14186 N0.getOperand(0) != N1.getOperand(0))
14187 return SDValue();
14188
14189 SDValue WideVec = N0.getOperand(0);
14190 MVT WideVT = WideVec.getSimpleValueType();
14191 if (!WideVT.is256BitVector())
14192 return SDValue();
14193
14194 // Match extracts of each half of the wide source vector. Commute the shuffle
14195 // if the extract of the low half is N1.
14196 unsigned NumElts = VT.getVectorNumElements();
14197 SmallVector<int, 4> NewMask(Mask.begin(), Mask.end());
14198 const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);
14199 const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);
14200 if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
14201 ShuffleVectorSDNode::commuteMask(NewMask);
14202 else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)
14203 return SDValue();
14204
14205 // Final bailout: if the mask is simple, we are better off using an extract
14206 // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
14207 // because that avoids a constant load from memory.
14208 if (NumElts == 4 &&
14209 (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask)))
14210 return SDValue();
14211
14212 // Extend the shuffle mask with undef elements.
14213 NewMask.append(NumElts, -1);
14214
14215 // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
14216 SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
14217 NewMask);
14218 // This is free: ymm -> xmm.
14219 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
14220 DAG.getIntPtrConstant(0, DL));
14221}
14222
14223/// Try to lower broadcast of a single element.
14224///
14225/// For convenience, this code also bundles all of the subtarget feature set
14226/// filtering. While a little annoying to re-dispatch on type here, there isn't
14227/// a convenient way to factor it out.
14228static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
14229 SDValue V2, ArrayRef<int> Mask,
14230 const X86Subtarget &Subtarget,
14231 SelectionDAG &DAG) {
14232 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
14233 (Subtarget.hasAVX() && VT.isFloatingPoint()) ||
14234 (Subtarget.hasAVX2() && VT.isInteger())))
14235 return SDValue();
14236
14237 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
14238 // we can only broadcast from a register with AVX2.
14239 unsigned NumEltBits = VT.getScalarSizeInBits();
14240 unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
14241 ? X86ISD::MOVDDUP
14242 : X86ISD::VBROADCAST;
14243 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
14244
14245 // Check that the mask is a broadcast.
14246 int BroadcastIdx = getSplatIndex(Mask);
14247 if (BroadcastIdx < 0)
14248 return SDValue();
14249 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "(static_cast <bool> (BroadcastIdx < (int)Mask.size()
&& "We only expect to be called with " "a sorted mask where the broadcast "
"comes from V1.") ? void (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14251, __extension__
__PRETTY_FUNCTION__))
14250 "a sorted mask where the broadcast "(static_cast <bool> (BroadcastIdx < (int)Mask.size()
&& "We only expect to be called with " "a sorted mask where the broadcast "
"comes from V1.") ? void (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14251, __extension__
__PRETTY_FUNCTION__))
14251 "comes from V1.")(static_cast <bool> (BroadcastIdx < (int)Mask.size()
&& "We only expect to be called with " "a sorted mask where the broadcast "
"comes from V1.") ? void (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14251, __extension__
__PRETTY_FUNCTION__))
;
14252
14253 // Go up the chain of (vector) values to find a scalar load that we can
14254 // combine with the broadcast.
14255 // TODO: Combine this logic with findEltLoadSrc() used by
14256 // EltsFromConsecutiveLoads().
14257 int BitOffset = BroadcastIdx * NumEltBits;
14258 SDValue V = V1;
14259 for (;;) {
14260 switch (V.getOpcode()) {
14261 case ISD::BITCAST: {
14262 V = V.getOperand(0);
14263 continue;
14264 }
14265 case ISD::CONCAT_VECTORS: {
14266 int OpBitWidth = V.getOperand(0).getValueSizeInBits();
14267 int OpIdx = BitOffset / OpBitWidth;
14268 V = V.getOperand(OpIdx);
14269 BitOffset %= OpBitWidth;
14270 continue;
14271 }
14272 case ISD::EXTRACT_SUBVECTOR: {
14273 // The extraction index adds to the existing offset.
14274 unsigned EltBitWidth = V.getScalarValueSizeInBits();
14275 unsigned Idx = V.getConstantOperandVal(1);
14276 unsigned BeginOffset = Idx * EltBitWidth;
14277 BitOffset += BeginOffset;
14278 V = V.getOperand(0);
14279 continue;
14280 }
14281 case ISD::INSERT_SUBVECTOR: {
14282 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
14283 int EltBitWidth = VOuter.getScalarValueSizeInBits();
14284 int Idx = (int)V.getConstantOperandVal(2);
14285 int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
14286 int BeginOffset = Idx * EltBitWidth;
14287 int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
14288 if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
14289 BitOffset -= BeginOffset;
14290 V = VInner;
14291 } else {
14292 V = VOuter;
14293 }
14294 continue;
14295 }
14296 }
14297 break;
14298 }
14299 assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset")(static_cast <bool> ((BitOffset % NumEltBits) == 0 &&
"Illegal bit-offset") ? void (0) : __assert_fail ("(BitOffset % NumEltBits) == 0 && \"Illegal bit-offset\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14299, __extension__
__PRETTY_FUNCTION__))
;
14300 BroadcastIdx = BitOffset / NumEltBits;
14301
14302 // Do we need to bitcast the source to retrieve the original broadcast index?
14303 bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;
14304
14305 // Check if this is a broadcast of a scalar. We special case lowering
14306 // for scalars so that we can more effectively fold with loads.
14307 // If the original value has a larger element type than the shuffle, the
14308 // broadcast element is in essence truncated. Make that explicit to ease
14309 // folding.
14310 if (BitCastSrc && VT.isInteger())
14311 if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(
14312 DL, VT, V, BroadcastIdx, Subtarget, DAG))
14313 return TruncBroadcast;
14314
14315 // Also check the simpler case, where we can directly reuse the scalar.
14316 if (!BitCastSrc &&
14317 ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
14318 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {
14319 V = V.getOperand(BroadcastIdx);
14320
14321 // If we can't broadcast from a register, check that the input is a load.
14322 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
14323 return SDValue();
14324 } else if (ISD::isNormalLoad(V.getNode()) &&
14325 cast<LoadSDNode>(V)->isSimple()) {
14326 // We do not check for one-use of the vector load because a broadcast load
14327 // is expected to be a win for code size, register pressure, and possibly
14328 // uops even if the original vector load is not eliminated.
14329
14330 // Reduce the vector load and shuffle to a broadcasted scalar load.
14331 LoadSDNode *Ld = cast<LoadSDNode>(V);
14332 SDValue BaseAddr = Ld->getOperand(1);
14333 MVT SVT = VT.getScalarType();
14334 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
14335 assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset")(static_cast <bool> ((int)(Offset * 8) == BitOffset &&
"Unexpected bit-offset") ? void (0) : __assert_fail ("(int)(Offset * 8) == BitOffset && \"Unexpected bit-offset\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14335, __extension__
__PRETTY_FUNCTION__))
;
14336 SDValue NewAddr =
14337 DAG.getMemBasePlusOffset(BaseAddr, TypeSize::Fixed(Offset), DL);
14338
14339 // Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather
14340 // than MOVDDUP.
14341 // FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?
14342 if (Opcode == X86ISD::VBROADCAST) {
14343 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
14344 SDValue Ops[] = {Ld->getChain(), NewAddr};
14345 V = DAG.getMemIntrinsicNode(
14346 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT,
14347 DAG.getMachineFunction().getMachineMemOperand(
14348 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
14349 DAG.makeEquivalentMemoryOrdering(Ld, V);
14350 return DAG.getBitcast(VT, V);
14351 }
14352 assert(SVT == MVT::f64 && "Unexpected VT!")(static_cast <bool> (SVT == MVT::f64 && "Unexpected VT!"
) ? void (0) : __assert_fail ("SVT == MVT::f64 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14352, __extension__
__PRETTY_FUNCTION__))
;
14353 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
14354 DAG.getMachineFunction().getMachineMemOperand(
14355 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
14356 DAG.makeEquivalentMemoryOrdering(Ld, V);
14357 } else if (!BroadcastFromReg) {
14358 // We can't broadcast from a vector register.
14359 return SDValue();
14360 } else if (BitOffset != 0) {
14361 // We can only broadcast from the zero-element of a vector register,
14362 // but it can be advantageous to broadcast from the zero-element of a
14363 // subvector.
14364 if (!VT.is256BitVector() && !VT.is512BitVector())
14365 return SDValue();
14366
14367 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
14368 if (VT == MVT::v4f64 || VT == MVT::v4i64)
14369 return SDValue();
14370
14371 // Only broadcast the zero-element of a 128-bit subvector.
14372 if ((BitOffset % 128) != 0)
14373 return SDValue();
14374
14375 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&(static_cast <bool> ((BitOffset % V.getScalarValueSizeInBits
()) == 0 && "Unexpected bit-offset") ? void (0) : __assert_fail
("(BitOffset % V.getScalarValueSizeInBits()) == 0 && \"Unexpected bit-offset\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14376, __extension__
__PRETTY_FUNCTION__))
14376 "Unexpected bit-offset")(static_cast <bool> ((BitOffset % V.getScalarValueSizeInBits
()) == 0 && "Unexpected bit-offset") ? void (0) : __assert_fail
("(BitOffset % V.getScalarValueSizeInBits()) == 0 && \"Unexpected bit-offset\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14376, __extension__
__PRETTY_FUNCTION__))
;
14377 assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&(static_cast <bool> ((V.getValueSizeInBits() == 256 || V
.getValueSizeInBits() == 512) && "Unexpected vector size"
) ? void (0) : __assert_fail ("(V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) && \"Unexpected vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14378, __extension__
__PRETTY_FUNCTION__))
14378 "Unexpected vector size")(static_cast <bool> ((V.getValueSizeInBits() == 256 || V
.getValueSizeInBits() == 512) && "Unexpected vector size"
) ? void (0) : __assert_fail ("(V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) && \"Unexpected vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14378, __extension__
__PRETTY_FUNCTION__))
;
14379 unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
14380 V = extract128BitVector(V, ExtractIdx, DAG, DL);
14381 }
14382
14383 // On AVX we can use VBROADCAST directly for scalar sources.
14384 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) {
14385 V = DAG.getBitcast(MVT::f64, V);
14386 if (Subtarget.hasAVX()) {
14387 V = DAG.getNode(X86ISD::VBROADCAST, DL, MVT::v2f64, V);
14388 return DAG.getBitcast(VT, V);
14389 }
14390 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V);
14391 }
14392
14393 // If this is a scalar, do the broadcast on this type and bitcast.
14394 if (!V.getValueType().isVector()) {
14395 assert(V.getScalarValueSizeInBits() == NumEltBits &&(static_cast <bool> (V.getScalarValueSizeInBits() == NumEltBits
&& "Unexpected scalar size") ? void (0) : __assert_fail
("V.getScalarValueSizeInBits() == NumEltBits && \"Unexpected scalar size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14396, __extension__
__PRETTY_FUNCTION__))
14396 "Unexpected scalar size")(static_cast <bool> (V.getScalarValueSizeInBits() == NumEltBits
&& "Unexpected scalar size") ? void (0) : __assert_fail
("V.getScalarValueSizeInBits() == NumEltBits && \"Unexpected scalar size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14396, __extension__
__PRETTY_FUNCTION__))
;
14397 MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),
14398 VT.getVectorNumElements());
14399 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
14400 }
14401
14402 // We only support broadcasting from 128-bit vectors to minimize the
14403 // number of patterns we need to deal with in isel. So extract down to
14404 // 128-bits, removing as many bitcasts as possible.
14405 if (V.getValueSizeInBits() > 128)
14406 V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL);
14407
14408 // Otherwise cast V to a vector with the same element type as VT, but
14409 // possibly narrower than VT. Then perform the broadcast.
14410 unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
14411 MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);
14412 return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));
14413}
14414
14415// Check for whether we can use INSERTPS to perform the shuffle. We only use
14416// INSERTPS when the V1 elements are already in the correct locations
14417// because otherwise we can just always use two SHUFPS instructions which
14418// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
14419// perform INSERTPS if a single V1 element is out of place and all V2
14420// elements are zeroable.
14421static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2,
14422 unsigned &InsertPSMask,
14423 const APInt &Zeroable,
14424 ArrayRef<int> Mask, SelectionDAG &DAG) {
14425 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType().is128BitVector
() && "Bad operand type!") ? void (0) : __assert_fail
("V1.getSimpleValueType().is128BitVector() && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14425, __extension__
__PRETTY_FUNCTION__))
;
14426 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType().is128BitVector
() && "Bad operand type!") ? void (0) : __assert_fail
("V2.getSimpleValueType().is128BitVector() && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14426, __extension__
__PRETTY_FUNCTION__))
;
14427 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14427, __extension__
__PRETTY_FUNCTION__))
;
14428
14429 // Attempt to match INSERTPS with one element from VA or VB being
14430 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
14431 // are updated.
14432 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
14433 ArrayRef<int> CandidateMask) {
14434 unsigned ZMask = 0;
14435 int VADstIndex = -1;
14436 int VBDstIndex = -1;
14437 bool VAUsedInPlace = false;
14438
14439 for (int i = 0; i < 4; ++i) {
14440 // Synthesize a zero mask from the zeroable elements (includes undefs).
14441 if (Zeroable[i]) {
14442 ZMask |= 1 << i;
14443 continue;
14444 }
14445
14446 // Flag if we use any VA inputs in place.
14447 if (i == CandidateMask[i]) {
14448 VAUsedInPlace = true;
14449 continue;
14450 }
14451
14452 // We can only insert a single non-zeroable element.
14453 if (VADstIndex >= 0 || VBDstIndex >= 0)
14454 return false;
14455
14456 if (CandidateMask[i] < 4) {
14457 // VA input out of place for insertion.
14458 VADstIndex = i;
14459 } else {
14460 // VB input for insertion.
14461 VBDstIndex = i;
14462 }
14463 }
14464
14465 // Don't bother if we have no (non-zeroable) element for insertion.
14466 if (VADstIndex < 0 && VBDstIndex < 0)
14467 return false;
14468
14469 // Determine element insertion src/dst indices. The src index is from the
14470 // start of the inserted vector, not the start of the concatenated vector.
14471 unsigned VBSrcIndex = 0;
14472 if (VADstIndex >= 0) {
14473 // If we have a VA input out of place, we use VA as the V2 element
14474 // insertion and don't use the original V2 at all.
14475 VBSrcIndex = CandidateMask[VADstIndex];
14476 VBDstIndex = VADstIndex;
14477 VB = VA;
14478 } else {
14479 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
14480 }
14481
14482 // If no V1 inputs are used in place, then the result is created only from
14483 // the zero mask and the V2 insertion - so remove V1 dependency.
14484 if (!VAUsedInPlace)
14485 VA = DAG.getUNDEF(MVT::v4f32);
14486
14487 // Update V1, V2 and InsertPSMask accordingly.
14488 V1 = VA;
14489 V2 = VB;
14490
14491 // Insert the V2 element into the desired position.
14492 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
14493 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!")(static_cast <bool> ((InsertPSMask & ~0xFFu) == 0 &&
"Invalid mask!") ? void (0) : __assert_fail ("(InsertPSMask & ~0xFFu) == 0 && \"Invalid mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14493, __extension__
__PRETTY_FUNCTION__))
;
14494 return true;
14495 };
14496
14497 if (matchAsInsertPS(V1, V2, Mask))
14498 return true;
14499
14500 // Commute and try again.
14501 SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
14502 ShuffleVectorSDNode::commuteMask(CommutedMask);
14503 if (matchAsInsertPS(V2, V1, CommutedMask))
14504 return true;
14505
14506 return false;
14507}
14508
14509static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2,
14510 ArrayRef<int> Mask, const APInt &Zeroable,
14511 SelectionDAG &DAG) {
14512 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14512, __extension__
__PRETTY_FUNCTION__))
;
14513 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14513, __extension__
__PRETTY_FUNCTION__))
;
14514
14515 // Attempt to match the insertps pattern.
14516 unsigned InsertPSMask = 0;
14517 if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
14518 return SDValue();
14519
14520 // Insert the V2 element into the desired position.
14521 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
14522 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
14523}
14524
14525/// Try to lower a shuffle as a permute of the inputs followed by an
14526/// UNPCK instruction.
14527///
14528/// This specifically targets cases where we end up with alternating between
14529/// the two inputs, and so can permute them into something that feeds a single
14530/// UNPCK instruction. Note that this routine only targets integer vectors
14531/// because for floating point vectors we have a generalized SHUFPS lowering
14532/// strategy that handles everything that doesn't *exactly* match an unpack,
14533/// making this clever lowering unnecessary.
14534static SDValue lowerShuffleAsPermuteAndUnpack(
14535 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
14536 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
14537 assert(!VT.isFloatingPoint() &&(static_cast <bool> (!VT.isFloatingPoint() && "This routine only supports integer vectors."
) ? void (0) : __assert_fail ("!VT.isFloatingPoint() && \"This routine only supports integer vectors.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14538, __extension__
__PRETTY_FUNCTION__))
14538 "This routine only supports integer vectors.")(static_cast <bool> (!VT.isFloatingPoint() && "This routine only supports integer vectors."
) ? void (0) : __assert_fail ("!VT.isFloatingPoint() && \"This routine only supports integer vectors.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14538, __extension__
__PRETTY_FUNCTION__))
;
14539 assert(VT.is128BitVector() &&(static_cast <bool> (VT.is128BitVector() && "This routine only works on 128-bit vectors."
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"This routine only works on 128-bit vectors.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14540, __extension__
__PRETTY_FUNCTION__))
14540 "This routine only works on 128-bit vectors.")(static_cast <bool> (VT.is128BitVector() && "This routine only works on 128-bit vectors."
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"This routine only works on 128-bit vectors.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14540, __extension__
__PRETTY_FUNCTION__))
;
14541 assert(!V2.isUndef() &&(static_cast <bool> (!V2.isUndef() && "This routine should only be used when blending two inputs."
) ? void (0) : __assert_fail ("!V2.isUndef() && \"This routine should only be used when blending two inputs.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14542, __extension__
__PRETTY_FUNCTION__))
14542 "This routine should only be used when blending two inputs.")(static_cast <bool> (!V2.isUndef() && "This routine should only be used when blending two inputs."
) ? void (0) : __assert_fail ("!V2.isUndef() && \"This routine should only be used when blending two inputs.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14542, __extension__
__PRETTY_FUNCTION__))
;
14543 assert(Mask.size() >= 2 && "Single element masks are invalid.")(static_cast <bool> (Mask.size() >= 2 && "Single element masks are invalid."
) ? void (0) : __assert_fail ("Mask.size() >= 2 && \"Single element masks are invalid.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14543, __extension__
__PRETTY_FUNCTION__))
;
14544
14545 int Size = Mask.size();
14546
14547 int NumLoInputs =
14548 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
14549 int NumHiInputs =
14550 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
14551
14552 bool UnpackLo = NumLoInputs >= NumHiInputs;
14553
14554 auto TryUnpack = [&](int ScalarSize, int Scale) {
14555 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
14556 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
14557
14558 for (int i = 0; i < Size; ++i) {
14559 if (Mask[i] < 0)
14560 continue;
14561
14562 // Each element of the unpack contains Scale elements from this mask.
14563 int UnpackIdx = i / Scale;
14564
14565 // We only handle the case where V1 feeds the first slots of the unpack.
14566 // We rely on canonicalization to ensure this is the case.
14567 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
14568 return SDValue();
14569
14570 // Setup the mask for this input. The indexing is tricky as we have to
14571 // handle the unpack stride.
14572 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
14573 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
14574 Mask[i] % Size;
14575 }
14576
14577 // If we will have to shuffle both inputs to use the unpack, check whether
14578 // we can just unpack first and shuffle the result. If so, skip this unpack.
14579 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
14580 !isNoopShuffleMask(V2Mask))
14581 return SDValue();
14582
14583 // Shuffle the inputs into place.
14584 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
14585 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
14586
14587 // Cast the inputs to the type we will use to unpack them.
14588 MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
14589 V1 = DAG.getBitcast(UnpackVT, V1);
14590 V2 = DAG.getBitcast(UnpackVT, V2);
14591
14592 // Unpack the inputs and cast the result back to the desired type.
14593 return DAG.getBitcast(
14594 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
14595 UnpackVT, V1, V2));
14596 };
14597
14598 // We try each unpack from the largest to the smallest to try and find one
14599 // that fits this mask.
14600 int OrigScalarSize = VT.getScalarSizeInBits();
14601 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
14602 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
14603 return Unpack;
14604
14605 // If we're shuffling with a zero vector then we're better off not doing
14606 // VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.
14607 if (ISD::isBuildVectorAllZeros(V1.getNode()) ||
14608 ISD::isBuildVectorAllZeros(V2.getNode()))
14609 return SDValue();
14610
14611 // If none of the unpack-rooted lowerings worked (or were profitable) try an
14612 // initial unpack.
14613 if (NumLoInputs == 0 || NumHiInputs == 0) {
14614 assert((NumLoInputs > 0 || NumHiInputs > 0) &&(static_cast <bool> ((NumLoInputs > 0 || NumHiInputs
> 0) && "We have to have *some* inputs!") ? void (
0) : __assert_fail ("(NumLoInputs > 0 || NumHiInputs > 0) && \"We have to have *some* inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14615, __extension__
__PRETTY_FUNCTION__))
14615 "We have to have *some* inputs!")(static_cast <bool> ((NumLoInputs > 0 || NumHiInputs
> 0) && "We have to have *some* inputs!") ? void (
0) : __assert_fail ("(NumLoInputs > 0 || NumHiInputs > 0) && \"We have to have *some* inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14615, __extension__
__PRETTY_FUNCTION__))
;
14616 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
14617
14618 // FIXME: We could consider the total complexity of the permute of each
14619 // possible unpacking. Or at the least we should consider how many
14620 // half-crossings are created.
14621 // FIXME: We could consider commuting the unpacks.
14622
14623 SmallVector<int, 32> PermMask((unsigned)Size, -1);
14624 for (int i = 0; i < Size; ++i) {
14625 if (Mask[i] < 0)
14626 continue;
14627
14628 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!")(static_cast <bool> (Mask[i] % Size >= HalfOffset &&
"Found input from wrong half!") ? void (0) : __assert_fail (
"Mask[i] % Size >= HalfOffset && \"Found input from wrong half!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14628, __extension__
__PRETTY_FUNCTION__))
;
14629
14630 PermMask[i] =
14631 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
14632 }
14633 return DAG.getVectorShuffle(
14634 VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
14635 DL, VT, V1, V2),
14636 DAG.getUNDEF(VT), PermMask);
14637 }
14638
14639 return SDValue();
14640}
14641
14642/// Handle lowering of 2-lane 64-bit floating point shuffles.
14643///
14644/// This is the basis function for the 2-lane 64-bit shuffles as we have full
14645/// support for floating point shuffles but not integer shuffles. These
14646/// instructions will incur a domain crossing penalty on some chips though so
14647/// it is better to avoid lowering through this for integer vectors where
14648/// possible.
14649static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
14650 const APInt &Zeroable, SDValue V1, SDValue V2,
14651 const X86Subtarget &Subtarget,
14652 SelectionDAG &DAG) {
14653 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v2f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v2f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14653, __extension__
__PRETTY_FUNCTION__))
;
14654 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v2f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v2f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14654, __extension__
__PRETTY_FUNCTION__))
;
14655 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!")(static_cast <bool> (Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 2 && \"Unexpected mask size for v2 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14655, __extension__
__PRETTY_FUNCTION__))
;
14656
14657 if (V2.isUndef()) {
14658 // Check for being able to broadcast a single element.
14659 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,
14660 Mask, Subtarget, DAG))
14661 return Broadcast;
14662
14663 // Straight shuffle of a single input vector. Simulate this by using the
14664 // single input as both of the "inputs" to this instruction..
14665 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
14666
14667 if (Subtarget.hasAVX()) {
14668 // If we have AVX, we can use VPERMILPS which will allow folding a load
14669 // into the shuffle.
14670 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
14671 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
14672 }
14673
14674 return DAG.getNode(
14675 X86ISD::SHUFP, DL, MVT::v2f64,
14676 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
14677 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
14678 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
14679 }
14680 assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!")(static_cast <bool> (Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!"
) ? void (0) : __assert_fail ("Mask[0] >= 0 && \"No undef lanes in multi-input v2 shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14680, __extension__
__PRETTY_FUNCTION__))
;
14681 assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!")(static_cast <bool> (Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!"
) ? void (0) : __assert_fail ("Mask[1] >= 0 && \"No undef lanes in multi-input v2 shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14681, __extension__
__PRETTY_FUNCTION__))
;
14682 assert(Mask[0] < 2 && "We sort V1 to be the first input.")(static_cast <bool> (Mask[0] < 2 && "We sort V1 to be the first input."
) ? void (0) : __assert_fail ("Mask[0] < 2 && \"We sort V1 to be the first input.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14682, __extension__
__PRETTY_FUNCTION__))
;
14683 assert(Mask[1] >= 2 && "We sort V2 to be the second input.")(static_cast <bool> (Mask[1] >= 2 && "We sort V2 to be the second input."
) ? void (0) : __assert_fail ("Mask[1] >= 2 && \"We sort V2 to be the second input.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14683, __extension__
__PRETTY_FUNCTION__))
;
14684
14685 if (Subtarget.hasAVX2())
14686 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
14687 return Extract;
14688
14689 // When loading a scalar and then shuffling it into a vector we can often do
14690 // the insertion cheaply.
14691 if (SDValue Insertion = lowerShuffleAsElementInsertion(
14692 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
14693 return Insertion;
14694 // Try inverting the insertion since for v2 masks it is easy to do and we
14695 // can't reliably sort the mask one way or the other.
14696 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
14697 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
14698 if (SDValue Insertion = lowerShuffleAsElementInsertion(
14699 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
14700 return Insertion;
14701
14702 // Try to use one of the special instruction patterns to handle two common
14703 // blend patterns if a zero-blend above didn't work.
14704 if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) ||
14705 isShuffleEquivalent(Mask, {1, 3}, V1, V2))
14706 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
14707 // We can either use a special instruction to load over the low double or
14708 // to move just the low double.
14709 return DAG.getNode(
14710 X86ISD::MOVSD, DL, MVT::v2f64, V2,
14711 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
14712
14713 if (Subtarget.hasSSE41())
14714 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
14715 Zeroable, Subtarget, DAG))
14716 return Blend;
14717
14718 // Use dedicated unpack instructions for masks that match their pattern.
14719 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
14720 return V;
14721
14722 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
14723 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
14724 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
14725}
14726
14727/// Handle lowering of 2-lane 64-bit integer shuffles.
14728///
14729/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
14730/// the integer unit to minimize domain crossing penalties. However, for blends
14731/// it falls back to the floating point shuffle operation with appropriate bit
14732/// casting.
14733static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
14734 const APInt &Zeroable, SDValue V1, SDValue V2,
14735 const X86Subtarget &Subtarget,
14736 SelectionDAG &DAG) {
14737 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v2i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v2i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14737, __extension__
__PRETTY_FUNCTION__))
;
14738 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v2i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v2i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14738, __extension__
__PRETTY_FUNCTION__))
;
14739 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!")(static_cast <bool> (Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 2 && \"Unexpected mask size for v2 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14739, __extension__
__PRETTY_FUNCTION__))
;
14740
14741 if (V2.isUndef()) {
14742 // Check for being able to broadcast a single element.
14743 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,
14744 Mask, Subtarget, DAG))
14745 return Broadcast;
14746
14747 // Straight shuffle of a single input vector. For everything from SSE2
14748 // onward this has a single fast instruction with no scary immediates.
14749 // We have to map the mask as it is actually a v4i32 shuffle instruction.
14750 V1 = DAG.getBitcast(MVT::v4i32, V1);
14751 int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),
14752 Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),
14753 Mask[1] < 0 ? -1 : (Mask[1] * 2),
14754 Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};
14755 return DAG.getBitcast(
14756 MVT::v2i64,
14757 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
14758 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
14759 }
14760 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!")(static_cast <bool> (Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!"
) ? void (0) : __assert_fail ("Mask[0] != -1 && \"No undef lanes in multi-input v2 shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14760, __extension__
__PRETTY_FUNCTION__))
;
14761 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!")(static_cast <bool> (Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!"
) ? void (0) : __assert_fail ("Mask[1] != -1 && \"No undef lanes in multi-input v2 shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14761, __extension__
__PRETTY_FUNCTION__))
;
14762 assert(Mask[0] < 2 && "We sort V1 to be the first input.")(static_cast <bool> (Mask[0] < 2 && "We sort V1 to be the first input."
) ? void (0) : __assert_fail ("Mask[0] < 2 && \"We sort V1 to be the first input.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14762, __extension__
__PRETTY_FUNCTION__))
;
14763 assert(Mask[1] >= 2 && "We sort V2 to be the second input.")(static_cast <bool> (Mask[1] >= 2 && "We sort V2 to be the second input."
) ? void (0) : __assert_fail ("Mask[1] >= 2 && \"We sort V2 to be the second input.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14763, __extension__
__PRETTY_FUNCTION__))
;
14764
14765 if (Subtarget.hasAVX2())
14766 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
14767 return Extract;
14768
14769 // Try to use shift instructions.
14770 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
14771 Zeroable, Subtarget, DAG))
14772 return Shift;
14773
14774 // When loading a scalar and then shuffling it into a vector we can often do
14775 // the insertion cheaply.
14776 if (SDValue Insertion = lowerShuffleAsElementInsertion(
14777 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
14778 return Insertion;
14779 // Try inverting the insertion since for v2 masks it is easy to do and we
14780 // can't reliably sort the mask one way or the other.
14781 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
14782 if (SDValue Insertion = lowerShuffleAsElementInsertion(
14783 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
14784 return Insertion;
14785
14786 // We have different paths for blend lowering, but they all must use the
14787 // *exact* same predicate.
14788 bool IsBlendSupported = Subtarget.hasSSE41();
14789 if (IsBlendSupported)
14790 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
14791 Zeroable, Subtarget, DAG))
14792 return Blend;
14793
14794 // Use dedicated unpack instructions for masks that match their pattern.
14795 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
14796 return V;
14797
14798 // Try to use byte rotation instructions.
14799 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
14800 if (Subtarget.hasSSSE3()) {
14801 if (Subtarget.hasVLX())
14802 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,
14803 Subtarget, DAG))
14804 return Rotate;
14805
14806 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,
14807 Subtarget, DAG))
14808 return Rotate;
14809 }
14810
14811 // If we have direct support for blends, we should lower by decomposing into
14812 // a permute. That will be faster than the domain cross.
14813 if (IsBlendSupported)
14814 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask,
14815 Subtarget, DAG);
14816
14817 // We implement this with SHUFPD which is pretty lame because it will likely
14818 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
14819 // However, all the alternatives are still more cycles and newer chips don't
14820 // have this problem. It would be really nice if x86 had better shuffles here.
14821 V1 = DAG.getBitcast(MVT::v2f64, V1);
14822 V2 = DAG.getBitcast(MVT::v2f64, V2);
14823 return DAG.getBitcast(MVT::v2i64,
14824 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
14825}
14826
14827/// Lower a vector shuffle using the SHUFPS instruction.
14828///
14829/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
14830/// It makes no assumptions about whether this is the *best* lowering, it simply
14831/// uses it.
14832static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
14833 ArrayRef<int> Mask, SDValue V1,
14834 SDValue V2, SelectionDAG &DAG) {
14835 SDValue LowV = V1, HighV = V2;
14836 SmallVector<int, 4> NewMask(Mask.begin(), Mask.end());
14837 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
14838
14839 if (NumV2Elements == 1) {
14840 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
14841
14842 // Compute the index adjacent to V2Index and in the same half by toggling
14843 // the low bit.
14844 int V2AdjIndex = V2Index ^ 1;
14845
14846 if (Mask[V2AdjIndex] < 0) {
14847 // Handles all the cases where we have a single V2 element and an undef.
14848 // This will only ever happen in the high lanes because we commute the
14849 // vector otherwise.
14850 if (V2Index < 2)
14851 std::swap(LowV, HighV);
14852 NewMask[V2Index] -= 4;
14853 } else {
14854 // Handle the case where the V2 element ends up adjacent to a V1 element.
14855 // To make this work, blend them together as the first step.
14856 int V1Index = V2AdjIndex;
14857 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
14858 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
14859 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
14860
14861 // Now proceed to reconstruct the final blend as we have the necessary
14862 // high or low half formed.
14863 if (V2Index < 2) {
14864 LowV = V2;
14865 HighV = V1;
14866 } else {
14867 HighV = V2;
14868 }
14869 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
14870 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
14871 }
14872 } else if (NumV2Elements == 2) {
14873 if (Mask[0] < 4 && Mask[1] < 4) {
14874 // Handle the easy case where we have V1 in the low lanes and V2 in the
14875 // high lanes.
14876 NewMask[2] -= 4;
14877 NewMask[3] -= 4;
14878 } else if (Mask[2] < 4 && Mask[3] < 4) {
14879 // We also handle the reversed case because this utility may get called
14880 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
14881 // arrange things in the right direction.
14882 NewMask[0] -= 4;
14883 NewMask[1] -= 4;
14884 HighV = V1;
14885 LowV = V2;
14886 } else {
14887 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
14888 // trying to place elements directly, just blend them and set up the final
14889 // shuffle to place them.
14890
14891 // The first two blend mask elements are for V1, the second two are for
14892 // V2.
14893 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
14894 Mask[2] < 4 ? Mask[2] : Mask[3],
14895 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
14896 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
14897 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
14898 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
14899
14900 // Now we do a normal shuffle of V1 by giving V1 as both operands to
14901 // a blend.
14902 LowV = HighV = V1;
14903 NewMask[0] = Mask[0] < 4 ? 0 : 2;
14904 NewMask[1] = Mask[0] < 4 ? 2 : 0;
14905 NewMask[2] = Mask[2] < 4 ? 1 : 3;
14906 NewMask[3] = Mask[2] < 4 ? 3 : 1;
14907 }
14908 } else if (NumV2Elements == 3) {
14909 // Ideally canonicalizeShuffleMaskWithCommute should have caught this, but
14910 // we can get here due to other paths (e.g repeated mask matching) that we
14911 // don't want to do another round of lowerVECTOR_SHUFFLE.
14912 ShuffleVectorSDNode::commuteMask(NewMask);
14913 return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG);
14914 }
14915 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
14916 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
14917}
14918
14919/// Lower 4-lane 32-bit floating point shuffles.
14920///
14921/// Uses instructions exclusively from the floating point unit to minimize
14922/// domain crossing penalties, as these are sufficient to implement all v4f32
14923/// shuffles.
14924static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
14925 const APInt &Zeroable, SDValue V1, SDValue V2,
14926 const X86Subtarget &Subtarget,
14927 SelectionDAG &DAG) {
14928 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14928, __extension__
__PRETTY_FUNCTION__))
;
14929 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14929, __extension__
__PRETTY_FUNCTION__))
;
14930 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14930, __extension__
__PRETTY_FUNCTION__))
;
14931
14932 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
14933
14934 if (NumV2Elements == 0) {
14935 // Check for being able to broadcast a single element.
14936 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,
14937 Mask, Subtarget, DAG))
14938 return Broadcast;
14939
14940 // Use even/odd duplicate instructions for masks that match their pattern.
14941 if (Subtarget.hasSSE3()) {
14942 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
14943 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
14944 if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2))
14945 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
14946 }
14947
14948 if (Subtarget.hasAVX()) {
14949 // If we have AVX, we can use VPERMILPS which will allow folding a load
14950 // into the shuffle.
14951 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
14952 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
14953 }
14954
14955 // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
14956 // in SSE1 because otherwise they are widened to v2f64 and never get here.
14957 if (!Subtarget.hasSSE2()) {
14958 if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2))
14959 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
14960 if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2))
14961 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
14962 }
14963
14964 // Otherwise, use a straight shuffle of a single input vector. We pass the
14965 // input vector to both operands to simulate this with a SHUFPS.
14966 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
14967 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
14968 }
14969
14970 if (Subtarget.hasAVX2())
14971 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
14972 return Extract;
14973
14974 // There are special ways we can lower some single-element blends. However, we
14975 // have custom ways we can lower more complex single-element blends below that
14976 // we defer to if both this and BLENDPS fail to match, so restrict this to
14977 // when the V2 input is targeting element 0 of the mask -- that is the fast
14978 // case here.
14979 if (NumV2Elements == 1 && Mask[0] >= 4)
14980 if (SDValue V = lowerShuffleAsElementInsertion(
14981 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
14982 return V;
14983
14984 if (Subtarget.hasSSE41()) {
14985 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
14986 Zeroable, Subtarget, DAG))
14987 return Blend;
14988
14989 // Use INSERTPS if we can complete the shuffle efficiently.
14990 if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
14991 return V;
14992
14993 if (!isSingleSHUFPSMask(Mask))
14994 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,
14995 V2, Mask, DAG))
14996 return BlendPerm;
14997 }
14998
14999 // Use low/high mov instructions. These are only valid in SSE1 because
15000 // otherwise they are widened to v2f64 and never get here.
15001 if (!Subtarget.hasSSE2()) {
15002 if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2))
15003 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
15004 if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2))
15005 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
15006 }
15007
15008 // Use dedicated unpack instructions for masks that match their pattern.
15009 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
15010 return V;
15011
15012 // Otherwise fall back to a SHUFPS lowering strategy.
15013 return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
15014}
15015
15016/// Lower 4-lane i32 vector shuffles.
15017///
15018/// We try to handle these with integer-domain shuffles where we can, but for
15019/// blends we use the floating point domain blend instructions.
15020static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15021 const APInt &Zeroable, SDValue V1, SDValue V2,
15022 const X86Subtarget &Subtarget,
15023 SelectionDAG &DAG) {
15024 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15024, __extension__
__PRETTY_FUNCTION__))
;
15025 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15025, __extension__
__PRETTY_FUNCTION__))
;
15026 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15026, __extension__
__PRETTY_FUNCTION__))
;
15027
15028 // Whenever we can lower this as a zext, that instruction is strictly faster
15029 // than any alternative. It also allows us to fold memory operands into the
15030 // shuffle in many cases.
15031 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,
15032 Zeroable, Subtarget, DAG))
15033 return ZExt;
15034
15035 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
15036
15037 if (NumV2Elements == 0) {
15038 // Try to use broadcast unless the mask only has one non-undef element.
15039 if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {
15040 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
15041 Mask, Subtarget, DAG))
15042 return Broadcast;
15043 }
15044
15045 // Straight shuffle of a single input vector. For everything from SSE2
15046 // onward this has a single fast instruction with no scary immediates.
15047 // We coerce the shuffle pattern to be compatible with UNPCK instructions
15048 // but we aren't actually going to use the UNPCK instruction because doing
15049 // so prevents folding a load into this instruction or making a copy.
15050 const int UnpackLoMask[] = {0, 0, 1, 1};
15051 const int UnpackHiMask[] = {2, 2, 3, 3};
15052 if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2))
15053 Mask = UnpackLoMask;
15054 else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2))
15055 Mask = UnpackHiMask;
15056
15057 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
15058 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
15059 }
15060
15061 if (Subtarget.hasAVX2())
15062 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
15063 return Extract;
15064
15065 // Try to use shift instructions.
15066 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
15067 Zeroable, Subtarget, DAG))
15068 return Shift;
15069
15070 // There are special ways we can lower some single-element blends.
15071 if (NumV2Elements == 1)
15072 if (SDValue V = lowerShuffleAsElementInsertion(
15073 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
15074 return V;
15075
15076 // We have different paths for blend lowering, but they all must use the
15077 // *exact* same predicate.
15078 bool IsBlendSupported = Subtarget.hasSSE41();
15079 if (IsBlendSupported)
15080 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
15081 Zeroable, Subtarget, DAG))
15082 return Blend;
15083
15084 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
15085 Zeroable, Subtarget, DAG))
15086 return Masked;
15087
15088 // Use dedicated unpack instructions for masks that match their pattern.
15089 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
15090 return V;
15091
15092 // Try to use byte rotation instructions.
15093 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
15094 if (Subtarget.hasSSSE3()) {
15095 if (Subtarget.hasVLX())
15096 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
15097 Subtarget, DAG))
15098 return Rotate;
15099
15100 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
15101 Subtarget, DAG))
15102 return Rotate;
15103 }
15104
15105 // Assume that a single SHUFPS is faster than an alternative sequence of
15106 // multiple instructions (even if the CPU has a domain penalty).
15107 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
15108 if (!isSingleSHUFPSMask(Mask)) {
15109 // If we have direct support for blends, we should lower by decomposing into
15110 // a permute. That will be faster than the domain cross.
15111 if (IsBlendSupported)
15112 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask,
15113 Subtarget, DAG);
15114
15115 // Try to lower by permuting the inputs into an unpack instruction.
15116 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
15117 Mask, Subtarget, DAG))
15118 return Unpack;
15119 }
15120
15121 // We implement this with SHUFPS because it can blend from two vectors.
15122 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
15123 // up the inputs, bypassing domain shift penalties that we would incur if we
15124 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
15125 // relevant.
15126 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
15127 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
15128 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
15129 return DAG.getBitcast(MVT::v4i32, ShufPS);
15130}
15131
15132/// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
15133/// shuffle lowering, and the most complex part.
15134///
15135/// The lowering strategy is to try to form pairs of input lanes which are
15136/// targeted at the same half of the final vector, and then use a dword shuffle
15137/// to place them onto the right half, and finally unpack the paired lanes into
15138/// their final position.
15139///
15140/// The exact breakdown of how to form these dword pairs and align them on the
15141/// correct sides is really tricky. See the comments within the function for
15142/// more of the details.
15143///
15144/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
15145/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
15146/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
15147/// vector, form the analogous 128-bit 8-element Mask.
15148static SDValue lowerV8I16GeneralSingleInputShuffle(
15149 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
15150 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15151 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!")(static_cast <bool> (VT.getVectorElementType() == MVT::
i16 && "Bad input type!") ? void (0) : __assert_fail (
"VT.getVectorElementType() == MVT::i16 && \"Bad input type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15151, __extension__
__PRETTY_FUNCTION__))
;
15152 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
15153
15154 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!")(static_cast <bool> (Mask.size() == 8 && "Shuffle mask length doesn't match!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Shuffle mask length doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15154, __extension__
__PRETTY_FUNCTION__))
;
15155 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
15156 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
15157
15158 // Attempt to directly match PSHUFLW or PSHUFHW.
15159 if (isUndefOrInRange(LoMask, 0, 4) &&
15160 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
15161 return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
15162 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
15163 }
15164 if (isUndefOrInRange(HiMask, 4, 8) &&
15165 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
15166 for (int i = 0; i != 4; ++i)
15167 HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
15168 return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
15169 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
15170 }
15171
15172 SmallVector<int, 4> LoInputs;
15173 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
15174 array_pod_sort(LoInputs.begin(), LoInputs.end());
15175 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
15176 SmallVector<int, 4> HiInputs;
15177 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
15178 array_pod_sort(HiInputs.begin(), HiInputs.end());
15179 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
15180 int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
15181 int NumHToL = LoInputs.size() - NumLToL;
15182 int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
15183 int NumHToH = HiInputs.size() - NumLToH;
15184 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
15185 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
15186 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
15187 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
15188
15189 // If we are shuffling values from one half - check how many different DWORD
15190 // pairs we need to create. If only 1 or 2 then we can perform this as a
15191 // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
15192 auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
15193 ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
15194 V = DAG.getNode(ShufWOp, DL, VT, V,
15195 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
15196 V = DAG.getBitcast(PSHUFDVT, V);
15197 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
15198 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
15199 return DAG.getBitcast(VT, V);
15200 };
15201
15202 if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
15203 int PSHUFDMask[4] = { -1, -1, -1, -1 };
15204 SmallVector<std::pair<int, int>, 4> DWordPairs;
15205 int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
15206
15207 // Collect the different DWORD pairs.
15208 for (int DWord = 0; DWord != 4; ++DWord) {
15209 int M0 = Mask[2 * DWord + 0];
15210 int M1 = Mask[2 * DWord + 1];
15211 M0 = (M0 >= 0 ? M0 % 4 : M0);
15212 M1 = (M1 >= 0 ? M1 % 4 : M1);
15213 if (M0 < 0 && M1 < 0)
15214 continue;
15215
15216 bool Match = false;
15217 for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
15218 auto &DWordPair = DWordPairs[j];
15219 if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
15220 (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
15221 DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
15222 DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
15223 PSHUFDMask[DWord] = DOffset + j;
15224 Match = true;
15225 break;
15226 }
15227 }
15228 if (!Match) {
15229 PSHUFDMask[DWord] = DOffset + DWordPairs.size();
15230 DWordPairs.push_back(std::make_pair(M0, M1));
15231 }
15232 }
15233
15234 if (DWordPairs.size() <= 2) {
15235 DWordPairs.resize(2, std::make_pair(-1, -1));
15236 int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
15237 DWordPairs[1].first, DWordPairs[1].second};
15238 if ((NumHToL + NumHToH) == 0)
15239 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
15240 if ((NumLToL + NumLToH) == 0)
15241 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
15242 }
15243 }
15244
15245 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
15246 // such inputs we can swap two of the dwords across the half mark and end up
15247 // with <=2 inputs to each half in each half. Once there, we can fall through
15248 // to the generic code below. For example:
15249 //
15250 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
15251 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
15252 //
15253 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
15254 // and an existing 2-into-2 on the other half. In this case we may have to
15255 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
15256 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
15257 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
15258 // because any other situation (including a 3-into-1 or 1-into-3 in the other
15259 // half than the one we target for fixing) will be fixed when we re-enter this
15260 // path. We will also combine away any sequence of PSHUFD instructions that
15261 // result into a single instruction. Here is an example of the tricky case:
15262 //
15263 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
15264 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
15265 //
15266 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
15267 //
15268 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
15269 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
15270 //
15271 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
15272 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
15273 //
15274 // The result is fine to be handled by the generic logic.
15275 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
15276 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
15277 int AOffset, int BOffset) {
15278 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&(static_cast <bool> ((AToAInputs.size() == 3 || AToAInputs
.size() == 1) && "Must call this with A having 3 or 1 inputs from the A half."
) ? void (0) : __assert_fail ("(AToAInputs.size() == 3 || AToAInputs.size() == 1) && \"Must call this with A having 3 or 1 inputs from the A half.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15279, __extension__
__PRETTY_FUNCTION__))
15279 "Must call this with A having 3 or 1 inputs from the A half.")(static_cast <bool> ((AToAInputs.size() == 3 || AToAInputs
.size() == 1) && "Must call this with A having 3 or 1 inputs from the A half."
) ? void (0) : __assert_fail ("(AToAInputs.size() == 3 || AToAInputs.size() == 1) && \"Must call this with A having 3 or 1 inputs from the A half.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15279, __extension__
__PRETTY_FUNCTION__))
;
15280 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&(static_cast <bool> ((BToAInputs.size() == 1 || BToAInputs
.size() == 3) && "Must call this with B having 1 or 3 inputs from the B half."
) ? void (0) : __assert_fail ("(BToAInputs.size() == 1 || BToAInputs.size() == 3) && \"Must call this with B having 1 or 3 inputs from the B half.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15281, __extension__
__PRETTY_FUNCTION__))
15281 "Must call this with B having 1 or 3 inputs from the B half.")(static_cast <bool> ((BToAInputs.size() == 1 || BToAInputs
.size() == 3) && "Must call this with B having 1 or 3 inputs from the B half."
) ? void (0) : __assert_fail ("(BToAInputs.size() == 1 || BToAInputs.size() == 3) && \"Must call this with B having 1 or 3 inputs from the B half.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15281, __extension__
__PRETTY_FUNCTION__))
;
15282 assert(AToAInputs.size() + BToAInputs.size() == 4 &&(static_cast <bool> (AToAInputs.size() + BToAInputs.size
() == 4 && "Must call this with either 3:1 or 1:3 inputs (summing to 4)."
) ? void (0) : __assert_fail ("AToAInputs.size() + BToAInputs.size() == 4 && \"Must call this with either 3:1 or 1:3 inputs (summing to 4).\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15283, __extension__
__PRETTY_FUNCTION__))
15283 "Must call this with either 3:1 or 1:3 inputs (summing to 4).")(static_cast <bool> (AToAInputs.size() + BToAInputs.size
() == 4 && "Must call this with either 3:1 or 1:3 inputs (summing to 4)."
) ? void (0) : __assert_fail ("AToAInputs.size() + BToAInputs.size() == 4 && \"Must call this with either 3:1 or 1:3 inputs (summing to 4).\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15283, __extension__
__PRETTY_FUNCTION__))
;
15284
15285 bool ThreeAInputs = AToAInputs.size() == 3;
15286
15287 // Compute the index of dword with only one word among the three inputs in
15288 // a half by taking the sum of the half with three inputs and subtracting
15289 // the sum of the actual three inputs. The difference is the remaining
15290 // slot.
15291 int ADWord = 0, BDWord = 0;
15292 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
15293 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
15294 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
15295 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
15296 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
15297 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
15298 int TripleNonInputIdx =
15299 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
15300 TripleDWord = TripleNonInputIdx / 2;
15301
15302 // We use xor with one to compute the adjacent DWord to whichever one the
15303 // OneInput is in.
15304 OneInputDWord = (OneInput / 2) ^ 1;
15305
15306 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
15307 // and BToA inputs. If there is also such a problem with the BToB and AToB
15308 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
15309 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
15310 // is essential that we don't *create* a 3<-1 as then we might oscillate.
15311 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
15312 // Compute how many inputs will be flipped by swapping these DWords. We
15313 // need
15314 // to balance this to ensure we don't form a 3-1 shuffle in the other
15315 // half.
15316 int NumFlippedAToBInputs = llvm::count(AToBInputs, 2 * ADWord) +
15317 llvm::count(AToBInputs, 2 * ADWord + 1);
15318 int NumFlippedBToBInputs = llvm::count(BToBInputs, 2 * BDWord) +
15319 llvm::count(BToBInputs, 2 * BDWord + 1);
15320 if ((NumFlippedAToBInputs == 1 &&
15321 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
15322 (NumFlippedBToBInputs == 1 &&
15323 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
15324 // We choose whether to fix the A half or B half based on whether that
15325 // half has zero flipped inputs. At zero, we may not be able to fix it
15326 // with that half. We also bias towards fixing the B half because that
15327 // will more commonly be the high half, and we have to bias one way.
15328 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
15329 ArrayRef<int> Inputs) {
15330 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
15331 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
15332 // Determine whether the free index is in the flipped dword or the
15333 // unflipped dword based on where the pinned index is. We use this bit
15334 // in an xor to conditionally select the adjacent dword.
15335 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
15336 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
15337 if (IsFixIdxInput == IsFixFreeIdxInput)
15338 FixFreeIdx += 1;
15339 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
15340 assert(IsFixIdxInput != IsFixFreeIdxInput &&(static_cast <bool> (IsFixIdxInput != IsFixFreeIdxInput
&& "We need to be changing the number of flipped inputs!"
) ? void (0) : __assert_fail ("IsFixIdxInput != IsFixFreeIdxInput && \"We need to be changing the number of flipped inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15341, __extension__
__PRETTY_FUNCTION__))
15341 "We need to be changing the number of flipped inputs!")(static_cast <bool> (IsFixIdxInput != IsFixFreeIdxInput
&& "We need to be changing the number of flipped inputs!"
) ? void (0) : __assert_fail ("IsFixIdxInput != IsFixFreeIdxInput && \"We need to be changing the number of flipped inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15341, __extension__
__PRETTY_FUNCTION__))
;
15342 int PSHUFHalfMask[] = {0, 1, 2, 3};
15343 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
15344 V = DAG.getNode(
15345 FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
15346 MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
15347 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
15348
15349 for (int &M : Mask)
15350 if (M >= 0 && M == FixIdx)
15351 M = FixFreeIdx;
15352 else if (M >= 0 && M == FixFreeIdx)
15353 M = FixIdx;
15354 };
15355 if (NumFlippedBToBInputs != 0) {
15356 int BPinnedIdx =
15357 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
15358 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
15359 } else {
15360 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!")(static_cast <bool> (NumFlippedAToBInputs != 0 &&
"Impossible given predicates!") ? void (0) : __assert_fail (
"NumFlippedAToBInputs != 0 && \"Impossible given predicates!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15360, __extension__
__PRETTY_FUNCTION__))
;
15361 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
15362 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
15363 }
15364 }
15365 }
15366
15367 int PSHUFDMask[] = {0, 1, 2, 3};
15368 PSHUFDMask[ADWord] = BDWord;
15369 PSHUFDMask[BDWord] = ADWord;
15370 V = DAG.getBitcast(
15371 VT,
15372 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
15373 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
15374
15375 // Adjust the mask to match the new locations of A and B.
15376 for (int &M : Mask)
15377 if (M >= 0 && M/2 == ADWord)
15378 M = 2 * BDWord + M % 2;
15379 else if (M >= 0 && M/2 == BDWord)
15380 M = 2 * ADWord + M % 2;
15381
15382 // Recurse back into this routine to re-compute state now that this isn't
15383 // a 3 and 1 problem.
15384 return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);
15385 };
15386 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
15387 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
15388 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
15389 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
15390
15391 // At this point there are at most two inputs to the low and high halves from
15392 // each half. That means the inputs can always be grouped into dwords and
15393 // those dwords can then be moved to the correct half with a dword shuffle.
15394 // We use at most one low and one high word shuffle to collect these paired
15395 // inputs into dwords, and finally a dword shuffle to place them.
15396 int PSHUFLMask[4] = {-1, -1, -1, -1};
15397 int PSHUFHMask[4] = {-1, -1, -1, -1};
15398 int PSHUFDMask[4] = {-1, -1, -1, -1};
15399
15400 // First fix the masks for all the inputs that are staying in their
15401 // original halves. This will then dictate the targets of the cross-half
15402 // shuffles.
15403 auto fixInPlaceInputs =
15404 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
15405 MutableArrayRef<int> SourceHalfMask,
15406 MutableArrayRef<int> HalfMask, int HalfOffset) {
15407 if (InPlaceInputs.empty())
15408 return;
15409 if (InPlaceInputs.size() == 1) {
15410 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
15411 InPlaceInputs[0] - HalfOffset;
15412 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
15413 return;
15414 }
15415 if (IncomingInputs.empty()) {
15416 // Just fix all of the in place inputs.
15417 for (int Input : InPlaceInputs) {
15418 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
15419 PSHUFDMask[Input / 2] = Input / 2;
15420 }
15421 return;
15422 }
15423
15424 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!")(static_cast <bool> (InPlaceInputs.size() == 2 &&
"Cannot handle 3 or 4 inputs!") ? void (0) : __assert_fail (
"InPlaceInputs.size() == 2 && \"Cannot handle 3 or 4 inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15424, __extension__
__PRETTY_FUNCTION__))
;
15425 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
15426 InPlaceInputs[0] - HalfOffset;
15427 // Put the second input next to the first so that they are packed into
15428 // a dword. We find the adjacent index by toggling the low bit.
15429 int AdjIndex = InPlaceInputs[0] ^ 1;
15430 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
15431 std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
15432 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
15433 };
15434 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
15435 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
15436
15437 // Now gather the cross-half inputs and place them into a free dword of
15438 // their target half.
15439 // FIXME: This operation could almost certainly be simplified dramatically to
15440 // look more like the 3-1 fixing operation.
15441 auto moveInputsToRightHalf = [&PSHUFDMask](
15442 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
15443 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
15444 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
15445 int DestOffset) {
15446 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
15447 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
15448 };
15449 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
15450 int Word) {
15451 int LowWord = Word & ~1;
15452 int HighWord = Word | 1;
15453 return isWordClobbered(SourceHalfMask, LowWord) ||
15454 isWordClobbered(SourceHalfMask, HighWord);
15455 };
15456
15457 if (IncomingInputs.empty())
15458 return;
15459
15460 if (ExistingInputs.empty()) {
15461 // Map any dwords with inputs from them into the right half.
15462 for (int Input : IncomingInputs) {
15463 // If the source half mask maps over the inputs, turn those into
15464 // swaps and use the swapped lane.
15465 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
15466 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
15467 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
15468 Input - SourceOffset;
15469 // We have to swap the uses in our half mask in one sweep.
15470 for (int &M : HalfMask)
15471 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
15472 M = Input;
15473 else if (M == Input)
15474 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
15475 } else {
15476 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==(static_cast <bool> (SourceHalfMask[SourceHalfMask[Input
- SourceOffset]] == Input - SourceOffset && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15478, __extension__
__PRETTY_FUNCTION__))
15477 Input - SourceOffset &&(static_cast <bool> (SourceHalfMask[SourceHalfMask[Input
- SourceOffset]] == Input - SourceOffset && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15478, __extension__
__PRETTY_FUNCTION__))
15478 "Previous placement doesn't match!")(static_cast <bool> (SourceHalfMask[SourceHalfMask[Input
- SourceOffset]] == Input - SourceOffset && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15478, __extension__
__PRETTY_FUNCTION__))
;
15479 }
15480 // Note that this correctly re-maps both when we do a swap and when
15481 // we observe the other side of the swap above. We rely on that to
15482 // avoid swapping the members of the input list directly.
15483 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
15484 }
15485
15486 // Map the input's dword into the correct half.
15487 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
15488 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
15489 else
15490 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==(static_cast <bool> (PSHUFDMask[(Input - SourceOffset +
DestOffset) / 2] == Input / 2 && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15492, __extension__
__PRETTY_FUNCTION__))
15491 Input / 2 &&(static_cast <bool> (PSHUFDMask[(Input - SourceOffset +
DestOffset) / 2] == Input / 2 && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15492, __extension__
__PRETTY_FUNCTION__))
15492 "Previous placement doesn't match!")(static_cast <bool> (PSHUFDMask[(Input - SourceOffset +
DestOffset) / 2] == Input / 2 && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15492, __extension__
__PRETTY_FUNCTION__))
;
15493 }
15494
15495 // And just directly shift any other-half mask elements to be same-half
15496 // as we will have mirrored the dword containing the element into the
15497 // same position within that half.
15498 for (int &M : HalfMask)
15499 if (M >= SourceOffset && M < SourceOffset + 4) {
15500 M = M - SourceOffset + DestOffset;
15501 assert(M >= 0 && "This should never wrap below zero!")(static_cast <bool> (M >= 0 && "This should never wrap below zero!"
) ? void (0) : __assert_fail ("M >= 0 && \"This should never wrap below zero!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15501, __extension__
__PRETTY_FUNCTION__))
;
15502 }
15503 return;
15504 }
15505
15506 // Ensure we have the input in a viable dword of its current half. This
15507 // is particularly tricky because the original position may be clobbered
15508 // by inputs being moved and *staying* in that half.
15509 if (IncomingInputs.size() == 1) {
15510 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
15511 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
15512 SourceOffset;
15513 SourceHalfMask[InputFixed - SourceOffset] =
15514 IncomingInputs[0] - SourceOffset;
15515 std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
15516 InputFixed);
15517 IncomingInputs[0] = InputFixed;
15518 }
15519 } else if (IncomingInputs.size() == 2) {
15520 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
15521 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
15522 // We have two non-adjacent or clobbered inputs we need to extract from
15523 // the source half. To do this, we need to map them into some adjacent
15524 // dword slot in the source mask.
15525 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
15526 IncomingInputs[1] - SourceOffset};
15527
15528 // If there is a free slot in the source half mask adjacent to one of
15529 // the inputs, place the other input in it. We use (Index XOR 1) to
15530 // compute an adjacent index.
15531 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
15532 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
15533 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
15534 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
15535 InputsFixed[1] = InputsFixed[0] ^ 1;
15536 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
15537 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
15538 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
15539 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
15540 InputsFixed[0] = InputsFixed[1] ^ 1;
15541 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
15542 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
15543 // The two inputs are in the same DWord but it is clobbered and the
15544 // adjacent DWord isn't used at all. Move both inputs to the free
15545 // slot.
15546 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
15547 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
15548 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
15549 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
15550 } else {
15551 // The only way we hit this point is if there is no clobbering
15552 // (because there are no off-half inputs to this half) and there is no
15553 // free slot adjacent to one of the inputs. In this case, we have to
15554 // swap an input with a non-input.
15555 for (int i = 0; i < 4; ++i)
15556 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&(static_cast <bool> ((SourceHalfMask[i] < 0 || SourceHalfMask
[i] == i) && "We can't handle any clobbers here!") ? void
(0) : __assert_fail ("(SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) && \"We can't handle any clobbers here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15557, __extension__
__PRETTY_FUNCTION__))
15557 "We can't handle any clobbers here!")(static_cast <bool> ((SourceHalfMask[i] < 0 || SourceHalfMask
[i] == i) && "We can't handle any clobbers here!") ? void
(0) : __assert_fail ("(SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) && \"We can't handle any clobbers here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15557, __extension__
__PRETTY_FUNCTION__))
;
15558 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&(static_cast <bool> (InputsFixed[1] != (InputsFixed[0] ^
1) && "Cannot have adjacent inputs here!") ? void (0
) : __assert_fail ("InputsFixed[1] != (InputsFixed[0] ^ 1) && \"Cannot have adjacent inputs here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15559, __extension__
__PRETTY_FUNCTION__))
15559 "Cannot have adjacent inputs here!")(static_cast <bool> (InputsFixed[1] != (InputsFixed[0] ^
1) && "Cannot have adjacent inputs here!") ? void (0
) : __assert_fail ("InputsFixed[1] != (InputsFixed[0] ^ 1) && \"Cannot have adjacent inputs here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15559, __extension__
__PRETTY_FUNCTION__))
;
15560
15561 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
15562 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
15563
15564 // We also have to update the final source mask in this case because
15565 // it may need to undo the above swap.
15566 for (int &M : FinalSourceHalfMask)
15567 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
15568 M = InputsFixed[1] + SourceOffset;
15569 else if (M == InputsFixed[1] + SourceOffset)
15570 M = (InputsFixed[0] ^ 1) + SourceOffset;
15571
15572 InputsFixed[1] = InputsFixed[0] ^ 1;
15573 }
15574
15575 // Point everything at the fixed inputs.
15576 for (int &M : HalfMask)
15577 if (M == IncomingInputs[0])
15578 M = InputsFixed[0] + SourceOffset;
15579 else if (M == IncomingInputs[1])
15580 M = InputsFixed[1] + SourceOffset;
15581
15582 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
15583 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
15584 }
15585 } else {
15586 llvm_unreachable("Unhandled input size!")::llvm::llvm_unreachable_internal("Unhandled input size!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15586)
;
15587 }
15588
15589 // Now hoist the DWord down to the right half.
15590 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
15591 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free")(static_cast <bool> (PSHUFDMask[FreeDWord] < 0 &&
"DWord not free") ? void (0) : __assert_fail ("PSHUFDMask[FreeDWord] < 0 && \"DWord not free\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15591, __extension__
__PRETTY_FUNCTION__))
;
15592 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
15593 for (int &M : HalfMask)
15594 for (int Input : IncomingInputs)
15595 if (M == Input)
15596 M = FreeDWord * 2 + Input % 2;
15597 };
15598 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
15599 /*SourceOffset*/ 4, /*DestOffset*/ 0);
15600 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
15601 /*SourceOffset*/ 0, /*DestOffset*/ 4);
15602
15603 // Now enact all the shuffles we've computed to move the inputs into their
15604 // target half.
15605 if (!isNoopShuffleMask(PSHUFLMask))
15606 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
15607 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
15608 if (!isNoopShuffleMask(PSHUFHMask))
15609 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
15610 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
15611 if (!isNoopShuffleMask(PSHUFDMask))
15612 V = DAG.getBitcast(
15613 VT,
15614 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
15615 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
15616
15617 // At this point, each half should contain all its inputs, and we can then
15618 // just shuffle them into their final position.
15619 assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&(static_cast <bool> (count_if(LoMask, [](int M) { return
M >= 4; }) == 0 && "Failed to lift all the high half inputs to the low mask!"
) ? void (0) : __assert_fail ("count_if(LoMask, [](int M) { return M >= 4; }) == 0 && \"Failed to lift all the high half inputs to the low mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15620, __extension__
__PRETTY_FUNCTION__))
15620 "Failed to lift all the high half inputs to the low mask!")(static_cast <bool> (count_if(LoMask, [](int M) { return
M >= 4; }) == 0 && "Failed to lift all the high half inputs to the low mask!"
) ? void (0) : __assert_fail ("count_if(LoMask, [](int M) { return M >= 4; }) == 0 && \"Failed to lift all the high half inputs to the low mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15620, __extension__
__PRETTY_FUNCTION__))
;
15621 assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&(static_cast <bool> (count_if(HiMask, [](int M) { return
M >= 0 && M < 4; }) == 0 && "Failed to lift all the low half inputs to the high mask!"
) ? void (0) : __assert_fail ("count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 && \"Failed to lift all the low half inputs to the high mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15622, __extension__
__PRETTY_FUNCTION__))
15622 "Failed to lift all the low half inputs to the high mask!")(static_cast <bool> (count_if(HiMask, [](int M) { return
M >= 0 && M < 4; }) == 0 && "Failed to lift all the low half inputs to the high mask!"
) ? void (0) : __assert_fail ("count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 && \"Failed to lift all the low half inputs to the high mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15622, __extension__
__PRETTY_FUNCTION__))
;
15623
15624 // Do a half shuffle for the low mask.
15625 if (!isNoopShuffleMask(LoMask))
15626 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
15627 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
15628
15629 // Do a half shuffle with the high mask after shifting its values down.
15630 for (int &M : HiMask)
15631 if (M >= 0)
15632 M -= 4;
15633 if (!isNoopShuffleMask(HiMask))
15634 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
15635 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
15636
15637 return V;
15638}
15639
15640/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
15641/// blend if only one input is used.
15642static SDValue lowerShuffleAsBlendOfPSHUFBs(
15643 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15644 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
15645 assert(!is128BitLaneCrossingShuffleMask(VT, Mask) &&(static_cast <bool> (!is128BitLaneCrossingShuffleMask(VT
, Mask) && "Lane crossing shuffle masks not supported"
) ? void (0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, Mask) && \"Lane crossing shuffle masks not supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15646, __extension__
__PRETTY_FUNCTION__))
15646 "Lane crossing shuffle masks not supported")(static_cast <bool> (!is128BitLaneCrossingShuffleMask(VT
, Mask) && "Lane crossing shuffle masks not supported"
) ? void (0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, Mask) && \"Lane crossing shuffle masks not supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15646, __extension__
__PRETTY_FUNCTION__))
;
15647
15648 int NumBytes = VT.getSizeInBits() / 8;
15649 int Size = Mask.size();
15650 int Scale = NumBytes / Size;
15651
15652 SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));
15653 SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));
15654 V1InUse = false;
15655 V2InUse = false;
15656
15657 for (int i = 0; i < NumBytes; ++i) {
15658 int M = Mask[i / Scale];
15659 if (M < 0)
15660 continue;
15661
15662 const int ZeroMask = 0x80;
15663 int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;
15664 int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;
15665 if (Zeroable[i / Scale])
15666 V1Idx = V2Idx = ZeroMask;
15667
15668 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
15669 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
15670 V1InUse |= (ZeroMask != V1Idx);
15671 V2InUse |= (ZeroMask != V2Idx);
15672 }
15673
15674 MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);
15675 if (V1InUse)
15676 V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),
15677 DAG.getBuildVector(ShufVT, DL, V1Mask));
15678 if (V2InUse)
15679 V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),
15680 DAG.getBuildVector(ShufVT, DL, V2Mask));
15681
15682 // If we need shuffled inputs from both, blend the two.
15683 SDValue V;
15684 if (V1InUse && V2InUse)
15685 V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);
15686 else
15687 V = V1InUse ? V1 : V2;
15688
15689 // Cast the result back to the correct type.
15690 return DAG.getBitcast(VT, V);
15691}
15692
15693/// Generic lowering of 8-lane i16 shuffles.
15694///
15695/// This handles both single-input shuffles and combined shuffle/blends with
15696/// two inputs. The single input shuffles are immediately delegated to
15697/// a dedicated lowering routine.
15698///
15699/// The blends are lowered in one of three fundamental ways. If there are few
15700/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
15701/// of the input is significantly cheaper when lowered as an interleaving of
15702/// the two inputs, try to interleave them. Otherwise, blend the low and high
15703/// halves of the inputs separately (making them have relatively few inputs)
15704/// and then concatenate them.
15705static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15706 const APInt &Zeroable, SDValue V1, SDValue V2,
15707 const X86Subtarget &Subtarget,
15708 SelectionDAG &DAG) {
15709 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15709, __extension__
__PRETTY_FUNCTION__))
;
15710 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15710, __extension__
__PRETTY_FUNCTION__))
;
15711 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15711, __extension__
__PRETTY_FUNCTION__))
;
15712
15713 // Whenever we can lower this as a zext, that instruction is strictly faster
15714 // than any alternative.
15715 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,
15716 Zeroable, Subtarget, DAG))
15717 return ZExt;
15718
15719 // Try to use lower using a truncation.
15720 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
15721 Subtarget, DAG))
15722 return V;
15723
15724 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
15725
15726 if (NumV2Inputs == 0) {
15727 // Try to use shift instructions.
15728 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
15729 Zeroable, Subtarget, DAG))
15730 return Shift;
15731
15732 // Check for being able to broadcast a single element.
15733 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
15734 Mask, Subtarget, DAG))
15735 return Broadcast;
15736
15737 // Try to use bit rotation instructions.
15738 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,
15739 Subtarget, DAG))
15740 return Rotate;
15741
15742 // Use dedicated unpack instructions for masks that match their pattern.
15743 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
15744 return V;
15745
15746 // Use dedicated pack instructions for masks that match their pattern.
15747 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
15748 Subtarget))
15749 return V;
15750
15751 // Try to use byte rotation instructions.
15752 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,
15753 Subtarget, DAG))
15754 return Rotate;
15755
15756 // Make a copy of the mask so it can be modified.
15757 SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
15758 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,
15759 Subtarget, DAG);
15760 }
15761
15762 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&(static_cast <bool> (llvm::any_of(Mask, [](int M) { return
M >= 0 && M < 8; }) && "All single-input shuffles should be canonicalized to be V1-input "
"shuffles.") ? void (0) : __assert_fail ("llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15764, __extension__
__PRETTY_FUNCTION__))
15763 "All single-input shuffles should be canonicalized to be V1-input "(static_cast <bool> (llvm::any_of(Mask, [](int M) { return
M >= 0 && M < 8; }) && "All single-input shuffles should be canonicalized to be V1-input "
"shuffles.") ? void (0) : __assert_fail ("llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15764, __extension__
__PRETTY_FUNCTION__))
15764 "shuffles.")(static_cast <bool> (llvm::any_of(Mask, [](int M) { return
M >= 0 && M < 8; }) && "All single-input shuffles should be canonicalized to be V1-input "
"shuffles.") ? void (0) : __assert_fail ("llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15764, __extension__
__PRETTY_FUNCTION__))
;
15765
15766 // Try to use shift instructions.
15767 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
15768 Zeroable, Subtarget, DAG))
15769 return Shift;
15770
15771 // See if we can use SSE4A Extraction / Insertion.
15772 if (Subtarget.hasSSE4A())
15773 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
15774 Zeroable, DAG))
15775 return V;
15776
15777 // There are special ways we can lower some single-element blends.
15778 if (NumV2Inputs == 1)
15779 if (SDValue V = lowerShuffleAsElementInsertion(
15780 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
15781 return V;
15782
15783 // We have different paths for blend lowering, but they all must use the
15784 // *exact* same predicate.
15785 bool IsBlendSupported = Subtarget.hasSSE41();
15786 if (IsBlendSupported)
15787 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
15788 Zeroable, Subtarget, DAG))
15789 return Blend;
15790
15791 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
15792 Zeroable, Subtarget, DAG))
15793 return Masked;
15794
15795 // Use dedicated unpack instructions for masks that match their pattern.
15796 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
15797 return V;
15798
15799 // Use dedicated pack instructions for masks that match their pattern.
15800 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
15801 Subtarget))
15802 return V;
15803
15804 // Try to use lower using a truncation.
15805 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
15806 Subtarget, DAG))
15807 return V;
15808
15809 // Try to use byte rotation instructions.
15810 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
15811 Subtarget, DAG))
15812 return Rotate;
15813
15814 if (SDValue BitBlend =
15815 lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
15816 return BitBlend;
15817
15818 // Try to use byte shift instructions to mask.
15819 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,
15820 Zeroable, Subtarget, DAG))
15821 return V;
15822
15823 // Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.
15824 // We could use SIGN_EXTEND_INREG+PACKSSDW for older targets but this seems to
15825 // be slower than a PSHUFLW+PSHUFHW+PSHUFD chain.
15826 int NumEvenDrops = canLowerByDroppingElements(Mask, true, false);
15827 if ((NumEvenDrops == 1 || NumEvenDrops == 2) && Subtarget.hasSSE41() &&
15828 !Subtarget.hasVLX()) {
15829 // Check if this is part of a 256-bit vector truncation.
15830 if (NumEvenDrops == 2 && Subtarget.hasAVX2() &&
15831 peekThroughBitcasts(V1).getOpcode() == ISD::EXTRACT_SUBVECTOR &&
15832 peekThroughBitcasts(V2).getOpcode() == ISD::EXTRACT_SUBVECTOR) {
15833 SDValue V1V2 = concatSubVectors(V1, V2, DAG, DL);
15834 V1V2 = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1V2,
15835 getZeroVector(MVT::v16i16, Subtarget, DAG, DL),
15836 DAG.getTargetConstant(0xEE, DL, MVT::i8));
15837 V1V2 = DAG.getBitcast(MVT::v8i32, V1V2);
15838 V1 = extract128BitVector(V1V2, 0, DAG, DL);
15839 V2 = extract128BitVector(V1V2, 4, DAG, DL);
15840 } else {
15841 SmallVector<SDValue, 4> DWordClearOps(4,
15842 DAG.getConstant(0, DL, MVT::i32));
15843 for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
15844 DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);
15845 SDValue DWordClearMask =
15846 DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);
15847 V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),
15848 DWordClearMask);
15849 V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),
15850 DWordClearMask);
15851 }
15852 // Now pack things back together.
15853 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, V1, V2);
15854 if (NumEvenDrops == 2) {
15855 Result = DAG.getBitcast(MVT::v4i32, Result);
15856 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, Result, Result);
15857 }
15858 return Result;
15859 }
15860
15861 // When compacting odd (upper) elements, use PACKSS pre-SSE41.
15862 int NumOddDrops = canLowerByDroppingElements(Mask, false, false);
15863 if (NumOddDrops == 1) {
15864 bool HasSSE41 = Subtarget.hasSSE41();
15865 V1 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
15866 DAG.getBitcast(MVT::v4i32, V1),
15867 DAG.getTargetConstant(16, DL, MVT::i8));
15868 V2 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
15869 DAG.getBitcast(MVT::v4i32, V2),
15870 DAG.getTargetConstant(16, DL, MVT::i8));
15871 return DAG.getNode(HasSSE41 ? X86ISD::PACKUS : X86ISD::PACKSS, DL,
15872 MVT::v8i16, V1, V2);
15873 }
15874
15875 // Try to lower by permuting the inputs into an unpack instruction.
15876 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
15877 Mask, Subtarget, DAG))
15878 return Unpack;
15879
15880 // If we can't directly blend but can use PSHUFB, that will be better as it
15881 // can both shuffle and set up the inefficient blend.
15882 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
15883 bool V1InUse, V2InUse;
15884 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
15885 Zeroable, DAG, V1InUse, V2InUse);
15886 }
15887
15888 // We can always bit-blend if we have to so the fallback strategy is to
15889 // decompose into single-input permutes and blends/unpacks.
15890 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2,
15891 Mask, Subtarget, DAG);
15892}
15893
15894/// Lower 8-lane 16-bit floating point shuffles.
15895static SDValue lowerV8F16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15896 const APInt &Zeroable, SDValue V1, SDValue V2,
15897 const X86Subtarget &Subtarget,
15898 SelectionDAG &DAG) {
15899 assert(V1.getSimpleValueType() == MVT::v8f16 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8f16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8f16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15899, __extension__
__PRETTY_FUNCTION__))
;
15900 assert(V2.getSimpleValueType() == MVT::v8f16 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8f16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8f16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15900, __extension__
__PRETTY_FUNCTION__))
;
15901 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15901, __extension__
__PRETTY_FUNCTION__))
;
15902 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
15903
15904 if (NumV2Elements == 0) {
15905 // Check for being able to broadcast a single element.
15906 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f16, V1, V2,
15907 Mask, Subtarget, DAG))
15908 return Broadcast;
15909 }
15910 if (NumV2Elements == 1 && Mask[0] >= 8)
15911 if (SDValue V = lowerShuffleAsElementInsertion(DL, MVT::v8f16, V1, V2, Mask,
15912 Zeroable, Subtarget, DAG))
15913 return V;
15914
15915 V1 = DAG.getBitcast(MVT::v8i16, V1);
15916 V2 = DAG.getBitcast(MVT::v8i16, V2);
15917 return DAG.getBitcast(MVT::v8f16,
15918 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
15919}
15920
15921// Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,
15922// sub-512-bit shuffles are padded to 512-bits for the shuffle and then
15923// the active subvector is extracted.
15924static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT,
15925 ArrayRef<int> Mask, SDValue V1, SDValue V2,
15926 const X86Subtarget &Subtarget,
15927 SelectionDAG &DAG) {
15928 MVT MaskVT = VT.changeTypeToInteger();
15929 SDValue MaskNode;
15930 MVT ShuffleVT = VT;
15931 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
15932 V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);
15933 V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);
15934 ShuffleVT = V1.getSimpleValueType();
15935
15936 // Adjust mask to correct indices for the second input.
15937 int NumElts = VT.getVectorNumElements();
15938 unsigned Scale = 512 / VT.getSizeInBits();
15939 SmallVector<int, 32> AdjustedMask(Mask.begin(), Mask.end());
15940 for (int &M : AdjustedMask)
15941 if (NumElts <= M)
15942 M += (Scale - 1) * NumElts;
15943 MaskNode = getConstVector(AdjustedMask, MaskVT, DAG, DL, true);
15944 MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);
15945 } else {
15946 MaskNode = getConstVector(Mask, MaskVT, DAG, DL, true);
15947 }
15948
15949 SDValue Result;
15950 if (V2.isUndef())
15951 Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);
15952 else
15953 Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);
15954
15955 if (VT != ShuffleVT)
15956 Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());
15957
15958 return Result;
15959}
15960
15961/// Generic lowering of v16i8 shuffles.
15962///
15963/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
15964/// detect any complexity reducing interleaving. If that doesn't help, it uses
15965/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
15966/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
15967/// back together.
15968static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15969 const APInt &Zeroable, SDValue V1, SDValue V2,
15970 const X86Subtarget &Subtarget,
15971 SelectionDAG &DAG) {
15972 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v16i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15972, __extension__
__PRETTY_FUNCTION__))
;
15973 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v16i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15973, __extension__
__PRETTY_FUNCTION__))
;
15974 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast <bool> (Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15974, __extension__
__PRETTY_FUNCTION__))
;
15975
15976 // Try to use shift instructions.
15977 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
15978 Zeroable, Subtarget, DAG))
15979 return Shift;
15980
15981 // Try to use byte rotation instructions.
15982 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,
15983 Subtarget, DAG))
15984 return Rotate;
15985
15986 // Use dedicated pack instructions for masks that match their pattern.
15987 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,
15988 Subtarget))
15989 return V;
15990
15991 // Try to use a zext lowering.
15992 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,
15993 Zeroable, Subtarget, DAG))
15994 return ZExt;
15995
15996 // Try to use lower using a truncation.
15997 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
15998 Subtarget, DAG))
15999 return V;
16000
16001 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
16002 Subtarget, DAG))
16003 return V;
16004
16005 // See if we can use SSE4A Extraction / Insertion.
16006 if (Subtarget.hasSSE4A())
16007 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
16008 Zeroable, DAG))
16009 return V;
16010
16011 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
16012
16013 // For single-input shuffles, there are some nicer lowering tricks we can use.
16014 if (NumV2Elements == 0) {
16015 // Check for being able to broadcast a single element.
16016 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,
16017 Mask, Subtarget, DAG))
16018 return Broadcast;
16019
16020 // Try to use bit rotation instructions.
16021 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,
16022 Subtarget, DAG))
16023 return Rotate;
16024
16025 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
16026 return V;
16027
16028 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
16029 // Notably, this handles splat and partial-splat shuffles more efficiently.
16030 // However, it only makes sense if the pre-duplication shuffle simplifies
16031 // things significantly. Currently, this means we need to be able to
16032 // express the pre-duplication shuffle as an i16 shuffle.
16033 //
16034 // FIXME: We should check for other patterns which can be widened into an
16035 // i16 shuffle as well.
16036 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
16037 for (int i = 0; i < 16; i += 2)
16038 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
16039 return false;
16040
16041 return true;
16042 };
16043 auto tryToWidenViaDuplication = [&]() -> SDValue {
16044 if (!canWidenViaDuplication(Mask))
16045 return SDValue();
16046 SmallVector<int, 4> LoInputs;
16047 copy_if(Mask, std::back_inserter(LoInputs),
16048 [](int M) { return M >= 0 && M < 8; });
16049 array_pod_sort(LoInputs.begin(), LoInputs.end());
16050 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
16051 LoInputs.end());
16052 SmallVector<int, 4> HiInputs;
16053 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
16054 array_pod_sort(HiInputs.begin(), HiInputs.end());
16055 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
16056 HiInputs.end());
16057
16058 bool TargetLo = LoInputs.size() >= HiInputs.size();
16059 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
16060 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
16061
16062 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
16063 SmallDenseMap<int, int, 8> LaneMap;
16064 for (int I : InPlaceInputs) {
16065 PreDupI16Shuffle[I/2] = I/2;
16066 LaneMap[I] = I;
16067 }
16068 int j = TargetLo ? 0 : 4, je = j + 4;
16069 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
16070 // Check if j is already a shuffle of this input. This happens when
16071 // there are two adjacent bytes after we move the low one.
16072 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
16073 // If we haven't yet mapped the input, search for a slot into which
16074 // we can map it.
16075 while (j < je && PreDupI16Shuffle[j] >= 0)
16076 ++j;
16077
16078 if (j == je)
16079 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
16080 return SDValue();
16081
16082 // Map this input with the i16 shuffle.
16083 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
16084 }
16085
16086 // Update the lane map based on the mapping we ended up with.
16087 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
16088 }
16089 V1 = DAG.getBitcast(
16090 MVT::v16i8,
16091 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
16092 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
16093
16094 // Unpack the bytes to form the i16s that will be shuffled into place.
16095 bool EvenInUse = false, OddInUse = false;
16096 for (int i = 0; i < 16; i += 2) {
16097 EvenInUse |= (Mask[i + 0] >= 0);
16098 OddInUse |= (Mask[i + 1] >= 0);
16099 if (EvenInUse && OddInUse)
16100 break;
16101 }
16102 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
16103 MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),
16104 OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));
16105
16106 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
16107 for (int i = 0; i < 16; ++i)
16108 if (Mask[i] >= 0) {
16109 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
16110 assert(MappedMask < 8 && "Invalid v8 shuffle mask!")(static_cast <bool> (MappedMask < 8 && "Invalid v8 shuffle mask!"
) ? void (0) : __assert_fail ("MappedMask < 8 && \"Invalid v8 shuffle mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16110, __extension__
__PRETTY_FUNCTION__))
;
16111 if (PostDupI16Shuffle[i / 2] < 0)
16112 PostDupI16Shuffle[i / 2] = MappedMask;
16113 else
16114 assert(PostDupI16Shuffle[i / 2] == MappedMask &&(static_cast <bool> (PostDupI16Shuffle[i / 2] == MappedMask
&& "Conflicting entries in the original shuffle!") ?
void (0) : __assert_fail ("PostDupI16Shuffle[i / 2] == MappedMask && \"Conflicting entries in the original shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16115, __extension__
__PRETTY_FUNCTION__))
16115 "Conflicting entries in the original shuffle!")(static_cast <bool> (PostDupI16Shuffle[i / 2] == MappedMask
&& "Conflicting entries in the original shuffle!") ?
void (0) : __assert_fail ("PostDupI16Shuffle[i / 2] == MappedMask && \"Conflicting entries in the original shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16115, __extension__
__PRETTY_FUNCTION__))
;
16116 }
16117 return DAG.getBitcast(
16118 MVT::v16i8,
16119 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
16120 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
16121 };
16122 if (SDValue V = tryToWidenViaDuplication())
16123 return V;
16124 }
16125
16126 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
16127 Zeroable, Subtarget, DAG))
16128 return Masked;
16129
16130 // Use dedicated unpack instructions for masks that match their pattern.
16131 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
16132 return V;
16133
16134 // Try to use byte shift instructions to mask.
16135 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,
16136 Zeroable, Subtarget, DAG))
16137 return V;
16138
16139 // Check for compaction patterns.
16140 bool IsSingleInput = V2.isUndef();
16141 int NumEvenDrops = canLowerByDroppingElements(Mask, true, IsSingleInput);
16142
16143 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
16144 // with PSHUFB. It is important to do this before we attempt to generate any
16145 // blends but after all of the single-input lowerings. If the single input
16146 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
16147 // want to preserve that and we can DAG combine any longer sequences into
16148 // a PSHUFB in the end. But once we start blending from multiple inputs,
16149 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
16150 // and there are *very* few patterns that would actually be faster than the
16151 // PSHUFB approach because of its ability to zero lanes.
16152 //
16153 // If the mask is a binary compaction, we can more efficiently perform this
16154 // as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).
16155 //
16156 // FIXME: The only exceptions to the above are blends which are exact
16157 // interleavings with direct instructions supporting them. We currently don't
16158 // handle those well here.
16159 if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {
16160 bool V1InUse = false;
16161 bool V2InUse = false;
16162
16163 SDValue PSHUFB = lowerShuffleAsBlendOfPSHUFBs(
16164 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
16165
16166 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
16167 // do so. This avoids using them to handle blends-with-zero which is
16168 // important as a single pshufb is significantly faster for that.
16169 if (V1InUse && V2InUse) {
16170 if (Subtarget.hasSSE41())
16171 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
16172 Zeroable, Subtarget, DAG))
16173 return Blend;
16174
16175 // We can use an unpack to do the blending rather than an or in some
16176 // cases. Even though the or may be (very minorly) more efficient, we
16177 // preference this lowering because there are common cases where part of
16178 // the complexity of the shuffles goes away when we do the final blend as
16179 // an unpack.
16180 // FIXME: It might be worth trying to detect if the unpack-feeding
16181 // shuffles will both be pshufb, in which case we shouldn't bother with
16182 // this.
16183 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(
16184 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
16185 return Unpack;
16186
16187 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
16188 if (Subtarget.hasVBMI())
16189 return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,
16190 DAG);
16191
16192 // If we have XOP we can use one VPPERM instead of multiple PSHUFBs.
16193 if (Subtarget.hasXOP()) {
16194 SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true);
16195 return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode);
16196 }
16197
16198 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
16199 // PALIGNR will be cheaper than the second PSHUFB+OR.
16200 if (SDValue V = lowerShuffleAsByteRotateAndPermute(
16201 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
16202 return V;
16203 }
16204
16205 return PSHUFB;
16206 }
16207
16208 // There are special ways we can lower some single-element blends.
16209 if (NumV2Elements == 1)
16210 if (SDValue V = lowerShuffleAsElementInsertion(
16211 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
16212 return V;
16213
16214 if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
16215 return Blend;
16216
16217 // Check whether a compaction lowering can be done. This handles shuffles
16218 // which take every Nth element for some even N. See the helper function for
16219 // details.
16220 //
16221 // We special case these as they can be particularly efficiently handled with
16222 // the PACKUSB instruction on x86 and they show up in common patterns of
16223 // rearranging bytes to truncate wide elements.
16224 if (NumEvenDrops) {
16225 // NumEvenDrops is the power of two stride of the elements. Another way of
16226 // thinking about it is that we need to drop the even elements this many
16227 // times to get the original input.
16228
16229 // First we need to zero all the dropped bytes.
16230 assert(NumEvenDrops <= 3 &&(static_cast <bool> (NumEvenDrops <= 3 && "No support for dropping even elements more than 3 times."
) ? void (0) : __assert_fail ("NumEvenDrops <= 3 && \"No support for dropping even elements more than 3 times.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16231, __extension__
__PRETTY_FUNCTION__))
16231 "No support for dropping even elements more than 3 times.")(static_cast <bool> (NumEvenDrops <= 3 && "No support for dropping even elements more than 3 times."
) ? void (0) : __assert_fail ("NumEvenDrops <= 3 && \"No support for dropping even elements more than 3 times.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16231, __extension__
__PRETTY_FUNCTION__))
;
16232 SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16));
16233 for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
16234 WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16);
16235 SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps);
16236 V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1),
16237 WordClearMask);
16238 if (!IsSingleInput)
16239 V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2),
16240 WordClearMask);
16241
16242 // Now pack things back together.
16243 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
16244 IsSingleInput ? V1 : V2);
16245 for (int i = 1; i < NumEvenDrops; ++i) {
16246 Result = DAG.getBitcast(MVT::v8i16, Result);
16247 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
16248 }
16249 return Result;
16250 }
16251
16252 int NumOddDrops = canLowerByDroppingElements(Mask, false, IsSingleInput);
16253 if (NumOddDrops == 1) {
16254 V1 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
16255 DAG.getBitcast(MVT::v8i16, V1),
16256 DAG.getTargetConstant(8, DL, MVT::i8));
16257 if (!IsSingleInput)
16258 V2 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
16259 DAG.getBitcast(MVT::v8i16, V2),
16260 DAG.getTargetConstant(8, DL, MVT::i8));
16261 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
16262 IsSingleInput ? V1 : V2);
16263 }
16264
16265 // Handle multi-input cases by blending/unpacking single-input shuffles.
16266 if (NumV2Elements > 0)
16267 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,
16268 Subtarget, DAG);
16269
16270 // The fallback path for single-input shuffles widens this into two v8i16
16271 // vectors with unpacks, shuffles those, and then pulls them back together
16272 // with a pack.
16273 SDValue V = V1;
16274
16275 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
16276 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
16277 for (int i = 0; i < 16; ++i)
16278 if (Mask[i] >= 0)
16279 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
16280
16281 SDValue VLoHalf, VHiHalf;
16282 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
16283 // them out and avoid using UNPCK{L,H} to extract the elements of V as
16284 // i16s.
16285 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
16286 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
16287 // Use a mask to drop the high bytes.
16288 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
16289 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
16290 DAG.getConstant(0x00FF, DL, MVT::v8i16));
16291
16292 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
16293 VHiHalf = DAG.getUNDEF(MVT::v8i16);
16294
16295 // Squash the masks to point directly into VLoHalf.
16296 for (int &M : LoBlendMask)
16297 if (M >= 0)
16298 M /= 2;
16299 for (int &M : HiBlendMask)
16300 if (M >= 0)
16301 M /= 2;
16302 } else {
16303 // Otherwise just unpack the low half of V into VLoHalf and the high half into
16304 // VHiHalf so that we can blend them as i16s.
16305 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
16306
16307 VLoHalf = DAG.getBitcast(
16308 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
16309 VHiHalf = DAG.getBitcast(
16310 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
16311 }
16312
16313 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
16314 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
16315
16316 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
16317}
16318
16319/// Dispatching routine to lower various 128-bit x86 vector shuffles.
16320///
16321/// This routine breaks down the specific type of 128-bit shuffle and
16322/// dispatches to the lowering routines accordingly.
16323static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
16324 MVT VT, SDValue V1, SDValue V2,
16325 const APInt &Zeroable,
16326 const X86Subtarget &Subtarget,
16327 SelectionDAG &DAG) {
16328 switch (VT.SimpleTy) {
16329 case MVT::v2i64:
16330 return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16331 case MVT::v2f64:
16332 return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16333 case MVT::v4i32:
16334 return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16335 case MVT::v4f32:
16336 return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16337 case MVT::v8i16:
16338 return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16339 case MVT::v8f16:
16340 return lowerV8F16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16341 case MVT::v16i8:
16342 return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16343
16344 default:
16345 llvm_unreachable("Unimplemented!")::llvm::llvm_unreachable_internal("Unimplemented!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16345)
;
16346 }
16347}
16348
16349/// Generic routine to split vector shuffle into half-sized shuffles.
16350///
16351/// This routine just extracts two subvectors, shuffles them independently, and
16352/// then concatenates them back together. This should work effectively with all
16353/// AVX vector shuffle types.
16354static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
16355 SDValue V2, ArrayRef<int> Mask,
16356 SelectionDAG &DAG) {
16357 assert(VT.getSizeInBits() >= 256 &&(static_cast <bool> (VT.getSizeInBits() >= 256 &&
"Only for 256-bit or wider vector shuffles!") ? void (0) : __assert_fail
("VT.getSizeInBits() >= 256 && \"Only for 256-bit or wider vector shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16358, __extension__
__PRETTY_FUNCTION__))
16358 "Only for 256-bit or wider vector shuffles!")(static_cast <bool> (VT.getSizeInBits() >= 256 &&
"Only for 256-bit or wider vector shuffles!") ? void (0) : __assert_fail
("VT.getSizeInBits() >= 256 && \"Only for 256-bit or wider vector shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16358, __extension__
__PRETTY_FUNCTION__))
;
16359 assert(V1.getSimpleValueType() == VT && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == VT &&
"Bad operand type!") ? void (0) : __assert_fail ("V1.getSimpleValueType() == VT && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16359, __extension__
__PRETTY_FUNCTION__))
;
16360 assert(V2.getSimpleValueType() == VT && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == VT &&
"Bad operand type!") ? void (0) : __assert_fail ("V2.getSimpleValueType() == VT && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16360, __extension__
__PRETTY_FUNCTION__))
;
16361
16362 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
16363 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
16364
16365 int NumElements = VT.getVectorNumElements();
16366 int SplitNumElements = NumElements / 2;
16367 MVT ScalarVT = VT.getVectorElementType();
16368 MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements);
16369
16370 // Use splitVector/extractSubVector so that split build-vectors just build two
16371 // narrower build vectors. This helps shuffling with splats and zeros.
16372 auto SplitVector = [&](SDValue V) {
16373 SDValue LoV, HiV;
16374 std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL);
16375 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
16376 DAG.getBitcast(SplitVT, HiV));
16377 };
16378
16379 SDValue LoV1, HiV1, LoV2, HiV2;
16380 std::tie(LoV1, HiV1) = SplitVector(V1);
16381 std::tie(LoV2, HiV2) = SplitVector(V2);
16382
16383 // Now create two 4-way blends of these half-width vectors.
16384 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
16385 bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
16386 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
16387 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
16388 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
16389 for (int i = 0; i < SplitNumElements; ++i) {
16390 int M = HalfMask[i];
16391 if (M >= NumElements) {
16392 if (M >= NumElements + SplitNumElements)
16393 UseHiV2 = true;
16394 else
16395 UseLoV2 = true;
16396 V2BlendMask[i] = M - NumElements;
16397 BlendMask[i] = SplitNumElements + i;
16398 } else if (M >= 0) {
16399 if (M >= SplitNumElements)
16400 UseHiV1 = true;
16401 else
16402 UseLoV1 = true;
16403 V1BlendMask[i] = M;
16404 BlendMask[i] = i;
16405 }
16406 }
16407
16408 // Because the lowering happens after all combining takes place, we need to
16409 // manually combine these blend masks as much as possible so that we create
16410 // a minimal number of high-level vector shuffle nodes.
16411
16412 // First try just blending the halves of V1 or V2.
16413 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
16414 return DAG.getUNDEF(SplitVT);
16415 if (!UseLoV2 && !UseHiV2)
16416 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
16417 if (!UseLoV1 && !UseHiV1)
16418 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
16419
16420 SDValue V1Blend, V2Blend;
16421 if (UseLoV1 && UseHiV1) {
16422 V1Blend =
16423 DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
16424 } else {
16425 // We only use half of V1 so map the usage down into the final blend mask.
16426 V1Blend = UseLoV1 ? LoV1 : HiV1;
16427 for (int i = 0; i < SplitNumElements; ++i)
16428 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
16429 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
16430 }
16431 if (UseLoV2 && UseHiV2) {
16432 V2Blend =
16433 DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
16434 } else {
16435 // We only use half of V2 so map the usage down into the final blend mask.
16436 V2Blend = UseLoV2 ? LoV2 : HiV2;
16437 for (int i = 0; i < SplitNumElements; ++i)
16438 if (BlendMask[i] >= SplitNumElements)
16439 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
16440 }
16441 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
16442 };
16443 SDValue Lo = HalfBlend(LoMask);
16444 SDValue Hi = HalfBlend(HiMask);
16445 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
16446}
16447
16448/// Either split a vector in halves or decompose the shuffles and the
16449/// blend/unpack.
16450///
16451/// This is provided as a good fallback for many lowerings of non-single-input
16452/// shuffles with more than one 128-bit lane. In those cases, we want to select
16453/// between splitting the shuffle into 128-bit components and stitching those
16454/// back together vs. extracting the single-input shuffles and blending those
16455/// results.
16456static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,
16457 SDValue V2, ArrayRef<int> Mask,
16458 const X86Subtarget &Subtarget,
16459 SelectionDAG &DAG) {
16460 assert(!V2.isUndef() && "This routine must not be used to lower single-input "(static_cast <bool> (!V2.isUndef() && "This routine must not be used to lower single-input "
"shuffles as it could then recurse on itself.") ? void (0) :
__assert_fail ("!V2.isUndef() && \"This routine must not be used to lower single-input \" \"shuffles as it could then recurse on itself.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16461, __extension__
__PRETTY_FUNCTION__))
16461 "shuffles as it could then recurse on itself.")(static_cast <bool> (!V2.isUndef() && "This routine must not be used to lower single-input "
"shuffles as it could then recurse on itself.") ? void (0) :
__assert_fail ("!V2.isUndef() && \"This routine must not be used to lower single-input \" \"shuffles as it could then recurse on itself.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16461, __extension__
__PRETTY_FUNCTION__))
;
16462 int Size = Mask.size();
16463
16464 // If this can be modeled as a broadcast of two elements followed by a blend,
16465 // prefer that lowering. This is especially important because broadcasts can
16466 // often fold with memory operands.
16467 auto DoBothBroadcast = [&] {
16468 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
16469 for (int M : Mask)
16470 if (M >= Size) {
16471 if (V2BroadcastIdx < 0)
16472 V2BroadcastIdx = M - Size;
16473 else if (M - Size != V2BroadcastIdx)
16474 return false;
16475 } else if (M >= 0) {
16476 if (V1BroadcastIdx < 0)
16477 V1BroadcastIdx = M;
16478 else if (M != V1BroadcastIdx)
16479 return false;
16480 }
16481 return true;
16482 };
16483 if (DoBothBroadcast())
16484 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
16485 DAG);
16486
16487 // If the inputs all stem from a single 128-bit lane of each input, then we
16488 // split them rather than blending because the split will decompose to
16489 // unusually few instructions.
16490 int LaneCount = VT.getSizeInBits() / 128;
16491 int LaneSize = Size / LaneCount;
16492 SmallBitVector LaneInputs[2];
16493 LaneInputs[0].resize(LaneCount, false);
16494 LaneInputs[1].resize(LaneCount, false);
16495 for (int i = 0; i < Size; ++i)
16496 if (Mask[i] >= 0)
16497 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
16498 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
16499 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
16500
16501 // Otherwise, just fall back to decomposed shuffles and a blend/unpack. This
16502 // requires that the decomposed single-input shuffles don't end up here.
16503 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
16504 DAG);
16505}
16506
16507// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
16508// TODO: Extend to support v8f32 (+ 512-bit shuffles).
16509static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT,
16510 SDValue V1, SDValue V2,
16511 ArrayRef<int> Mask,
16512 SelectionDAG &DAG) {
16513 assert(VT == MVT::v4f64 && "Only for v4f64 shuffles")(static_cast <bool> (VT == MVT::v4f64 && "Only for v4f64 shuffles"
) ? void (0) : __assert_fail ("VT == MVT::v4f64 && \"Only for v4f64 shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16513, __extension__
__PRETTY_FUNCTION__))
;
16514
16515 int LHSMask[4] = {-1, -1, -1, -1};
16516 int RHSMask[4] = {-1, -1, -1, -1};
16517 unsigned SHUFPMask = 0;
16518
16519 // As SHUFPD uses a single LHS/RHS element per lane, we can always
16520 // perform the shuffle once the lanes have been shuffled in place.
16521 for (int i = 0; i != 4; ++i) {
16522 int M = Mask[i];
16523 if (M < 0)
16524 continue;
16525 int LaneBase = i & ~1;
16526 auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
16527 LaneMask[LaneBase + (M & 1)] = M;
16528 SHUFPMask |= (M & 1) << i;
16529 }
16530
16531 SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);
16532 SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);
16533 return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,
16534 DAG.getTargetConstant(SHUFPMask, DL, MVT::i8));
16535}
16536
16537/// Lower a vector shuffle crossing multiple 128-bit lanes as
16538/// a lane permutation followed by a per-lane permutation.
16539///
16540/// This is mainly for cases where we can have non-repeating permutes
16541/// in each lane.
16542///
16543/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,
16544/// we should investigate merging them.
16545static SDValue lowerShuffleAsLanePermuteAndPermute(
16546 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16547 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
16548 int NumElts = VT.getVectorNumElements();
16549 int NumLanes = VT.getSizeInBits() / 128;
16550 int NumEltsPerLane = NumElts / NumLanes;
16551 bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef();
16552
16553 /// Attempts to find a sublane permute with the given size
16554 /// that gets all elements into their target lanes.
16555 ///
16556 /// If successful, fills CrossLaneMask and InLaneMask and returns true.
16557 /// If unsuccessful, returns false and may overwrite InLaneMask.
16558 auto getSublanePermute = [&](int NumSublanes) -> SDValue {
16559 int NumSublanesPerLane = NumSublanes / NumLanes;
16560 int NumEltsPerSublane = NumElts / NumSublanes;
16561
16562 SmallVector<int, 16> CrossLaneMask;
16563 SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef);
16564 // CrossLaneMask but one entry == one sublane.
16565 SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef);
16566
16567 for (int i = 0; i != NumElts; ++i) {
16568 int M = Mask[i];
16569 if (M < 0)
16570 continue;
16571
16572 int SrcSublane = M / NumEltsPerSublane;
16573 int DstLane = i / NumEltsPerLane;
16574
16575 // We only need to get the elements into the right lane, not sublane.
16576 // So search all sublanes that make up the destination lane.
16577 bool Found = false;
16578 int DstSubStart = DstLane * NumSublanesPerLane;
16579 int DstSubEnd = DstSubStart + NumSublanesPerLane;
16580 for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {
16581 if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))
16582 continue;
16583
16584 Found = true;
16585 CrossLaneMaskLarge[DstSublane] = SrcSublane;
16586 int DstSublaneOffset = DstSublane * NumEltsPerSublane;
16587 InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;
16588 break;
16589 }
16590 if (!Found)
16591 return SDValue();
16592 }
16593
16594 // Fill CrossLaneMask using CrossLaneMaskLarge.
16595 narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask);
16596
16597 if (!CanUseSublanes) {
16598 // If we're only shuffling a single lowest lane and the rest are identity
16599 // then don't bother.
16600 // TODO - isShuffleMaskInputInPlace could be extended to something like
16601 // this.
16602 int NumIdentityLanes = 0;
16603 bool OnlyShuffleLowestLane = true;
16604 for (int i = 0; i != NumLanes; ++i) {
16605 int LaneOffset = i * NumEltsPerLane;
16606 if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane,
16607 i * NumEltsPerLane))
16608 NumIdentityLanes++;
16609 else if (CrossLaneMask[LaneOffset] != 0)
16610 OnlyShuffleLowestLane = false;
16611 }
16612 if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
16613 return SDValue();
16614 }
16615
16616 SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);
16617 return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),
16618 InLaneMask);
16619 };
16620
16621 // First attempt a solution with full lanes.
16622 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes))
16623 return V;
16624
16625 // The rest of the solutions use sublanes.
16626 if (!CanUseSublanes)
16627 return SDValue();
16628
16629 // Then attempt a solution with 64-bit sublanes (vpermq).
16630 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2))
16631 return V;
16632
16633 // If that doesn't work and we have fast variable cross-lane shuffle,
16634 // attempt 32-bit sublanes (vpermd).
16635 if (!Subtarget.hasFastVariableCrossLaneShuffle())
16636 return SDValue();
16637
16638 return getSublanePermute(/*NumSublanes=*/NumLanes * 4);
16639}
16640
16641/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
16642/// source with a lane permutation.
16643///
16644/// This lowering strategy results in four instructions in the worst case for a
16645/// single-input cross lane shuffle which is lower than any other fully general
16646/// cross-lane shuffle strategy I'm aware of. Special cases for each particular
16647/// shuffle pattern should be handled prior to trying this lowering.
16648static SDValue lowerShuffleAsLanePermuteAndShuffle(
16649 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16650 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
16651 // FIXME: This should probably be generalized for 512-bit vectors as well.
16652 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!")(static_cast <bool> (VT.is256BitVector() && "Only for 256-bit vector shuffles!"
) ? void (0) : __assert_fail ("VT.is256BitVector() && \"Only for 256-bit vector shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16652, __extension__
__PRETTY_FUNCTION__))
;
16653 int Size = Mask.size();
16654 int LaneSize = Size / 2;
16655
16656 // Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
16657 // Only do this if the elements aren't all from the lower lane,
16658 // otherwise we're (probably) better off doing a split.
16659 if (VT == MVT::v4f64 &&
16660 !all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))
16661 if (SDValue V =
16662 lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG))
16663 return V;
16664
16665 // If there are only inputs from one 128-bit lane, splitting will in fact be
16666 // less expensive. The flags track whether the given lane contains an element
16667 // that crosses to another lane.
16668 bool AllLanes;
16669 if (!Subtarget.hasAVX2()) {
16670 bool LaneCrossing[2] = {false, false};
16671 for (int i = 0; i < Size; ++i)
16672 if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
16673 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
16674 AllLanes = LaneCrossing[0] && LaneCrossing[1];
16675 } else {
16676 bool LaneUsed[2] = {false, false};
16677 for (int i = 0; i < Size; ++i)
16678 if (Mask[i] >= 0)
16679 LaneUsed[(Mask[i] % Size) / LaneSize] = true;
16680 AllLanes = LaneUsed[0] && LaneUsed[1];
16681 }
16682
16683 // TODO - we could support shuffling V2 in the Flipped input.
16684 assert(V2.isUndef() &&(static_cast <bool> (V2.isUndef() && "This last part of this routine only works on single input shuffles"
) ? void (0) : __assert_fail ("V2.isUndef() && \"This last part of this routine only works on single input shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16685, __extension__
__PRETTY_FUNCTION__))
16685 "This last part of this routine only works on single input shuffles")(static_cast <bool> (V2.isUndef() && "This last part of this routine only works on single input shuffles"
) ? void (0) : __assert_fail ("V2.isUndef() && \"This last part of this routine only works on single input shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16685, __extension__
__PRETTY_FUNCTION__))
;
16686
16687 SmallVector<int, 32> InLaneMask(Mask.begin(), Mask.end());
16688 for (int i = 0; i < Size; ++i) {
16689 int &M = InLaneMask[i];
16690 if (M < 0)
16691 continue;
16692 if (((M % Size) / LaneSize) != (i / LaneSize))
16693 M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
16694 }
16695 assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&(static_cast <bool> (!is128BitLaneCrossingShuffleMask(VT
, InLaneMask) && "In-lane shuffle mask expected") ? void
(0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, InLaneMask) && \"In-lane shuffle mask expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16696, __extension__
__PRETTY_FUNCTION__))
16696 "In-lane shuffle mask expected")(static_cast <bool> (!is128BitLaneCrossingShuffleMask(VT
, InLaneMask) && "In-lane shuffle mask expected") ? void
(0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, InLaneMask) && \"In-lane shuffle mask expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16696, __extension__
__PRETTY_FUNCTION__))
;
16697
16698 // If we're not using both lanes in each lane and the inlane mask is not
16699 // repeating, then we're better off splitting.
16700 if (!AllLanes && !is128BitLaneRepeatedShuffleMask(VT, InLaneMask))
16701 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
16702
16703 // Flip the lanes, and shuffle the results which should now be in-lane.
16704 MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
16705 SDValue Flipped = DAG.getBitcast(PVT, V1);
16706 Flipped =
16707 DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});
16708 Flipped = DAG.getBitcast(VT, Flipped);
16709 return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);
16710}
16711
16712/// Handle lowering 2-lane 128-bit shuffles.
16713static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,
16714 SDValue V2, ArrayRef<int> Mask,
16715 const APInt &Zeroable,
16716 const X86Subtarget &Subtarget,
16717 SelectionDAG &DAG) {
16718 if (V2.isUndef()) {
16719 // Attempt to match VBROADCAST*128 subvector broadcast load.
16720 bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1);
16721 bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1);
16722 if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() &&
16723 X86::mayFoldLoad(peekThroughOneUseBitcasts(V1), Subtarget)) {
16724 MVT MemVT = VT.getHalfNumVectorElementsVT();
16725 unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
16726 auto *Ld = cast<LoadSDNode>(peekThroughOneUseBitcasts(V1));
16727 if (SDValue BcstLd = getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, DL,
16728 VT, MemVT, Ld, Ofs, DAG))
16729 return BcstLd;
16730 }
16731
16732 // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
16733 if (Subtarget.hasAVX2())
16734 return SDValue();
16735 }
16736
16737 bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());
16738
16739 SmallVector<int, 4> WidenedMask;
16740 if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))
16741 return SDValue();
16742
16743 bool IsLowZero = (Zeroable & 0x3) == 0x3;
16744 bool IsHighZero = (Zeroable & 0xc) == 0xc;
16745
16746 // Try to use an insert into a zero vector.
16747 if (WidenedMask[0] == 0 && IsHighZero) {
16748 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
16749 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
16750 DAG.getIntPtrConstant(0, DL));
16751 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
16752 getZeroVector(VT, Subtarget, DAG, DL), LoV,
16753 DAG.getIntPtrConstant(0, DL));
16754 }
16755
16756 // TODO: If minimizing size and one of the inputs is a zero vector and the
16757 // the zero vector has only one use, we could use a VPERM2X128 to save the
16758 // instruction bytes needed to explicitly generate the zero vector.
16759
16760 // Blends are faster and handle all the non-lane-crossing cases.
16761 if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
16762 Subtarget, DAG))
16763 return Blend;
16764
16765 // If either input operand is a zero vector, use VPERM2X128 because its mask
16766 // allows us to replace the zero input with an implicit zero.
16767 if (!IsLowZero && !IsHighZero) {
16768 // Check for patterns which can be matched with a single insert of a 128-bit
16769 // subvector.
16770 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2);
16771 if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) {
16772
16773 // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
16774 // this will likely become vinsertf128 which can't fold a 256-bit memop.
16775 if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
16776 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
16777 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
16778 OnlyUsesV1 ? V1 : V2,
16779 DAG.getIntPtrConstant(0, DL));
16780 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
16781 DAG.getIntPtrConstant(2, DL));
16782 }
16783 }
16784
16785 // Try to use SHUF128 if possible.
16786 if (Subtarget.hasVLX()) {
16787 if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
16788 unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
16789 ((WidenedMask[1] % 2) << 1);
16790 return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
16791 DAG.getTargetConstant(PermMask, DL, MVT::i8));
16792 }
16793 }
16794 }
16795
16796 // Otherwise form a 128-bit permutation. After accounting for undefs,
16797 // convert the 64-bit shuffle mask selection values into 128-bit
16798 // selection bits by dividing the indexes by 2 and shifting into positions
16799 // defined by a vperm2*128 instruction's immediate control byte.
16800
16801 // The immediate permute control byte looks like this:
16802 // [1:0] - select 128 bits from sources for low half of destination
16803 // [2] - ignore
16804 // [3] - zero low half of destination
16805 // [5:4] - select 128 bits from sources for high half of destination
16806 // [6] - ignore
16807 // [7] - zero high half of destination
16808
16809 assert((WidenedMask[0] >= 0 || IsLowZero) &&(static_cast <bool> ((WidenedMask[0] >= 0 || IsLowZero
) && (WidenedMask[1] >= 0 || IsHighZero) &&
"Undef half?") ? void (0) : __assert_fail ("(WidenedMask[0] >= 0 || IsLowZero) && (WidenedMask[1] >= 0 || IsHighZero) && \"Undef half?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16810, __extension__
__PRETTY_FUNCTION__))
16810 (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?")(static_cast <bool> ((WidenedMask[0] >= 0 || IsLowZero
) && (WidenedMask[1] >= 0 || IsHighZero) &&
"Undef half?") ? void (0) : __assert_fail ("(WidenedMask[0] >= 0 || IsLowZero) && (WidenedMask[1] >= 0 || IsHighZero) && \"Undef half?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16810, __extension__
__PRETTY_FUNCTION__))
;
16811
16812 unsigned PermMask = 0;
16813 PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
16814 PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
16815
16816 // Check the immediate mask and replace unused sources with undef.
16817 if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
16818 V1 = DAG.getUNDEF(VT);
16819 if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
16820 V2 = DAG.getUNDEF(VT);
16821
16822 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
16823 DAG.getTargetConstant(PermMask, DL, MVT::i8));
16824}
16825
16826/// Lower a vector shuffle by first fixing the 128-bit lanes and then
16827/// shuffling each lane.
16828///
16829/// This attempts to create a repeated lane shuffle where each lane uses one
16830/// or two of the lanes of the inputs. The lanes of the input vectors are
16831/// shuffled in one or two independent shuffles to get the lanes into the
16832/// position needed by the final shuffle.
16833static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(
16834 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16835 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
16836 assert(!V2.isUndef() && "This is only useful with multiple inputs.")(static_cast <bool> (!V2.isUndef() && "This is only useful with multiple inputs."
) ? void (0) : __assert_fail ("!V2.isUndef() && \"This is only useful with multiple inputs.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16836, __extension__
__PRETTY_FUNCTION__))
;
16837
16838 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
16839 return SDValue();
16840
16841 int NumElts = Mask.size();
16842 int NumLanes = VT.getSizeInBits() / 128;
16843 int NumLaneElts = 128 / VT.getScalarSizeInBits();
16844 SmallVector<int, 16> RepeatMask(NumLaneElts, -1);
16845 SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});
16846
16847 // First pass will try to fill in the RepeatMask from lanes that need two
16848 // sources.
16849 for (int Lane = 0; Lane != NumLanes; ++Lane) {
16850 int Srcs[2] = {-1, -1};
16851 SmallVector<int, 16> InLaneMask(NumLaneElts, -1);
16852 for (int i = 0; i != NumLaneElts; ++i) {
16853 int M = Mask[(Lane * NumLaneElts) + i];
16854 if (M < 0)
16855 continue;
16856 // Determine which of the possible input lanes (NumLanes from each source)
16857 // this element comes from. Assign that as one of the sources for this
16858 // lane. We can assign up to 2 sources for this lane. If we run out
16859 // sources we can't do anything.
16860 int LaneSrc = M / NumLaneElts;
16861 int Src;
16862 if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
16863 Src = 0;
16864 else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)
16865 Src = 1;
16866 else
16867 return SDValue();
16868
16869 Srcs[Src] = LaneSrc;
16870 InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;
16871 }
16872
16873 // If this lane has two sources, see if it fits with the repeat mask so far.
16874 if (Srcs[1] < 0)
16875 continue;
16876
16877 LaneSrcs[Lane][0] = Srcs[0];
16878 LaneSrcs[Lane][1] = Srcs[1];
16879
16880 auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {
16881 assert(M1.size() == M2.size() && "Unexpected mask size")(static_cast <bool> (M1.size() == M2.size() && "Unexpected mask size"
) ? void (0) : __assert_fail ("M1.size() == M2.size() && \"Unexpected mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16881, __extension__
__PRETTY_FUNCTION__))
;
16882 for (int i = 0, e = M1.size(); i != e; ++i)
16883 if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])
16884 return false;
16885 return true;
16886 };
16887
16888 auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {
16889 assert(Mask.size() == MergedMask.size() && "Unexpected mask size")(static_cast <bool> (Mask.size() == MergedMask.size() &&
"Unexpected mask size") ? void (0) : __assert_fail ("Mask.size() == MergedMask.size() && \"Unexpected mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16889, __extension__
__PRETTY_FUNCTION__))
;
16890 for (int i = 0, e = MergedMask.size(); i != e; ++i) {
16891 int M = Mask[i];
16892 if (M < 0)
16893 continue;
16894 assert((MergedMask[i] < 0 || MergedMask[i] == M) &&(static_cast <bool> ((MergedMask[i] < 0 || MergedMask
[i] == M) && "Unexpected mask element") ? void (0) : __assert_fail
("(MergedMask[i] < 0 || MergedMask[i] == M) && \"Unexpected mask element\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16895, __extension__
__PRETTY_FUNCTION__))
16895 "Unexpected mask element")(static_cast <bool> ((MergedMask[i] < 0 || MergedMask
[i] == M) && "Unexpected mask element") ? void (0) : __assert_fail
("(MergedMask[i] < 0 || MergedMask[i] == M) && \"Unexpected mask element\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16895, __extension__
__PRETTY_FUNCTION__))
;
16896 MergedMask[i] = M;
16897 }
16898 };
16899
16900 if (MatchMasks(InLaneMask, RepeatMask)) {
16901 // Merge this lane mask into the final repeat mask.
16902 MergeMasks(InLaneMask, RepeatMask);
16903 continue;
16904 }
16905
16906 // Didn't find a match. Swap the operands and try again.
16907 std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
16908 ShuffleVectorSDNode::commuteMask(InLaneMask);
16909
16910 if (MatchMasks(InLaneMask, RepeatMask)) {
16911 // Merge this lane mask into the final repeat mask.
16912 MergeMasks(InLaneMask, RepeatMask);
16913 continue;
16914 }
16915
16916 // Couldn't find a match with the operands in either order.
16917 return SDValue();
16918 }
16919
16920 // Now handle any lanes with only one source.
16921 for (int Lane = 0; Lane != NumLanes; ++Lane) {
16922 // If this lane has already been processed, skip it.
16923 if (LaneSrcs[Lane][0] >= 0)
16924 continue;
16925
16926 for (int i = 0; i != NumLaneElts; ++i) {
16927 int M = Mask[(Lane * NumLaneElts) + i];
16928 if (M < 0)
16929 continue;
16930
16931 // If RepeatMask isn't defined yet we can define it ourself.
16932 if (RepeatMask[i] < 0)
16933 RepeatMask[i] = M % NumLaneElts;
16934
16935 if (RepeatMask[i] < NumElts) {
16936 if (RepeatMask[i] != M % NumLaneElts)
16937 return SDValue();
16938 LaneSrcs[Lane][0] = M / NumLaneElts;
16939 } else {
16940 if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))
16941 return SDValue();
16942 LaneSrcs[Lane][1] = M / NumLaneElts;
16943 }
16944 }
16945
16946 if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
16947 return SDValue();
16948 }
16949
16950 SmallVector<int, 16> NewMask(NumElts, -1);
16951 for (int Lane = 0; Lane != NumLanes; ++Lane) {
16952 int Src = LaneSrcs[Lane][0];
16953 for (int i = 0; i != NumLaneElts; ++i) {
16954 int M = -1;
16955 if (Src >= 0)
16956 M = Src * NumLaneElts + i;
16957 NewMask[Lane * NumLaneElts + i] = M;
16958 }
16959 }
16960 SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
16961 // Ensure we didn't get back the shuffle we started with.
16962 // FIXME: This is a hack to make up for some splat handling code in
16963 // getVectorShuffle.
16964 if (isa<ShuffleVectorSDNode>(NewV1) &&
16965 cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
16966 return SDValue();
16967
16968 for (int Lane = 0; Lane != NumLanes; ++Lane) {
16969 int Src = LaneSrcs[Lane][1];
16970 for (int i = 0; i != NumLaneElts; ++i) {
16971 int M = -1;
16972 if (Src >= 0)
16973 M = Src * NumLaneElts + i;
16974 NewMask[Lane * NumLaneElts + i] = M;
16975 }
16976 }
16977 SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
16978 // Ensure we didn't get back the shuffle we started with.
16979 // FIXME: This is a hack to make up for some splat handling code in
16980 // getVectorShuffle.
16981 if (isa<ShuffleVectorSDNode>(NewV2) &&
16982 cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
16983 return SDValue();
16984
16985 for (int i = 0; i != NumElts; ++i) {
16986 NewMask[i] = RepeatMask[i % NumLaneElts];
16987 if (NewMask[i] < 0)
16988 continue;
16989
16990 NewMask[i] += (i / NumLaneElts) * NumLaneElts;
16991 }
16992 return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
16993}
16994
16995/// If the input shuffle mask results in a vector that is undefined in all upper
16996/// or lower half elements and that mask accesses only 2 halves of the
16997/// shuffle's operands, return true. A mask of half the width with mask indexes
16998/// adjusted to access the extracted halves of the original shuffle operands is
16999/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
17000/// lower half of each input operand is accessed.
17001static bool
17002getHalfShuffleMask(ArrayRef<int> Mask, MutableArrayRef<int> HalfMask,
17003 int &HalfIdx1, int &HalfIdx2) {
17004 assert((Mask.size() == HalfMask.size() * 2) &&(static_cast <bool> ((Mask.size() == HalfMask.size() * 2
) && "Expected input mask to be twice as long as output"
) ? void (0) : __assert_fail ("(Mask.size() == HalfMask.size() * 2) && \"Expected input mask to be twice as long as output\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17005, __extension__
__PRETTY_FUNCTION__))
17005 "Expected input mask to be twice as long as output")(static_cast <bool> ((Mask.size() == HalfMask.size() * 2
) && "Expected input mask to be twice as long as output"
) ? void (0) : __assert_fail ("(Mask.size() == HalfMask.size() * 2) && \"Expected input mask to be twice as long as output\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17005, __extension__
__PRETTY_FUNCTION__))
;
17006
17007 // Exactly one half of the result must be undef to allow narrowing.
17008 bool UndefLower = isUndefLowerHalf(Mask);
17009 bool UndefUpper = isUndefUpperHalf(Mask);
17010 if (UndefLower == UndefUpper)
17011 return false;
17012
17013 unsigned HalfNumElts = HalfMask.size();
17014 unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
17015 HalfIdx1 = -1;
17016 HalfIdx2 = -1;
17017 for (unsigned i = 0; i != HalfNumElts; ++i) {
17018 int M = Mask[i + MaskIndexOffset];
17019 if (M < 0) {
17020 HalfMask[i] = M;
17021 continue;
17022 }
17023
17024 // Determine which of the 4 half vectors this element is from.
17025 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
17026 int HalfIdx = M / HalfNumElts;
17027
17028 // Determine the element index into its half vector source.
17029 int HalfElt = M % HalfNumElts;
17030
17031 // We can shuffle with up to 2 half vectors, set the new 'half'
17032 // shuffle mask accordingly.
17033 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
17034 HalfMask[i] = HalfElt;
17035 HalfIdx1 = HalfIdx;
17036 continue;
17037 }
17038 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
17039 HalfMask[i] = HalfElt + HalfNumElts;
17040 HalfIdx2 = HalfIdx;
17041 continue;
17042 }
17043
17044 // Too many half vectors referenced.
17045 return false;
17046 }
17047
17048 return true;
17049}
17050
17051/// Given the output values from getHalfShuffleMask(), create a half width
17052/// shuffle of extracted vectors followed by an insert back to full width.
17053static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2,
17054 ArrayRef<int> HalfMask, int HalfIdx1,
17055 int HalfIdx2, bool UndefLower,
17056 SelectionDAG &DAG, bool UseConcat = false) {
17057 assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?")(static_cast <bool> (V1.getValueType() == V2.getValueType
() && "Different sized vectors?") ? void (0) : __assert_fail
("V1.getValueType() == V2.getValueType() && \"Different sized vectors?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17057, __extension__
__PRETTY_FUNCTION__))
;
17058 assert(V1.getValueType().isSimple() && "Expecting only simple types")(static_cast <bool> (V1.getValueType().isSimple() &&
"Expecting only simple types") ? void (0) : __assert_fail ("V1.getValueType().isSimple() && \"Expecting only simple types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17058, __extension__
__PRETTY_FUNCTION__))
;
17059
17060 MVT VT = V1.getSimpleValueType();
17061 MVT HalfVT = VT.getHalfNumVectorElementsVT();
17062 unsigned HalfNumElts = HalfVT.getVectorNumElements();
17063
17064 auto getHalfVector = [&](int HalfIdx) {
17065 if (HalfIdx < 0)
17066 return DAG.getUNDEF(HalfVT);
17067 SDValue V = (HalfIdx < 2 ? V1 : V2);
17068 HalfIdx = (HalfIdx % 2) * HalfNumElts;
17069 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
17070 DAG.getIntPtrConstant(HalfIdx, DL));
17071 };
17072
17073 // ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset
17074 SDValue Half1 = getHalfVector(HalfIdx1);
17075 SDValue Half2 = getHalfVector(HalfIdx2);
17076 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
17077 if (UseConcat) {
17078 SDValue Op0 = V;
17079 SDValue Op1 = DAG.getUNDEF(HalfVT);
17080 if (UndefLower)
17081 std::swap(Op0, Op1);
17082 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);
17083 }
17084
17085 unsigned Offset = UndefLower ? HalfNumElts : 0;
17086 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
17087 DAG.getIntPtrConstant(Offset, DL));
17088}
17089
17090/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
17091/// This allows for fast cases such as subvector extraction/insertion
17092/// or shuffling smaller vector types which can lower more efficiently.
17093static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1,
17094 SDValue V2, ArrayRef<int> Mask,
17095 const X86Subtarget &Subtarget,
17096 SelectionDAG &DAG) {
17097 assert((VT.is256BitVector() || VT.is512BitVector()) &&(static_cast <bool> ((VT.is256BitVector() || VT.is512BitVector
()) && "Expected 256-bit or 512-bit vector") ? void (
0) : __assert_fail ("(VT.is256BitVector() || VT.is512BitVector()) && \"Expected 256-bit or 512-bit vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17098, __extension__
__PRETTY_FUNCTION__))
17098 "Expected 256-bit or 512-bit vector")(static_cast <bool> ((VT.is256BitVector() || VT.is512BitVector
()) && "Expected 256-bit or 512-bit vector") ? void (
0) : __assert_fail ("(VT.is256BitVector() || VT.is512BitVector()) && \"Expected 256-bit or 512-bit vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17098, __extension__
__PRETTY_FUNCTION__))
;
17099
17100 bool UndefLower = isUndefLowerHalf(Mask);
17101 if (!UndefLower && !isUndefUpperHalf(Mask))
17102 return SDValue();
17103
17104 assert((!UndefLower || !isUndefUpperHalf(Mask)) &&(static_cast <bool> ((!UndefLower || !isUndefUpperHalf(
Mask)) && "Completely undef shuffle mask should have been simplified already"
) ? void (0) : __assert_fail ("(!UndefLower || !isUndefUpperHalf(Mask)) && \"Completely undef shuffle mask should have been simplified already\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17105, __extension__
__PRETTY_FUNCTION__))
17105 "Completely undef shuffle mask should have been simplified already")(static_cast <bool> ((!UndefLower || !isUndefUpperHalf(
Mask)) && "Completely undef shuffle mask should have been simplified already"
) ? void (0) : __assert_fail ("(!UndefLower || !isUndefUpperHalf(Mask)) && \"Completely undef shuffle mask should have been simplified already\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17105, __extension__
__PRETTY_FUNCTION__))
;
17106
17107 // Upper half is undef and lower half is whole upper subvector.
17108 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
17109 MVT HalfVT = VT.getHalfNumVectorElementsVT();
17110 unsigned HalfNumElts = HalfVT.getVectorNumElements();
17111 if (!UndefLower &&
17112 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
17113 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
17114 DAG.getIntPtrConstant(HalfNumElts, DL));
17115 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
17116 DAG.getIntPtrConstant(0, DL));
17117 }
17118
17119 // Lower half is undef and upper half is whole lower subvector.
17120 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
17121 if (UndefLower &&
17122 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
17123 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
17124 DAG.getIntPtrConstant(0, DL));
17125 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
17126 DAG.getIntPtrConstant(HalfNumElts, DL));
17127 }
17128
17129 int HalfIdx1, HalfIdx2;
17130 SmallVector<int, 8> HalfMask(HalfNumElts);
17131 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))
17132 return SDValue();
17133
17134 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length")(static_cast <bool> (HalfMask.size() == HalfNumElts &&
"Unexpected shuffle mask length") ? void (0) : __assert_fail
("HalfMask.size() == HalfNumElts && \"Unexpected shuffle mask length\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17134, __extension__
__PRETTY_FUNCTION__))
;
17135
17136 // Only shuffle the halves of the inputs when useful.
17137 unsigned NumLowerHalves =
17138 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
17139 unsigned NumUpperHalves =
17140 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
17141 assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed")(static_cast <bool> (NumLowerHalves + NumUpperHalves <=
2 && "Only 1 or 2 halves allowed") ? void (0) : __assert_fail
("NumLowerHalves + NumUpperHalves <= 2 && \"Only 1 or 2 halves allowed\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17141, __extension__
__PRETTY_FUNCTION__))
;
17142
17143 // Determine the larger pattern of undef/halves, then decide if it's worth
17144 // splitting the shuffle based on subtarget capabilities and types.
17145 unsigned EltWidth = VT.getVectorElementType().getSizeInBits();
17146 if (!UndefLower) {
17147 // XXXXuuuu: no insert is needed.
17148 // Always extract lowers when setting lower - these are all free subreg ops.
17149 if (NumUpperHalves == 0)
17150 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
17151 UndefLower, DAG);
17152
17153 if (NumUpperHalves == 1) {
17154 // AVX2 has efficient 32/64-bit element cross-lane shuffles.
17155 if (Subtarget.hasAVX2()) {
17156 // extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
17157 if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
17158 !is128BitUnpackShuffleMask(HalfMask) &&
17159 (!isSingleSHUFPSMask(HalfMask) ||
17160 Subtarget.hasFastVariableCrossLaneShuffle()))
17161 return SDValue();
17162 // If this is a unary shuffle (assume that the 2nd operand is
17163 // canonicalized to undef), then we can use vpermpd. Otherwise, we
17164 // are better off extracting the upper half of 1 operand and using a
17165 // narrow shuffle.
17166 if (EltWidth == 64 && V2.isUndef())
17167 return SDValue();
17168 }
17169 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
17170 if (Subtarget.hasAVX512() && VT.is512BitVector())
17171 return SDValue();
17172 // Extract + narrow shuffle is better than the wide alternative.
17173 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
17174 UndefLower, DAG);
17175 }
17176
17177 // Don't extract both uppers, instead shuffle and then extract.
17178 assert(NumUpperHalves == 2 && "Half vector count went wrong")(static_cast <bool> (NumUpperHalves == 2 && "Half vector count went wrong"
) ? void (0) : __assert_fail ("NumUpperHalves == 2 && \"Half vector count went wrong\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17178, __extension__
__PRETTY_FUNCTION__))
;
17179 return SDValue();
17180 }
17181
17182 // UndefLower - uuuuXXXX: an insert to high half is required if we split this.
17183 if (NumUpperHalves == 0) {
17184 // AVX2 has efficient 64-bit element cross-lane shuffles.
17185 // TODO: Refine to account for unary shuffle, splat, and other masks?
17186 if (Subtarget.hasAVX2() && EltWidth == 64)
17187 return SDValue();
17188 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
17189 if (Subtarget.hasAVX512() && VT.is512BitVector())
17190 return SDValue();
17191 // Narrow shuffle + insert is better than the wide alternative.
17192 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
17193 UndefLower, DAG);
17194 }
17195
17196 // NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.
17197 return SDValue();
17198}
17199
17200/// Test whether the specified input (0 or 1) is in-place blended by the
17201/// given mask.
17202///
17203/// This returns true if the elements from a particular input are already in the
17204/// slot required by the given mask and require no permutation.
17205static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
17206 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.")(static_cast <bool> ((Input == 0 || Input == 1) &&
"Only two inputs to shuffles.") ? void (0) : __assert_fail (
"(Input == 0 || Input == 1) && \"Only two inputs to shuffles.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17206, __extension__
__PRETTY_FUNCTION__))
;
17207 int Size = Mask.size();
17208 for (int i = 0; i < Size; ++i)
17209 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
17210 return false;
17211
17212 return true;
17213}
17214
17215/// Handle case where shuffle sources are coming from the same 128-bit lane and
17216/// every lane can be represented as the same repeating mask - allowing us to
17217/// shuffle the sources with the repeating shuffle and then permute the result
17218/// to the destination lanes.
17219static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
17220 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
17221 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
17222 int NumElts = VT.getVectorNumElements();
17223 int NumLanes = VT.getSizeInBits() / 128;
17224 int NumLaneElts = NumElts / NumLanes;
17225
17226 // On AVX2 we may be able to just shuffle the lowest elements and then
17227 // broadcast the result.
17228 if (Subtarget.hasAVX2()) {
17229 for (unsigned BroadcastSize : {16, 32, 64}) {
17230 if (BroadcastSize <= VT.getScalarSizeInBits())
17231 continue;
17232 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
17233
17234 // Attempt to match a repeating pattern every NumBroadcastElts,
17235 // accounting for UNDEFs but only references the lowest 128-bit
17236 // lane of the inputs.
17237 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
17238 for (int i = 0; i != NumElts; i += NumBroadcastElts)
17239 for (int j = 0; j != NumBroadcastElts; ++j) {
17240 int M = Mask[i + j];
17241 if (M < 0)
17242 continue;
17243 int &R = RepeatMask[j];
17244 if (0 != ((M % NumElts) / NumLaneElts))
17245 return false;
17246 if (0 <= R && R != M)
17247 return false;
17248 R = M;
17249 }
17250 return true;
17251 };
17252
17253 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
17254 if (!FindRepeatingBroadcastMask(RepeatMask))
17255 continue;
17256
17257 // Shuffle the (lowest) repeated elements in place for broadcast.
17258 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
17259
17260 // Shuffle the actual broadcast.
17261 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
17262 for (int i = 0; i != NumElts; i += NumBroadcastElts)
17263 for (int j = 0; j != NumBroadcastElts; ++j)
17264 BroadcastMask[i + j] = j;
17265 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
17266 BroadcastMask);
17267 }
17268 }
17269
17270 // Bail if the shuffle mask doesn't cross 128-bit lanes.
17271 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
17272 return SDValue();
17273
17274 // Bail if we already have a repeated lane shuffle mask.
17275 SmallVector<int, 8> RepeatedShuffleMask;
17276 if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
17277 return SDValue();
17278
17279 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
17280 // (with PERMQ/PERMPD). On AVX512BW targets, permuting 32-bit sub-lanes, even
17281 // with a variable shuffle, is worth it for 64xi8 vectors. Otherwise we can
17282 // only permute whole 128-bit lanes.
17283 int SubLaneScale = 1;
17284 if (Subtarget.hasAVX2() && VT.is256BitVector())
17285 SubLaneScale = 2;
17286 if (Subtarget.hasBWI() && VT == MVT::v64i8)
17287 SubLaneScale = 4;
17288 int NumSubLanes = NumLanes * SubLaneScale;
17289 int NumSubLaneElts = NumLaneElts / SubLaneScale;
17290
17291 // Check that all the sources are coming from the same lane and see if we can
17292 // form a repeating shuffle mask (local to each sub-lane). At the same time,
17293 // determine the source sub-lane for each destination sub-lane.
17294 int TopSrcSubLane = -1;
17295 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
17296 SmallVector<SmallVector<int, 8>> RepeatedSubLaneMasks(
17297 SubLaneScale,
17298 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef));
17299
17300 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
17301 // Extract the sub-lane mask, check that it all comes from the same lane
17302 // and normalize the mask entries to come from the first lane.
17303 int SrcLane = -1;
17304 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
17305 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
17306 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
17307 if (M < 0)
17308 continue;
17309 int Lane = (M % NumElts) / NumLaneElts;
17310 if ((0 <= SrcLane) && (SrcLane != Lane))
17311 return SDValue();
17312 SrcLane = Lane;
17313 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
17314 SubLaneMask[Elt] = LocalM;
17315 }
17316
17317 // Whole sub-lane is UNDEF.
17318 if (SrcLane < 0)
17319 continue;
17320
17321 // Attempt to match against the candidate repeated sub-lane masks.
17322 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
17323 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
17324 for (int i = 0; i != NumSubLaneElts; ++i) {
17325 if (M1[i] < 0 || M2[i] < 0)
17326 continue;
17327 if (M1[i] != M2[i])
17328 return false;
17329 }
17330 return true;
17331 };
17332
17333 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
17334 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
17335 continue;
17336
17337 // Merge the sub-lane mask into the matching repeated sub-lane mask.
17338 for (int i = 0; i != NumSubLaneElts; ++i) {
17339 int M = SubLaneMask[i];
17340 if (M < 0)
17341 continue;
17342 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&(static_cast <bool> ((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask
[i] == M) && "Unexpected mask element") ? void (0) : __assert_fail
("(RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) && \"Unexpected mask element\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17343, __extension__
__PRETTY_FUNCTION__))
17343 "Unexpected mask element")(static_cast <bool> ((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask
[i] == M) && "Unexpected mask element") ? void (0) : __assert_fail
("(RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) && \"Unexpected mask element\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17343, __extension__
__PRETTY_FUNCTION__))
;
17344 RepeatedSubLaneMask[i] = M;
17345 }
17346
17347 // Track the top most source sub-lane - by setting the remaining to UNDEF
17348 // we can greatly simplify shuffle matching.
17349 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
17350 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
17351 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
17352 break;
17353 }
17354
17355 // Bail if we failed to find a matching repeated sub-lane mask.
17356 if (Dst2SrcSubLanes[DstSubLane] < 0)
17357 return SDValue();
17358 }
17359 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&(static_cast <bool> (0 <= TopSrcSubLane && TopSrcSubLane
< NumSubLanes && "Unexpected source lane") ? void
(0) : __assert_fail ("0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes && \"Unexpected source lane\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17360, __extension__
__PRETTY_FUNCTION__))
17360 "Unexpected source lane")(static_cast <bool> (0 <= TopSrcSubLane && TopSrcSubLane
< NumSubLanes && "Unexpected source lane") ? void
(0) : __assert_fail ("0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes && \"Unexpected source lane\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17360, __extension__
__PRETTY_FUNCTION__))
;
17361
17362 // Create a repeating shuffle mask for the entire vector.
17363 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
17364 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
17365 int Lane = SubLane / SubLaneScale;
17366 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
17367 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
17368 int M = RepeatedSubLaneMask[Elt];
17369 if (M < 0)
17370 continue;
17371 int Idx = (SubLane * NumSubLaneElts) + Elt;
17372 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
17373 }
17374 }
17375 SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
17376
17377 // Shuffle each source sub-lane to its destination.
17378 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
17379 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
17380 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
17381 if (SrcSubLane < 0)
17382 continue;
17383 for (int j = 0; j != NumSubLaneElts; ++j)
17384 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
17385 }
17386
17387 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
17388 SubLaneMask);
17389}
17390
17391static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
17392 bool &ForceV1Zero, bool &ForceV2Zero,
17393 unsigned &ShuffleImm, ArrayRef<int> Mask,
17394 const APInt &Zeroable) {
17395 int NumElts = VT.getVectorNumElements();
17396 assert(VT.getScalarSizeInBits() == 64 &&(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
(NumElts == 2 || NumElts == 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && \"Unexpected data type for VSHUFPD\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17398, __extension__
__PRETTY_FUNCTION__))
17397 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
(NumElts == 2 || NumElts == 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && \"Unexpected data type for VSHUFPD\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17398, __extension__
__PRETTY_FUNCTION__))
17398 "Unexpected data type for VSHUFPD")(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
(NumElts == 2 || NumElts == 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && \"Unexpected data type for VSHUFPD\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17398, __extension__
__PRETTY_FUNCTION__))
;
17399 assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&(static_cast <bool> (isUndefOrZeroOrInRange(Mask, 0, 2 *
NumElts) && "Illegal shuffle mask") ? void (0) : __assert_fail
("isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) && \"Illegal shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17400, __extension__
__PRETTY_FUNCTION__))
17400 "Illegal shuffle mask")(static_cast <bool> (isUndefOrZeroOrInRange(Mask, 0, 2 *
NumElts) && "Illegal shuffle mask") ? void (0) : __assert_fail
("isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) && \"Illegal shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17400, __extension__
__PRETTY_FUNCTION__))
;
17401
17402 bool ZeroLane[2] = { true, true };
17403 for (int i = 0; i < NumElts; ++i)
17404 ZeroLane[i & 1] &= Zeroable[i];
17405
17406 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
17407 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
17408 ShuffleImm = 0;
17409 bool ShufpdMask = true;
17410 bool CommutableMask = true;
17411 for (int i = 0; i < NumElts; ++i) {
17412 if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])
17413 continue;
17414 if (Mask[i] < 0)
17415 return false;
17416 int Val = (i & 6) + NumElts * (i & 1);
17417 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
17418 if (Mask[i] < Val || Mask[i] > Val + 1)
17419 ShufpdMask = false;
17420 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
17421 CommutableMask = false;
17422 ShuffleImm |= (Mask[i] % 2) << i;
17423 }
17424
17425 if (!ShufpdMask && !CommutableMask)
17426 return false;
17427
17428 if (!ShufpdMask && CommutableMask)
17429 std::swap(V1, V2);
17430
17431 ForceV1Zero = ZeroLane[0];
17432 ForceV2Zero = ZeroLane[1];
17433 return true;
17434}
17435
17436static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1,
17437 SDValue V2, ArrayRef<int> Mask,
17438 const APInt &Zeroable,
17439 const X86Subtarget &Subtarget,
17440 SelectionDAG &DAG) {
17441 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&(static_cast <bool> ((VT == MVT::v2f64 || VT == MVT::v4f64
|| VT == MVT::v8f64) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("(VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) && \"Unexpected data type for VSHUFPD\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17442, __extension__
__PRETTY_FUNCTION__))
17442 "Unexpected data type for VSHUFPD")(static_cast <bool> ((VT == MVT::v2f64 || VT == MVT::v4f64
|| VT == MVT::v8f64) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("(VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) && \"Unexpected data type for VSHUFPD\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17442, __extension__
__PRETTY_FUNCTION__))
;
17443
17444 unsigned Immediate = 0;
17445 bool ForceV1Zero = false, ForceV2Zero = false;
17446 if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,
17447 Mask, Zeroable))
17448 return SDValue();
17449
17450 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
17451 if (ForceV1Zero)
17452 V1 = getZeroVector(VT, Subtarget, DAG, DL);
17453 if (ForceV2Zero)
17454 V2 = getZeroVector(VT, Subtarget, DAG, DL);
17455
17456 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
17457 DAG.getTargetConstant(Immediate, DL, MVT::i8));
17458}
17459
17460// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
17461// by zeroable elements in the remaining 24 elements. Turn this into two
17462// vmovqb instructions shuffled together.
17463static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT,
17464 SDValue V1, SDValue V2,
17465 ArrayRef<int> Mask,
17466 const APInt &Zeroable,
17467 SelectionDAG &DAG) {
17468 assert(VT == MVT::v32i8 && "Unexpected type!")(static_cast <bool> (VT == MVT::v32i8 && "Unexpected type!"
) ? void (0) : __assert_fail ("VT == MVT::v32i8 && \"Unexpected type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17468, __extension__
__PRETTY_FUNCTION__))
;
17469
17470 // The first 8 indices should be every 8th element.
17471 if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))
17472 return SDValue();
17473
17474 // Remaining elements need to be zeroable.
17475 if (Zeroable.countLeadingOnes() < (Mask.size() - 8))
17476 return SDValue();
17477
17478 V1 = DAG.getBitcast(MVT::v4i64, V1);
17479 V2 = DAG.getBitcast(MVT::v4i64, V2);
17480
17481 V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);
17482 V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);
17483
17484 // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in
17485 // the upper bits of the result using an unpckldq.
17486 SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,
17487 { 0, 1, 2, 3, 16, 17, 18, 19,
17488 4, 5, 6, 7, 20, 21, 22, 23 });
17489 // Insert the unpckldq into a zero vector to widen to v32i8.
17490 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
17491 DAG.getConstant(0, DL, MVT::v32i8), Unpack,
17492 DAG.getIntPtrConstant(0, DL));
17493}
17494
17495
17496/// Handle lowering of 4-lane 64-bit floating point shuffles.
17497///
17498/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
17499/// isn't available.
17500static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17501 const APInt &Zeroable, SDValue V1, SDValue V2,
17502 const X86Subtarget &Subtarget,
17503 SelectionDAG &DAG) {
17504 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17504, __extension__
__PRETTY_FUNCTION__))
;
17505 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17505, __extension__
__PRETTY_FUNCTION__))
;
17506 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17506, __extension__
__PRETTY_FUNCTION__))
;
17507
17508 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
17509 Subtarget, DAG))
17510 return V;
17511
17512 if (V2.isUndef()) {
17513 // Check for being able to broadcast a single element.
17514 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,
17515 Mask, Subtarget, DAG))
17516 return Broadcast;
17517
17518 // Use low duplicate instructions for masks that match their pattern.
17519 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
17520 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
17521
17522 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
17523 // Non-half-crossing single input shuffles can be lowered with an
17524 // interleaved permutation.
17525 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
17526 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
17527 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
17528 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
17529 }
17530
17531 // With AVX2 we have direct support for this permutation.
17532 if (Subtarget.hasAVX2())
17533 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
17534 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
17535
17536 // Try to create an in-lane repeating shuffle mask and then shuffle the
17537 // results into the target lanes.
17538 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17539 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
17540 return V;
17541
17542 // Try to permute the lanes and then use a per-lane permute.
17543 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,
17544 Mask, DAG, Subtarget))
17545 return V;
17546
17547 // Otherwise, fall back.
17548 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,
17549 DAG, Subtarget);
17550 }
17551
17552 // Use dedicated unpack instructions for masks that match their pattern.
17553 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
17554 return V;
17555
17556 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
17557 Zeroable, Subtarget, DAG))
17558 return Blend;
17559
17560 // Check if the blend happens to exactly fit that of SHUFPD.
17561 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,
17562 Zeroable, Subtarget, DAG))
17563 return Op;
17564
17565 // If we have lane crossing shuffles AND they don't all come from the lower
17566 // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
17567 // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently
17568 // canonicalize to a blend of splat which isn't necessary for this combine.
17569 if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&
17570 !all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&
17571 (V1.getOpcode() != ISD::BUILD_VECTOR) &&
17572 (V2.getOpcode() != ISD::BUILD_VECTOR))
17573 if (SDValue Op = lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2,
17574 Mask, DAG))
17575 return Op;
17576
17577 // If we have one input in place, then we can permute the other input and
17578 // blend the result.
17579 if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask))
17580 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
17581 Subtarget, DAG);
17582
17583 // Try to create an in-lane repeating shuffle mask and then shuffle the
17584 // results into the target lanes.
17585 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17586 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
17587 return V;
17588
17589 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17590 // shuffle. However, if we have AVX2 and either inputs are already in place,
17591 // we will be able to shuffle even across lanes the other input in a single
17592 // instruction so skip this pattern.
17593 if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
17594 isShuffleMaskInputInPlace(1, Mask))))
17595 if (SDValue V = lowerShuffleAsLanePermuteAndRepeatedMask(
17596 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
17597 return V;
17598
17599 // If we have VLX support, we can use VEXPAND.
17600 if (Subtarget.hasVLX())
17601 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, V1, V2,
17602 DAG, Subtarget))
17603 return V;
17604
17605 // If we have AVX2 then we always want to lower with a blend because an v4 we
17606 // can fully permute the elements.
17607 if (Subtarget.hasAVX2())
17608 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
17609 Subtarget, DAG);
17610
17611 // Otherwise fall back on generic lowering.
17612 return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask,
17613 Subtarget, DAG);
17614}
17615
17616/// Handle lowering of 4-lane 64-bit integer shuffles.
17617///
17618/// This routine is only called when we have AVX2 and thus a reasonable
17619/// instruction set for v4i64 shuffling..
17620static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17621 const APInt &Zeroable, SDValue V1, SDValue V2,
17622 const X86Subtarget &Subtarget,
17623 SelectionDAG &DAG) {
17624 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17624, __extension__
__PRETTY_FUNCTION__))
;
17625 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17625, __extension__
__PRETTY_FUNCTION__))
;
17626 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17626, __extension__
__PRETTY_FUNCTION__))
;
17627 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v4i64 with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17627, __extension__
__PRETTY_FUNCTION__))
;
17628
17629 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
17630 Subtarget, DAG))
17631 return V;
17632
17633 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
17634 Zeroable, Subtarget, DAG))
17635 return Blend;
17636
17637 // Check for being able to broadcast a single element.
17638 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
17639 Subtarget, DAG))
17640 return Broadcast;
17641
17642 if (V2.isUndef()) {
17643 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
17644 // can use lower latency instructions that will operate on both lanes.
17645 SmallVector<int, 2> RepeatedMask;
17646 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
17647 SmallVector<int, 4> PSHUFDMask;
17648 narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask);
17649 return DAG.getBitcast(
17650 MVT::v4i64,
17651 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
17652 DAG.getBitcast(MVT::v8i32, V1),
17653 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
17654 }
17655
17656 // AVX2 provides a direct instruction for permuting a single input across
17657 // lanes.
17658 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
17659 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
17660 }
17661
17662 // Try to use shift instructions.
17663 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
17664 Zeroable, Subtarget, DAG))
17665 return Shift;
17666
17667 // If we have VLX support, we can use VALIGN or VEXPAND.
17668 if (Subtarget.hasVLX()) {
17669 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,
17670 Subtarget, DAG))
17671 return Rotate;
17672
17673 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, V1, V2,
17674 DAG, Subtarget))
17675 return V;
17676 }
17677
17678 // Try to use PALIGNR.
17679 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,
17680 Subtarget, DAG))
17681 return Rotate;
17682
17683 // Use dedicated unpack instructions for masks that match their pattern.
17684 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
17685 return V;
17686
17687 // If we have one input in place, then we can permute the other input and
17688 // blend the result.
17689 if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask))
17690 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
17691 Subtarget, DAG);
17692
17693 // Try to create an in-lane repeating shuffle mask and then shuffle the
17694 // results into the target lanes.
17695 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17696 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
17697 return V;
17698
17699 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17700 // shuffle. However, if we have AVX2 and either inputs are already in place,
17701 // we will be able to shuffle even across lanes the other input in a single
17702 // instruction so skip this pattern.
17703 if (!isShuffleMaskInputInPlace(0, Mask) &&
17704 !isShuffleMaskInputInPlace(1, Mask))
17705 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
17706 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
17707 return Result;
17708
17709 // Otherwise fall back on generic blend lowering.
17710 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
17711 Subtarget, DAG);
17712}
17713
17714/// Handle lowering of 8-lane 32-bit floating point shuffles.
17715///
17716/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
17717/// isn't available.
17718static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17719 const APInt &Zeroable, SDValue V1, SDValue V2,
17720 const X86Subtarget &Subtarget,
17721 SelectionDAG &DAG) {
17722 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17722, __extension__
__PRETTY_FUNCTION__))
;
17723 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17723, __extension__
__PRETTY_FUNCTION__))
;
17724 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17724, __extension__
__PRETTY_FUNCTION__))
;
17725
17726 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
17727 Zeroable, Subtarget, DAG))
17728 return Blend;
17729
17730 // Check for being able to broadcast a single element.
17731 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
17732 Subtarget, DAG))
17733 return Broadcast;
17734
17735 // If the shuffle mask is repeated in each 128-bit lane, we have many more
17736 // options to efficiently lower the shuffle.
17737 SmallVector<int, 4> RepeatedMask;
17738 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
17739 assert(RepeatedMask.size() == 4 &&(static_cast <bool> (RepeatedMask.size() == 4 &&
"Repeated masks must be half the mask width!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Repeated masks must be half the mask width!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17740, __extension__
__PRETTY_FUNCTION__))
17740 "Repeated masks must be half the mask width!")(static_cast <bool> (RepeatedMask.size() == 4 &&
"Repeated masks must be half the mask width!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Repeated masks must be half the mask width!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17740, __extension__
__PRETTY_FUNCTION__))
;
17741
17742 // Use even/odd duplicate instructions for masks that match their pattern.
17743 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
17744 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
17745 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
17746 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
17747
17748 if (V2.isUndef())
17749 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
17750 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17751
17752 // Use dedicated unpack instructions for masks that match their pattern.
17753 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
17754 return V;
17755
17756 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
17757 // have already handled any direct blends.
17758 return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
17759 }
17760
17761 // Try to create an in-lane repeating shuffle mask and then shuffle the
17762 // results into the target lanes.
17763 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17764 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
17765 return V;
17766
17767 // If we have a single input shuffle with different shuffle patterns in the
17768 // two 128-bit lanes use the variable mask to VPERMILPS.
17769 if (V2.isUndef()) {
17770 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {
17771 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
17772 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
17773 }
17774 if (Subtarget.hasAVX2()) {
17775 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
17776 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
17777 }
17778 // Otherwise, fall back.
17779 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,
17780 DAG, Subtarget);
17781 }
17782
17783 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17784 // shuffle.
17785 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
17786 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
17787 return Result;
17788
17789 // If we have VLX support, we can use VEXPAND.
17790 if (Subtarget.hasVLX())
17791 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask, V1, V2,
17792 DAG, Subtarget))
17793 return V;
17794
17795 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
17796 // since after split we get a more efficient code using vpunpcklwd and
17797 // vpunpckhwd instrs than vblend.
17798 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
17799 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Subtarget,
17800 DAG);
17801
17802 // If we have AVX2 then we always want to lower with a blend because at v8 we
17803 // can fully permute the elements.
17804 if (Subtarget.hasAVX2())
17805 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask,
17806 Subtarget, DAG);
17807
17808 // Otherwise fall back on generic lowering.
17809 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,
17810 Subtarget, DAG);
17811}
17812
17813/// Handle lowering of 8-lane 32-bit integer shuffles.
17814///
17815/// This routine is only called when we have AVX2 and thus a reasonable
17816/// instruction set for v8i32 shuffling..
17817static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17818 const APInt &Zeroable, SDValue V1, SDValue V2,
17819 const X86Subtarget &Subtarget,
17820 SelectionDAG &DAG) {
17821 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17821, __extension__
__PRETTY_FUNCTION__))
;
17822 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17822, __extension__
__PRETTY_FUNCTION__))
;
17823 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17823, __extension__
__PRETTY_FUNCTION__))
;
17824 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v8i32 with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17824, __extension__
__PRETTY_FUNCTION__))
;
17825
17826 // Whenever we can lower this as a zext, that instruction is strictly faster
17827 // than any alternative. It also allows us to fold memory operands into the
17828 // shuffle in many cases.
17829 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
17830 Zeroable, Subtarget, DAG))
17831 return ZExt;
17832
17833 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
17834 // since after split we get a more efficient code than vblend by using
17835 // vpunpcklwd and vpunpckhwd instrs.
17836 if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
17837 !Subtarget.hasAVX512())
17838 return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Subtarget,
17839 DAG);
17840
17841 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
17842 Zeroable, Subtarget, DAG))
17843 return Blend;
17844
17845 // Check for being able to broadcast a single element.
17846 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
17847 Subtarget, DAG))
17848 return Broadcast;
17849
17850 // If the shuffle mask is repeated in each 128-bit lane we can use more
17851 // efficient instructions that mirror the shuffles across the two 128-bit
17852 // lanes.
17853 SmallVector<int, 4> RepeatedMask;
17854 bool Is128BitLaneRepeatedShuffle =
17855 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
17856 if (Is128BitLaneRepeatedShuffle) {
17857 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")(static_cast <bool> (RepeatedMask.size() == 4 &&
"Unexpected repeated mask size!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17857, __extension__
__PRETTY_FUNCTION__))
;
17858 if (V2.isUndef())
17859 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
17860 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17861
17862 // Use dedicated unpack instructions for masks that match their pattern.
17863 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
17864 return V;
17865 }
17866
17867 // Try to use shift instructions.
17868 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
17869 Zeroable, Subtarget, DAG))
17870 return Shift;
17871
17872 // If we have VLX support, we can use VALIGN or EXPAND.
17873 if (Subtarget.hasVLX()) {
17874 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,
17875 Subtarget, DAG))
17876 return Rotate;
17877
17878 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, V1, V2,
17879 DAG, Subtarget))
17880 return V;
17881 }
17882
17883 // Try to use byte rotation instructions.
17884 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,
17885 Subtarget, DAG))
17886 return Rotate;
17887
17888 // Try to create an in-lane repeating shuffle mask and then shuffle the
17889 // results into the target lanes.
17890 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17891 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
17892 return V;
17893
17894 if (V2.isUndef()) {
17895 // Try to produce a fixed cross-128-bit lane permute followed by unpack
17896 // because that should be faster than the variable permute alternatives.
17897 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, Mask, V1, V2, DAG))
17898 return V;
17899
17900 // If the shuffle patterns aren't repeated but it's a single input, directly
17901 // generate a cross-lane VPERMD instruction.
17902 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
17903 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
17904 }
17905
17906 // Assume that a single SHUFPS is faster than an alternative sequence of
17907 // multiple instructions (even if the CPU has a domain penalty).
17908 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
17909 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
17910 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
17911 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
17912 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
17913 CastV1, CastV2, DAG);
17914 return DAG.getBitcast(MVT::v8i32, ShufPS);
17915 }
17916
17917 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17918 // shuffle.
17919 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
17920 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
17921 return Result;
17922
17923 // Otherwise fall back on generic blend lowering.
17924 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask,
17925 Subtarget, DAG);
17926}
17927
17928/// Handle lowering of 16-lane 16-bit integer shuffles.
17929///
17930/// This routine is only called when we have AVX2 and thus a reasonable
17931/// instruction set for v16i16 shuffling..
17932static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17933 const APInt &Zeroable, SDValue V1, SDValue V2,
17934 const X86Subtarget &Subtarget,
17935 SelectionDAG &DAG) {
17936 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v16i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17936, __extension__
__PRETTY_FUNCTION__))
;
17937 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v16i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17937, __extension__
__PRETTY_FUNCTION__))
;
17938 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast <bool> (Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17938, __extension__
__PRETTY_FUNCTION__))
;
17939 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v16i16 with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17939, __extension__
__PRETTY_FUNCTION__))
;
17940
17941 // Whenever we can lower this as a zext, that instruction is strictly faster
17942 // than any alternative. It also allows us to fold memory operands into the
17943 // shuffle in many cases.
17944 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
17945 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
17946 return ZExt;
17947
17948 // Check for being able to broadcast a single element.
17949 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,
17950 Subtarget, DAG))
17951 return Broadcast;
17952
17953 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
17954 Zeroable, Subtarget, DAG))
17955 return Blend;
17956
17957 // Use dedicated unpack instructions for masks that match their pattern.
17958 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
17959 return V;
17960
17961 // Use dedicated pack instructions for masks that match their pattern.
17962 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,
17963 Subtarget))
17964 return V;
17965
17966 // Try to use lower using a truncation.
17967 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
17968 Subtarget, DAG))
17969 return V;
17970
17971 // Try to use shift instructions.
17972 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
17973 Zeroable, Subtarget, DAG))
17974 return Shift;
17975
17976 // Try to use byte rotation instructions.
17977 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,
17978 Subtarget, DAG))
17979 return Rotate;
17980
17981 // Try to create an in-lane repeating shuffle mask and then shuffle the
17982 // results into the target lanes.
17983 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17984 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17985 return V;
17986
17987 if (V2.isUndef()) {
17988 // Try to use bit rotation instructions.
17989 if (SDValue Rotate =
17990 lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))
17991 return Rotate;
17992
17993 // Try to produce a fixed cross-128-bit lane permute followed by unpack
17994 // because that should be faster than the variable permute alternatives.
17995 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, Mask, V1, V2, DAG))
17996 return V;
17997
17998 // There are no generalized cross-lane shuffle operations available on i16
17999 // element types.
18000 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
18001 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
18002 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
18003 return V;
18004
18005 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,
18006 DAG, Subtarget);
18007 }
18008
18009 SmallVector<int, 8> RepeatedMask;
18010 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
18011 // As this is a single-input shuffle, the repeated mask should be
18012 // a strictly valid v8i16 mask that we can pass through to the v8i16
18013 // lowering to handle even the v16 case.
18014 return lowerV8I16GeneralSingleInputShuffle(
18015 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
18016 }
18017 }
18018
18019 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,
18020 Zeroable, Subtarget, DAG))
18021 return PSHUFB;
18022
18023 // AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).
18024 if (Subtarget.hasBWI())
18025 return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);
18026
18027 // Try to simplify this by merging 128-bit lanes to enable a lane-based
18028 // shuffle.
18029 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
18030 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
18031 return Result;
18032
18033 // Try to permute the lanes and then use a per-lane permute.
18034 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
18035 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
18036 return V;
18037
18038 // Otherwise fall back on generic lowering.
18039 return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask,
18040 Subtarget, DAG);
18041}
18042
18043/// Handle lowering of 32-lane 8-bit integer shuffles.
18044///
18045/// This routine is only called when we have AVX2 and thus a reasonable
18046/// instruction set for v32i8 shuffling..
18047static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18048 const APInt &Zeroable, SDValue V1, SDValue V2,
18049 const X86Subtarget &Subtarget,
18050 SelectionDAG &DAG) {
18051 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v32i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v32i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18051, __extension__
__PRETTY_FUNCTION__))
;
18052 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v32i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v32i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18052, __extension__
__PRETTY_FUNCTION__))
;
18053 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!")(static_cast <bool> (Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 32 && \"Unexpected mask size for v32 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18053, __extension__
__PRETTY_FUNCTION__))
;
18054 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v32i8 with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18054, __extension__
__PRETTY_FUNCTION__))
;
18055
18056 // Whenever we can lower this as a zext, that instruction is strictly faster
18057 // than any alternative. It also allows us to fold memory operands into the
18058 // shuffle in many cases.
18059 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,
18060 Zeroable, Subtarget, DAG))
18061 return ZExt;
18062
18063 // Check for being able to broadcast a single element.
18064 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
18065 Subtarget, DAG))
18066 return Broadcast;
18067
18068 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
18069 Zeroable, Subtarget, DAG))
18070 return Blend;
18071
18072 // Use dedicated unpack instructions for masks that match their pattern.
18073 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
18074 return V;
18075
18076 // Use dedicated pack instructions for masks that match their pattern.
18077 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,
18078 Subtarget))
18079 return V;
18080
18081 // Try to use lower using a truncation.
18082 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
18083 Subtarget, DAG))
18084 return V;
18085
18086 // Try to use shift instructions.
18087 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
18088 Zeroable, Subtarget, DAG))
18089 return Shift;
18090
18091 // Try to use byte rotation instructions.
18092 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,
18093 Subtarget, DAG))
18094 return Rotate;
18095
18096 // Try to use bit rotation instructions.
18097 if (V2.isUndef())
18098 if (SDValue Rotate =
18099 lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))
18100 return Rotate;
18101
18102 // Try to create an in-lane repeating shuffle mask and then shuffle the
18103 // results into the target lanes.
18104 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
18105 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
18106 return V;
18107
18108 // There are no generalized cross-lane shuffle operations available on i8
18109 // element types.
18110 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
18111 // Try to produce a fixed cross-128-bit lane permute followed by unpack
18112 // because that should be faster than the variable permute alternatives.
18113 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, Mask, V1, V2, DAG))
18114 return V;
18115
18116 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
18117 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
18118 return V;
18119
18120 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,
18121 DAG, Subtarget);
18122 }
18123
18124 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
18125 Zeroable, Subtarget, DAG))
18126 return PSHUFB;
18127
18128 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
18129 if (Subtarget.hasVBMI())
18130 return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);
18131
18132 // Try to simplify this by merging 128-bit lanes to enable a lane-based
18133 // shuffle.
18134 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
18135 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
18136 return Result;
18137
18138 // Try to permute the lanes and then use a per-lane permute.
18139 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
18140 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
18141 return V;
18142
18143 // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
18144 // by zeroable elements in the remaining 24 elements. Turn this into two
18145 // vmovqb instructions shuffled together.
18146 if (Subtarget.hasVLX())
18147 if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,
18148 Mask, Zeroable, DAG))
18149 return V;
18150
18151 // Otherwise fall back on generic lowering.
18152 return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask,
18153 Subtarget, DAG);
18154}
18155
18156/// High-level routine to lower various 256-bit x86 vector shuffles.
18157///
18158/// This routine either breaks down the specific type of a 256-bit x86 vector
18159/// shuffle or splits it into two 128-bit shuffles and fuses the results back
18160/// together based on the available instructions.
18161static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
18162 SDValue V1, SDValue V2, const APInt &Zeroable,
18163 const X86Subtarget &Subtarget,
18164 SelectionDAG &DAG) {
18165 // If we have a single input to the zero element, insert that into V1 if we
18166 // can do so cheaply.
18167 int NumElts = VT.getVectorNumElements();
18168 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
18169
18170 if (NumV2Elements == 1 && Mask[0] >= NumElts)
18171 if (SDValue Insertion = lowerShuffleAsElementInsertion(
18172 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
18173 return Insertion;
18174
18175 // Handle special cases where the lower or upper half is UNDEF.
18176 if (SDValue V =
18177 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
18178 return V;
18179
18180 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
18181 // can check for those subtargets here and avoid much of the subtarget
18182 // querying in the per-vector-type lowering routines. With AVX1 we have
18183 // essentially *zero* ability to manipulate a 256-bit vector with integer
18184 // types. Since we'll use floating point types there eventually, just
18185 // immediately cast everything to a float and operate entirely in that domain.
18186 if (VT.isInteger() && !Subtarget.hasAVX2()) {
18187 int ElementBits = VT.getScalarSizeInBits();
18188 if (ElementBits < 32) {
18189 // No floating point type available, if we can't use the bit operations
18190 // for masking/blending then decompose into 128-bit vectors.
18191 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
18192 Subtarget, DAG))
18193 return V;
18194 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
18195 return V;
18196 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
18197 }
18198
18199 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
18200 VT.getVectorNumElements());
18201 V1 = DAG.getBitcast(FpVT, V1);
18202 V2 = DAG.getBitcast(FpVT, V2);
18203 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
18204 }
18205
18206 if (VT == MVT::v16f16) {
18207 V1 = DAG.getBitcast(MVT::v16i16, V1);
18208 V2 = DAG.getBitcast(MVT::v16i16, V2);
18209 return DAG.getBitcast(MVT::v16f16,
18210 DAG.getVectorShuffle(MVT::v16i16, DL, V1, V2, Mask));
18211 }
18212
18213 switch (VT.SimpleTy) {
18214 case MVT::v4f64:
18215 return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18216 case MVT::v4i64:
18217 return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18218 case MVT::v8f32:
18219 return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18220 case MVT::v8i32:
18221 return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18222 case MVT::v16i16:
18223 return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18224 case MVT::v32i8:
18225 return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18226
18227 default:
18228 llvm_unreachable("Not a valid 256-bit x86 vector type!")::llvm::llvm_unreachable_internal("Not a valid 256-bit x86 vector type!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18228)
;
18229 }
18230}
18231
18232/// Try to lower a vector shuffle as a 128-bit shuffles.
18233static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
18234 const APInt &Zeroable, SDValue V1, SDValue V2,
18235 const X86Subtarget &Subtarget,
18236 SelectionDAG &DAG) {
18237 assert(VT.getScalarSizeInBits() == 64 &&(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
"Unexpected element type size for 128bit shuffle.") ? void (
0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && \"Unexpected element type size for 128bit shuffle.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18238, __extension__
__PRETTY_FUNCTION__))
18238 "Unexpected element type size for 128bit shuffle.")(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
"Unexpected element type size for 128bit shuffle.") ? void (
0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && \"Unexpected element type size for 128bit shuffle.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18238, __extension__
__PRETTY_FUNCTION__))
;
18239
18240 // To handle 256 bit vector requires VLX and most probably
18241 // function lowerV2X128VectorShuffle() is better solution.
18242 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.")(static_cast <bool> (VT.is512BitVector() && "Unexpected vector size for 512bit shuffle."
) ? void (0) : __assert_fail ("VT.is512BitVector() && \"Unexpected vector size for 512bit shuffle.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18242, __extension__
__PRETTY_FUNCTION__))
;
18243
18244 // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
18245 SmallVector<int, 4> Widened128Mask;
18246 if (!canWidenShuffleElements(Mask, Widened128Mask))
18247 return SDValue();
18248 assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch")(static_cast <bool> (Widened128Mask.size() == 4 &&
"Shuffle widening mismatch") ? void (0) : __assert_fail ("Widened128Mask.size() == 4 && \"Shuffle widening mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18248, __extension__
__PRETTY_FUNCTION__))
;
18249
18250 // Try to use an insert into a zero vector.
18251 if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
18252 (Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
18253 unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
18254 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
18255 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
18256 DAG.getIntPtrConstant(0, DL));
18257 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
18258 getZeroVector(VT, Subtarget, DAG, DL), LoV,
18259 DAG.getIntPtrConstant(0, DL));
18260 }
18261
18262 // Check for patterns which can be matched with a single insert of a 256-bit
18263 // subvector.
18264 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);
18265 if (OnlyUsesV1 ||
18266 isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {
18267 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
18268 SDValue SubVec =
18269 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
18270 DAG.getIntPtrConstant(0, DL));
18271 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
18272 DAG.getIntPtrConstant(4, DL));
18273 }
18274
18275 // See if this is an insertion of the lower 128-bits of V2 into V1.
18276 bool IsInsert = true;
18277 int V2Index = -1;
18278 for (int i = 0; i < 4; ++i) {
18279 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value")(static_cast <bool> (Widened128Mask[i] >= -1 &&
"Illegal shuffle sentinel value") ? void (0) : __assert_fail
("Widened128Mask[i] >= -1 && \"Illegal shuffle sentinel value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18279, __extension__
__PRETTY_FUNCTION__))
;
18280 if (Widened128Mask[i] < 0)
18281 continue;
18282
18283 // Make sure all V1 subvectors are in place.
18284 if (Widened128Mask[i] < 4) {
18285 if (Widened128Mask[i] != i) {
18286 IsInsert = false;
18287 break;
18288 }
18289 } else {
18290 // Make sure we only have a single V2 index and its the lowest 128-bits.
18291 if (V2Index >= 0 || Widened128Mask[i] != 4) {
18292 IsInsert = false;
18293 break;
18294 }
18295 V2Index = i;
18296 }
18297 }
18298 if (IsInsert && V2Index >= 0) {
18299 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
18300 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
18301 DAG.getIntPtrConstant(0, DL));
18302 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
18303 }
18304
18305 // See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane
18306 // UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where
18307 // possible we at least ensure the lanes stay sequential to help later
18308 // combines.
18309 SmallVector<int, 2> Widened256Mask;
18310 if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {
18311 Widened128Mask.clear();
18312 narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask);
18313 }
18314
18315 // Try to lower to vshuf64x2/vshuf32x4.
18316 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
18317 unsigned PermMask = 0;
18318 // Insure elements came from the same Op.
18319 for (int i = 0; i < 4; ++i) {
18320 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value")(static_cast <bool> (Widened128Mask[i] >= -1 &&
"Illegal shuffle sentinel value") ? void (0) : __assert_fail
("Widened128Mask[i] >= -1 && \"Illegal shuffle sentinel value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18320, __extension__
__PRETTY_FUNCTION__))
;
18321 if (Widened128Mask[i] < 0)
18322 continue;
18323
18324 SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;
18325 unsigned OpIndex = i / 2;
18326 if (Ops[OpIndex].isUndef())
18327 Ops[OpIndex] = Op;
18328 else if (Ops[OpIndex] != Op)
18329 return SDValue();
18330
18331 // Convert the 128-bit shuffle mask selection values into 128-bit selection
18332 // bits defined by a vshuf64x2 instruction's immediate control byte.
18333 PermMask |= (Widened128Mask[i] % 4) << (i * 2);
18334 }
18335
18336 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
18337 DAG.getTargetConstant(PermMask, DL, MVT::i8));
18338}
18339
18340/// Handle lowering of 8-lane 64-bit floating point shuffles.
18341static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18342 const APInt &Zeroable, SDValue V1, SDValue V2,
18343 const X86Subtarget &Subtarget,
18344 SelectionDAG &DAG) {
18345 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18345, __extension__
__PRETTY_FUNCTION__))
;
18346 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18346, __extension__
__PRETTY_FUNCTION__))
;
18347 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18347, __extension__
__PRETTY_FUNCTION__))
;
18348
18349 if (V2.isUndef()) {
18350 // Use low duplicate instructions for masks that match their pattern.
18351 if (isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))
18352 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
18353
18354 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
18355 // Non-half-crossing single input shuffles can be lowered with an
18356 // interleaved permutation.
18357 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
18358 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
18359 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
18360 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
18361 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
18362 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
18363 }
18364
18365 SmallVector<int, 4> RepeatedMask;
18366 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
18367 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
18368 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
18369 }
18370
18371 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,
18372 V2, Subtarget, DAG))
18373 return Shuf128;
18374
18375 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
18376 return Unpck;
18377
18378 // Check if the blend happens to exactly fit that of SHUFPD.
18379 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,
18380 Zeroable, Subtarget, DAG))
18381 return Op;
18382
18383 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, V2,
18384 DAG, Subtarget))
18385 return V;
18386
18387 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
18388 Zeroable, Subtarget, DAG))
18389 return Blend;
18390
18391 return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);
18392}
18393
18394/// Handle lowering of 16-lane 32-bit floating point shuffles.
18395static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18396 const APInt &Zeroable, SDValue V1, SDValue V2,
18397 const X86Subtarget &Subtarget,
18398 SelectionDAG &DAG) {
18399 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v16f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18399, __extension__
__PRETTY_FUNCTION__))
;
18400 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v16f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18400, __extension__
__PRETTY_FUNCTION__))
;
18401 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast <bool> (Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18401, __extension__
__PRETTY_FUNCTION__))
;
18402
18403 // If the shuffle mask is repeated in each 128-bit lane, we have many more
18404 // options to efficiently lower the shuffle.
18405 SmallVector<int, 4> RepeatedMask;
18406 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
18407 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")(static_cast <bool> (RepeatedMask.size() == 4 &&
"Unexpected repeated mask size!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18407, __extension__
__PRETTY_FUNCTION__))
;
18408
18409 // Use even/odd duplicate instructions for masks that match their pattern.
18410 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
18411 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
18412 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
18413 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
18414
18415 if (V2.isUndef())
18416 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
18417 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
18418
18419 // Use dedicated unpack instructions for masks that match their pattern.
18420 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
18421 return V;
18422
18423 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
18424 Zeroable, Subtarget, DAG))
18425 return Blend;
18426
18427 // Otherwise, fall back to a SHUFPS sequence.
18428 return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
18429 }
18430
18431 // Try to create an in-lane repeating shuffle mask and then shuffle the
18432 // results into the target lanes.
18433 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
18434 DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))
18435 return V;
18436
18437 // If we have a single input shuffle with different shuffle patterns in the
18438 // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
18439 if (V2.isUndef() &&
18440 !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
18441 SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
18442 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
18443 }
18444
18445 // If we have AVX512F support, we can use VEXPAND.
18446 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
18447 V1, V2, DAG, Subtarget))
18448 return V;
18449
18450 return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);
18451}
18452
18453/// Handle lowering of 8-lane 64-bit integer shuffles.
18454static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18455 const APInt &Zeroable, SDValue V1, SDValue V2,
18456 const X86Subtarget &Subtarget,
18457 SelectionDAG &DAG) {
18458 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18458, __extension__
__PRETTY_FUNCTION__))
;
18459 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18459, __extension__
__PRETTY_FUNCTION__))
;
18460 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18460, __extension__
__PRETTY_FUNCTION__))
;
18461
18462 if (V2.isUndef()) {
18463 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
18464 // can use lower latency instructions that will operate on all four
18465 // 128-bit lanes.
18466 SmallVector<int, 2> Repeated128Mask;
18467 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
18468 SmallVector<int, 4> PSHUFDMask;
18469 narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask);
18470 return DAG.getBitcast(
18471 MVT::v8i64,
18472 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
18473 DAG.getBitcast(MVT::v16i32, V1),
18474 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
18475 }
18476
18477 SmallVector<int, 4> Repeated256Mask;
18478 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
18479 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
18480 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
18481 }
18482
18483 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,
18484 V2, Subtarget, DAG))
18485 return Shuf128;
18486
18487 // Try to use shift instructions.
18488 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
18489 Zeroable, Subtarget, DAG))
18490 return Shift;
18491
18492 // Try to use VALIGN.
18493 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,
18494 Subtarget, DAG))
18495 return Rotate;
18496
18497 // Try to use PALIGNR.
18498 if (Subtarget.hasBWI())
18499 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
18500 Subtarget, DAG))
18501 return Rotate;
18502
18503 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
18504 return Unpck;
18505
18506 // If we have AVX512F support, we can use VEXPAND.
18507 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, V2,
18508 DAG, Subtarget))
18509 return V;
18510
18511 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
18512 Zeroable, Subtarget, DAG))
18513 return Blend;
18514
18515 return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);
18516}
18517
18518/// Handle lowering of 16-lane 32-bit integer shuffles.
18519static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18520 const APInt &Zeroable, SDValue V1, SDValue V2,
18521 const X86Subtarget &Subtarget,
18522 SelectionDAG &DAG) {
18523 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v16i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18523, __extension__
__PRETTY_FUNCTION__))
;
18524 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v16i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18524, __extension__
__PRETTY_FUNCTION__))
;
18525 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast <bool> (Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18525, __extension__
__PRETTY_FUNCTION__))
;
18526
18527 // Whenever we can lower this as a zext, that instruction is strictly faster
18528 // than any alternative. It also allows us to fold memory operands into the
18529 // shuffle in many cases.
18530 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
18531 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
18532 return ZExt;
18533
18534 // If the shuffle mask is repeated in each 128-bit lane we can use more
18535 // efficient instructions that mirror the shuffles across the four 128-bit
18536 // lanes.
18537 SmallVector<int, 4> RepeatedMask;
18538 bool Is128BitLaneRepeatedShuffle =
18539 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
18540 if (Is128BitLaneRepeatedShuffle) {
18541 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")(static_cast <bool> (RepeatedMask.size() == 4 &&
"Unexpected repeated mask size!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18541, __extension__
__PRETTY_FUNCTION__))
;
18542 if (V2.isUndef())
18543 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
18544 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
18545
18546 // Use dedicated unpack instructions for masks that match their pattern.
18547 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
18548 return V;
18549 }
18550
18551 // Try to use shift instructions.
18552 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
18553 Zeroable, Subtarget, DAG))
18554 return Shift;
18555
18556 // Try to use VALIGN.
18557 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,
18558 Subtarget, DAG))
18559 return Rotate;
18560
18561 // Try to use byte rotation instructions.
18562 if (Subtarget.hasBWI())
18563 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,
18564 Subtarget, DAG))
18565 return Rotate;
18566
18567 // Assume that a single SHUFPS is faster than using a permv shuffle.
18568 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
18569 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
18570 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
18571 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
18572 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
18573 CastV1, CastV2, DAG);
18574 return DAG.getBitcast(MVT::v16i32, ShufPS);
18575 }
18576
18577 // Try to create an in-lane repeating shuffle mask and then shuffle the
18578 // results into the target lanes.
18579 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
18580 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
18581 return V;
18582
18583 // If we have AVX512F support, we can use VEXPAND.
18584 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, V1, V2,
18585 DAG, Subtarget))
18586 return V;
18587
18588 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
18589 Zeroable, Subtarget, DAG))
18590 return Blend;
18591
18592 return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);
18593}
18594
18595/// Handle lowering of 32-lane 16-bit integer shuffles.
18596static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18597 const APInt &Zeroable, SDValue V1, SDValue V2,
18598 const X86Subtarget &Subtarget,
18599 SelectionDAG &DAG) {
18600 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v32i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v32i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18600, __extension__
__PRETTY_FUNCTION__))
;
18601 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v32i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v32i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18601, __extension__
__PRETTY_FUNCTION__))
;
18602 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!")(static_cast <bool> (Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 32 && \"Unexpected mask size for v32 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18602, __extension__
__PRETTY_FUNCTION__))
;
18603 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!")(static_cast <bool> (Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"We can only lower v32i16 with AVX-512-BWI!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18603, __extension__
__PRETTY_FUNCTION__))
;
18604
18605 // Whenever we can lower this as a zext, that instruction is strictly faster
18606 // than any alternative. It also allows us to fold memory operands into the
18607 // shuffle in many cases.
18608 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
18609 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
18610 return ZExt;
18611
18612 // Use dedicated unpack instructions for masks that match their pattern.
18613 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
18614 return V;
18615
18616 // Use dedicated pack instructions for masks that match their pattern.
18617 if (SDValue V =
18618 lowerShuffleWithPACK(DL, MVT::v32i16, Mask, V1, V2, DAG, Subtarget))
18619 return V;
18620
18621 // Try to use shift instructions.
18622 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
18623 Zeroable, Subtarget, DAG))
18624 return Shift;
18625
18626 // Try to use byte rotation instructions.
18627 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,
18628 Subtarget, DAG))
18629 return Rotate;
18630
18631 if (V2.isUndef()) {
18632 // Try to use bit rotation instructions.
18633 if (SDValue Rotate =
18634 lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))
18635 return Rotate;
18636
18637 SmallVector<int, 8> RepeatedMask;
18638 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
18639 // As this is a single-input shuffle, the repeated mask should be
18640 // a strictly valid v8i16 mask that we can pass through to the v8i16
18641 // lowering to handle even the v32 case.
18642 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,
18643 RepeatedMask, Subtarget, DAG);
18644 }
18645 }
18646
18647 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
18648 Zeroable, Subtarget, DAG))
18649 return Blend;
18650
18651 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
18652 Zeroable, Subtarget, DAG))
18653 return PSHUFB;
18654
18655 return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);
18656}
18657
18658/// Handle lowering of 64-lane 8-bit integer shuffles.
18659static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18660 const APInt &Zeroable, SDValue V1, SDValue V2,
18661 const X86Subtarget &Subtarget,
18662 SelectionDAG &DAG) {
18663 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v64i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v64i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18663, __extension__
__PRETTY_FUNCTION__))
;
18664 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v64i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v64i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18664, __extension__
__PRETTY_FUNCTION__))
;
18665 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!")(static_cast <bool> (Mask.size() == 64 && "Unexpected mask size for v64 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 64 && \"Unexpected mask size for v64 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18665, __extension__
__PRETTY_FUNCTION__))
;
18666 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!")(static_cast <bool> (Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"We can only lower v64i8 with AVX-512-BWI!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18666, __extension__
__PRETTY_FUNCTION__))
;
18667
18668 // Whenever we can lower this as a zext, that instruction is strictly faster
18669 // than any alternative. It also allows us to fold memory operands into the
18670 // shuffle in many cases.
18671 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
18672 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
18673 return ZExt;
18674
18675 // Use dedicated unpack instructions for masks that match their pattern.
18676 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
18677 return V;
18678
18679 // Use dedicated pack instructions for masks that match their pattern.
18680 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v64i8, Mask, V1, V2, DAG,
18681 Subtarget))
18682 return V;
18683
18684 // Try to use shift instructions.
18685 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
18686 Zeroable, Subtarget, DAG))
18687 return Shift;
18688
18689 // Try to use byte rotation instructions.
18690 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,
18691 Subtarget, DAG))
18692 return Rotate;
18693
18694 // Try to use bit rotation instructions.
18695 if (V2.isUndef())
18696 if (SDValue Rotate =
18697 lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))
18698 return Rotate;
18699
18700 // Lower as AND if possible.
18701 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask,
18702 Zeroable, Subtarget, DAG))
18703 return Masked;
18704
18705 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
18706 Zeroable, Subtarget, DAG))
18707 return PSHUFB;
18708
18709 // Try to create an in-lane repeating shuffle mask and then shuffle the
18710 // results into the target lanes.
18711 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
18712 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
18713 return V;
18714
18715 if (SDValue Result = lowerShuffleAsLanePermuteAndPermute(
18716 DL, MVT::v64i8, V1, V2, Mask, DAG, Subtarget))
18717 return Result;
18718
18719 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
18720 Zeroable, Subtarget, DAG))
18721 return Blend;
18722
18723 if (!is128BitLaneCrossingShuffleMask(MVT::v64i8, Mask)) {
18724 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
18725 // PALIGNR will be cheaper than the second PSHUFB+OR.
18726 if (SDValue V = lowerShuffleAsByteRotateAndPermute(DL, MVT::v64i8, V1, V2,
18727 Mask, Subtarget, DAG))
18728 return V;
18729
18730 // If we can't directly blend but can use PSHUFB, that will be better as it
18731 // can both shuffle and set up the inefficient blend.
18732 bool V1InUse, V2InUse;
18733 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v64i8, V1, V2, Mask, Zeroable,
18734 DAG, V1InUse, V2InUse);
18735 }
18736
18737 // Try to simplify this by merging 128-bit lanes to enable a lane-based
18738 // shuffle.
18739 if (!V2.isUndef())
18740 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
18741 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
18742 return Result;
18743
18744 // VBMI can use VPERMV/VPERMV3 byte shuffles.
18745 if (Subtarget.hasVBMI())
18746 return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
18747
18748 return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
18749}
18750
18751/// High-level routine to lower various 512-bit x86 vector shuffles.
18752///
18753/// This routine either breaks down the specific type of a 512-bit x86 vector
18754/// shuffle or splits it into two 256-bit shuffles and fuses the results back
18755/// together based on the available instructions.
18756static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
18757 MVT VT, SDValue V1, SDValue V2,
18758 const APInt &Zeroable,
18759 const X86Subtarget &Subtarget,
18760 SelectionDAG &DAG) {
18761 assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/ basic ISA!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18762, __extension__
__PRETTY_FUNCTION__))
18762 "Cannot lower 512-bit vectors w/ basic ISA!")(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/ basic ISA!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18762, __extension__
__PRETTY_FUNCTION__))
;
18763
18764 // If we have a single input to the zero element, insert that into V1 if we
18765 // can do so cheaply.
18766 int NumElts = Mask.size();
18767 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
18768
18769 if (NumV2Elements == 1 && Mask[0] >= NumElts)
18770 if (SDValue Insertion = lowerShuffleAsElementInsertion(
18771 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
18772 return Insertion;
18773
18774 // Handle special cases where the lower or upper half is UNDEF.
18775 if (SDValue V =
18776 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
18777 return V;
18778
18779 // Check for being able to broadcast a single element.
18780 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
18781 Subtarget, DAG))
18782 return Broadcast;
18783
18784 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {
18785 // Try using bit ops for masking and blending before falling back to
18786 // splitting.
18787 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
18788 Subtarget, DAG))
18789 return V;
18790 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
18791 return V;
18792
18793 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
18794 }
18795
18796 if (VT == MVT::v32f16) {
18797 V1 = DAG.getBitcast(MVT::v32i16, V1);
18798 V2 = DAG.getBitcast(MVT::v32i16, V2);
18799 return DAG.getBitcast(MVT::v32f16,
18800 DAG.getVectorShuffle(MVT::v32i16, DL, V1, V2, Mask));
18801 }
18802
18803 // Dispatch to each element type for lowering. If we don't have support for
18804 // specific element type shuffles at 512 bits, immediately split them and
18805 // lower them. Each lowering routine of a given type is allowed to assume that
18806 // the requisite ISA extensions for that element type are available.
18807 switch (VT.SimpleTy) {
18808 case MVT::v8f64:
18809 return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18810 case MVT::v16f32:
18811 return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18812 case MVT::v8i64:
18813 return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18814 case MVT::v16i32:
18815 return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18816 case MVT::v32i16:
18817 return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18818 case MVT::v64i8:
18819 return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18820
18821 default:
18822 llvm_unreachable("Not a valid 512-bit x86 vector type!")::llvm::llvm_unreachable_internal("Not a valid 512-bit x86 vector type!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18822)
;
18823 }
18824}
18825
18826static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef<int> Mask,
18827 MVT VT, SDValue V1, SDValue V2,
18828 const X86Subtarget &Subtarget,
18829 SelectionDAG &DAG) {
18830 // Shuffle should be unary.
18831 if (!V2.isUndef())
18832 return SDValue();
18833
18834 int ShiftAmt = -1;
18835 int NumElts = Mask.size();
18836 for (int i = 0; i != NumElts; ++i) {
18837 int M = Mask[i];
18838 assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&(static_cast <bool> ((M == SM_SentinelUndef || (0 <=
M && M < NumElts)) && "Unexpected mask index."
) ? void (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < NumElts)) && \"Unexpected mask index.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18839, __extension__
__PRETTY_FUNCTION__))
18839 "Unexpected mask index.")(static_cast <bool> ((M == SM_SentinelUndef || (0 <=
M && M < NumElts)) && "Unexpected mask index."
) ? void (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < NumElts)) && \"Unexpected mask index.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18839, __extension__
__PRETTY_FUNCTION__))
;
18840 if (M < 0)
18841 continue;
18842
18843 // The first non-undef element determines our shift amount.
18844 if (ShiftAmt < 0) {
18845 ShiftAmt = M - i;
18846 // Need to be shifting right.
18847 if (ShiftAmt <= 0)
18848 return SDValue();
18849 }
18850 // All non-undef elements must shift by the same amount.
18851 if (ShiftAmt != M - i)
18852 return SDValue();
18853 }
18854 assert(ShiftAmt >= 0 && "All undef?")(static_cast <bool> (ShiftAmt >= 0 && "All undef?"
) ? void (0) : __assert_fail ("ShiftAmt >= 0 && \"All undef?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18854, __extension__
__PRETTY_FUNCTION__))
;
18855
18856 // Great we found a shift right.
18857 MVT WideVT = VT;
18858 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
18859 WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
18860 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,
18861 DAG.getUNDEF(WideVT), V1,
18862 DAG.getIntPtrConstant(0, DL));
18863 Res = DAG.getNode(X86ISD::KSHIFTR, DL, WideVT, Res,
18864 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
18865 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
18866 DAG.getIntPtrConstant(0, DL));
18867}
18868
18869// Determine if this shuffle can be implemented with a KSHIFT instruction.
18870// Returns the shift amount if possible or -1 if not. This is a simplified
18871// version of matchShuffleAsShift.
18872static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
18873 int MaskOffset, const APInt &Zeroable) {
18874 int Size = Mask.size();
18875
18876 auto CheckZeros = [&](int Shift, bool Left) {
18877 for (int j = 0; j < Shift; ++j)
18878 if (!Zeroable[j + (Left ? 0 : (Size - Shift))])
18879 return false;
18880
18881 return true;
18882 };
18883
18884 auto MatchShift = [&](int Shift, bool Left) {
18885 unsigned Pos = Left ? Shift : 0;
18886 unsigned Low = Left ? 0 : Shift;
18887 unsigned Len = Size - Shift;
18888 return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);
18889 };
18890
18891 for (int Shift = 1; Shift != Size; ++Shift)
18892 for (bool Left : {true, false})
18893 if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {
18894 Opcode = Left ? X86ISD::KSHIFTL : X86ISD::KSHIFTR;
18895 return Shift;
18896 }
18897
18898 return -1;
18899}
18900
18901
18902// Lower vXi1 vector shuffles.
18903// There is no a dedicated instruction on AVX-512 that shuffles the masks.
18904// The only way to shuffle bits is to sign-extend the mask vector to SIMD
18905// vector, shuffle and then truncate it back.
18906static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
18907 MVT VT, SDValue V1, SDValue V2,
18908 const APInt &Zeroable,
18909 const X86Subtarget &Subtarget,
18910 SelectionDAG &DAG) {
18911 assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/o basic ISA!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/o basic ISA!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18912, __extension__
__PRETTY_FUNCTION__))
18912 "Cannot lower 512-bit vectors w/o basic ISA!")(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/o basic ISA!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/o basic ISA!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18912, __extension__
__PRETTY_FUNCTION__))
;
18913
18914 int NumElts = Mask.size();
18915
18916 // Try to recognize shuffles that are just padding a subvector with zeros.
18917 int SubvecElts = 0;
18918 int Src = -1;
18919 for (int i = 0; i != NumElts; ++i) {
18920 if (Mask[i] >= 0) {
18921 // Grab the source from the first valid mask. All subsequent elements need
18922 // to use this same source.
18923 if (Src < 0)
18924 Src = Mask[i] / NumElts;
18925 if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)
18926 break;
18927 }
18928
18929 ++SubvecElts;
18930 }
18931 assert(SubvecElts != NumElts && "Identity shuffle?")(static_cast <bool> (SubvecElts != NumElts && "Identity shuffle?"
) ? void (0) : __assert_fail ("SubvecElts != NumElts && \"Identity shuffle?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18931, __extension__
__PRETTY_FUNCTION__))
;
18932
18933 // Clip to a power 2.
18934 SubvecElts = PowerOf2Floor(SubvecElts);
18935
18936 // Make sure the number of zeroable bits in the top at least covers the bits
18937 // not covered by the subvector.
18938 if ((int)Zeroable.countLeadingOnes() >= (NumElts - SubvecElts)) {
18939 assert(Src >= 0 && "Expected a source!")(static_cast <bool> (Src >= 0 && "Expected a source!"
) ? void (0) : __assert_fail ("Src >= 0 && \"Expected a source!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18939, __extension__
__PRETTY_FUNCTION__))
;
18940 MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
18941 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT,
18942 Src == 0 ? V1 : V2,
18943 DAG.getIntPtrConstant(0, DL));
18944 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
18945 DAG.getConstant(0, DL, VT),
18946 Extract, DAG.getIntPtrConstant(0, DL));
18947 }
18948
18949 // Try a simple shift right with undef elements. Later we'll try with zeros.
18950 if (SDValue Shift = lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget,
18951 DAG))
18952 return Shift;
18953
18954 // Try to match KSHIFTs.
18955 unsigned Offset = 0;
18956 for (SDValue V : { V1, V2 }) {
18957 unsigned Opcode;
18958 int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
18959 if (ShiftAmt >= 0) {
18960 MVT WideVT = VT;
18961 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
18962 WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
18963 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,
18964 DAG.getUNDEF(WideVT), V,
18965 DAG.getIntPtrConstant(0, DL));
18966 // Widened right shifts need two shifts to ensure we shift in zeroes.
18967 if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {
18968 int WideElts = WideVT.getVectorNumElements();
18969 // Shift left to put the original vector in the MSBs of the new size.
18970 Res = DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,
18971 DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));
18972 // Increase the shift amount to account for the left shift.
18973 ShiftAmt += WideElts - NumElts;
18974 }
18975
18976 Res = DAG.getNode(Opcode, DL, WideVT, Res,
18977 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
18978 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
18979 DAG.getIntPtrConstant(0, DL));
18980 }
18981 Offset += NumElts; // Increment for next iteration.
18982 }
18983
18984 // If we're broadcasting a SETCC result, try to broadcast the ops instead.
18985 // TODO: What other unary shuffles would benefit from this?
18986 if (isBroadcastShuffleMask(Mask) && V1.getOpcode() == ISD::SETCC &&
18987 V1->hasOneUse()) {
18988 SDValue Op0 = V1.getOperand(0);
18989 SDValue Op1 = V1.getOperand(1);
18990 ISD::CondCode CC = cast<CondCodeSDNode>(V1.getOperand(2))->get();
18991 EVT OpVT = Op0.getValueType();
18992 return DAG.getSetCC(
18993 DL, VT, DAG.getVectorShuffle(OpVT, DL, Op0, DAG.getUNDEF(OpVT), Mask),
18994 DAG.getVectorShuffle(OpVT, DL, Op1, DAG.getUNDEF(OpVT), Mask), CC);
18995 }
18996
18997 MVT ExtVT;
18998 switch (VT.SimpleTy) {
18999 default:
19000 llvm_unreachable("Expected a vector of i1 elements")::llvm::llvm_unreachable_internal("Expected a vector of i1 elements"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19000)
;
19001 case MVT::v2i1:
19002 ExtVT = MVT::v2i64;
19003 break;
19004 case MVT::v4i1:
19005 ExtVT = MVT::v4i32;
19006 break;
19007 case MVT::v8i1:
19008 // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
19009 // shuffle.
19010 ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
19011 break;
19012 case MVT::v16i1:
19013 // Take 512-bit type, unless we are avoiding 512-bit types and have the
19014 // 256-bit operation available.
19015 ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
19016 break;
19017 case MVT::v32i1:
19018 // Take 512-bit type, unless we are avoiding 512-bit types and have the
19019 // 256-bit operation available.
19020 assert(Subtarget.hasBWI() && "Expected AVX512BW support")(static_cast <bool> (Subtarget.hasBWI() && "Expected AVX512BW support"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected AVX512BW support\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19020, __extension__
__PRETTY_FUNCTION__))
;
19021 ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
19022 break;
19023 case MVT::v64i1:
19024 // Fall back to scalarization. FIXME: We can do better if the shuffle
19025 // can be partitioned cleanly.
19026 if (!Subtarget.useBWIRegs())
19027 return SDValue();
19028 ExtVT = MVT::v64i8;
19029 break;
19030 }
19031
19032 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
19033 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
19034
19035 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
19036 // i1 was sign extended we can use X86ISD::CVT2MASK.
19037 int NumElems = VT.getVectorNumElements();
19038 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
19039 (Subtarget.hasDQI() && (NumElems < 32)))
19040 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
19041 Shuffle, ISD::SETGT);
19042
19043 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
19044}
19045
19046/// Helper function that returns true if the shuffle mask should be
19047/// commuted to improve canonicalization.
19048static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
19049 int NumElements = Mask.size();
19050
19051 int NumV1Elements = 0, NumV2Elements = 0;
19052 for (int M : Mask)
19053 if (M < 0)
19054 continue;
19055 else if (M < NumElements)
19056 ++NumV1Elements;
19057 else
19058 ++NumV2Elements;
19059
19060 // Commute the shuffle as needed such that more elements come from V1 than
19061 // V2. This allows us to match the shuffle pattern strictly on how many
19062 // elements come from V1 without handling the symmetric cases.
19063 if (NumV2Elements > NumV1Elements)
19064 return true;
19065
19066 assert(NumV1Elements > 0 && "No V1 indices")(static_cast <bool> (NumV1Elements > 0 && "No V1 indices"
) ? void (0) : __assert_fail ("NumV1Elements > 0 && \"No V1 indices\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19066, __extension__
__PRETTY_FUNCTION__))
;
19067
19068 if (NumV2Elements == 0)
19069 return false;
19070
19071 // When the number of V1 and V2 elements are the same, try to minimize the
19072 // number of uses of V2 in the low half of the vector. When that is tied,
19073 // ensure that the sum of indices for V1 is equal to or lower than the sum
19074 // indices for V2. When those are equal, try to ensure that the number of odd
19075 // indices for V1 is lower than the number of odd indices for V2.
19076 if (NumV1Elements == NumV2Elements) {
19077 int LowV1Elements = 0, LowV2Elements = 0;
19078 for (int M : Mask.slice(0, NumElements / 2))
19079 if (M >= NumElements)
19080 ++LowV2Elements;
19081 else if (M >= 0)
19082 ++LowV1Elements;
19083 if (LowV2Elements > LowV1Elements)
19084 return true;
19085 if (LowV2Elements == LowV1Elements) {
19086 int SumV1Indices = 0, SumV2Indices = 0;
19087 for (int i = 0, Size = Mask.size(); i < Size; ++i)
19088 if (Mask[i] >= NumElements)
19089 SumV2Indices += i;
19090 else if (Mask[i] >= 0)
19091 SumV1Indices += i;
19092 if (SumV2Indices < SumV1Indices)
19093 return true;
19094 if (SumV2Indices == SumV1Indices) {
19095 int NumV1OddIndices = 0, NumV2OddIndices = 0;
19096 for (int i = 0, Size = Mask.size(); i < Size; ++i)
19097 if (Mask[i] >= NumElements)
19098 NumV2OddIndices += i % 2;
19099 else if (Mask[i] >= 0)
19100 NumV1OddIndices += i % 2;
19101 if (NumV2OddIndices < NumV1OddIndices)
19102 return true;
19103 }
19104 }
19105 }
19106
19107 return false;
19108}
19109
19110// Forward declaration.
19111static SDValue canonicalizeShuffleMaskWithHorizOp(
19112 MutableArrayRef<SDValue> Ops, MutableArrayRef<int> Mask,
19113 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
19114 const X86Subtarget &Subtarget);
19115
19116 /// Top-level lowering for x86 vector shuffles.
19117///
19118/// This handles decomposition, canonicalization, and lowering of all x86
19119/// vector shuffles. Most of the specific lowering strategies are encapsulated
19120/// above in helper routines. The canonicalization attempts to widen shuffles
19121/// to involve fewer lanes of wider elements, consolidate symmetric patterns
19122/// s.t. only one of the two inputs needs to be tested, etc.
19123static SDValue lowerVECTOR_SHUFFLE(SDValue Op, const X86Subtarget &Subtarget,
19124 SelectionDAG &DAG) {
19125 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
19126 ArrayRef<int> OrigMask = SVOp->getMask();
19127 SDValue V1 = Op.getOperand(0);
19128 SDValue V2 = Op.getOperand(1);
19129 MVT VT = Op.getSimpleValueType();
19130 int NumElements = VT.getVectorNumElements();
19131 SDLoc DL(Op);
19132 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
19133
19134 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&(static_cast <bool> ((VT.getSizeInBits() != 64 || Is1BitVector
) && "Can't lower MMX shuffles") ? void (0) : __assert_fail
("(VT.getSizeInBits() != 64 || Is1BitVector) && \"Can't lower MMX shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19135, __extension__
__PRETTY_FUNCTION__))
19135 "Can't lower MMX shuffles")(static_cast <bool> ((VT.getSizeInBits() != 64 || Is1BitVector
) && "Can't lower MMX shuffles") ? void (0) : __assert_fail
("(VT.getSizeInBits() != 64 || Is1BitVector) && \"Can't lower MMX shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19135, __extension__
__PRETTY_FUNCTION__))
;
19136
19137 bool V1IsUndef = V1.isUndef();
19138 bool V2IsUndef = V2.isUndef();
19139 if (V1IsUndef && V2IsUndef)
19140 return DAG.getUNDEF(VT);
19141
19142 // When we create a shuffle node we put the UNDEF node to second operand,
19143 // but in some cases the first operand may be transformed to UNDEF.
19144 // In this case we should just commute the node.
19145 if (V1IsUndef)
19146 return DAG.getCommutedVectorShuffle(*SVOp);
19147
19148 // Check for non-undef masks pointing at an undef vector and make the masks
19149 // undef as well. This makes it easier to match the shuffle based solely on
19150 // the mask.
19151 if (V2IsUndef &&
19152 any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
19153 SmallVector<int, 8> NewMask(OrigMask.begin(), OrigMask.end());
19154 for (int &M : NewMask)
19155 if (M >= NumElements)
19156 M = -1;
19157 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
19158 }
19159
19160 // Check for illegal shuffle mask element index values.
19161 int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
19162 (void)MaskUpperLimit;
19163 assert(llvm::all_of(OrigMask,(static_cast <bool> (llvm::all_of(OrigMask, [&](int
M) { return -1 <= M && M < MaskUpperLimit; }) &&
"Out of bounds shuffle index") ? void (0) : __assert_fail ("llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19165, __extension__
__PRETTY_FUNCTION__))
19164 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&(static_cast <bool> (llvm::all_of(OrigMask, [&](int
M) { return -1 <= M && M < MaskUpperLimit; }) &&
"Out of bounds shuffle index") ? void (0) : __assert_fail ("llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19165, __extension__
__PRETTY_FUNCTION__))
19165 "Out of bounds shuffle index")(static_cast <bool> (llvm::all_of(OrigMask, [&](int
M) { return -1 <= M && M < MaskUpperLimit; }) &&
"Out of bounds shuffle index") ? void (0) : __assert_fail ("llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19165, __extension__
__PRETTY_FUNCTION__))
;
19166
19167 // We actually see shuffles that are entirely re-arrangements of a set of
19168 // zero inputs. This mostly happens while decomposing complex shuffles into
19169 // simple ones. Directly lower these as a buildvector of zeros.
19170 APInt KnownUndef, KnownZero;
19171 computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);
19172
19173 APInt Zeroable = KnownUndef | KnownZero;
19174 if (Zeroable.isAllOnes())
19175 return getZeroVector(VT, Subtarget, DAG, DL);
19176
19177 bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
19178
19179 // Try to collapse shuffles into using a vector type with fewer elements but
19180 // wider element types. We cap this to not form integers or floating point
19181 // elements wider than 64 bits. It does not seem beneficial to form i128
19182 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
19183 SmallVector<int, 16> WidenedMask;
19184 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
19185 canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {
19186 // Shuffle mask widening should not interfere with a broadcast opportunity
19187 // by obfuscating the operands with bitcasts.
19188 // TODO: Avoid lowering directly from this top-level function: make this
19189 // a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
19190 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,
19191 Subtarget, DAG))
19192 return Broadcast;
19193
19194 MVT NewEltVT = VT.isFloatingPoint()
19195 ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
19196 : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
19197 int NewNumElts = NumElements / 2;
19198 MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
19199 // Make sure that the new vector type is legal. For example, v2f64 isn't
19200 // legal on SSE1.
19201 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
19202 if (V2IsZero) {
19203 // Modify the new Mask to take all zeros from the all-zero vector.
19204 // Choose indices that are blend-friendly.
19205 bool UsedZeroVector = false;
19206 assert(is_contained(WidenedMask, SM_SentinelZero) &&(static_cast <bool> (is_contained(WidenedMask, SM_SentinelZero
) && "V2's non-undef elements are used?!") ? void (0)
: __assert_fail ("is_contained(WidenedMask, SM_SentinelZero) && \"V2's non-undef elements are used?!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19207, __extension__
__PRETTY_FUNCTION__))
19207 "V2's non-undef elements are used?!")(static_cast <bool> (is_contained(WidenedMask, SM_SentinelZero
) && "V2's non-undef elements are used?!") ? void (0)
: __assert_fail ("is_contained(WidenedMask, SM_SentinelZero) && \"V2's non-undef elements are used?!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19207, __extension__
__PRETTY_FUNCTION__))
;
19208 for (int i = 0; i != NewNumElts; ++i)
19209 if (WidenedMask[i] == SM_SentinelZero) {
19210 WidenedMask[i] = i + NewNumElts;
19211 UsedZeroVector = true;
19212 }
19213 // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
19214 // some elements to be undef.
19215 if (UsedZeroVector)
19216 V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
19217 }
19218 V1 = DAG.getBitcast(NewVT, V1);
19219 V2 = DAG.getBitcast(NewVT, V2);
19220 return DAG.getBitcast(
19221 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
19222 }
19223 }
19224
19225 SmallVector<SDValue> Ops = {V1, V2};
19226 SmallVector<int> Mask(OrigMask.begin(), OrigMask.end());
19227
19228 // Canonicalize the shuffle with any horizontal ops inputs.
19229 // NOTE: This may update Ops and Mask.
19230 if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp(
19231 Ops, Mask, VT.getSizeInBits(), DL, DAG, Subtarget))
19232 return DAG.getBitcast(VT, HOp);
19233
19234 V1 = DAG.getBitcast(VT, Ops[0]);
19235 V2 = DAG.getBitcast(VT, Ops[1]);
19236 assert(NumElements == (int)Mask.size() &&(static_cast <bool> (NumElements == (int)Mask.size() &&
"canonicalizeShuffleMaskWithHorizOp " "shouldn't alter the shuffle mask size"
) ? void (0) : __assert_fail ("NumElements == (int)Mask.size() && \"canonicalizeShuffleMaskWithHorizOp \" \"shouldn't alter the shuffle mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19238, __extension__
__PRETTY_FUNCTION__))
19237 "canonicalizeShuffleMaskWithHorizOp "(static_cast <bool> (NumElements == (int)Mask.size() &&
"canonicalizeShuffleMaskWithHorizOp " "shouldn't alter the shuffle mask size"
) ? void (0) : __assert_fail ("NumElements == (int)Mask.size() && \"canonicalizeShuffleMaskWithHorizOp \" \"shouldn't alter the shuffle mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19238, __extension__
__PRETTY_FUNCTION__))
19238 "shouldn't alter the shuffle mask size")(static_cast <bool> (NumElements == (int)Mask.size() &&
"canonicalizeShuffleMaskWithHorizOp " "shouldn't alter the shuffle mask size"
) ? void (0) : __assert_fail ("NumElements == (int)Mask.size() && \"canonicalizeShuffleMaskWithHorizOp \" \"shouldn't alter the shuffle mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19238, __extension__
__PRETTY_FUNCTION__))
;
19239
19240 // Commute the shuffle if it will improve canonicalization.
19241 if (canonicalizeShuffleMaskWithCommute(Mask)) {
19242 ShuffleVectorSDNode::commuteMask(Mask);
19243 std::swap(V1, V2);
19244 }
19245
19246 // For each vector width, delegate to a specialized lowering routine.
19247 if (VT.is128BitVector())
19248 return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
19249
19250 if (VT.is256BitVector())
19251 return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
19252
19253 if (VT.is512BitVector())
19254 return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
19255
19256 if (Is1BitVector)
19257 return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
19258
19259 llvm_unreachable("Unimplemented!")::llvm::llvm_unreachable_internal("Unimplemented!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19259)
;
19260}
19261
19262/// Try to lower a VSELECT instruction to a vector shuffle.
19263static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
19264 const X86Subtarget &Subtarget,
19265 SelectionDAG &DAG) {
19266 SDValue Cond = Op.getOperand(0);
19267 SDValue LHS = Op.getOperand(1);
19268 SDValue RHS = Op.getOperand(2);
19269 MVT VT = Op.getSimpleValueType();
19270
19271 // Only non-legal VSELECTs reach this lowering, convert those into generic
19272 // shuffles and re-use the shuffle lowering path for blends.
19273 if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
19274 SmallVector<int, 32> Mask;
19275 if (createShuffleMaskFromVSELECT(Mask, Cond))
19276 return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);
19277 }
19278
19279 return SDValue();
19280}
19281
19282SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
19283 SDValue Cond = Op.getOperand(0);
19284 SDValue LHS = Op.getOperand(1);
19285 SDValue RHS = Op.getOperand(2);
19286
19287 // A vselect where all conditions and data are constants can be optimized into
19288 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
19289 if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) &&
19290 ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&
19291 ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))
19292 return SDValue();
19293
19294 // Try to lower this to a blend-style vector shuffle. This can handle all
19295 // constant condition cases.
19296 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
19297 return BlendOp;
19298
19299 // If this VSELECT has a vector if i1 as a mask, it will be directly matched
19300 // with patterns on the mask registers on AVX-512.
19301 MVT CondVT = Cond.getSimpleValueType();
19302 unsigned CondEltSize = Cond.getScalarValueSizeInBits();
19303 if (CondEltSize == 1)
19304 return Op;
19305
19306 // Variable blends are only legal from SSE4.1 onward.
19307 if (!Subtarget.hasSSE41())
19308 return SDValue();
19309
19310 SDLoc dl(Op);
19311 MVT VT = Op.getSimpleValueType();
19312 unsigned EltSize = VT.getScalarSizeInBits();
19313 unsigned NumElts = VT.getVectorNumElements();
19314
19315 // Expand v32i16/v64i8 without BWI.
19316 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
19317 return SDValue();
19318
19319 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
19320 // into an i1 condition so that we can use the mask-based 512-bit blend
19321 // instructions.
19322 if (VT.getSizeInBits() == 512) {
19323 // Build a mask by testing the condition against zero.
19324 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
19325 SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
19326 DAG.getConstant(0, dl, CondVT),
19327 ISD::SETNE);
19328 // Now return a new VSELECT using the mask.
19329 return DAG.getSelect(dl, VT, Mask, LHS, RHS);
19330 }
19331
19332 // SEXT/TRUNC cases where the mask doesn't match the destination size.
19333 if (CondEltSize != EltSize) {
19334 // If we don't have a sign splat, rely on the expansion.
19335 if (CondEltSize != DAG.ComputeNumSignBits(Cond))
19336 return SDValue();
19337
19338 MVT NewCondSVT = MVT::getIntegerVT(EltSize);
19339 MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);
19340 Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);
19341 return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
19342 }
19343
19344 // Only some types will be legal on some subtargets. If we can emit a legal
19345 // VSELECT-matching blend, return Op, and but if we need to expand, return
19346 // a null value.
19347 switch (VT.SimpleTy) {
19348 default:
19349 // Most of the vector types have blends past SSE4.1.
19350 return Op;
19351
19352 case MVT::v32i8:
19353 // The byte blends for AVX vectors were introduced only in AVX2.
19354 if (Subtarget.hasAVX2())
19355 return Op;
19356
19357 return SDValue();
19358
19359 case MVT::v8i16:
19360 case MVT::v16i16: {
19361 // Bitcast everything to the vXi8 type and use a vXi8 vselect.
19362 MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
19363 Cond = DAG.getBitcast(CastVT, Cond);
19364 LHS = DAG.getBitcast(CastVT, LHS);
19365 RHS = DAG.getBitcast(CastVT, RHS);
19366 SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
19367 return DAG.getBitcast(VT, Select);
19368 }
19369 }
19370}
19371
19372static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
19373 MVT VT = Op.getSimpleValueType();
19374 SDValue Vec = Op.getOperand(0);
19375 SDValue Idx = Op.getOperand(1);
19376 assert(isa<ConstantSDNode>(Idx) && "Constant index expected")(static_cast <bool> (isa<ConstantSDNode>(Idx) &&
"Constant index expected") ? void (0) : __assert_fail ("isa<ConstantSDNode>(Idx) && \"Constant index expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19376, __extension__
__PRETTY_FUNCTION__))
;
19377 SDLoc dl(Op);
19378
19379 if (!Vec.getSimpleValueType().is128BitVector())
19380 return SDValue();
19381
19382 if (VT.getSizeInBits() == 8) {
19383 // If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless
19384 // we're going to zero extend the register or fold the store.
19385 if (llvm::isNullConstant(Idx) && !X86::mayFoldIntoZeroExtend(Op) &&
19386 !X86::mayFoldIntoStore(Op))
19387 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
19388 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
19389 DAG.getBitcast(MVT::v4i32, Vec), Idx));
19390
19391 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
19392 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,
19393 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
19394 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
19395 }
19396
19397 if (VT == MVT::f32) {
19398 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
19399 // the result back to FR32 register. It's only worth matching if the
19400 // result has a single use which is a store or a bitcast to i32. And in
19401 // the case of a store, it's not worth it if the index is a constant 0,
19402 // because a MOVSSmr can be used instead, which is smaller and faster.
19403 if (!Op.hasOneUse())
19404 return SDValue();
19405 SDNode *User = *Op.getNode()->use_begin();
19406 if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&
19407 (User->getOpcode() != ISD::BITCAST ||
19408 User->getValueType(0) != MVT::i32))
19409 return SDValue();
19410 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
19411 DAG.getBitcast(MVT::v4i32, Vec), Idx);
19412 return DAG.getBitcast(MVT::f32, Extract);
19413 }
19414
19415 if (VT == MVT::i32 || VT == MVT::i64)
19416 return Op;
19417
19418 return SDValue();
19419}
19420
19421/// Extract one bit from mask vector, like v16i1 or v8i1.
19422/// AVX-512 feature.
19423static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
19424 const X86Subtarget &Subtarget) {
19425 SDValue Vec = Op.getOperand(0);
19426 SDLoc dl(Vec);
19427 MVT VecVT = Vec.getSimpleValueType();
19428 SDValue Idx = Op.getOperand(1);
19429 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
19430 MVT EltVT = Op.getSimpleValueType();
19431
19432 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&(static_cast <bool> ((VecVT.getVectorNumElements() <=
16 || Subtarget.hasBWI()) && "Unexpected vector type in ExtractBitFromMaskVector"
) ? void (0) : __assert_fail ("(VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) && \"Unexpected vector type in ExtractBitFromMaskVector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19433, __extension__
__PRETTY_FUNCTION__))
19433 "Unexpected vector type in ExtractBitFromMaskVector")(static_cast <bool> ((VecVT.getVectorNumElements() <=
16 || Subtarget.hasBWI()) && "Unexpected vector type in ExtractBitFromMaskVector"
) ? void (0) : __assert_fail ("(VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) && \"Unexpected vector type in ExtractBitFromMaskVector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19433, __extension__
__PRETTY_FUNCTION__))
;
19434
19435 // variable index can't be handled in mask registers,
19436 // extend vector to VR512/128
19437 if (!IdxC) {
19438 unsigned NumElts = VecVT.getVectorNumElements();
19439 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
19440 // than extending to 128/256bit.
19441 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
19442 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
19443 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
19444 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
19445 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
19446 }
19447
19448 unsigned IdxVal = IdxC->getZExtValue();
19449 if (IdxVal == 0) // the operation is legal
19450 return Op;
19451
19452 // Extend to natively supported kshift.
19453 unsigned NumElems = VecVT.getVectorNumElements();
19454 MVT WideVecVT = VecVT;
19455 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {
19456 WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
19457 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
19458 DAG.getUNDEF(WideVecVT), Vec,
19459 DAG.getIntPtrConstant(0, dl));
19460 }
19461
19462 // Use kshiftr instruction to move to the lower element.
19463 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
19464 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
19465
19466 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
19467 DAG.getIntPtrConstant(0, dl));
19468}
19469
19470SDValue
19471X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
19472 SelectionDAG &DAG) const {
19473 SDLoc dl(Op);
19474 SDValue Vec = Op.getOperand(0);
19475 MVT VecVT = Vec.getSimpleValueType();
19476 SDValue Idx = Op.getOperand(1);
19477 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
19478
19479 if (VecVT.getVectorElementType() == MVT::i1)
19480 return ExtractBitFromMaskVector(Op, DAG, Subtarget);
19481
19482 if (!IdxC) {
19483 // Its more profitable to go through memory (1 cycles throughput)
19484 // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
19485 // IACA tool was used to get performance estimation
19486 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
19487 //
19488 // example : extractelement <16 x i8> %a, i32 %i
19489 //
19490 // Block Throughput: 3.00 Cycles
19491 // Throughput Bottleneck: Port5
19492 //
19493 // | Num Of | Ports pressure in cycles | |
19494 // | Uops | 0 - DV | 5 | 6 | 7 | |
19495 // ---------------------------------------------
19496 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
19497 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
19498 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
19499 // Total Num Of Uops: 4
19500 //
19501 //
19502 // Block Throughput: 1.00 Cycles
19503 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
19504 //
19505 // | | Ports pressure in cycles | |
19506 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
19507 // ---------------------------------------------------------
19508 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
19509 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
19510 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
19511 // Total Num Of Uops: 4
19512
19513 return SDValue();
19514 }
19515
19516 unsigned IdxVal = IdxC->getZExtValue();
19517
19518 // If this is a 256-bit vector result, first extract the 128-bit vector and
19519 // then extract the element from the 128-bit vector.
19520 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
19521 // Get the 128-bit vector.
19522 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
19523 MVT EltVT = VecVT.getVectorElementType();
19524
19525 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
19526 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")(static_cast <bool> (isPowerOf2_32(ElemsPerChunk) &&
"Elements per chunk not power of 2") ? void (0) : __assert_fail
("isPowerOf2_32(ElemsPerChunk) && \"Elements per chunk not power of 2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19526, __extension__
__PRETTY_FUNCTION__))
;
19527
19528 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
19529 // this can be done with a mask.
19530 IdxVal &= ElemsPerChunk - 1;
19531 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
19532 DAG.getIntPtrConstant(IdxVal, dl));
19533 }
19534
19535 assert(VecVT.is128BitVector() && "Unexpected vector length")(static_cast <bool> (VecVT.is128BitVector() && "Unexpected vector length"
) ? void (0) : __assert_fail ("VecVT.is128BitVector() && \"Unexpected vector length\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19535, __extension__
__PRETTY_FUNCTION__))
;
19536
19537 MVT VT = Op.getSimpleValueType();
19538
19539 if (VT == MVT::i16) {
19540 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
19541 // we're going to zero extend the register or fold the store (SSE41 only).
19542 if (IdxVal == 0 && !X86::mayFoldIntoZeroExtend(Op) &&
19543 !(Subtarget.hasSSE41() && X86::mayFoldIntoStore(Op))) {
19544 if (Subtarget.hasFP16())
19545 return Op;
19546
19547 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
19548 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
19549 DAG.getBitcast(MVT::v4i32, Vec), Idx));
19550 }
19551
19552 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,
19553 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
19554 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
19555 }
19556
19557 if (Subtarget.hasSSE41())
19558 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
19559 return Res;
19560
19561 // TODO: We only extract a single element from v16i8, we can probably afford
19562 // to be more aggressive here before using the default approach of spilling to
19563 // stack.
19564 if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
19565 // Extract either the lowest i32 or any i16, and extract the sub-byte.
19566 int DWordIdx = IdxVal / 4;
19567 if (DWordIdx == 0) {
19568 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
19569 DAG.getBitcast(MVT::v4i32, Vec),
19570 DAG.getIntPtrConstant(DWordIdx, dl));
19571 int ShiftVal = (IdxVal % 4) * 8;
19572 if (ShiftVal != 0)
19573 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
19574 DAG.getConstant(ShiftVal, dl, MVT::i8));
19575 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
19576 }
19577
19578 int WordIdx = IdxVal / 2;
19579 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
19580 DAG.getBitcast(MVT::v8i16, Vec),
19581 DAG.getIntPtrConstant(WordIdx, dl));
19582 int ShiftVal = (IdxVal % 2) * 8;
19583 if (ShiftVal != 0)
19584 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
19585 DAG.getConstant(ShiftVal, dl, MVT::i8));
19586 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
19587 }
19588
19589 if (VT == MVT::f16 || VT.getSizeInBits() == 32) {
19590 if (IdxVal == 0)
19591 return Op;
19592
19593 // Shuffle the element to the lowest element, then movss or movsh.
19594 SmallVector<int, 8> Mask(VecVT.getVectorNumElements(), -1);
19595 Mask[0] = static_cast<int>(IdxVal);
19596 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
19597 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
19598 DAG.getIntPtrConstant(0, dl));
19599 }
19600
19601 if (VT.getSizeInBits() == 64) {
19602 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
19603 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
19604 // to match extract_elt for f64.
19605 if (IdxVal == 0)
19606 return Op;
19607
19608 // UNPCKHPD the element to the lowest double word, then movsd.
19609 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
19610 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
19611 int Mask[2] = { 1, -1 };
19612 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
19613 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
19614 DAG.getIntPtrConstant(0, dl));
19615 }
19616
19617 return SDValue();
19618}
19619
19620/// Insert one bit to mask vector, like v16i1 or v8i1.
19621/// AVX-512 feature.
19622static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,
19623 const X86Subtarget &Subtarget) {
19624 SDLoc dl(Op);
19625 SDValue Vec = Op.getOperand(0);
19626 SDValue Elt = Op.getOperand(1);
19627 SDValue Idx = Op.getOperand(2);
19628 MVT VecVT = Vec.getSimpleValueType();
19629
19630 if (!isa<ConstantSDNode>(Idx)) {
19631 // Non constant index. Extend source and destination,
19632 // insert element and then truncate the result.
19633 unsigned NumElts = VecVT.getVectorNumElements();
19634 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
19635 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
19636 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
19637 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
19638 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
19639 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
19640 }
19641
19642 // Copy into a k-register, extract to v1i1 and insert_subvector.
19643 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
19644 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx);
19645}
19646
19647SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
19648 SelectionDAG &DAG) const {
19649 MVT VT = Op.getSimpleValueType();
19650 MVT EltVT = VT.getVectorElementType();
19651 unsigned NumElts = VT.getVectorNumElements();
19652 unsigned EltSizeInBits = EltVT.getScalarSizeInBits();
19653
19654 if (EltVT == MVT::i1)
19655 return InsertBitToMaskVector(Op, DAG, Subtarget);
19656
19657 SDLoc dl(Op);
19658 SDValue N0 = Op.getOperand(0);
19659 SDValue N1 = Op.getOperand(1);
19660 SDValue N2 = Op.getOperand(2);
19661 auto *N2C = dyn_cast<ConstantSDNode>(N2);
19662
19663 if (!N2C) {
19664 // Variable insertion indices, usually we're better off spilling to stack,
19665 // but AVX512 can use a variable compare+select by comparing against all
19666 // possible vector indices, and FP insertion has less gpr->simd traffic.
19667 if (!(Subtarget.hasBWI() ||
19668 (Subtarget.hasAVX512() && EltSizeInBits >= 32) ||
19669 (Subtarget.hasSSE41() && VT.isFloatingPoint())))
19670 return SDValue();
19671
19672 MVT IdxSVT = MVT::getIntegerVT(EltSizeInBits);
19673 MVT IdxVT = MVT::getVectorVT(IdxSVT, NumElts);
19674 if (!isTypeLegal(IdxSVT) || !isTypeLegal(IdxVT))
19675 return SDValue();
19676
19677 SDValue IdxExt = DAG.getZExtOrTrunc(N2, dl, IdxSVT);
19678 SDValue IdxSplat = DAG.getSplatBuildVector(IdxVT, dl, IdxExt);
19679 SDValue EltSplat = DAG.getSplatBuildVector(VT, dl, N1);
19680
19681 SmallVector<SDValue, 16> RawIndices;
19682 for (unsigned I = 0; I != NumElts; ++I)
19683 RawIndices.push_back(DAG.getConstant(I, dl, IdxSVT));
19684 SDValue Indices = DAG.getBuildVector(IdxVT, dl, RawIndices);
19685
19686 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
19687 return DAG.getSelectCC(dl, IdxSplat, Indices, EltSplat, N0,
19688 ISD::CondCode::SETEQ);
19689 }
19690
19691 if (N2C->getAPIntValue().uge(NumElts))
19692 return SDValue();
19693 uint64_t IdxVal = N2C->getZExtValue();
19694
19695 bool IsZeroElt = X86::isZeroNode(N1);
19696 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
19697
19698 if (IsZeroElt || IsAllOnesElt) {
19699 // Lower insertion of i8 -1 as an 'OR' blend.
19700 // We don't deal with i8 0 since it appears to be handled elsewhere.
19701 if (IsAllOnesElt && EltSizeInBits == 8 && !Subtarget.hasSSE41()) {
19702 SDValue ZeroCst = DAG.getConstant(0, dl, VT.getScalarType());
19703 SDValue OnesCst = DAG.getAllOnesConstant(dl, VT.getScalarType());
19704 SmallVector<SDValue, 8> CstVectorElts(NumElts, ZeroCst);
19705 CstVectorElts[IdxVal] = OnesCst;
19706 SDValue CstVector = DAG.getBuildVector(VT, dl, CstVectorElts);
19707 return DAG.getNode(ISD::OR, dl, VT, N0, CstVector);
19708 }
19709 // See if we can do this more efficiently with a blend shuffle with a
19710 // rematerializable vector.
19711 if (Subtarget.hasSSE41() &&
19712 (EltSizeInBits >= 16 || (IsZeroElt && !VT.is128BitVector()))) {
19713 SmallVector<int, 8> BlendMask;
19714 for (unsigned i = 0; i != NumElts; ++i)
19715 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
19716 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
19717 : getOnesVector(VT, DAG, dl);
19718 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
19719 }
19720 }
19721
19722 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
19723 // into that, and then insert the subvector back into the result.
19724 if (VT.is256BitVector() || VT.is512BitVector()) {
19725 // With a 256-bit vector, we can insert into the zero element efficiently
19726 // using a blend if we have AVX or AVX2 and the right data type.
19727 if (VT.is256BitVector() && IdxVal == 0) {
19728 // TODO: It is worthwhile to cast integer to floating point and back
19729 // and incur a domain crossing penalty if that's what we'll end up
19730 // doing anyway after extracting to a 128-bit vector.
19731 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
19732 (Subtarget.hasAVX2() && EltVT == MVT::i32)) {
19733 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
19734 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,
19735 DAG.getTargetConstant(1, dl, MVT::i8));
19736 }
19737 }
19738
19739 unsigned NumEltsIn128 = 128 / EltSizeInBits;
19740 assert(isPowerOf2_32(NumEltsIn128) &&(static_cast <bool> (isPowerOf2_32(NumEltsIn128) &&
"Vectors will always have power-of-two number of elements.")
? void (0) : __assert_fail ("isPowerOf2_32(NumEltsIn128) && \"Vectors will always have power-of-two number of elements.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19741, __extension__
__PRETTY_FUNCTION__))
19741 "Vectors will always have power-of-two number of elements.")(static_cast <bool> (isPowerOf2_32(NumEltsIn128) &&
"Vectors will always have power-of-two number of elements.")
? void (0) : __assert_fail ("isPowerOf2_32(NumEltsIn128) && \"Vectors will always have power-of-two number of elements.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19741, __extension__
__PRETTY_FUNCTION__))
;
19742
19743 // If we are not inserting into the low 128-bit vector chunk,
19744 // then prefer the broadcast+blend sequence.
19745 // FIXME: relax the profitability check iff all N1 uses are insertions.
19746 if (!VT.is128BitVector() && IdxVal >= NumEltsIn128 &&
19747 ((Subtarget.hasAVX2() && EltSizeInBits != 8) ||
19748 (Subtarget.hasAVX() && (EltSizeInBits >= 32) &&
19749 X86::mayFoldLoad(N1, Subtarget)))) {
19750 SDValue N1SplatVec = DAG.getSplatBuildVector(VT, dl, N1);
19751 SmallVector<int, 8> BlendMask;
19752 for (unsigned i = 0; i != NumElts; ++i)
19753 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
19754 return DAG.getVectorShuffle(VT, dl, N0, N1SplatVec, BlendMask);
19755 }
19756
19757 // Get the desired 128-bit vector chunk.
19758 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
19759
19760 // Insert the element into the desired chunk.
19761 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
19762 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
19763
19764 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
19765 DAG.getIntPtrConstant(IdxIn128, dl));
19766
19767 // Insert the changed part back into the bigger vector
19768 return insert128BitVector(N0, V, IdxVal, DAG, dl);
19769 }
19770 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vector types should be left!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector types should be left!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19770, __extension__
__PRETTY_FUNCTION__))
;
19771
19772 // This will be just movw/movd/movq/movsh/movss/movsd.
19773 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {
19774 if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
19775 EltVT == MVT::f16 || EltVT == MVT::i64) {
19776 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
19777 return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
19778 }
19779
19780 // We can't directly insert an i8 or i16 into a vector, so zero extend
19781 // it to i32 first.
19782 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
19783 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);
19784 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
19785 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);
19786 N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
19787 return DAG.getBitcast(VT, N1);
19788 }
19789 }
19790
19791 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
19792 // argument. SSE41 required for pinsrb.
19793 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
19794 unsigned Opc;
19795 if (VT == MVT::v8i16) {
19796 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW")(static_cast <bool> (Subtarget.hasSSE2() && "SSE2 required for PINSRW"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"SSE2 required for PINSRW\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19796, __extension__
__PRETTY_FUNCTION__))
;
19797 Opc = X86ISD::PINSRW;
19798 } else {
19799 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector")(static_cast <bool> (VT == MVT::v16i8 && "PINSRB requires v16i8 vector"
) ? void (0) : __assert_fail ("VT == MVT::v16i8 && \"PINSRB requires v16i8 vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19799, __extension__
__PRETTY_FUNCTION__))
;
19800 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB")(static_cast <bool> (Subtarget.hasSSE41() && "SSE41 required for PINSRB"
) ? void (0) : __assert_fail ("Subtarget.hasSSE41() && \"SSE41 required for PINSRB\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19800, __extension__
__PRETTY_FUNCTION__))
;
19801 Opc = X86ISD::PINSRB;
19802 }
19803
19804 assert(N1.getValueType() != MVT::i32 && "Unexpected VT")(static_cast <bool> (N1.getValueType() != MVT::i32 &&
"Unexpected VT") ? void (0) : __assert_fail ("N1.getValueType() != MVT::i32 && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19804, __extension__
__PRETTY_FUNCTION__))
;
19805 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
19806 N2 = DAG.getTargetConstant(IdxVal, dl, MVT::i8);
19807 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
19808 }
19809
19810 if (Subtarget.hasSSE41()) {
19811 if (EltVT == MVT::f32) {
19812 // Bits [7:6] of the constant are the source select. This will always be
19813 // zero here. The DAG Combiner may combine an extract_elt index into
19814 // these bits. For example (insert (extract, 3), 2) could be matched by
19815 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
19816 // Bits [5:4] of the constant are the destination select. This is the
19817 // value of the incoming immediate.
19818 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
19819 // combine either bitwise AND or insert of float 0.0 to set these bits.
19820
19821 bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
19822 if (IdxVal == 0 && (!MinSize || !X86::mayFoldLoad(N1, Subtarget))) {
19823 // If this is an insertion of 32-bits into the low 32-bits of
19824 // a vector, we prefer to generate a blend with immediate rather
19825 // than an insertps. Blends are simpler operations in hardware and so
19826 // will always have equal or better performance than insertps.
19827 // But if optimizing for size and there's a load folding opportunity,
19828 // generate insertps because blendps does not have a 32-bit memory
19829 // operand form.
19830 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
19831 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,
19832 DAG.getTargetConstant(1, dl, MVT::i8));
19833 }
19834 // Create this as a scalar to vector..
19835 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
19836 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,
19837 DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));
19838 }
19839
19840 // PINSR* works with constant index.
19841 if (EltVT == MVT::i32 || EltVT == MVT::i64)
19842 return Op;
19843 }
19844
19845 return SDValue();
19846}
19847
19848static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
19849 SelectionDAG &DAG) {
19850 SDLoc dl(Op);
19851 MVT OpVT = Op.getSimpleValueType();
19852
19853 // It's always cheaper to replace a xor+movd with xorps and simplifies further
19854 // combines.
19855 if (X86::isZeroNode(Op.getOperand(0)))
19856 return getZeroVector(OpVT, Subtarget, DAG, dl);
19857
19858 // If this is a 256-bit vector result, first insert into a 128-bit
19859 // vector and then insert into the 256-bit vector.
19860 if (!OpVT.is128BitVector()) {
19861 // Insert into a 128-bit vector.
19862 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
19863 MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
19864 OpVT.getVectorNumElements() / SizeFactor);
19865
19866 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
19867
19868 // Insert the 128-bit vector.
19869 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
19870 }
19871 assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&(static_cast <bool> (OpVT.is128BitVector() && OpVT
.isInteger() && OpVT != MVT::v2i64 && "Expected an SSE type!"
) ? void (0) : __assert_fail ("OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 && \"Expected an SSE type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19872, __extension__
__PRETTY_FUNCTION__))
19872 "Expected an SSE type!")(static_cast <bool> (OpVT.is128BitVector() && OpVT
.isInteger() && OpVT != MVT::v2i64 && "Expected an SSE type!"
) ? void (0) : __assert_fail ("OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 && \"Expected an SSE type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19872, __extension__
__PRETTY_FUNCTION__))
;
19873
19874 // Pass through a v4i32 or V8i16 SCALAR_TO_VECTOR as that's what we use in
19875 // tblgen.
19876 if (OpVT == MVT::v4i32 || (OpVT == MVT::v8i16 && Subtarget.hasFP16()))
19877 return Op;
19878
19879 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
19880 return DAG.getBitcast(
19881 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
19882}
19883
19884// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
19885// simple superregister reference or explicit instructions to insert
19886// the upper bits of a vector.
19887static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
19888 SelectionDAG &DAG) {
19889 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1)(static_cast <bool> (Op.getSimpleValueType().getVectorElementType
() == MVT::i1) ? void (0) : __assert_fail ("Op.getSimpleValueType().getVectorElementType() == MVT::i1"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19889, __extension__
__PRETTY_FUNCTION__))
;
19890
19891 return insert1BitVector(Op, DAG, Subtarget);
19892}
19893
19894static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
19895 SelectionDAG &DAG) {
19896 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&(static_cast <bool> (Op.getSimpleValueType().getVectorElementType
() == MVT::i1 && "Only vXi1 extract_subvectors need custom lowering"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().getVectorElementType() == MVT::i1 && \"Only vXi1 extract_subvectors need custom lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19897, __extension__
__PRETTY_FUNCTION__))
19897 "Only vXi1 extract_subvectors need custom lowering")(static_cast <bool> (Op.getSimpleValueType().getVectorElementType
() == MVT::i1 && "Only vXi1 extract_subvectors need custom lowering"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().getVectorElementType() == MVT::i1 && \"Only vXi1 extract_subvectors need custom lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19897, __extension__
__PRETTY_FUNCTION__))
;
19898
19899 SDLoc dl(Op);
19900 SDValue Vec = Op.getOperand(0);
19901 uint64_t IdxVal = Op.getConstantOperandVal(1);
19902
19903 if (IdxVal == 0) // the operation is legal
19904 return Op;
19905
19906 MVT VecVT = Vec.getSimpleValueType();
19907 unsigned NumElems = VecVT.getVectorNumElements();
19908
19909 // Extend to natively supported kshift.
19910 MVT WideVecVT = VecVT;
19911 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {
19912 WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
19913 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
19914 DAG.getUNDEF(WideVecVT), Vec,
19915 DAG.getIntPtrConstant(0, dl));
19916 }
19917
19918 // Shift to the LSB.
19919 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
19920 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
19921
19922 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
19923 DAG.getIntPtrConstant(0, dl));
19924}
19925
19926// Returns the appropriate wrapper opcode for a global reference.
19927unsigned X86TargetLowering::getGlobalWrapperKind(
19928 const GlobalValue *GV, const unsigned char OpFlags) const {
19929 // References to absolute symbols are never PC-relative.
19930 if (GV && GV->isAbsoluteSymbolRef())
19931 return X86ISD::Wrapper;
19932
19933 CodeModel::Model M = getTargetMachine().getCodeModel();
19934 if (Subtarget.isPICStyleRIPRel() &&
19935 (M == CodeModel::Small || M == CodeModel::Kernel))
19936 return X86ISD::WrapperRIP;
19937
19938 // GOTPCREL references must always use RIP.
19939 if (OpFlags == X86II::MO_GOTPCREL || OpFlags == X86II::MO_GOTPCREL_NORELAX)
19940 return X86ISD::WrapperRIP;
19941
19942 return X86ISD::Wrapper;
19943}
19944
19945// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
19946// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
19947// one of the above mentioned nodes. It has to be wrapped because otherwise
19948// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
19949// be used to form addressing mode. These wrapped nodes will be selected
19950// into MOV32ri.
19951SDValue
19952X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
19953 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
19954
19955 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19956 // global base reg.
19957 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
19958
19959 auto PtrVT = getPointerTy(DAG.getDataLayout());
19960 SDValue Result = DAG.getTargetConstantPool(
19961 CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);
19962 SDLoc DL(CP);
19963 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
19964 // With PIC, the address is actually $g + Offset.
19965 if (OpFlag) {
19966 Result =
19967 DAG.getNode(ISD::ADD, DL, PtrVT,
19968 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
19969 }
19970
19971 return Result;
19972}
19973
19974SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
19975 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
19976
19977 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19978 // global base reg.
19979 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
19980
19981 auto PtrVT = getPointerTy(DAG.getDataLayout());
19982 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
19983 SDLoc DL(JT);
19984 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
19985
19986 // With PIC, the address is actually $g + Offset.
19987 if (OpFlag)
19988 Result =
19989 DAG.getNode(ISD::ADD, DL, PtrVT,
19990 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
19991
19992 return Result;
19993}
19994
19995SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
19996 SelectionDAG &DAG) const {
19997 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
19998}
19999
20000SDValue
20001X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
20002 // Create the TargetBlockAddressAddress node.
20003 unsigned char OpFlags =
20004 Subtarget.classifyBlockAddressReference();
20005 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
20006 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
20007 SDLoc dl(Op);
20008 auto PtrVT = getPointerTy(DAG.getDataLayout());
20009 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
20010 Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);
20011
20012 // With PIC, the address is actually $g + Offset.
20013 if (isGlobalRelativeToPICBase(OpFlags)) {
20014 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
20015 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
20016 }
20017
20018 return Result;
20019}
20020
20021/// Creates target global address or external symbol nodes for calls or
20022/// other uses.
20023SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
20024 bool ForCall) const {
20025 // Unpack the global address or external symbol.
20026 const SDLoc &dl = SDLoc(Op);
20027 const GlobalValue *GV = nullptr;
20028 int64_t Offset = 0;
20029 const char *ExternalSym = nullptr;
20030 if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {
20031 GV = G->getGlobal();
20032 Offset = G->getOffset();
20033 } else {
20034 const auto *ES = cast<ExternalSymbolSDNode>(Op);
20035 ExternalSym = ES->getSymbol();
20036 }
20037
20038 // Calculate some flags for address lowering.
20039 const Module &Mod = *DAG.getMachineFunction().getFunction().getParent();
20040 unsigned char OpFlags;
20041 if (ForCall)
20042 OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);
20043 else
20044 OpFlags = Subtarget.classifyGlobalReference(GV, Mod);
20045 bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);
20046 bool NeedsLoad = isGlobalStubReference(OpFlags);
20047
20048 CodeModel::Model M = DAG.getTarget().getCodeModel();
20049 auto PtrVT = getPointerTy(DAG.getDataLayout());
20050 SDValue Result;
20051
20052 if (GV) {
20053 // Create a target global address if this is a global. If possible, fold the
20054 // offset into the global address reference. Otherwise, ADD it on later.
20055 // Suppress the folding if Offset is negative: movl foo-1, %eax is not
20056 // allowed because if the address of foo is 0, the ELF R_X86_64_32
20057 // relocation will compute to a negative value, which is invalid.
20058 int64_t GlobalOffset = 0;
20059 if (OpFlags == X86II::MO_NO_FLAG && Offset >= 0 &&
20060 X86::isOffsetSuitableForCodeModel(Offset, M, true)) {
20061 std::swap(GlobalOffset, Offset);
20062 }
20063 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
20064 } else {
20065 // If this is not a global address, this must be an external symbol.
20066 Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);
20067 }
20068
20069 // If this is a direct call, avoid the wrapper if we don't need to do any
20070 // loads or adds. This allows SDAG ISel to match direct calls.
20071 if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
20072 return Result;
20073
20074 Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
20075
20076 // With PIC, the address is actually $g + Offset.
20077 if (HasPICReg) {
20078 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
20079 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
20080 }
20081
20082 // For globals that require a load from a stub to get the address, emit the
20083 // load.
20084 if (NeedsLoad)
20085 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
20086 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
20087
20088 // If there was a non-zero offset that we didn't fold, create an explicit
20089 // addition for it.
20090 if (Offset != 0)
20091 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
20092 DAG.getConstant(Offset, dl, PtrVT));
20093
20094 return Result;
20095}
20096
20097SDValue
20098X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
20099 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
20100}
20101
20102static SDValue
20103GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
20104 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
20105 unsigned char OperandFlags, bool LocalDynamic = false) {
20106 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
20107 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
20108 SDLoc dl(GA);
20109 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
20110 GA->getValueType(0),
20111 GA->getOffset(),
20112 OperandFlags);
20113
20114 X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
20115 : X86ISD::TLSADDR;
20116
20117 if (InFlag) {
20118 SDValue Ops[] = { Chain, TGA, *InFlag };
20119 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
20120 } else {
20121 SDValue Ops[] = { Chain, TGA };
20122 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
20123 }
20124
20125 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
20126 MFI.setAdjustsStack(true);
20127 MFI.setHasCalls(true);
20128
20129 SDValue Flag = Chain.getValue(1);
20130 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
20131}
20132
20133// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
20134static SDValue
20135LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
20136 const EVT PtrVT) {
20137 SDValue InFlag;
20138 SDLoc dl(GA); // ? function entry point might be better
20139 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
20140 DAG.getNode(X86ISD::GlobalBaseReg,
20141 SDLoc(), PtrVT), InFlag);
20142 InFlag = Chain.getValue(1);
20143
20144 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
20145}
20146
20147// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64
20148static SDValue
20149LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
20150 const EVT PtrVT) {
20151 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
20152 X86::RAX, X86II::MO_TLSGD);
20153}
20154
20155// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32
20156static SDValue
20157LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
20158 const EVT PtrVT) {
20159 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
20160 X86::EAX, X86II::MO_TLSGD);
20161}
20162
20163static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
20164 SelectionDAG &DAG, const EVT PtrVT,
20165 bool Is64Bit, bool Is64BitLP64) {
20166 SDLoc dl(GA);
20167
20168 // Get the start address of the TLS block for this module.
20169 X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
20170 .getInfo<X86MachineFunctionInfo>();
20171 MFI->incNumLocalDynamicTLSAccesses();
20172
20173 SDValue Base;
20174 if (Is64Bit) {
20175 unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;
20176 Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, ReturnReg,
20177 X86II::MO_TLSLD, /*LocalDynamic=*/true);
20178 } else {
20179 SDValue InFlag;
20180 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
20181 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
20182 InFlag = Chain.getValue(1);
20183 Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
20184 X86II::MO_TLSLDM, /*LocalDynamic=*/true);
20185 }
20186
20187 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
20188 // of Base.
20189
20190 // Build x@dtpoff.
20191 unsigned char OperandFlags = X86II::MO_DTPOFF;
20192 unsigned WrapperKind = X86ISD::Wrapper;
20193 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
20194 GA->getValueType(0),
20195 GA->getOffset(), OperandFlags);
20196 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
20197
20198 // Add x@dtpoff with the base.
20199 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
20200}
20201
20202// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
20203static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
20204 const EVT PtrVT, TLSModel::Model model,
20205 bool is64Bit, bool isPIC) {
20206 SDLoc dl(GA);
20207
20208 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
20209 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
20210 is64Bit ? 257 : 256));
20211
20212 SDValue ThreadPointer =
20213 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
20214 MachinePointerInfo(Ptr));
20215
20216 unsigned char OperandFlags = 0;
20217 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
20218 // initialexec.
20219 unsigned WrapperKind = X86ISD::Wrapper;
20220 if (model == TLSModel::LocalExec) {
20221 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
20222 } else if (model == TLSModel::InitialExec) {
20223 if (is64Bit) {
20224 OperandFlags = X86II::MO_GOTTPOFF;
20225 WrapperKind = X86ISD::WrapperRIP;
20226 } else {
20227 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
20228 }
20229 } else {
20230 llvm_unreachable("Unexpected model")::llvm::llvm_unreachable_internal("Unexpected model", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20230)
;
20231 }
20232
20233 // emit "addl x@ntpoff,%eax" (local exec)
20234 // or "addl x@indntpoff,%eax" (initial exec)
20235 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
20236 SDValue TGA =
20237 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
20238 GA->getOffset(), OperandFlags);
20239 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
20240
20241 if (model == TLSModel::InitialExec) {
20242 if (isPIC && !is64Bit) {
20243 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
20244 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
20245 Offset);
20246 }
20247
20248 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
20249 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
20250 }
20251
20252 // The address of the thread local variable is the add of the thread
20253 // pointer with the offset of the variable.
20254 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
20255}
20256
20257SDValue
20258X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
20259
20260 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
20261
20262 if (DAG.getTarget().useEmulatedTLS())
20263 return LowerToTLSEmulatedModel(GA, DAG);
20264
20265 const GlobalValue *GV = GA->getGlobal();
20266 auto PtrVT = getPointerTy(DAG.getDataLayout());
20267 bool PositionIndependent = isPositionIndependent();
20268
20269 if (Subtarget.isTargetELF()) {
20270 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
20271 switch (model) {
20272 case TLSModel::GeneralDynamic:
20273 if (Subtarget.is64Bit()) {
20274 if (Subtarget.isTarget64BitLP64())
20275 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
20276 return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);
20277 }
20278 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
20279 case TLSModel::LocalDynamic:
20280 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),
20281 Subtarget.isTarget64BitLP64());
20282 case TLSModel::InitialExec:
20283 case TLSModel::LocalExec:
20284 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
20285 PositionIndependent);
20286 }
20287 llvm_unreachable("Unknown TLS model.")::llvm::llvm_unreachable_internal("Unknown TLS model.", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20287)
;
20288 }
20289
20290 if (Subtarget.isTargetDarwin()) {
20291 // Darwin only has one model of TLS. Lower to that.
20292 unsigned char OpFlag = 0;
20293 unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
20294 X86ISD::WrapperRIP : X86ISD::Wrapper;
20295
20296 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
20297 // global base reg.
20298 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
20299 if (PIC32)
20300 OpFlag = X86II::MO_TLVP_PIC_BASE;
20301 else
20302 OpFlag = X86II::MO_TLVP;
20303 SDLoc DL(Op);
20304 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
20305 GA->getValueType(0),
20306 GA->getOffset(), OpFlag);
20307 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
20308
20309 // With PIC32, the address is actually $g + Offset.
20310 if (PIC32)
20311 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
20312 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
20313 Offset);
20314
20315 // Lowering the machine isd will make sure everything is in the right
20316 // location.
20317 SDValue Chain = DAG.getEntryNode();
20318 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
20319 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
20320 SDValue Args[] = { Chain, Offset };
20321 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
20322 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
20323 DAG.getIntPtrConstant(0, DL, true),
20324 Chain.getValue(1), DL);
20325
20326 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
20327 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
20328 MFI.setAdjustsStack(true);
20329
20330 // And our return value (tls address) is in the standard call return value
20331 // location.
20332 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
20333 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
20334 }
20335
20336 if (Subtarget.isOSWindows()) {
20337 // Just use the implicit TLS architecture
20338 // Need to generate something similar to:
20339 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
20340 // ; from TEB
20341 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
20342 // mov rcx, qword [rdx+rcx*8]
20343 // mov eax, .tls$:tlsvar
20344 // [rax+rcx] contains the address
20345 // Windows 64bit: gs:0x58
20346 // Windows 32bit: fs:__tls_array
20347
20348 SDLoc dl(GA);
20349 SDValue Chain = DAG.getEntryNode();
20350
20351 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
20352 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
20353 // use its literal value of 0x2C.
20354 Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
20355 ? Type::getInt8PtrTy(*DAG.getContext(),
20356 256)
20357 : Type::getInt32PtrTy(*DAG.getContext(),
20358 257));
20359
20360 SDValue TlsArray = Subtarget.is64Bit()
20361 ? DAG.getIntPtrConstant(0x58, dl)
20362 : (Subtarget.isTargetWindowsGNU()
20363 ? DAG.getIntPtrConstant(0x2C, dl)
20364 : DAG.getExternalSymbol("_tls_array", PtrVT));
20365
20366 SDValue ThreadPointer =
20367 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
20368
20369 SDValue res;
20370 if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
20371 res = ThreadPointer;
20372 } else {
20373 // Load the _tls_index variable
20374 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
20375 if (Subtarget.is64Bit())
20376 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
20377 MachinePointerInfo(), MVT::i32);
20378 else
20379 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
20380
20381 const DataLayout &DL = DAG.getDataLayout();
20382 SDValue Scale =
20383 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
20384 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
20385
20386 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
20387 }
20388
20389 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
20390
20391 // Get the offset of start of .tls section
20392 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
20393 GA->getValueType(0),
20394 GA->getOffset(), X86II::MO_SECREL);
20395 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
20396
20397 // The address of the thread local variable is the add of the thread
20398 // pointer with the offset of the variable.
20399 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
20400 }
20401
20402 llvm_unreachable("TLS not implemented for this target.")::llvm::llvm_unreachable_internal("TLS not implemented for this target."
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20402)
;
20403}
20404
20405/// Lower SRA_PARTS and friends, which return two i32 values
20406/// and take a 2 x i32 value to shift plus a shift amount.
20407/// TODO: Can this be moved to general expansion code?
20408static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
20409 SDValue Lo, Hi;
20410 DAG.getTargetLoweringInfo().expandShiftParts(Op.getNode(), Lo, Hi, DAG);
20411 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
20412}
20413
20414// Try to use a packed vector operation to handle i64 on 32-bit targets when
20415// AVX512DQ is enabled.
20416static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,
20417 const X86Subtarget &Subtarget) {
20418 assert((Op.getOpcode() == ISD::SINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20422, __extension__
__PRETTY_FUNCTION__))
20419 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20422, __extension__
__PRETTY_FUNCTION__))
20420 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20422, __extension__
__PRETTY_FUNCTION__))
20421 Op.getOpcode() == ISD::UINT_TO_FP) &&(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20422, __extension__
__PRETTY_FUNCTION__))
20422 "Unexpected opcode!")(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20422, __extension__
__PRETTY_FUNCTION__))
;
20423 bool IsStrict = Op->isStrictFPOpcode();
20424 unsigned OpNo = IsStrict ? 1 : 0;
20425 SDValue Src = Op.getOperand(OpNo);
20426 MVT SrcVT = Src.getSimpleValueType();
20427 MVT VT = Op.getSimpleValueType();
20428
20429 if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
20430 (VT != MVT::f32 && VT != MVT::f64))
20431 return SDValue();
20432
20433 // Pack the i64 into a vector, do the operation and extract.
20434
20435 // Using 256-bit to ensure result is 128-bits for f32 case.
20436 unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
20437 MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
20438 MVT VecVT = MVT::getVectorVT(VT, NumElts);
20439
20440 SDLoc dl(Op);
20441 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
20442 if (IsStrict) {
20443 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},
20444 {Op.getOperand(0), InVec});
20445 SDValue Chain = CvtVec.getValue(1);
20446 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
20447 DAG.getIntPtrConstant(0, dl));
20448 return DAG.getMergeValues({Value, Chain}, dl);
20449 }
20450
20451 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
20452
20453 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
20454 DAG.getIntPtrConstant(0, dl));
20455}
20456
20457// Try to use a packed vector operation to handle i64 on 32-bit targets.
20458static SDValue LowerI64IntToFP16(SDValue Op, SelectionDAG &DAG,
20459 const X86Subtarget &Subtarget) {
20460 assert((Op.getOpcode() == ISD::SINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20464, __extension__
__PRETTY_FUNCTION__))
20461 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20464, __extension__
__PRETTY_FUNCTION__))
20462 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20464, __extension__
__PRETTY_FUNCTION__))
20463 Op.getOpcode() == ISD::UINT_TO_FP) &&(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20464, __extension__
__PRETTY_FUNCTION__))
20464 "Unexpected opcode!")(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20464, __extension__
__PRETTY_FUNCTION__))
;
20465 bool IsStrict = Op->isStrictFPOpcode();
20466 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
20467 MVT SrcVT = Src.getSimpleValueType();
20468 MVT VT = Op.getSimpleValueType();
20469
20470 if (SrcVT != MVT::i64 || Subtarget.is64Bit() || VT != MVT::f16)
20471 return SDValue();
20472
20473 // Pack the i64 into a vector, do the operation and extract.
20474
20475 assert(Subtarget.hasFP16() && "Expected FP16")(static_cast <bool> (Subtarget.hasFP16() && "Expected FP16"
) ? void (0) : __assert_fail ("Subtarget.hasFP16() && \"Expected FP16\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20475, __extension__
__PRETTY_FUNCTION__))
;
20476
20477 SDLoc dl(Op);
20478 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
20479 if (IsStrict) {
20480 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {MVT::v2f16, MVT::Other},
20481 {Op.getOperand(0), InVec});
20482 SDValue Chain = CvtVec.getValue(1);
20483 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
20484 DAG.getIntPtrConstant(0, dl));
20485 return DAG.getMergeValues({Value, Chain}, dl);
20486 }
20487
20488 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, MVT::v2f16, InVec);
20489
20490 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
20491 DAG.getIntPtrConstant(0, dl));
20492}
20493
20494static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
20495 const X86Subtarget &Subtarget) {
20496 switch (Opcode) {
20497 case ISD::SINT_TO_FP:
20498 // TODO: Handle wider types with AVX/AVX512.
20499 if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
20500 return false;
20501 // CVTDQ2PS or (V)CVTDQ2PD
20502 return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
20503
20504 case ISD::UINT_TO_FP:
20505 // TODO: Handle wider types and i64 elements.
20506 if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
20507 return false;
20508 // VCVTUDQ2PS or VCVTUDQ2PD
20509 return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
20510
20511 default:
20512 return false;
20513 }
20514}
20515
20516/// Given a scalar cast operation that is extracted from a vector, try to
20517/// vectorize the cast op followed by extraction. This will avoid an expensive
20518/// round-trip between XMM and GPR.
20519static SDValue vectorizeExtractedCast(SDValue Cast, SelectionDAG &DAG,
20520 const X86Subtarget &Subtarget) {
20521 // TODO: This could be enhanced to handle smaller integer types by peeking
20522 // through an extend.
20523 SDValue Extract = Cast.getOperand(0);
20524 MVT DestVT = Cast.getSimpleValueType();
20525 if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
20526 !isa<ConstantSDNode>(Extract.getOperand(1)))
20527 return SDValue();
20528
20529 // See if we have a 128-bit vector cast op for this type of cast.
20530 SDValue VecOp = Extract.getOperand(0);
20531 MVT FromVT = VecOp.getSimpleValueType();
20532 unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();
20533 MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);
20534 MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);
20535 if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))
20536 return SDValue();
20537
20538 // If we are extracting from a non-zero element, first shuffle the source
20539 // vector to allow extracting from element zero.
20540 SDLoc DL(Cast);
20541 if (!isNullConstant(Extract.getOperand(1))) {
20542 SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
20543 Mask[0] = Extract.getConstantOperandVal(1);
20544 VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);
20545 }
20546 // If the source vector is wider than 128-bits, extract the low part. Do not
20547 // create an unnecessarily wide vector cast op.
20548 if (FromVT != Vec128VT)
20549 VecOp = extract128BitVector(VecOp, 0, DAG, DL);
20550
20551 // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
20552 // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
20553 SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
20554 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
20555 DAG.getIntPtrConstant(0, DL));
20556}
20557
20558/// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),
20559/// try to vectorize the cast ops. This will avoid an expensive round-trip
20560/// between XMM and GPR.
20561static SDValue lowerFPToIntToFP(SDValue CastToFP, SelectionDAG &DAG,
20562 const X86Subtarget &Subtarget) {
20563 // TODO: Allow FP_TO_UINT.
20564 SDValue CastToInt = CastToFP.getOperand(0);
20565 MVT VT = CastToFP.getSimpleValueType();
20566 if (CastToInt.getOpcode() != ISD::FP_TO_SINT || VT.isVector())
20567 return SDValue();
20568
20569 MVT IntVT = CastToInt.getSimpleValueType();
20570 SDValue X = CastToInt.getOperand(0);
20571 MVT SrcVT = X.getSimpleValueType();
20572 if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
20573 return SDValue();
20574
20575 // See if we have 128-bit vector cast instructions for this type of cast.
20576 // We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.
20577 if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||
20578 IntVT != MVT::i32)
20579 return SDValue();
20580
20581 unsigned SrcSize = SrcVT.getSizeInBits();
20582 unsigned IntSize = IntVT.getSizeInBits();
20583 unsigned VTSize = VT.getSizeInBits();
20584 MVT VecSrcVT = MVT::getVectorVT(SrcVT, 128 / SrcSize);
20585 MVT VecIntVT = MVT::getVectorVT(IntVT, 128 / IntSize);
20586 MVT VecVT = MVT::getVectorVT(VT, 128 / VTSize);
20587
20588 // We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.
20589 unsigned ToIntOpcode =
20590 SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;
20591 unsigned ToFPOpcode =
20592 IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;
20593
20594 // sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0
20595 //
20596 // We are not defining the high elements (for example, zero them) because
20597 // that could nullify any performance advantage that we hoped to gain from
20598 // this vector op hack. We do not expect any adverse effects (like denorm
20599 // penalties) with cast ops.
20600 SDLoc DL(CastToFP);
20601 SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
20602 SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);
20603 SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);
20604 SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);
20605 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);
20606}
20607
20608static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG,
20609 const X86Subtarget &Subtarget) {
20610 SDLoc DL(Op);
20611 bool IsStrict = Op->isStrictFPOpcode();
20612 MVT VT = Op->getSimpleValueType(0);
20613 SDValue Src = Op->getOperand(IsStrict ? 1 : 0);
20614
20615 if (Subtarget.hasDQI()) {
20616 assert(!Subtarget.hasVLX() && "Unexpected features")(static_cast <bool> (!Subtarget.hasVLX() && "Unexpected features"
) ? void (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20616, __extension__
__PRETTY_FUNCTION__))
;
20617
20618 assert((Src.getSimpleValueType() == MVT::v2i64 ||(static_cast <bool> ((Src.getSimpleValueType() == MVT::
v2i64 || Src.getSimpleValueType() == MVT::v4i64) && "Unsupported custom type"
) ? void (0) : __assert_fail ("(Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType() == MVT::v4i64) && \"Unsupported custom type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20620, __extension__
__PRETTY_FUNCTION__))
20619 Src.getSimpleValueType() == MVT::v4i64) &&(static_cast <bool> ((Src.getSimpleValueType() == MVT::
v2i64 || Src.getSimpleValueType() == MVT::v4i64) && "Unsupported custom type"
) ? void (0) : __assert_fail ("(Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType() == MVT::v4i64) && \"Unsupported custom type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20620, __extension__
__PRETTY_FUNCTION__))
20620 "Unsupported custom type")(static_cast <bool> ((Src.getSimpleValueType() == MVT::
v2i64 || Src.getSimpleValueType() == MVT::v4i64) && "Unsupported custom type"
) ? void (0) : __assert_fail ("(Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType() == MVT::v4i64) && \"Unsupported custom type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20620, __extension__
__PRETTY_FUNCTION__))
;
20621
20622 // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.
20623 assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&(static_cast <bool> ((VT == MVT::v4f32 || VT == MVT::v2f64
|| VT == MVT::v4f64) && "Unexpected VT!") ? void (0)
: __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20624, __extension__
__PRETTY_FUNCTION__))
20624 "Unexpected VT!")(static_cast <bool> ((VT == MVT::v4f32 || VT == MVT::v2f64
|| VT == MVT::v4f64) && "Unexpected VT!") ? void (0)
: __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20624, __extension__
__PRETTY_FUNCTION__))
;
20625 MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
20626
20627 // Need to concat with zero vector for strict fp to avoid spurious
20628 // exceptions.
20629 SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)
20630 : DAG.getUNDEF(MVT::v8i64);
20631 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,
20632 DAG.getIntPtrConstant(0, DL));
20633 SDValue Res, Chain;
20634 if (IsStrict) {
20635 Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},
20636 {Op->getOperand(0), Src});
20637 Chain = Res.getValue(1);
20638 } else {
20639 Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);
20640 }
20641
20642 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
20643 DAG.getIntPtrConstant(0, DL));
20644
20645 if (IsStrict)
20646 return DAG.getMergeValues({Res, Chain}, DL);
20647 return Res;
20648 }
20649
20650 bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||
20651 Op->getOpcode() == ISD::STRICT_SINT_TO_FP;
20652 if (VT != MVT::v4f32 || IsSigned)
20653 return SDValue();
20654
20655 SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);
20656 SDValue One = DAG.getConstant(1, DL, MVT::v4i64);
20657 SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,
20658 DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),
20659 DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));
20660 SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);
20661 SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);
20662 SmallVector<SDValue, 4> SignCvts(4);
20663 SmallVector<SDValue, 4> Chains(4);
20664 for (int i = 0; i != 4; ++i) {
20665 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
20666 DAG.getIntPtrConstant(i, DL));
20667 if (IsStrict) {
20668 SignCvts[i] =
20669 DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},
20670 {Op.getOperand(0), Elt});
20671 Chains[i] = SignCvts[i].getValue(1);
20672 } else {
20673 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);
20674 }
20675 }
20676 SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);
20677
20678 SDValue Slow, Chain;
20679 if (IsStrict) {
20680 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
20681 Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},
20682 {Chain, SignCvt, SignCvt});
20683 Chain = Slow.getValue(1);
20684 } else {
20685 Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);
20686 }
20687
20688 IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);
20689 SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);
20690
20691 if (IsStrict)
20692 return DAG.getMergeValues({Cvt, Chain}, DL);
20693
20694 return Cvt;
20695}
20696
20697SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
20698 SelectionDAG &DAG) const {
20699 bool IsStrict = Op->isStrictFPOpcode();
20700 unsigned OpNo = IsStrict ? 1 : 0;
20701 SDValue Src = Op.getOperand(OpNo);
20702 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
20703 MVT SrcVT = Src.getSimpleValueType();
20704 MVT VT = Op.getSimpleValueType();
20705 SDLoc dl(Op);
20706
20707 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
20708 return LowerWin64_INT128_TO_FP(Op, DAG);
20709
20710 if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
20711 return Extract;
20712
20713 if (SDValue R = lowerFPToIntToFP(Op, DAG, Subtarget))
20714 return R;
20715
20716 if (SrcVT.isVector()) {
20717 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
20718 // Note: Since v2f64 is a legal type. We don't need to zero extend the
20719 // source for strict FP.
20720 if (IsStrict)
20721 return DAG.getNode(
20722 X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
20723 {Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
20724 DAG.getUNDEF(SrcVT))});
20725 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
20726 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
20727 DAG.getUNDEF(SrcVT)));
20728 }
20729 if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)
20730 return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);
20731
20732 return SDValue();
20733 }
20734
20735 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&(static_cast <bool> (SrcVT <= MVT::i64 && SrcVT
>= MVT::i16 && "Unknown SINT_TO_FP to lower!") ? void
(0) : __assert_fail ("SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && \"Unknown SINT_TO_FP to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20736, __extension__
__PRETTY_FUNCTION__))
20736 "Unknown SINT_TO_FP to lower!")(static_cast <bool> (SrcVT <= MVT::i64 && SrcVT
>= MVT::i16 && "Unknown SINT_TO_FP to lower!") ? void
(0) : __assert_fail ("SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && \"Unknown SINT_TO_FP to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20736, __extension__
__PRETTY_FUNCTION__))
;
20737
20738 bool UseSSEReg = isScalarFPTypeInSSEReg(VT);
20739
20740 // These are really Legal; return the operand so the caller accepts it as
20741 // Legal.
20742 if (SrcVT == MVT::i32 && UseSSEReg)
20743 return Op;
20744 if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())
20745 return Op;
20746
20747 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
20748 return V;
20749 if (SDValue V = LowerI64IntToFP16(Op, DAG, Subtarget))
20750 return V;
20751
20752 // SSE doesn't have an i16 conversion so we need to promote.
20753 if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {
20754 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);
20755 if (IsStrict)
20756 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
20757 {Chain, Ext});
20758
20759 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);
20760 }
20761
20762 if (VT == MVT::f128)
20763 return SDValue();
20764
20765 SDValue ValueToStore = Src;
20766 if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())
20767 // Bitcasting to f64 here allows us to do a single 64-bit store from
20768 // an SSE register, avoiding the store forwarding penalty that would come
20769 // with two 32-bit stores.
20770 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
20771
20772 unsigned Size = SrcVT.getStoreSize();
20773 Align Alignment(Size);
20774 MachineFunction &MF = DAG.getMachineFunction();
20775 auto PtrVT = getPointerTy(MF.getDataLayout());
20776 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);
20777 MachinePointerInfo MPI =
20778 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);
20779 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20780 Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);
20781 std::pair<SDValue, SDValue> Tmp =
20782 BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);
20783
20784 if (IsStrict)
20785 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
20786
20787 return Tmp.first;
20788}
20789
20790std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(
20791 EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,
20792 MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {
20793 // Build the FILD
20794 SDVTList Tys;
20795 bool useSSE = isScalarFPTypeInSSEReg(DstVT);
20796 if (useSSE)
20797 Tys = DAG.getVTList(MVT::f80, MVT::Other);
20798 else
20799 Tys = DAG.getVTList(DstVT, MVT::Other);
20800
20801 SDValue FILDOps[] = {Chain, Pointer};
20802 SDValue Result =
20803 DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,
20804 Alignment, MachineMemOperand::MOLoad);
20805 Chain = Result.getValue(1);
20806
20807 if (useSSE) {
20808 MachineFunction &MF = DAG.getMachineFunction();
20809 unsigned SSFISize = DstVT.getStoreSize();
20810 int SSFI =
20811 MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false);
20812 auto PtrVT = getPointerTy(MF.getDataLayout());
20813 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20814 Tys = DAG.getVTList(MVT::Other);
20815 SDValue FSTOps[] = {Chain, Result, StackSlot};
20816 MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand(
20817 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
20818 MachineMemOperand::MOStore, SSFISize, Align(SSFISize));
20819
20820 Chain =
20821 DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);
20822 Result = DAG.getLoad(
20823 DstVT, DL, Chain, StackSlot,
20824 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
20825 Chain = Result.getValue(1);
20826 }
20827
20828 return { Result, Chain };
20829}
20830
20831/// Horizontal vector math instructions may be slower than normal math with
20832/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
20833/// implementation, and likely shuffle complexity of the alternate sequence.
20834static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
20835 const X86Subtarget &Subtarget) {
20836 bool IsOptimizingSize = DAG.shouldOptForSize();
20837 bool HasFastHOps = Subtarget.hasFastHorizontalOps();
20838 return !IsSingleSource || IsOptimizingSize || HasFastHOps;
20839}
20840
20841/// 64-bit unsigned integer to double expansion.
20842static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
20843 const X86Subtarget &Subtarget) {
20844 // We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0
20845 // when converting 0 when rounding toward negative infinity. Caller will
20846 // fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.
20847 assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!")(static_cast <bool> (!Op->isStrictFPOpcode() &&
"Expected non-strict uint_to_fp!") ? void (0) : __assert_fail
("!Op->isStrictFPOpcode() && \"Expected non-strict uint_to_fp!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20847, __extension__
__PRETTY_FUNCTION__))
;
20848 // This algorithm is not obvious. Here it is what we're trying to output:
20849 /*
20850 movq %rax, %xmm0
20851 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
20852 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
20853 #ifdef __SSE3__
20854 haddpd %xmm0, %xmm0
20855 #else
20856 pshufd $0x4e, %xmm0, %xmm1
20857 addpd %xmm1, %xmm0
20858 #endif
20859 */
20860
20861 SDLoc dl(Op);
20862 LLVMContext *Context = DAG.getContext();
20863
20864 // Build some magic constants.
20865 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
20866 Constant *C0 = ConstantDataVector::get(*Context, CV0);
20867 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
20868 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));
20869
20870 SmallVector<Constant*,2> CV1;
20871 CV1.push_back(
20872 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
20873 APInt(64, 0x4330000000000000ULL))));
20874 CV1.push_back(
20875 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
20876 APInt(64, 0x4530000000000000ULL))));
20877 Constant *C1 = ConstantVector::get(CV1);
20878 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));
20879
20880 // Load the 64-bit value into an XMM register.
20881 SDValue XR1 =
20882 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0));
20883 SDValue CLod0 = DAG.getLoad(
20884 MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
20885 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));
20886 SDValue Unpck1 =
20887 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
20888
20889 SDValue CLod1 = DAG.getLoad(
20890 MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
20891 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));
20892 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
20893 // TODO: Are there any fast-math-flags to propagate here?
20894 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
20895 SDValue Result;
20896
20897 if (Subtarget.hasSSE3() &&
20898 shouldUseHorizontalOp(true, DAG, Subtarget)) {
20899 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
20900 } else {
20901 SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
20902 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
20903 }
20904 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
20905 DAG.getIntPtrConstant(0, dl));
20906 return Result;
20907}
20908
20909/// 32-bit unsigned integer to float expansion.
20910static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG,
20911 const X86Subtarget &Subtarget) {
20912 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
20913 SDLoc dl(Op);
20914 // FP constant to bias correct the final result.
20915 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
20916 MVT::f64);
20917
20918 // Load the 32-bit value into an XMM register.
20919 SDValue Load =
20920 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));
20921
20922 // Zero out the upper parts of the register.
20923 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
20924
20925 // Or the load with the bias.
20926 SDValue Or = DAG.getNode(
20927 ISD::OR, dl, MVT::v2i64,
20928 DAG.getBitcast(MVT::v2i64, Load),
20929 DAG.getBitcast(MVT::v2i64,
20930 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
20931 Or =
20932 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
20933 DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
20934
20935 if (Op.getNode()->isStrictFPOpcode()) {
20936 // Subtract the bias.
20937 // TODO: Are there any fast-math-flags to propagate here?
20938 SDValue Chain = Op.getOperand(0);
20939 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},
20940 {Chain, Or, Bias});
20941
20942 if (Op.getValueType() == Sub.getValueType())
20943 return Sub;
20944
20945 // Handle final rounding.
20946 std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(
20947 Sub, Sub.getValue(1), dl, Op.getSimpleValueType());
20948
20949 return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);
20950 }
20951
20952 // Subtract the bias.
20953 // TODO: Are there any fast-math-flags to propagate here?
20954 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
20955
20956 // Handle final rounding.
20957 return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
20958}
20959
20960static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
20961 const X86Subtarget &Subtarget,
20962 const SDLoc &DL) {
20963 if (Op.getSimpleValueType() != MVT::v2f64)
20964 return SDValue();
20965
20966 bool IsStrict = Op->isStrictFPOpcode();
20967
20968 SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);
20969 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type")(static_cast <bool> (N0.getSimpleValueType() == MVT::v2i32
&& "Unexpected input type") ? void (0) : __assert_fail
("N0.getSimpleValueType() == MVT::v2i32 && \"Unexpected input type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20969, __extension__
__PRETTY_FUNCTION__))
;
20970
20971 if (Subtarget.hasAVX512()) {
20972 if (!Subtarget.hasVLX()) {
20973 // Let generic type legalization widen this.
20974 if (!IsStrict)
20975 return SDValue();
20976 // Otherwise pad the integer input with 0s and widen the operation.
20977 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
20978 DAG.getConstant(0, DL, MVT::v2i32));
20979 SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},
20980 {Op.getOperand(0), N0});
20981 SDValue Chain = Res.getValue(1);
20982 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,
20983 DAG.getIntPtrConstant(0, DL));
20984 return DAG.getMergeValues({Res, Chain}, DL);
20985 }
20986
20987 // Legalize to v4i32 type.
20988 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
20989 DAG.getUNDEF(MVT::v2i32));
20990 if (IsStrict)
20991 return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},
20992 {Op.getOperand(0), N0});
20993 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
20994 }
20995
20996 // Zero extend to 2i64, OR with the floating point representation of 2^52.
20997 // This gives us the floating point equivalent of 2^52 + the i32 integer
20998 // since double has 52-bits of mantissa. Then subtract 2^52 in floating
20999 // point leaving just our i32 integers in double format.
21000 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);
21001 SDValue VBias =
21002 DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), DL, MVT::v2f64);
21003 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,
21004 DAG.getBitcast(MVT::v2i64, VBias));
21005 Or = DAG.getBitcast(MVT::v2f64, Or);
21006
21007 if (IsStrict)
21008 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},
21009 {Op.getOperand(0), Or, VBias});
21010 return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);
21011}
21012
21013static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
21014 const X86Subtarget &Subtarget) {
21015 SDLoc DL(Op);
21016 bool IsStrict = Op->isStrictFPOpcode();
21017 SDValue V = Op->getOperand(IsStrict ? 1 : 0);
21018 MVT VecIntVT = V.getSimpleValueType();
21019 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&(static_cast <bool> ((VecIntVT == MVT::v4i32 || VecIntVT
== MVT::v8i32) && "Unsupported custom type") ? void (
0) : __assert_fail ("(VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && \"Unsupported custom type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21020, __extension__
__PRETTY_FUNCTION__))
21020 "Unsupported custom type")(static_cast <bool> ((VecIntVT == MVT::v4i32 || VecIntVT
== MVT::v8i32) && "Unsupported custom type") ? void (
0) : __assert_fail ("(VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && \"Unsupported custom type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21020, __extension__
__PRETTY_FUNCTION__))
;
21021
21022 if (Subtarget.hasAVX512()) {
21023 // With AVX512, but not VLX we need to widen to get a 512-bit result type.
21024 assert(!Subtarget.hasVLX() && "Unexpected features")(static_cast <bool> (!Subtarget.hasVLX() && "Unexpected features"
) ? void (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21024, __extension__
__PRETTY_FUNCTION__))
;
21025 MVT VT = Op->getSimpleValueType(0);
21026
21027 // v8i32->v8f64 is legal with AVX512 so just return it.
21028 if (VT == MVT::v8f64)
21029 return Op;
21030
21031 assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) &&(static_cast <bool> ((VT == MVT::v4f32 || VT == MVT::v8f32
|| VT == MVT::v4f64) && "Unexpected VT!") ? void (0)
: __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21032, __extension__
__PRETTY_FUNCTION__))
21032 "Unexpected VT!")(static_cast <bool> ((VT == MVT::v4f32 || VT == MVT::v8f32
|| VT == MVT::v4f64) && "Unexpected VT!") ? void (0)
: __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21032, __extension__
__PRETTY_FUNCTION__))
;
21033 MVT WideVT = VT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
21034 MVT WideIntVT = VT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
21035 // Need to concat with zero vector for strict fp to avoid spurious
21036 // exceptions.
21037 SDValue Tmp =
21038 IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);
21039 V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,
21040 DAG.getIntPtrConstant(0, DL));
21041 SDValue Res, Chain;
21042 if (IsStrict) {
21043 Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},
21044 {Op->getOperand(0), V});
21045 Chain = Res.getValue(1);
21046 } else {
21047 Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);
21048 }
21049
21050 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
21051 DAG.getIntPtrConstant(0, DL));
21052
21053 if (IsStrict)
21054 return DAG.getMergeValues({Res, Chain}, DL);
21055 return Res;
21056 }
21057
21058 if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&
21059 Op->getSimpleValueType(0) == MVT::v4f64) {
21060 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);
21061 Constant *Bias = ConstantFP::get(
21062 *DAG.getContext(),
21063 APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));
21064 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
21065 SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8));
21066 SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);
21067 SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
21068 SDValue VBias = DAG.getMemIntrinsicNode(
21069 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
21070 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(8),
21071 MachineMemOperand::MOLoad);
21072
21073 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,
21074 DAG.getBitcast(MVT::v4i64, VBias));
21075 Or = DAG.getBitcast(MVT::v4f64, Or);
21076
21077 if (IsStrict)
21078 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},
21079 {Op.getOperand(0), Or, VBias});
21080 return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);
21081 }
21082
21083 // The algorithm is the following:
21084 // #ifdef __SSE4_1__
21085 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
21086 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
21087 // (uint4) 0x53000000, 0xaa);
21088 // #else
21089 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
21090 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
21091 // #endif
21092 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
21093 // return (float4) lo + fhi;
21094
21095 bool Is128 = VecIntVT == MVT::v4i32;
21096 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
21097 // If we convert to something else than the supported type, e.g., to v4f64,
21098 // abort early.
21099 if (VecFloatVT != Op->getSimpleValueType(0))
21100 return SDValue();
21101
21102 // In the #idef/#else code, we have in common:
21103 // - The vector of constants:
21104 // -- 0x4b000000
21105 // -- 0x53000000
21106 // - A shift:
21107 // -- v >> 16
21108
21109 // Create the splat vector for 0x4b000000.
21110 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
21111 // Create the splat vector for 0x53000000.
21112 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
21113
21114 // Create the right shift.
21115 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
21116 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
21117
21118 SDValue Low, High;
21119 if (Subtarget.hasSSE41()) {
21120 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
21121 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
21122 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
21123 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
21124 // Low will be bitcasted right away, so do not bother bitcasting back to its
21125 // original type.
21126 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
21127 VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
21128 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
21129 // (uint4) 0x53000000, 0xaa);
21130 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
21131 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
21132 // High will be bitcasted right away, so do not bother bitcasting back to
21133 // its original type.
21134 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
21135 VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
21136 } else {
21137 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
21138 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
21139 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
21140 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
21141
21142 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
21143 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
21144 }
21145
21146 // Create the vector constant for (0x1.0p39f + 0x1.0p23f).
21147 SDValue VecCstFSub = DAG.getConstantFP(
21148 APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);
21149
21150 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
21151 // NOTE: By using fsub of a positive constant instead of fadd of a negative
21152 // constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is
21153 // enabled. See PR24512.
21154 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
21155 // TODO: Are there any fast-math-flags to propagate here?
21156 // (float4) lo;
21157 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
21158 // return (float4) lo + fhi;
21159 if (IsStrict) {
21160 SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},
21161 {Op.getOperand(0), HighBitcast, VecCstFSub});
21162 return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},
21163 {FHigh.getValue(1), LowBitcast, FHigh});
21164 }
21165
21166 SDValue FHigh =
21167 DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);
21168 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
21169}
21170
21171static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG,
21172 const X86Subtarget &Subtarget) {
21173 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
21174 SDValue N0 = Op.getOperand(OpNo);
21175 MVT SrcVT = N0.getSimpleValueType();
21176 SDLoc dl(Op);
21177
21178 switch (SrcVT.SimpleTy) {
21179 default:
21180 llvm_unreachable("Custom UINT_TO_FP is not supported!")::llvm::llvm_unreachable_internal("Custom UINT_TO_FP is not supported!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21180)
;
21181 case MVT::v2i32:
21182 return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
21183 case MVT::v4i32:
21184 case MVT::v8i32:
21185 return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
21186 case MVT::v2i64:
21187 case MVT::v4i64:
21188 return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);
21189 }
21190}
21191
21192SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
21193 SelectionDAG &DAG) const {
21194 bool IsStrict = Op->isStrictFPOpcode();
21195 unsigned OpNo = IsStrict ? 1 : 0;
21196 SDValue Src = Op.getOperand(OpNo);
21197 SDLoc dl(Op);
21198 auto PtrVT = getPointerTy(DAG.getDataLayout());
21199 MVT SrcVT = Src.getSimpleValueType();
21200 MVT DstVT = Op->getSimpleValueType(0);
21201 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
21202
21203 if (DstVT == MVT::f128)
21204 return SDValue();
21205
21206 if (DstVT.isVector())
21207 return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);
21208
21209 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
21210 return LowerWin64_INT128_TO_FP(Op, DAG);
21211
21212 if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
21213 return Extract;
21214
21215 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
21216 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
21217 // Conversions from unsigned i32 to f32/f64 are legal,
21218 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
21219 return Op;
21220 }
21221
21222 // Promote i32 to i64 and use a signed conversion on 64-bit targets.
21223 if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
21224 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);
21225 if (IsStrict)
21226 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},
21227 {Chain, Src});
21228 return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);
21229 }
21230
21231 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
21232 return V;
21233 if (SDValue V = LowerI64IntToFP16(Op, DAG, Subtarget))
21234 return V;
21235
21236 // The transform for i64->f64 isn't correct for 0 when rounding to negative
21237 // infinity. It produces -0.0, so disable under strictfp.
21238 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && Subtarget.hasSSE2() &&
21239 !IsStrict)
21240 return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);
21241 // The transform for i32->f64/f32 isn't correct for 0 when rounding to
21242 // negative infinity. So disable under strictfp. Using FILD instead.
21243 if (SrcVT == MVT::i32 && Subtarget.hasSSE2() && DstVT != MVT::f80 &&
21244 !IsStrict)
21245 return LowerUINT_TO_FP_i32(Op, DAG, Subtarget);
21246 if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&
21247 (DstVT == MVT::f32 || DstVT == MVT::f64))
21248 return SDValue();
21249
21250 // Make a 64-bit buffer, and use it to build an FILD.
21251 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);
21252 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
21253 Align SlotAlign(8);
21254 MachinePointerInfo MPI =
21255 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);
21256 if (SrcVT == MVT::i32) {
21257 SDValue OffsetSlot =
21258 DAG.getMemBasePlusOffset(StackSlot, TypeSize::Fixed(4), dl);
21259 SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);
21260 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
21261 OffsetSlot, MPI.getWithOffset(4), SlotAlign);
21262 std::pair<SDValue, SDValue> Tmp =
21263 BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);
21264 if (IsStrict)
21265 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
21266
21267 return Tmp.first;
21268 }
21269
21270 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP")(static_cast <bool> (SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"
) ? void (0) : __assert_fail ("SrcVT == MVT::i64 && \"Unexpected type in UINT_TO_FP\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21270, __extension__
__PRETTY_FUNCTION__))
;
21271 SDValue ValueToStore = Src;
21272 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {
21273 // Bitcasting to f64 here allows us to do a single 64-bit store from
21274 // an SSE register, avoiding the store forwarding penalty that would come
21275 // with two 32-bit stores.
21276 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
21277 }
21278 SDValue Store =
21279 DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);
21280 // For i64 source, we need to add the appropriate power of 2 if the input
21281 // was negative. We must be careful to do the computation in x87 extended
21282 // precision, not in SSE.
21283 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
21284 SDValue Ops[] = { Store, StackSlot };
21285 SDValue Fild =
21286 DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,
21287 SlotAlign, MachineMemOperand::MOLoad);
21288 Chain = Fild.getValue(1);
21289
21290
21291 // Check whether the sign bit is set.
21292 SDValue SignSet = DAG.getSetCC(
21293 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
21294 Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
21295
21296 // Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.
21297 APInt FF(64, 0x5F80000000000000ULL);
21298 SDValue FudgePtr = DAG.getConstantPool(
21299 ConstantInt::get(*DAG.getContext(), FF), PtrVT);
21300 Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();
21301
21302 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
21303 SDValue Zero = DAG.getIntPtrConstant(0, dl);
21304 SDValue Four = DAG.getIntPtrConstant(4, dl);
21305 SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);
21306 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
21307
21308 // Load the value out, extending it from f32 to f80.
21309 SDValue Fudge = DAG.getExtLoad(
21310 ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,
21311 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
21312 CPAlignment);
21313 Chain = Fudge.getValue(1);
21314 // Extend everything to 80 bits to force it to be done on x87.
21315 // TODO: Are there any fast-math-flags to propagate here?
21316 if (IsStrict) {
21317 SDValue Add = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::f80, MVT::Other},
21318 {Chain, Fild, Fudge});
21319 // STRICT_FP_ROUND can't handle equal types.
21320 if (DstVT == MVT::f80)
21321 return Add;
21322 return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},
21323 {Add.getValue(1), Add, DAG.getIntPtrConstant(0, dl)});
21324 }
21325 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
21326 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
21327 DAG.getIntPtrConstant(0, dl));
21328}
21329
21330// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
21331// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
21332// just return an SDValue().
21333// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
21334// to i16, i32 or i64, and we lower it to a legal sequence and return the
21335// result.
21336SDValue
21337X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
21338 bool IsSigned, SDValue &Chain) const {
21339 bool IsStrict = Op->isStrictFPOpcode();
21340 SDLoc DL(Op);
21341
21342 EVT DstTy = Op.getValueType();
21343 SDValue Value = Op.getOperand(IsStrict ? 1 : 0);
21344 EVT TheVT = Value.getValueType();
21345 auto PtrVT = getPointerTy(DAG.getDataLayout());
21346
21347 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
21348 // f16 must be promoted before using the lowering in this routine.
21349 // fp128 does not use this lowering.
21350 return SDValue();
21351 }
21352
21353 // If using FIST to compute an unsigned i64, we'll need some fixup
21354 // to handle values above the maximum signed i64. A FIST is always
21355 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
21356 bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;
21357
21358 // FIXME: This does not generate an invalid exception if the input does not
21359 // fit in i32. PR44019
21360 if (!IsSigned && DstTy != MVT::i64) {
21361 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
21362 // The low 32 bits of the fist result will have the correct uint32 result.
21363 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT")(static_cast <bool> (DstTy == MVT::i32 && "Unexpected FP_TO_UINT"
) ? void (0) : __assert_fail ("DstTy == MVT::i32 && \"Unexpected FP_TO_UINT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21363, __extension__
__PRETTY_FUNCTION__))
;
21364 DstTy = MVT::i64;
21365 }
21366
21367 assert(DstTy.getSimpleVT() <= MVT::i64 &&(static_cast <bool> (DstTy.getSimpleVT() <= MVT::i64
&& DstTy.getSimpleVT() >= MVT::i16 && "Unknown FP_TO_INT to lower!"
) ? void (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21369, __extension__
__PRETTY_FUNCTION__))
21368 DstTy.getSimpleVT() >= MVT::i16 &&(static_cast <bool> (DstTy.getSimpleVT() <= MVT::i64
&& DstTy.getSimpleVT() >= MVT::i16 && "Unknown FP_TO_INT to lower!"
) ? void (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21369, __extension__
__PRETTY_FUNCTION__))
21369 "Unknown FP_TO_INT to lower!")(static_cast <bool> (DstTy.getSimpleVT() <= MVT::i64
&& DstTy.getSimpleVT() >= MVT::i16 && "Unknown FP_TO_INT to lower!"
) ? void (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21369, __extension__
__PRETTY_FUNCTION__))
;
21370
21371 // We lower FP->int64 into FISTP64 followed by a load from a temporary
21372 // stack slot.
21373 MachineFunction &MF = DAG.getMachineFunction();
21374 unsigned MemSize = DstTy.getStoreSize();
21375 int SSFI =
21376 MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false);
21377 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
21378
21379 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
21380
21381 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
21382
21383 if (UnsignedFixup) {
21384 //
21385 // Conversion to unsigned i64 is implemented with a select,
21386 // depending on whether the source value fits in the range
21387 // of a signed i64. Let Thresh be the FP equivalent of
21388 // 0x8000000000000000ULL.
21389 //
21390 // Adjust = (Value >= Thresh) ? 0x80000000 : 0;
21391 // FltOfs = (Value >= Thresh) ? 0x80000000 : 0;
21392 // FistSrc = (Value - FltOfs);
21393 // Fist-to-mem64 FistSrc
21394 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
21395 // to XOR'ing the high 32 bits with Adjust.
21396 //
21397 // Being a power of 2, Thresh is exactly representable in all FP formats.
21398 // For X87 we'd like to use the smallest FP type for this constant, but
21399 // for DAG type consistency we have to match the FP operand type.
21400
21401 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
21402 LLVM_ATTRIBUTE_UNUSED__attribute__((__unused__)) APFloat::opStatus Status = APFloat::opOK;
21403 bool LosesInfo = false;
21404 if (TheVT == MVT::f64)
21405 // The rounding mode is irrelevant as the conversion should be exact.
21406 Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
21407 &LosesInfo);
21408 else if (TheVT == MVT::f80)
21409 Status = Thresh.convert(APFloat::x87DoubleExtended(),
21410 APFloat::rmNearestTiesToEven, &LosesInfo);
21411
21412 assert(Status == APFloat::opOK && !LosesInfo &&(static_cast <bool> (Status == APFloat::opOK &&
!LosesInfo && "FP conversion should have been exact"
) ? void (0) : __assert_fail ("Status == APFloat::opOK && !LosesInfo && \"FP conversion should have been exact\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21413, __extension__
__PRETTY_FUNCTION__))
21413 "FP conversion should have been exact")(static_cast <bool> (Status == APFloat::opOK &&
!LosesInfo && "FP conversion should have been exact"
) ? void (0) : __assert_fail ("Status == APFloat::opOK && !LosesInfo && \"FP conversion should have been exact\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21413, __extension__
__PRETTY_FUNCTION__))
;
21414
21415 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
21416
21417 EVT ResVT = getSetCCResultType(DAG.getDataLayout(),
21418 *DAG.getContext(), TheVT);
21419 SDValue Cmp;
21420 if (IsStrict) {
21421 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,
21422 /*IsSignaling*/ true);
21423 Chain = Cmp.getValue(1);
21424 } else {
21425 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE);
21426 }
21427
21428 // Our preferred lowering of
21429 //
21430 // (Value >= Thresh) ? 0x8000000000000000ULL : 0
21431 //
21432 // is
21433 //
21434 // (Value >= Thresh) << 63
21435 //
21436 // but since we can get here after LegalOperations, DAGCombine might do the
21437 // wrong thing if we create a select. So, directly create the preferred
21438 // version.
21439 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp);
21440 SDValue Const63 = DAG.getConstant(63, DL, MVT::i8);
21441 Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63);
21442
21443 SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal,
21444 DAG.getConstantFP(0.0, DL, TheVT));
21445
21446 if (IsStrict) {
21447 Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
21448 { Chain, Value, FltOfs });
21449 Chain = Value.getValue(1);
21450 } else
21451 Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);
21452 }
21453
21454 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
21455
21456 // FIXME This causes a redundant load/store if the SSE-class value is already
21457 // in memory, such as if it is on the callstack.
21458 if (isScalarFPTypeInSSEReg(TheVT)) {
21459 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!")(static_cast <bool> (DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"
) ? void (0) : __assert_fail ("DstTy == MVT::i64 && \"Invalid FP_TO_SINT to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21459, __extension__
__PRETTY_FUNCTION__))
;
21460 Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
21461 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
21462 SDValue Ops[] = { Chain, StackSlot };
21463
21464 unsigned FLDSize = TheVT.getStoreSize();
21465 assert(FLDSize <= MemSize && "Stack slot not big enough")(static_cast <bool> (FLDSize <= MemSize && "Stack slot not big enough"
) ? void (0) : __assert_fail ("FLDSize <= MemSize && \"Stack slot not big enough\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21465, __extension__
__PRETTY_FUNCTION__))
;
21466 MachineMemOperand *MMO = MF.getMachineMemOperand(
21467 MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize));
21468 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);
21469 Chain = Value.getValue(1);
21470 }
21471
21472 // Build the FP_TO_INT*_IN_MEM
21473 MachineMemOperand *MMO = MF.getMachineMemOperand(
21474 MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));
21475 SDValue Ops[] = { Chain, Value, StackSlot };
21476 SDValue FIST = DAG.getMemIntrinsicNode(X86ISD::FP_TO_INT_IN_MEM, DL,
21477 DAG.getVTList(MVT::Other),
21478 Ops, DstTy, MMO);
21479
21480 SDValue Res = DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, MPI);
21481 Chain = Res.getValue(1);
21482
21483 // If we need an unsigned fixup, XOR the result with adjust.
21484 if (UnsignedFixup)
21485 Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);
21486
21487 return Res;
21488}
21489
21490static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
21491 const X86Subtarget &Subtarget) {
21492 MVT VT = Op.getSimpleValueType();
21493 SDValue In = Op.getOperand(0);
21494 MVT InVT = In.getSimpleValueType();
21495 SDLoc dl(Op);
21496 unsigned Opc = Op.getOpcode();
21497
21498 assert(VT.isVector() && InVT.isVector() && "Expected vector type")(static_cast <bool> (VT.isVector() && InVT.isVector
() && "Expected vector type") ? void (0) : __assert_fail
("VT.isVector() && InVT.isVector() && \"Expected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21498, __extension__
__PRETTY_FUNCTION__))
;
21499 assert((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) &&(static_cast <bool> ((Opc == ISD::ANY_EXTEND || Opc == ISD
::ZERO_EXTEND) && "Unexpected extension opcode") ? void
(0) : __assert_fail ("(Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) && \"Unexpected extension opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21500, __extension__
__PRETTY_FUNCTION__))
21500 "Unexpected extension opcode")(static_cast <bool> ((Opc == ISD::ANY_EXTEND || Opc == ISD
::ZERO_EXTEND) && "Unexpected extension opcode") ? void
(0) : __assert_fail ("(Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) && \"Unexpected extension opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21500, __extension__
__PRETTY_FUNCTION__))
;
21501 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Expected same number of elements"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21502, __extension__
__PRETTY_FUNCTION__))
21502 "Expected same number of elements")(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Expected same number of elements"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21502, __extension__
__PRETTY_FUNCTION__))
;
21503 assert((VT.getVectorElementType() == MVT::i16 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21506, __extension__
__PRETTY_FUNCTION__))
21504 VT.getVectorElementType() == MVT::i32 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21506, __extension__
__PRETTY_FUNCTION__))
21505 VT.getVectorElementType() == MVT::i64) &&(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21506, __extension__
__PRETTY_FUNCTION__))
21506 "Unexpected element type")(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21506, __extension__
__PRETTY_FUNCTION__))
;
21507 assert((InVT.getVectorElementType() == MVT::i8 ||(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21510, __extension__
__PRETTY_FUNCTION__))
21508 InVT.getVectorElementType() == MVT::i16 ||(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21510, __extension__
__PRETTY_FUNCTION__))
21509 InVT.getVectorElementType() == MVT::i32) &&(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21510, __extension__
__PRETTY_FUNCTION__))
21510 "Unexpected element type")(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21510, __extension__
__PRETTY_FUNCTION__))
;
21511
21512 unsigned ExtendInVecOpc = getOpcode_EXTEND_VECTOR_INREG(Opc);
21513
21514 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
21515 assert(InVT == MVT::v32i8 && "Unexpected VT!")(static_cast <bool> (InVT == MVT::v32i8 && "Unexpected VT!"
) ? void (0) : __assert_fail ("InVT == MVT::v32i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21515, __extension__
__PRETTY_FUNCTION__))
;
21516 return splitVectorIntUnary(Op, DAG);
21517 }
21518
21519 if (Subtarget.hasInt256())
21520 return Op;
21521
21522 // Optimize vectors in AVX mode:
21523 //
21524 // v8i16 -> v8i32
21525 // Use vpmovzwd for 4 lower elements v8i16 -> v4i32.
21526 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
21527 // Concat upper and lower parts.
21528 //
21529 // v4i32 -> v4i64
21530 // Use vpmovzdq for 4 lower elements v4i32 -> v2i64.
21531 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
21532 // Concat upper and lower parts.
21533 //
21534 MVT HalfVT = VT.getHalfNumVectorElementsVT();
21535 SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);
21536
21537 // Short-circuit if we can determine that each 128-bit half is the same value.
21538 // Otherwise, this is difficult to match and optimize.
21539 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))
21540 if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))
21541 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);
21542
21543 SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
21544 SDValue Undef = DAG.getUNDEF(InVT);
21545 bool NeedZero = Opc == ISD::ZERO_EXTEND;
21546 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
21547 OpHi = DAG.getBitcast(HalfVT, OpHi);
21548
21549 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
21550}
21551
21552// Helper to split and extend a v16i1 mask to v16i8 or v16i16.
21553static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
21554 const SDLoc &dl, SelectionDAG &DAG) {
21555 assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.")(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v16i16
) && "Unexpected VT.") ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v16i16) && \"Unexpected VT.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21555, __extension__
__PRETTY_FUNCTION__))
;
21556 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
21557 DAG.getIntPtrConstant(0, dl));
21558 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
21559 DAG.getIntPtrConstant(8, dl));
21560 Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
21561 Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
21562 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
21563 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21564}
21565
21566static SDValue LowerZERO_EXTEND_Mask(SDValue Op,
21567 const X86Subtarget &Subtarget,
21568 SelectionDAG &DAG) {
21569 MVT VT = Op->getSimpleValueType(0);
21570 SDValue In = Op->getOperand(0);
21571 MVT InVT = In.getSimpleValueType();
21572 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!")(static_cast <bool> (InVT.getVectorElementType() == MVT
::i1 && "Unexpected input type!") ? void (0) : __assert_fail
("InVT.getVectorElementType() == MVT::i1 && \"Unexpected input type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21572, __extension__
__PRETTY_FUNCTION__))
;
21573 SDLoc DL(Op);
21574 unsigned NumElts = VT.getVectorNumElements();
21575
21576 // For all vectors, but vXi8 we can just emit a sign_extend and a shift. This
21577 // avoids a constant pool load.
21578 if (VT.getVectorElementType() != MVT::i8) {
21579 SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
21580 return DAG.getNode(ISD::SRL, DL, VT, Extend,
21581 DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
21582 }
21583
21584 // Extend VT if BWI is not supported.
21585 MVT ExtVT = VT;
21586 if (!Subtarget.hasBWI()) {
21587 // If v16i32 is to be avoided, we'll need to split and concatenate.
21588 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
21589 return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);
21590
21591 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
21592 }
21593
21594 // Widen to 512-bits if VLX is not supported.
21595 MVT WideVT = ExtVT;
21596 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
21597 NumElts *= 512 / ExtVT.getSizeInBits();
21598 InVT = MVT::getVectorVT(MVT::i1, NumElts);
21599 In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),
21600 In, DAG.getIntPtrConstant(0, DL));
21601 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(),
21602 NumElts);
21603 }
21604
21605 SDValue One = DAG.getConstant(1, DL, WideVT);
21606 SDValue Zero = DAG.getConstant(0, DL, WideVT);
21607
21608 SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
21609
21610 // Truncate if we had to extend above.
21611 if (VT != ExtVT) {
21612 WideVT = MVT::getVectorVT(MVT::i8, NumElts);
21613 SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
21614 }
21615
21616 // Extract back to 128/256-bit if we widened.
21617 if (WideVT != VT)
21618 SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
21619 DAG.getIntPtrConstant(0, DL));
21620
21621 return SelectedVal;
21622}
21623
21624static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
21625 SelectionDAG &DAG) {
21626 SDValue In = Op.getOperand(0);
21627 MVT SVT = In.getSimpleValueType();
21628
21629 if (SVT.getVectorElementType() == MVT::i1)
21630 return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);
21631
21632 assert(Subtarget.hasAVX() && "Expected AVX support")(static_cast <bool> (Subtarget.hasAVX() && "Expected AVX support"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"Expected AVX support\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21632, __extension__
__PRETTY_FUNCTION__))
;
21633 return LowerAVXExtend(Op, DAG, Subtarget);
21634}
21635
21636/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
21637/// It makes use of the fact that vectors with enough leading sign/zero bits
21638/// prevent the PACKSS/PACKUS from saturating the results.
21639/// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
21640/// within each 128-bit lane.
21641static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
21642 const SDLoc &DL, SelectionDAG &DAG,
21643 const X86Subtarget &Subtarget) {
21644 assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&(static_cast <bool> ((Opcode == X86ISD::PACKSS || Opcode
== X86ISD::PACKUS) && "Unexpected PACK opcode") ? void
(0) : __assert_fail ("(Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) && \"Unexpected PACK opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21645, __extension__
__PRETTY_FUNCTION__))
21645 "Unexpected PACK opcode")(static_cast <bool> ((Opcode == X86ISD::PACKSS || Opcode
== X86ISD::PACKUS) && "Unexpected PACK opcode") ? void
(0) : __assert_fail ("(Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) && \"Unexpected PACK opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21645, __extension__
__PRETTY_FUNCTION__))
;
21646 assert(DstVT.isVector() && "VT not a vector?")(static_cast <bool> (DstVT.isVector() && "VT not a vector?"
) ? void (0) : __assert_fail ("DstVT.isVector() && \"VT not a vector?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21646, __extension__
__PRETTY_FUNCTION__))
;
21647
21648 // Requires SSE2 for PACKSS (SSE41 PACKUSDW is handled below).
21649 if (!Subtarget.hasSSE2())
21650 return SDValue();
21651
21652 EVT SrcVT = In.getValueType();
21653
21654 // No truncation required, we might get here due to recursive calls.
21655 if (SrcVT == DstVT)
21656 return In;
21657
21658 // We only support vector truncation to 64bits or greater from a
21659 // 128bits or greater source.
21660 unsigned DstSizeInBits = DstVT.getSizeInBits();
21661 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
21662 if ((DstSizeInBits % 64) != 0 || (SrcSizeInBits % 128) != 0)
21663 return SDValue();
21664
21665 unsigned NumElems = SrcVT.getVectorNumElements();
21666 if (!isPowerOf2_32(NumElems))
21667 return SDValue();
21668
21669 LLVMContext &Ctx = *DAG.getContext();
21670 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation")(static_cast <bool> (DstVT.getVectorNumElements() == NumElems
&& "Illegal truncation") ? void (0) : __assert_fail (
"DstVT.getVectorNumElements() == NumElems && \"Illegal truncation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21670, __extension__
__PRETTY_FUNCTION__))
;
21671 assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation")(static_cast <bool> (SrcSizeInBits > DstSizeInBits &&
"Illegal truncation") ? void (0) : __assert_fail ("SrcSizeInBits > DstSizeInBits && \"Illegal truncation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21671, __extension__
__PRETTY_FUNCTION__))
;
21672
21673 EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
21674
21675 // Pack to the largest type possible:
21676 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
21677 EVT InVT = MVT::i16, OutVT = MVT::i8;
21678 if (SrcVT.getScalarSizeInBits() > 16 &&
21679 (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
21680 InVT = MVT::i32;
21681 OutVT = MVT::i16;
21682 }
21683
21684 // 128bit -> 64bit truncate - PACK 128-bit src in the lower subvector.
21685 if (SrcVT.is128BitVector()) {
21686 InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
21687 OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
21688 In = DAG.getBitcast(InVT, In);
21689 SDValue Res = DAG.getNode(Opcode, DL, OutVT, In, DAG.getUNDEF(InVT));
21690 Res = extractSubVector(Res, 0, DAG, DL, 64);
21691 return DAG.getBitcast(DstVT, Res);
21692 }
21693
21694 // Split lower/upper subvectors.
21695 SDValue Lo, Hi;
21696 std::tie(Lo, Hi) = splitVector(In, DAG, DL);
21697
21698 unsigned SubSizeInBits = SrcSizeInBits / 2;
21699 InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
21700 OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
21701
21702 // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
21703 if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
21704 Lo = DAG.getBitcast(InVT, Lo);
21705 Hi = DAG.getBitcast(InVT, Hi);
21706 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
21707 return DAG.getBitcast(DstVT, Res);
21708 }
21709
21710 // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
21711 // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
21712 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
21713 Lo = DAG.getBitcast(InVT, Lo);
21714 Hi = DAG.getBitcast(InVT, Hi);
21715 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
21716
21717 // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
21718 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
21719 // Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.
21720 SmallVector<int, 64> Mask;
21721 int Scale = 64 / OutVT.getScalarSizeInBits();
21722 narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask);
21723 Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);
21724
21725 if (DstVT.is256BitVector())
21726 return DAG.getBitcast(DstVT, Res);
21727
21728 // If 512bit -> 128bit truncate another stage.
21729 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
21730 Res = DAG.getBitcast(PackedVT, Res);
21731 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21732 }
21733
21734 // Recursively pack lower/upper subvectors, concat result and pack again.
21735 assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater")(static_cast <bool> (SrcSizeInBits >= 256 &&
"Expected 256-bit vector or greater") ? void (0) : __assert_fail
("SrcSizeInBits >= 256 && \"Expected 256-bit vector or greater\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21735, __extension__
__PRETTY_FUNCTION__))
;
21736 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems / 2);
21737 Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget);
21738 Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget);
21739
21740 PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
21741 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
21742 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21743}
21744
21745static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
21746 const X86Subtarget &Subtarget) {
21747
21748 SDLoc DL(Op);
21749 MVT VT = Op.getSimpleValueType();
21750 SDValue In = Op.getOperand(0);
21751 MVT InVT = In.getSimpleValueType();
21752
21753 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.")(static_cast <bool> (VT.getVectorElementType() == MVT::
i1 && "Unexpected vector type.") ? void (0) : __assert_fail
("VT.getVectorElementType() == MVT::i1 && \"Unexpected vector type.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21753, __extension__
__PRETTY_FUNCTION__))
;
21754
21755 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
21756 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
21757 if (InVT.getScalarSizeInBits() <= 16) {
21758 if (Subtarget.hasBWI()) {
21759 // legal, will go to VPMOVB2M, VPMOVW2M
21760 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
21761 // We need to shift to get the lsb into sign position.
21762 // Shift packed bytes not supported natively, bitcast to word
21763 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
21764 In = DAG.getNode(ISD::SHL, DL, ExtVT,
21765 DAG.getBitcast(ExtVT, In),
21766 DAG.getConstant(ShiftInx, DL, ExtVT));
21767 In = DAG.getBitcast(InVT, In);
21768 }
21769 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
21770 In, ISD::SETGT);
21771 }
21772 // Use TESTD/Q, extended vector to packed dword/qword.
21773 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&(static_cast <bool> ((InVT.is256BitVector() || InVT.is128BitVector
()) && "Unexpected vector type.") ? void (0) : __assert_fail
("(InVT.is256BitVector() || InVT.is128BitVector()) && \"Unexpected vector type.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21774, __extension__
__PRETTY_FUNCTION__))
21774 "Unexpected vector type.")(static_cast <bool> ((InVT.is256BitVector() || InVT.is128BitVector
()) && "Unexpected vector type.") ? void (0) : __assert_fail
("(InVT.is256BitVector() || InVT.is128BitVector()) && \"Unexpected vector type.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21774, __extension__
__PRETTY_FUNCTION__))
;
21775 unsigned NumElts = InVT.getVectorNumElements();
21776 assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements")(static_cast <bool> ((NumElts == 8 || NumElts == 16) &&
"Unexpected number of elements") ? void (0) : __assert_fail (
"(NumElts == 8 || NumElts == 16) && \"Unexpected number of elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21776, __extension__
__PRETTY_FUNCTION__))
;
21777 // We need to change to a wider element type that we have support for.
21778 // For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
21779 // For 16 element vectors we extend to v16i32 unless we are explicitly
21780 // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
21781 // we need to split into two 8 element vectors which we can extend to v8i32,
21782 // truncate and concat the results. There's an additional complication if
21783 // the original type is v16i8. In that case we can't split the v16i8
21784 // directly, so we need to shuffle high elements to low and use
21785 // sign_extend_vector_inreg.
21786 if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
21787 SDValue Lo, Hi;
21788 if (InVT == MVT::v16i8) {
21789 Lo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, In);
21790 Hi = DAG.getVectorShuffle(
21791 InVT, DL, In, In,
21792 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
21793 Hi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, Hi);
21794 } else {
21795 assert(InVT == MVT::v16i16 && "Unexpected VT!")(static_cast <bool> (InVT == MVT::v16i16 && "Unexpected VT!"
) ? void (0) : __assert_fail ("InVT == MVT::v16i16 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21795, __extension__
__PRETTY_FUNCTION__))
;
21796 Lo = extract128BitVector(In, 0, DAG, DL);
21797 Hi = extract128BitVector(In, 8, DAG, DL);
21798 }
21799 // We're split now, just emit two truncates and a concat. The two
21800 // truncates will trigger legalization to come back to this function.
21801 Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
21802 Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
21803 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21804 }
21805 // We either have 8 elements or we're allowed to use 512-bit vectors.
21806 // If we have VLX, we want to use the narrowest vector that can get the
21807 // job done so we use vXi32.
21808 MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
21809 MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
21810 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
21811 InVT = ExtVT;
21812 ShiftInx = InVT.getScalarSizeInBits() - 1;
21813 }
21814
21815 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
21816 // We need to shift to get the lsb into sign position.
21817 In = DAG.getNode(ISD::SHL, DL, InVT, In,
21818 DAG.getConstant(ShiftInx, DL, InVT));
21819 }
21820 // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
21821 if (Subtarget.hasDQI())
21822 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);
21823 return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);
21824}
21825
21826SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
21827 SDLoc DL(Op);
21828 MVT VT = Op.getSimpleValueType();
21829 SDValue In = Op.getOperand(0);
21830 MVT InVT = In.getSimpleValueType();
21831 unsigned InNumEltBits = InVT.getScalarSizeInBits();
21832
21833 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Invalid TRUNCATE operation"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Invalid TRUNCATE operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21834, __extension__
__PRETTY_FUNCTION__))
21834 "Invalid TRUNCATE operation")(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Invalid TRUNCATE operation"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Invalid TRUNCATE operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21834, __extension__
__PRETTY_FUNCTION__))
;
21835
21836 // If we're called by the type legalizer, handle a few cases.
21837 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21838 if (!TLI.isTypeLegal(InVT)) {
21839 if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&
21840 VT.is128BitVector()) {
21841 assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) &&(static_cast <bool> ((InVT == MVT::v16i64 || Subtarget.
hasVLX()) && "Unexpected subtarget!") ? void (0) : __assert_fail
("(InVT == MVT::v16i64 || Subtarget.hasVLX()) && \"Unexpected subtarget!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21842, __extension__
__PRETTY_FUNCTION__))
21842 "Unexpected subtarget!")(static_cast <bool> ((InVT == MVT::v16i64 || Subtarget.
hasVLX()) && "Unexpected subtarget!") ? void (0) : __assert_fail
("(InVT == MVT::v16i64 || Subtarget.hasVLX()) && \"Unexpected subtarget!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21842, __extension__
__PRETTY_FUNCTION__))
;
21843 // The default behavior is to truncate one step, concatenate, and then
21844 // truncate the remainder. We'd rather produce two 64-bit results and
21845 // concatenate those.
21846 SDValue Lo, Hi;
21847 std::tie(Lo, Hi) = DAG.SplitVector(In, DL);
21848
21849 EVT LoVT, HiVT;
21850 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
21851
21852 Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);
21853 Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);
21854 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21855 }
21856
21857 // Otherwise let default legalization handle it.
21858 return SDValue();
21859 }
21860
21861 if (VT.getVectorElementType() == MVT::i1)
21862 return LowerTruncateVecI1(Op, DAG, Subtarget);
21863
21864 // vpmovqb/w/d, vpmovdb/w, vpmovwb
21865 if (Subtarget.hasAVX512()) {
21866 if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) {
21867 assert(VT == MVT::v32i8 && "Unexpected VT!")(static_cast <bool> (VT == MVT::v32i8 && "Unexpected VT!"
) ? void (0) : __assert_fail ("VT == MVT::v32i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21867, __extension__
__PRETTY_FUNCTION__))
;
21868 return splitVectorIntUnary(Op, DAG);
21869 }
21870
21871 // word to byte only under BWI. Otherwise we have to promoted to v16i32
21872 // and then truncate that. But we should only do that if we haven't been
21873 // asked to avoid 512-bit vectors. The actual promotion to v16i32 will be
21874 // handled by isel patterns.
21875 if (InVT != MVT::v16i16 || Subtarget.hasBWI() ||
21876 Subtarget.canExtendTo512DQ())
21877 return Op;
21878 }
21879
21880 unsigned NumPackedSignBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);
21881 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
21882
21883 // Truncate with PACKUS if we are truncating a vector with leading zero bits
21884 // that extend all the way to the packed/truncated value.
21885 // Pre-SSE41 we can only use PACKUSWB.
21886 KnownBits Known = DAG.computeKnownBits(In);
21887 if ((InNumEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros())
21888 if (SDValue V =
21889 truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget))
21890 return V;
21891
21892 // Truncate with PACKSS if we are truncating a vector with sign-bits that
21893 // extend all the way to the packed/truncated value.
21894 if ((InNumEltBits - NumPackedSignBits) < DAG.ComputeNumSignBits(In))
21895 if (SDValue V =
21896 truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))
21897 return V;
21898
21899 // Handle truncation of V256 to V128 using shuffles.
21900 assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!")(static_cast <bool> (VT.is128BitVector() && InVT
.is256BitVector() && "Unexpected types!") ? void (0) :
__assert_fail ("VT.is128BitVector() && InVT.is256BitVector() && \"Unexpected types!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21900, __extension__
__PRETTY_FUNCTION__))
;
21901
21902 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
21903 In = DAG.getBitcast(MVT::v8i32, In);
21904
21905 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
21906 if (Subtarget.hasInt256()) {
21907 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
21908 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
21909 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
21910 DAG.getIntPtrConstant(0, DL));
21911 }
21912
21913 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
21914 DAG.getIntPtrConstant(0, DL));
21915 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
21916 DAG.getIntPtrConstant(4, DL));
21917 static const int ShufMask[] = {0, 2, 4, 6};
21918 return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
21919 }
21920
21921 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
21922 In = DAG.getBitcast(MVT::v32i8, In);
21923
21924 // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
21925 if (Subtarget.hasInt256()) {
21926 // The PSHUFB mask:
21927 static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
21928 -1, -1, -1, -1, -1, -1, -1, -1,
21929 16, 17, 20, 21, 24, 25, 28, 29,
21930 -1, -1, -1, -1, -1, -1, -1, -1 };
21931 In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
21932 In = DAG.getBitcast(MVT::v4i64, In);
21933
21934 static const int ShufMask2[] = {0, 2, -1, -1};
21935 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
21936 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16,
21937 DAG.getBitcast(MVT::v16i16, In),
21938 DAG.getIntPtrConstant(0, DL));
21939 }
21940
21941 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, In,
21942 DAG.getIntPtrConstant(0, DL));
21943 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, In,
21944 DAG.getIntPtrConstant(16, DL));
21945
21946 // The PSHUFB mask:
21947 static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
21948 -1, -1, -1, -1, -1, -1, -1, -1};
21949
21950 OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);
21951 OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);
21952
21953 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
21954 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
21955
21956 // The MOVLHPS Mask:
21957 static const int ShufMask2[] = {0, 1, 4, 5};
21958 SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
21959 return DAG.getBitcast(MVT::v8i16, res);
21960 }
21961
21962 if (VT == MVT::v16i8 && InVT == MVT::v16i16) {
21963 // Use an AND to zero uppper bits for PACKUS.
21964 In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(255, DL, InVT));
21965
21966 SDValue InLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,
21967 DAG.getIntPtrConstant(0, DL));
21968 SDValue InHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,
21969 DAG.getIntPtrConstant(8, DL));
21970 return DAG.getNode(X86ISD::PACKUS, DL, VT, InLo, InHi);
21971 }
21972
21973 llvm_unreachable("All 256->128 cases should have been handled above!")::llvm::llvm_unreachable_internal("All 256->128 cases should have been handled above!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21973)
;
21974}
21975
21976// We can leverage the specific way the "cvttps2dq/cvttpd2dq" instruction
21977// behaves on out of range inputs to generate optimized conversions.
21978static SDValue expandFP_TO_UINT_SSE(MVT VT, SDValue Src, const SDLoc &dl,
21979 SelectionDAG &DAG,
21980 const X86Subtarget &Subtarget) {
21981 MVT SrcVT = Src.getSimpleValueType();
21982 unsigned DstBits = VT.getScalarSizeInBits();
21983 assert(DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported")(static_cast <bool> (DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported"
) ? void (0) : __assert_fail ("DstBits == 32 && \"expandFP_TO_UINT_SSE - only vXi32 supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21983, __extension__
__PRETTY_FUNCTION__))
;
21984
21985 // Calculate the converted result for values in the range 0 to
21986 // 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
21987 SDValue Small = DAG.getNode(X86ISD::CVTTP2SI, dl, VT, Src);
21988 SDValue Big =
21989 DAG.getNode(X86ISD::CVTTP2SI, dl, VT,
21990 DAG.getNode(ISD::FSUB, dl, SrcVT, Src,
21991 DAG.getConstantFP(2147483648.0f, dl, SrcVT)));
21992
21993 // The "CVTTP2SI" instruction conveniently sets the sign bit if
21994 // and only if the value was out of range. So we can use that
21995 // as our indicator that we rather use "Big" instead of "Small".
21996 //
21997 // Use "Small" if "IsOverflown" has all bits cleared
21998 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
21999
22000 // AVX1 can't use the signsplat masking for 256-bit vectors - we have to
22001 // use the slightly slower blendv select instead.
22002 if (VT == MVT::v8i32 && !Subtarget.hasAVX2()) {
22003 SDValue Overflow = DAG.getNode(ISD::OR, dl, VT, Small, Big);
22004 return DAG.getNode(X86ISD::BLENDV, dl, VT, Small, Overflow, Small);
22005 }
22006
22007 SDValue IsOverflown =
22008 DAG.getNode(X86ISD::VSRAI, dl, VT, Small,
22009 DAG.getTargetConstant(DstBits - 1, dl, MVT::i8));
22010 return DAG.getNode(ISD::OR, dl, VT, Small,
22011 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
22012}
22013
22014SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
22015 bool IsStrict = Op->isStrictFPOpcode();
22016 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
22017 Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
22018 MVT VT = Op->getSimpleValueType(0);
22019 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
22020 SDValue Chain = IsStrict ? Op->getOperand(0) : SDValue();
22021 MVT SrcVT = Src.getSimpleValueType();
22022 SDLoc dl(Op);
22023
22024 SDValue Res;
22025 if (VT.isVector()) {
22026 if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
22027 MVT ResVT = MVT::v4i32;
22028 MVT TruncVT = MVT::v4i1;
22029 unsigned Opc;
22030 if (IsStrict)
22031 Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
22032 else
22033 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
22034
22035 if (!IsSigned && !Subtarget.hasVLX()) {
22036 assert(Subtarget.useAVX512Regs() && "Unexpected features!")(static_cast <bool> (Subtarget.useAVX512Regs() &&
"Unexpected features!") ? void (0) : __assert_fail ("Subtarget.useAVX512Regs() && \"Unexpected features!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22036, __extension__
__PRETTY_FUNCTION__))
;
22037 // Widen to 512-bits.
22038 ResVT = MVT::v8i32;
22039 TruncVT = MVT::v8i1;
22040 Opc = Op.getOpcode();
22041 // Need to concat with zero vector for strict fp to avoid spurious
22042 // exceptions.
22043 // TODO: Should we just do this for non-strict as well?
22044 SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64)
22045 : DAG.getUNDEF(MVT::v8f64);
22046 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src,
22047 DAG.getIntPtrConstant(0, dl));
22048 }
22049 if (IsStrict) {
22050 Res = DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Chain, Src});
22051 Chain = Res.getValue(1);
22052 } else {
22053 Res = DAG.getNode(Opc, dl, ResVT, Src);
22054 }
22055
22056 Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
22057 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
22058 DAG.getIntPtrConstant(0, dl));
22059 if (IsStrict)
22060 return DAG.getMergeValues({Res, Chain}, dl);
22061 return Res;
22062 }
22063
22064 if (Subtarget.hasFP16() && SrcVT.getVectorElementType() == MVT::f16) {
22065 if (VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16)
22066 return Op;
22067
22068 MVT ResVT = VT;
22069 MVT EleVT = VT.getVectorElementType();
22070 if (EleVT != MVT::i64)
22071 ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
22072
22073 if (SrcVT != MVT::v8f16) {
22074 SDValue Tmp =
22075 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
22076 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
22077 Ops[0] = Src;
22078 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
22079 }
22080
22081 if (IsStrict) {
22082 Res = DAG.getNode(IsSigned ? X86ISD::STRICT_CVTTP2SI
22083 : X86ISD::STRICT_CVTTP2UI,
22084 dl, {ResVT, MVT::Other}, {Chain, Src});
22085 Chain = Res.getValue(1);
22086 } else {
22087 Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl,
22088 ResVT, Src);
22089 }
22090
22091 // TODO: Need to add exception check code for strict FP.
22092 if (EleVT.getSizeInBits() < 16) {
22093 ResVT = MVT::getVectorVT(EleVT, 8);
22094 Res = DAG.getNode(ISD::TRUNCATE, dl, ResVT, Res);
22095 }
22096
22097 if (ResVT != VT)
22098 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
22099 DAG.getIntPtrConstant(0, dl));
22100
22101 if (IsStrict)
22102 return DAG.getMergeValues({Res, Chain}, dl);
22103 return Res;
22104 }
22105
22106 if (VT == MVT::v8i16 && (SrcVT == MVT::v8f32 || SrcVT == MVT::v8f64)) {
22107 if (IsStrict) {
22108 Res = DAG.getNode(IsSigned ? ISD::STRICT_FP_TO_SINT
22109 : ISD::STRICT_FP_TO_UINT,
22110 dl, {MVT::v8i32, MVT::Other}, {Chain, Src});
22111 Chain = Res.getValue(1);
22112 } else {
22113 Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, dl,
22114 MVT::v8i32, Src);
22115 }
22116
22117 // TODO: Need to add exception check code for strict FP.
22118 Res = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i16, Res);
22119
22120 if (IsStrict)
22121 return DAG.getMergeValues({Res, Chain}, dl);
22122 return Res;
22123 }
22124
22125 // v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.
22126 if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {
22127 assert(!IsSigned && "Expected unsigned conversion!")(static_cast <bool> (!IsSigned && "Expected unsigned conversion!"
) ? void (0) : __assert_fail ("!IsSigned && \"Expected unsigned conversion!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22127, __extension__
__PRETTY_FUNCTION__))
;
22128 assert(Subtarget.useAVX512Regs() && "Requires avx512f")(static_cast <bool> (Subtarget.useAVX512Regs() &&
"Requires avx512f") ? void (0) : __assert_fail ("Subtarget.useAVX512Regs() && \"Requires avx512f\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22128, __extension__
__PRETTY_FUNCTION__))
;
22129 return Op;
22130 }
22131
22132 // Widen vXi32 fp_to_uint with avx512f to 512-bit source.
22133 if ((VT == MVT::v4i32 || VT == MVT::v8i32) &&
22134 (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32) &&
22135 Subtarget.useAVX512Regs()) {
22136 assert(!IsSigned && "Expected unsigned conversion!")(static_cast <bool> (!IsSigned && "Expected unsigned conversion!"
) ? void (0) : __assert_fail ("!IsSigned && \"Expected unsigned conversion!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22136, __extension__
__PRETTY_FUNCTION__))
;
22137 assert(!Subtarget.hasVLX() && "Unexpected features!")(static_cast <bool> (!Subtarget.hasVLX() && "Unexpected features!"
) ? void (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22137, __extension__
__PRETTY_FUNCTION__))
;
22138 MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
22139 MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
22140 // Need to concat with zero vector for strict fp to avoid spurious
22141 // exceptions.
22142 // TODO: Should we just do this for non-strict as well?
22143 SDValue Tmp =
22144 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
22145 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
22146 DAG.getIntPtrConstant(0, dl));
22147
22148 if (IsStrict) {
22149 Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other},
22150 {Chain, Src});
22151 Chain = Res.getValue(1);
22152 } else {
22153 Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src);
22154 }
22155
22156 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
22157 DAG.getIntPtrConstant(0, dl));
22158
22159 if (IsStrict)
22160 return DAG.getMergeValues({Res, Chain}, dl);
22161 return Res;
22162 }
22163
22164 // Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.
22165 if ((VT == MVT::v2i64 || VT == MVT::v4i64) &&
22166 (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32) &&
22167 Subtarget.useAVX512Regs() && Subtarget.hasDQI()) {
22168 assert(!Subtarget.hasVLX() && "Unexpected features!")(static_cast <bool> (!Subtarget.hasVLX() && "Unexpected features!"
) ? void (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22168, __extension__
__PRETTY_FUNCTION__))
;
22169 MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
22170 // Need to concat with zero vector for strict fp to avoid spurious
22171 // exceptions.
22172 // TODO: Should we just do this for non-strict as well?
22173 SDValue Tmp =
22174 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
22175 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
22176 DAG.getIntPtrConstant(0, dl));
22177
22178 if (IsStrict) {
22179 Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
22180 {Chain, Src});
22181 Chain = Res.getValue(1);
22182 } else {
22183 Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src);
22184 }
22185
22186 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
22187 DAG.getIntPtrConstant(0, dl));
22188
22189 if (IsStrict)
22190 return DAG.getMergeValues({Res, Chain}, dl);
22191 return Res;
22192 }
22193
22194 if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
22195 if (!Subtarget.hasVLX()) {
22196 // Non-strict nodes without VLX can we widened to v4f32->v4i64 by type
22197 // legalizer and then widened again by vector op legalization.
22198 if (!IsStrict)
22199 return SDValue();
22200
22201 SDValue Zero = DAG.getConstantFP(0.0, dl, MVT::v2f32);
22202 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32,
22203 {Src, Zero, Zero, Zero});
22204 Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
22205 {Chain, Tmp});
22206 SDValue Chain = Tmp.getValue(1);
22207 Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,
22208 DAG.getIntPtrConstant(0, dl));
22209 return DAG.getMergeValues({Tmp, Chain}, dl);
22210 }
22211
22212 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL")(static_cast <bool> (Subtarget.hasDQI() && Subtarget
.hasVLX() && "Requires AVX512DQVL") ? void (0) : __assert_fail
("Subtarget.hasDQI() && Subtarget.hasVLX() && \"Requires AVX512DQVL\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22212, __extension__
__PRETTY_FUNCTION__))
;
22213 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
22214 DAG.getUNDEF(MVT::v2f32));
22215 if (IsStrict) {
22216 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI
22217 : X86ISD::STRICT_CVTTP2UI;
22218 return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});
22219 }
22220 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
22221 return DAG.getNode(Opc, dl, VT, Tmp);
22222 }
22223
22224 // Generate optimized instructions for pre AVX512 unsigned conversions from
22225 // vXf32 to vXi32.
22226 if ((VT == MVT::v4i32 && SrcVT == MVT::v4f32) ||
22227 (VT == MVT::v4i32 && SrcVT == MVT::v4f64) ||
22228 (VT == MVT::v8i32 && SrcVT == MVT::v8f32)) {
22229 assert(!IsSigned && "Expected unsigned conversion!")(static_cast <bool> (!IsSigned && "Expected unsigned conversion!"
) ? void (0) : __assert_fail ("!IsSigned && \"Expected unsigned conversion!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22229, __extension__
__PRETTY_FUNCTION__))
;
22230 return expandFP_TO_UINT_SSE(VT, Src, dl, DAG, Subtarget);
22231 }
22232
22233 return SDValue();
22234 }
22235
22236 assert(!VT.isVector())(static_cast <bool> (!VT.isVector()) ? void (0) : __assert_fail
("!VT.isVector()", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22236, __extension__ __PRETTY_FUNCTION__))
;
22237
22238 bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);
22239
22240 if (!IsSigned && UseSSEReg) {
22241 // Conversions from f32/f64 with AVX512 should be legal.
22242 if (Subtarget.hasAVX512())
22243 return Op;
22244
22245 // We can leverage the specific way the "cvttss2si/cvttsd2si" instruction
22246 // behaves on out of range inputs to generate optimized conversions.
22247 if (!IsStrict && ((VT == MVT::i32 && !Subtarget.is64Bit()) ||
22248 (VT == MVT::i64 && Subtarget.is64Bit()))) {
22249 unsigned DstBits = VT.getScalarSizeInBits();
22250 APInt UIntLimit = APInt::getSignMask(DstBits);
22251 SDValue FloatOffset = DAG.getNode(ISD::UINT_TO_FP, dl, SrcVT,
22252 DAG.getConstant(UIntLimit, dl, VT));
22253 MVT SrcVecVT = MVT::getVectorVT(SrcVT, 128 / SrcVT.getScalarSizeInBits());
22254
22255 // Calculate the converted result for values in the range:
22256 // (i32) 0 to 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
22257 // (i64) 0 to 2^63-1 ("Small") and from 2^63 to 2^64-1 ("Big").
22258 SDValue Small =
22259 DAG.getNode(X86ISD::CVTTS2SI, dl, VT,
22260 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT, Src));
22261 SDValue Big = DAG.getNode(
22262 X86ISD::CVTTS2SI, dl, VT,
22263 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT,
22264 DAG.getNode(ISD::FSUB, dl, SrcVT, Src, FloatOffset)));
22265
22266 // The "CVTTS2SI" instruction conveniently sets the sign bit if
22267 // and only if the value was out of range. So we can use that
22268 // as our indicator that we rather use "Big" instead of "Small".
22269 //
22270 // Use "Small" if "IsOverflown" has all bits cleared
22271 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
22272 SDValue IsOverflown = DAG.getNode(
22273 ISD::SRA, dl, VT, Small, DAG.getConstant(DstBits - 1, dl, MVT::i8));
22274 return DAG.getNode(ISD::OR, dl, VT, Small,
22275 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
22276 }
22277
22278 // Use default expansion for i64.
22279 if (VT == MVT::i64)
22280 return SDValue();
22281
22282 assert(VT == MVT::i32 && "Unexpected VT!")(static_cast <bool> (VT == MVT::i32 && "Unexpected VT!"
) ? void (0) : __assert_fail ("VT == MVT::i32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22282, __extension__
__PRETTY_FUNCTION__))
;
22283
22284 // Promote i32 to i64 and use a signed operation on 64-bit targets.
22285 // FIXME: This does not generate an invalid exception if the input does not
22286 // fit in i32. PR44019
22287 if (Subtarget.is64Bit()) {
22288 if (IsStrict) {
22289 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i64, MVT::Other},
22290 {Chain, Src});
22291 Chain = Res.getValue(1);
22292 } else
22293 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);
22294
22295 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
22296 if (IsStrict)
22297 return DAG.getMergeValues({Res, Chain}, dl);
22298 return Res;
22299 }
22300
22301 // Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can
22302 // use fisttp which will be handled later.
22303 if (!Subtarget.hasSSE3())
22304 return SDValue();
22305 }
22306
22307 // Promote i16 to i32 if we can use a SSE operation or the type is f128.
22308 // FIXME: This does not generate an invalid exception if the input does not
22309 // fit in i16. PR44019
22310 if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) {
22311 assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!")(static_cast <bool> (IsSigned && "Expected i16 FP_TO_UINT to have been promoted!"
) ? void (0) : __assert_fail ("IsSigned && \"Expected i16 FP_TO_UINT to have been promoted!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22311, __extension__
__PRETTY_FUNCTION__))
;
22312 if (IsStrict) {
22313 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i32, MVT::Other},
22314 {Chain, Src});
22315 Chain = Res.getValue(1);
22316 } else
22317 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);
22318
22319 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
22320 if (IsStrict)
22321 return DAG.getMergeValues({Res, Chain}, dl);
22322 return Res;
22323 }
22324
22325 // If this is a FP_TO_SINT using SSEReg we're done.
22326 if (UseSSEReg && IsSigned)
22327 return Op;
22328
22329 // fp128 needs to use a libcall.
22330 if (SrcVT == MVT::f128) {
22331 RTLIB::Libcall LC;
22332 if (IsSigned)
22333 LC = RTLIB::getFPTOSINT(SrcVT, VT);
22334 else
22335 LC = RTLIB::getFPTOUINT(SrcVT, VT);
22336
22337 MakeLibCallOptions CallOptions;
22338 std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, LC, VT, Src, CallOptions,
22339 SDLoc(Op), Chain);
22340
22341 if (IsStrict)
22342 return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
22343
22344 return Tmp.first;
22345 }
22346
22347 // Fall back to X87.
22348 if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) {
22349 if (IsStrict)
22350 return DAG.getMergeValues({V, Chain}, dl);
22351 return V;
22352 }
22353
22354 llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.")::llvm::llvm_unreachable_internal("Expected FP_TO_INTHelper to handle all remaining cases."
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22354)
;
22355}
22356
22357SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,
22358 SelectionDAG &DAG) const {
22359 SDValue Src = Op.getOperand(0);
22360 MVT SrcVT = Src.getSimpleValueType();
22361
22362 // If the source is in an SSE register, the node is Legal.
22363 if (isScalarFPTypeInSSEReg(SrcVT))
22364 return Op;
22365
22366 return LRINT_LLRINTHelper(Op.getNode(), DAG);
22367}
22368
22369SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
22370 SelectionDAG &DAG) const {
22371 EVT DstVT = N->getValueType(0);
22372 SDValue Src = N->getOperand(0);
22373 EVT SrcVT = Src.getValueType();
22374
22375 if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) {
22376 // f16 must be promoted before using the lowering in this routine.
22377 // fp128 does not use this lowering.
22378 return SDValue();
22379 }
22380
22381 SDLoc DL(N);
22382 SDValue Chain = DAG.getEntryNode();
22383
22384 bool UseSSE = isScalarFPTypeInSSEReg(SrcVT);
22385
22386 // If we're converting from SSE, the stack slot needs to hold both types.
22387 // Otherwise it only needs to hold the DstVT.
22388 EVT OtherVT = UseSSE ? SrcVT : DstVT;
22389 SDValue StackPtr = DAG.CreateStackTemporary(DstVT, OtherVT);
22390 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
22391 MachinePointerInfo MPI =
22392 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
22393
22394 if (UseSSE) {
22395 assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!")(static_cast <bool> (DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!"
) ? void (0) : __assert_fail ("DstVT == MVT::i64 && \"Invalid LRINT/LLRINT to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22395, __extension__
__PRETTY_FUNCTION__))
;
22396 Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI);
22397 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
22398 SDValue Ops[] = { Chain, StackPtr };
22399
22400 Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI,
22401 /*Align*/ None, MachineMemOperand::MOLoad);
22402 Chain = Src.getValue(1);
22403 }
22404
22405 SDValue StoreOps[] = { Chain, Src, StackPtr };
22406 Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other),
22407 StoreOps, DstVT, MPI, /*Align*/ None,
22408 MachineMemOperand::MOStore);
22409
22410 return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);
22411}
22412
22413SDValue
22414X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
22415 // This is based on the TargetLowering::expandFP_TO_INT_SAT implementation,
22416 // but making use of X86 specifics to produce better instruction sequences.
22417 SDNode *Node = Op.getNode();
22418 bool IsSigned = Node->getOpcode() == ISD::FP_TO_SINT_SAT;
22419 unsigned FpToIntOpcode = IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
22420 SDLoc dl(SDValue(Node, 0));
22421 SDValue Src = Node->getOperand(0);
22422
22423 // There are three types involved here: SrcVT is the source floating point
22424 // type, DstVT is the type of the result, and TmpVT is the result of the
22425 // intermediate FP_TO_*INT operation we'll use (which may be a promotion of
22426 // DstVT).
22427 EVT SrcVT = Src.getValueType();
22428 EVT DstVT = Node->getValueType(0);
22429 EVT TmpVT = DstVT;
22430
22431 // This code is only for floats and doubles. Fall back to generic code for
22432 // anything else.
22433 if (!isScalarFPTypeInSSEReg(SrcVT))
22434 return SDValue();
22435
22436 EVT SatVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
22437 unsigned SatWidth = SatVT.getScalarSizeInBits();
22438 unsigned DstWidth = DstVT.getScalarSizeInBits();
22439 unsigned TmpWidth = TmpVT.getScalarSizeInBits();
22440 assert(SatWidth <= DstWidth && SatWidth <= TmpWidth &&(static_cast <bool> (SatWidth <= DstWidth &&
SatWidth <= TmpWidth && "Expected saturation width smaller than result width"
) ? void (0) : __assert_fail ("SatWidth <= DstWidth && SatWidth <= TmpWidth && \"Expected saturation width smaller than result width\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22441, __extension__
__PRETTY_FUNCTION__))
22441 "Expected saturation width smaller than result width")(static_cast <bool> (SatWidth <= DstWidth &&
SatWidth <= TmpWidth && "Expected saturation width smaller than result width"
) ? void (0) : __assert_fail ("SatWidth <= DstWidth && SatWidth <= TmpWidth && \"Expected saturation width smaller than result width\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22441, __extension__
__PRETTY_FUNCTION__))
;
22442
22443 // Promote result of FP_TO_*INT to at least 32 bits.
22444 if (TmpWidth < 32) {
22445 TmpVT = MVT::i32;
22446 TmpWidth = 32;
22447 }
22448
22449 // Promote conversions to unsigned 32-bit to 64-bit, because it will allow
22450 // us to use a native signed conversion instead.
22451 if (SatWidth == 32 && !IsSigned && Subtarget.is64Bit()) {
22452 TmpVT = MVT::i64;
22453 TmpWidth = 64;
22454 }
22455
22456 // If the saturation width is smaller than the size of the temporary result,
22457 // we can always use signed conversion, which is native.
22458 if (SatWidth < TmpWidth)
22459 FpToIntOpcode = ISD::FP_TO_SINT;
22460
22461 // Determine minimum and maximum integer values and their corresponding
22462 // floating-point values.
22463 APInt MinInt, MaxInt;
22464 if (IsSigned) {
22465 MinInt = APInt::getSignedMinValue(SatWidth).sextOrSelf(DstWidth);
22466 MaxInt = APInt::getSignedMaxValue(SatWidth).sextOrSelf(DstWidth);
22467 } else {
22468 MinInt = APInt::getMinValue(SatWidth).zextOrSelf(DstWidth);
22469 MaxInt = APInt::getMaxValue(SatWidth).zextOrSelf(DstWidth);
22470 }
22471
22472 APFloat MinFloat(DAG.EVTToAPFloatSemantics(SrcVT));
22473 APFloat MaxFloat(DAG.EVTToAPFloatSemantics(SrcVT));
22474
22475 APFloat::opStatus MinStatus = MinFloat.convertFromAPInt(
22476 MinInt, IsSigned, APFloat::rmTowardZero);
22477 APFloat::opStatus MaxStatus = MaxFloat.convertFromAPInt(
22478 MaxInt, IsSigned, APFloat::rmTowardZero);
22479 bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact)
22480 && !(MaxStatus & APFloat::opStatus::opInexact);
22481
22482 SDValue MinFloatNode = DAG.getConstantFP(MinFloat, dl, SrcVT);
22483 SDValue MaxFloatNode = DAG.getConstantFP(MaxFloat, dl, SrcVT);
22484
22485 // If the integer bounds are exactly representable as floats, emit a
22486 // min+max+fptoi sequence. Otherwise use comparisons and selects.
22487 if (AreExactFloatBounds) {
22488 if (DstVT != TmpVT) {
22489 // Clamp by MinFloat from below. If Src is NaN, propagate NaN.
22490 SDValue MinClamped = DAG.getNode(
22491 X86ISD::FMAX, dl, SrcVT, MinFloatNode, Src);
22492 // Clamp by MaxFloat from above. If Src is NaN, propagate NaN.
22493 SDValue BothClamped = DAG.getNode(
22494 X86ISD::FMIN, dl, SrcVT, MaxFloatNode, MinClamped);
22495 // Convert clamped value to integer.
22496 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, BothClamped);
22497
22498 // NaN will become INDVAL, with the top bit set and the rest zero.
22499 // Truncation will discard the top bit, resulting in zero.
22500 return DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
22501 }
22502
22503 // Clamp by MinFloat from below. If Src is NaN, the result is MinFloat.
22504 SDValue MinClamped = DAG.getNode(
22505 X86ISD::FMAX, dl, SrcVT, Src, MinFloatNode);
22506 // Clamp by MaxFloat from above. NaN cannot occur.
22507 SDValue BothClamped = DAG.getNode(
22508 X86ISD::FMINC, dl, SrcVT, MinClamped, MaxFloatNode);
22509 // Convert clamped value to integer.
22510 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, DstVT, BothClamped);
22511
22512 if (!IsSigned) {
22513 // In the unsigned case we're done, because we mapped NaN to MinFloat,
22514 // which is zero.
22515 return FpToInt;
22516 }
22517
22518 // Otherwise, select zero if Src is NaN.
22519 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
22520 return DAG.getSelectCC(
22521 dl, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO);
22522 }
22523
22524 SDValue MinIntNode = DAG.getConstant(MinInt, dl, DstVT);
22525 SDValue MaxIntNode = DAG.getConstant(MaxInt, dl, DstVT);
22526
22527 // Result of direct conversion, which may be selected away.
22528 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, Src);
22529
22530 if (DstVT != TmpVT) {
22531 // NaN will become INDVAL, with the top bit set and the rest zero.
22532 // Truncation will discard the top bit, resulting in zero.
22533 FpToInt = DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
22534 }
22535
22536 SDValue Select = FpToInt;
22537 // For signed conversions where we saturate to the same size as the
22538 // result type of the fptoi instructions, INDVAL coincides with integer
22539 // minimum, so we don't need to explicitly check it.
22540 if (!IsSigned || SatWidth != TmpVT.getScalarSizeInBits()) {
22541 // If Src ULT MinFloat, select MinInt. In particular, this also selects
22542 // MinInt if Src is NaN.
22543 Select = DAG.getSelectCC(
22544 dl, Src, MinFloatNode, MinIntNode, Select, ISD::CondCode::SETULT);
22545 }
22546
22547 // If Src OGT MaxFloat, select MaxInt.
22548 Select = DAG.getSelectCC(
22549 dl, Src, MaxFloatNode, MaxIntNode, Select, ISD::CondCode::SETOGT);
22550
22551 // In the unsigned case we are done, because we mapped NaN to MinInt, which
22552 // is already zero. The promoted case was already handled above.
22553 if (!IsSigned || DstVT != TmpVT) {
22554 return Select;
22555 }
22556
22557 // Otherwise, select 0 if Src is NaN.
22558 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
22559 return DAG.getSelectCC(
22560 dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO);
22561}
22562
22563SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
22564 bool IsStrict = Op->isStrictFPOpcode();
22565
22566 SDLoc DL(Op);
22567 MVT VT = Op.getSimpleValueType();
22568 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
22569 MVT SVT = In.getSimpleValueType();
22570
22571 if (VT == MVT::f128)
22572 return SDValue();
22573
22574 if (VT == MVT::f80) {
22575 if (SVT == MVT::f16) {
22576 assert(Subtarget.hasFP16() && "Unexpected features!")(static_cast <bool> (Subtarget.hasFP16() && "Unexpected features!"
) ? void (0) : __assert_fail ("Subtarget.hasFP16() && \"Unexpected features!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22576, __extension__
__PRETTY_FUNCTION__))
;
22577 RTLIB::Libcall LC = RTLIB::getFPEXT(SVT, VT);
22578 MakeLibCallOptions CallOptions;
22579 std::pair<SDValue, SDValue> Tmp =
22580 makeLibCall(DAG, LC, VT, In, CallOptions, DL,
22581 IsStrict ? Op.getOperand(0) : SDValue());
22582 if (IsStrict)
22583 return DAG.getMergeValues({Tmp.first, Tmp.second}, DL);
22584 else
22585 return Tmp.first;
22586 }
22587 return Op;
22588 }
22589
22590 if (SVT.getVectorElementType() == MVT::f16) {
22591 assert(Subtarget.hasFP16() && Subtarget.hasVLX() && "Unexpected features!")(static_cast <bool> (Subtarget.hasFP16() && Subtarget
.hasVLX() && "Unexpected features!") ? void (0) : __assert_fail
("Subtarget.hasFP16() && Subtarget.hasVLX() && \"Unexpected features!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22591, __extension__
__PRETTY_FUNCTION__))
;
22592 if (SVT == MVT::v2f16)
22593 In = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f16, In,
22594 DAG.getUNDEF(MVT::v2f16));
22595 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f16, In,
22596 DAG.getUNDEF(MVT::v4f16));
22597 if (IsStrict)
22598 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
22599 {Op->getOperand(0), Res});
22600 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
22601 }
22602
22603 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!")(static_cast <bool> (SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!"
) ? void (0) : __assert_fail ("SVT == MVT::v2f32 && \"Only customize MVT::v2f32 type legalization!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22603, __extension__
__PRETTY_FUNCTION__))
;
22604
22605 SDValue Res =
22606 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT));
22607 if (IsStrict)
22608 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
22609 {Op->getOperand(0), Res});
22610 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
22611}
22612
22613SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
22614 bool IsStrict = Op->isStrictFPOpcode();
22615 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
22616 MVT VT = Op.getSimpleValueType();
22617 MVT SVT = In.getSimpleValueType();
22618
22619 // It's legal except when f128 is involved or we're converting f80->f16.
22620 if (SVT != MVT::f128 && !(VT == MVT::f16 && SVT == MVT::f80))
22621 return Op;
22622
22623 return SDValue();
22624}
22625
22626static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG) {
22627 bool IsStrict = Op->isStrictFPOpcode();
22628 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
22629 assert(Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 &&(static_cast <bool> (Src.getValueType() == MVT::i16 &&
Op.getValueType() == MVT::f32 && "Unexpected VT!") ?
void (0) : __assert_fail ("Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22630, __extension__
__PRETTY_FUNCTION__))
22630 "Unexpected VT!")(static_cast <bool> (Src.getValueType() == MVT::i16 &&
Op.getValueType() == MVT::f32 && "Unexpected VT!") ?
void (0) : __assert_fail ("Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22630, __extension__
__PRETTY_FUNCTION__))
;
22631
22632 SDLoc dl(Op);
22633 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16,
22634 DAG.getConstant(0, dl, MVT::v8i16), Src,
22635 DAG.getIntPtrConstant(0, dl));
22636
22637 SDValue Chain;
22638 if (IsStrict) {
22639 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {MVT::v4f32, MVT::Other},
22640 {Op.getOperand(0), Res});
22641 Chain = Res.getValue(1);
22642 } else {
22643 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
22644 }
22645
22646 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
22647 DAG.getIntPtrConstant(0, dl));
22648
22649 if (IsStrict)
22650 return DAG.getMergeValues({Res, Chain}, dl);
22651
22652 return Res;
22653}
22654
22655static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) {
22656 bool IsStrict = Op->isStrictFPOpcode();
22657 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
22658 assert(Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 &&(static_cast <bool> (Src.getValueType() == MVT::f32 &&
Op.getValueType() == MVT::i16 && "Unexpected VT!") ?
void (0) : __assert_fail ("Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22659, __extension__
__PRETTY_FUNCTION__))
22659 "Unexpected VT!")(static_cast <bool> (Src.getValueType() == MVT::f32 &&
Op.getValueType() == MVT::i16 && "Unexpected VT!") ?
void (0) : __assert_fail ("Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22659, __extension__
__PRETTY_FUNCTION__))
;
22660
22661 SDLoc dl(Op);
22662 SDValue Res, Chain;
22663 if (IsStrict) {
22664 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4f32,
22665 DAG.getConstantFP(0, dl, MVT::v4f32), Src,
22666 DAG.getIntPtrConstant(0, dl));
22667 Res = DAG.getNode(
22668 X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
22669 {Op.getOperand(0), Res, DAG.getTargetConstant(4, dl, MVT::i32)});
22670 Chain = Res.getValue(1);
22671 } else {
22672 // FIXME: Should we use zeros for upper elements for non-strict?
22673 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Src);
22674 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
22675 DAG.getTargetConstant(4, dl, MVT::i32));
22676 }
22677
22678 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res,
22679 DAG.getIntPtrConstant(0, dl));
22680
22681 if (IsStrict)
22682 return DAG.getMergeValues({Res, Chain}, dl);
22683
22684 return Res;
22685}
22686
22687/// Depending on uarch and/or optimizing for size, we might prefer to use a
22688/// vector operation in place of the typical scalar operation.
22689static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG,
22690 const X86Subtarget &Subtarget) {
22691 // If both operands have other uses, this is probably not profitable.
22692 SDValue LHS = Op.getOperand(0);
22693 SDValue RHS = Op.getOperand(1);
22694 if (!LHS.hasOneUse() && !RHS.hasOneUse())
22695 return Op;
22696
22697 // FP horizontal add/sub were added with SSE3. Integer with SSSE3.
22698 bool IsFP = Op.getSimpleValueType().isFloatingPoint();
22699 if (IsFP && !Subtarget.hasSSE3())
22700 return Op;
22701 if (!IsFP && !Subtarget.hasSSSE3())
22702 return Op;
22703
22704 // Extract from a common vector.
22705 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
22706 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
22707 LHS.getOperand(0) != RHS.getOperand(0) ||
22708 !isa<ConstantSDNode>(LHS.getOperand(1)) ||
22709 !isa<ConstantSDNode>(RHS.getOperand(1)) ||
22710 !shouldUseHorizontalOp(true, DAG, Subtarget))
22711 return Op;
22712
22713 // Allow commuted 'hadd' ops.
22714 // TODO: Allow commuted (f)sub by negating the result of (F)HSUB?
22715 unsigned HOpcode;
22716 switch (Op.getOpcode()) {
22717 case ISD::ADD: HOpcode = X86ISD::HADD; break;
22718 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
22719 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
22720 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
22721 default:
22722 llvm_unreachable("Trying to lower unsupported opcode to horizontal op")::llvm::llvm_unreachable_internal("Trying to lower unsupported opcode to horizontal op"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22722)
;
22723 }
22724 unsigned LExtIndex = LHS.getConstantOperandVal(1);
22725 unsigned RExtIndex = RHS.getConstantOperandVal(1);
22726 if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 &&
22727 (HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD))
22728 std::swap(LExtIndex, RExtIndex);
22729
22730 if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1))
22731 return Op;
22732
22733 SDValue X = LHS.getOperand(0);
22734 EVT VecVT = X.getValueType();
22735 unsigned BitWidth = VecVT.getSizeInBits();
22736 unsigned NumLanes = BitWidth / 128;
22737 unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes;
22738 assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&(static_cast <bool> ((BitWidth == 128 || BitWidth == 256
|| BitWidth == 512) && "Not expecting illegal vector widths here"
) ? void (0) : __assert_fail ("(BitWidth == 128 || BitWidth == 256 || BitWidth == 512) && \"Not expecting illegal vector widths here\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22739, __extension__
__PRETTY_FUNCTION__))
22739 "Not expecting illegal vector widths here")(static_cast <bool> ((BitWidth == 128 || BitWidth == 256
|| BitWidth == 512) && "Not expecting illegal vector widths here"
) ? void (0) : __assert_fail ("(BitWidth == 128 || BitWidth == 256 || BitWidth == 512) && \"Not expecting illegal vector widths here\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22739, __extension__
__PRETTY_FUNCTION__))
;
22740
22741 // Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit
22742 // equivalent, so extract the 256/512-bit source op to 128-bit if we can.
22743 SDLoc DL(Op);
22744 if (BitWidth == 256 || BitWidth == 512) {
22745 unsigned LaneIdx = LExtIndex / NumEltsPerLane;
22746 X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);
22747 LExtIndex %= NumEltsPerLane;
22748 }
22749
22750 // add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
22751 // add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
22752 // add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1
22753 // sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0
22754 SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);
22755 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,
22756 DAG.getIntPtrConstant(LExtIndex / 2, DL));
22757}
22758
22759/// Depending on uarch and/or optimizing for size, we might prefer to use a
22760/// vector operation in place of the typical scalar operation.
22761SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {
22762 assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) &&(static_cast <bool> ((Op.getValueType() == MVT::f32 || Op
.getValueType() == MVT::f64) && "Only expecting float/double"
) ? void (0) : __assert_fail ("(Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) && \"Only expecting float/double\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22763, __extension__
__PRETTY_FUNCTION__))
22763 "Only expecting float/double")(static_cast <bool> ((Op.getValueType() == MVT::f32 || Op
.getValueType() == MVT::f64) && "Only expecting float/double"
) ? void (0) : __assert_fail ("(Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) && \"Only expecting float/double\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22763, __extension__
__PRETTY_FUNCTION__))
;
22764 return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
22765}
22766
22767/// ISD::FROUND is defined to round to nearest with ties rounding away from 0.
22768/// This mode isn't supported in hardware on X86. But as long as we aren't
22769/// compiling with trapping math, we can emulate this with
22770/// trunc(X + copysign(nextafter(0.5, 0.0), X)).
22771static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) {
22772 SDValue N0 = Op.getOperand(0);
22773 SDLoc dl(Op);
22774 MVT VT = Op.getSimpleValueType();
22775
22776 // N0 += copysign(nextafter(0.5, 0.0), N0)
22777 const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
22778 bool Ignored;
22779 APFloat Point5Pred = APFloat(0.5f);
22780 Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);
22781 Point5Pred.next(/*nextDown*/true);
22782
22783 SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,
22784 DAG.getConstantFP(Point5Pred, dl, VT), N0);
22785 N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder);
22786
22787 // Truncate the result to remove fraction.
22788 return DAG.getNode(ISD::FTRUNC, dl, VT, N0);
22789}
22790
22791/// The only differences between FABS and FNEG are the mask and the logic op.
22792/// FNEG also has a folding opportunity for FNEG(FABS(x)).
22793static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
22794 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&(static_cast <bool> ((Op.getOpcode() == ISD::FABS || Op
.getOpcode() == ISD::FNEG) && "Wrong opcode for lowering FABS or FNEG."
) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) && \"Wrong opcode for lowering FABS or FNEG.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22795, __extension__
__PRETTY_FUNCTION__))
22795 "Wrong opcode for lowering FABS or FNEG.")(static_cast <bool> ((Op.getOpcode() == ISD::FABS || Op
.getOpcode() == ISD::FNEG) && "Wrong opcode for lowering FABS or FNEG."
) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) && \"Wrong opcode for lowering FABS or FNEG.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22795, __extension__
__PRETTY_FUNCTION__))
;
22796
22797 bool IsFABS = (Op.getOpcode() == ISD::FABS);
22798
22799 // If this is a FABS and it has an FNEG user, bail out to fold the combination
22800 // into an FNABS. We'll lower the FABS after that if it is still in use.
22801 if (IsFABS)
22802 for (SDNode *User : Op->uses())
22803 if (User->getOpcode() == ISD::FNEG)
22804 return Op;
22805
22806 SDLoc dl(Op);
22807 MVT VT = Op.getSimpleValueType();
22808
22809 bool IsF128 = (VT == MVT::f128);
22810 assert(VT.isFloatingPoint() && VT != MVT::f80 &&(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFABSorFNEG") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFABSorFNEG\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22812, __extension__
__PRETTY_FUNCTION__))
22811 DAG.getTargetLoweringInfo().isTypeLegal(VT) &&(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFABSorFNEG") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFABSorFNEG\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22812, __extension__
__PRETTY_FUNCTION__))
22812 "Unexpected type in LowerFABSorFNEG")(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFABSorFNEG") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFABSorFNEG\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22812, __extension__
__PRETTY_FUNCTION__))
;
22813
22814 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
22815 // decide if we should generate a 16-byte constant mask when we only need 4 or
22816 // 8 bytes for the scalar case.
22817
22818 // There are no scalar bitwise logical SSE/AVX instructions, so we
22819 // generate a 16-byte vector constant and logic op even for the scalar case.
22820 // Using a 16-byte mask allows folding the load of the mask with
22821 // the logic op, so it can save (~4 bytes) on code size.
22822 bool IsFakeVector = !VT.isVector() && !IsF128;
22823 MVT LogicVT = VT;
22824 if (IsFakeVector)
22825 LogicVT = (VT == MVT::f64) ? MVT::v2f64
22826 : (VT == MVT::f32) ? MVT::v4f32
22827 : MVT::v8f16;
22828
22829 unsigned EltBits = VT.getScalarSizeInBits();
22830 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
22831 APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :
22832 APInt::getSignMask(EltBits);
22833 const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
22834 SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
22835
22836 SDValue Op0 = Op.getOperand(0);
22837 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
22838 unsigned LogicOp = IsFABS ? X86ISD::FAND :
22839 IsFNABS ? X86ISD::FOR :
22840 X86ISD::FXOR;
22841 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
22842
22843 if (VT.isVector() || IsF128)
22844 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
22845
22846 // For the scalar case extend to a 128-bit vector, perform the logic op,
22847 // and extract the scalar result back out.
22848 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
22849 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
22850 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
22851 DAG.getIntPtrConstant(0, dl));
22852}
22853
22854static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
22855 SDValue Mag = Op.getOperand(0);
22856 SDValue Sign = Op.getOperand(1);
22857 SDLoc dl(Op);
22858
22859 // If the sign operand is smaller, extend it first.
22860 MVT VT = Op.getSimpleValueType();
22861 if (Sign.getSimpleValueType().bitsLT(VT))
22862 Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
22863
22864 // And if it is bigger, shrink it first.
22865 if (Sign.getSimpleValueType().bitsGT(VT))
22866 Sign =
22867 DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(0, dl));
22868
22869 // At this point the operands and the result should have the same
22870 // type, and that won't be f80 since that is not custom lowered.
22871 bool IsF128 = (VT == MVT::f128);
22872 assert(VT.isFloatingPoint() && VT != MVT::f80 &&(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFCOPYSIGN") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFCOPYSIGN\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22874, __extension__
__PRETTY_FUNCTION__))
22873 DAG.getTargetLoweringInfo().isTypeLegal(VT) &&(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFCOPYSIGN") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFCOPYSIGN\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22874, __extension__
__PRETTY_FUNCTION__))
22874 "Unexpected type in LowerFCOPYSIGN")(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFCOPYSIGN") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFCOPYSIGN\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22874, __extension__
__PRETTY_FUNCTION__))
;
22875
22876 const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
22877
22878 // Perform all scalar logic operations as 16-byte vectors because there are no
22879 // scalar FP logic instructions in SSE.
22880 // TODO: This isn't necessary. If we used scalar types, we might avoid some
22881 // unnecessary splats, but we might miss load folding opportunities. Should
22882 // this decision be based on OptimizeForSize?
22883 bool IsFakeVector = !VT.isVector() && !IsF128;
22884 MVT LogicVT = VT;
22885 if (IsFakeVector)
22886 LogicVT = (VT == MVT::f64) ? MVT::v2f64
22887 : (VT == MVT::f32) ? MVT::v4f32
22888 : MVT::v8f16;
22889
22890 // The mask constants are automatically splatted for vector types.
22891 unsigned EltSizeInBits = VT.getScalarSizeInBits();
22892 SDValue SignMask = DAG.getConstantFP(
22893 APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
22894 SDValue MagMask = DAG.getConstantFP(
22895 APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT);
22896
22897 // First, clear all bits but the sign bit from the second operand (sign).
22898 if (IsFakeVector)
22899 Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
22900 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
22901
22902 // Next, clear the sign bit from the first operand (magnitude).
22903 // TODO: If we had general constant folding for FP logic ops, this check
22904 // wouldn't be necessary.
22905 SDValue MagBits;
22906 if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) {
22907 APFloat APF = Op0CN->getValueAPF();
22908 APF.clearSign();
22909 MagBits = DAG.getConstantFP(APF, dl, LogicVT);
22910 } else {
22911 // If the magnitude operand wasn't a constant, we need to AND out the sign.
22912 if (IsFakeVector)
22913 Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
22914 MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
22915 }
22916
22917 // OR the magnitude value with the sign bit.
22918 SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
22919 return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
22920 DAG.getIntPtrConstant(0, dl));
22921}
22922
22923static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
22924 SDValue N0 = Op.getOperand(0);
22925 SDLoc dl(Op);
22926 MVT VT = Op.getSimpleValueType();
22927
22928 MVT OpVT = N0.getSimpleValueType();
22929 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&(static_cast <bool> ((OpVT == MVT::f32 || OpVT == MVT::
f64) && "Unexpected type for FGETSIGN") ? void (0) : __assert_fail
("(OpVT == MVT::f32 || OpVT == MVT::f64) && \"Unexpected type for FGETSIGN\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22930, __extension__
__PRETTY_FUNCTION__))
22930 "Unexpected type for FGETSIGN")(static_cast <bool> ((OpVT == MVT::f32 || OpVT == MVT::
f64) && "Unexpected type for FGETSIGN") ? void (0) : __assert_fail
("(OpVT == MVT::f32 || OpVT == MVT::f64) && \"Unexpected type for FGETSIGN\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22930, __extension__
__PRETTY_FUNCTION__))
;
22931
22932 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
22933 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
22934 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
22935 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
22936 Res = DAG.getZExtOrTrunc(Res, dl, VT);
22937 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
22938 return Res;
22939}
22940
22941/// Helper for attempting to create a X86ISD::BT node.
22942static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG) {
22943 // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
22944 // instruction. Since the shift amount is in-range-or-undefined, we know
22945 // that doing a bittest on the i32 value is ok. We extend to i32 because
22946 // the encoding for the i16 version is larger than the i32 version.
22947 // Also promote i16 to i32 for performance / code size reason.
22948 if (Src.getValueType().getScalarSizeInBits() < 32)
22949 Src = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Src);
22950
22951 // No legal type found, give up.
22952 if (!DAG.getTargetLoweringInfo().isTypeLegal(Src.getValueType()))
22953 return SDValue();
22954
22955 // See if we can use the 32-bit instruction instead of the 64-bit one for a
22956 // shorter encoding. Since the former takes the modulo 32 of BitNo and the
22957 // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
22958 // known to be zero.
22959 if (Src.getValueType() == MVT::i64 &&
22960 DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
22961 Src = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Src);
22962
22963 // If the operand types disagree, extend the shift amount to match. Since
22964 // BT ignores high bits (like shifts) we can use anyextend.
22965 if (Src.getValueType() != BitNo.getValueType())
22966 BitNo = DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(), BitNo);
22967
22968 return DAG.getNode(X86ISD::BT, DL, MVT::i32, Src, BitNo);
22969}
22970
22971/// Helper for creating a X86ISD::SETCC node.
22972static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
22973 SelectionDAG &DAG) {
22974 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
22975 DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS);
22976}
22977
22978/// Helper for matching OR(EXTRACTELT(X,0),OR(EXTRACTELT(X,1),...))
22979/// style scalarized (associative) reduction patterns. Partial reductions
22980/// are supported when the pointer SrcMask is non-null.
22981/// TODO - move this to SelectionDAG?
22982static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp,
22983 SmallVectorImpl<SDValue> &SrcOps,
22984 SmallVectorImpl<APInt> *SrcMask = nullptr) {
22985 SmallVector<SDValue, 8> Opnds;
22986 DenseMap<SDValue, APInt> SrcOpMap;
22987 EVT VT = MVT::Other;
22988
22989 // Recognize a special case where a vector is casted into wide integer to
22990 // test all 0s.
22991 assert(Op.getOpcode() == unsigned(BinOp) &&(static_cast <bool> (Op.getOpcode() == unsigned(BinOp) &&
"Unexpected bit reduction opcode") ? void (0) : __assert_fail
("Op.getOpcode() == unsigned(BinOp) && \"Unexpected bit reduction opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22992, __extension__
__PRETTY_FUNCTION__))
22992 "Unexpected bit reduction opcode")(static_cast <bool> (Op.getOpcode() == unsigned(BinOp) &&
"Unexpected bit reduction opcode") ? void (0) : __assert_fail
("Op.getOpcode() == unsigned(BinOp) && \"Unexpected bit reduction opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22992, __extension__
__PRETTY_FUNCTION__))
;
22993 Opnds.push_back(Op.getOperand(0));
22994 Opnds.push_back(Op.getOperand(1));
22995
22996 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
22997 SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
22998 // BFS traverse all BinOp operands.
22999 if (I->getOpcode() == unsigned(BinOp)) {
23000 Opnds.push_back(I->getOperand(0));
23001 Opnds.push_back(I->getOperand(1));
23002 // Re-evaluate the number of nodes to be traversed.
23003 e += 2; // 2 more nodes (LHS and RHS) are pushed.
23004 continue;
23005 }
23006
23007 // Quit if a non-EXTRACT_VECTOR_ELT
23008 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
23009 return false;
23010
23011 // Quit if without a constant index.
23012 auto *Idx = dyn_cast<ConstantSDNode>(I->getOperand(1));
23013 if (!Idx)
23014 return false;
23015
23016 SDValue Src = I->getOperand(0);
23017 DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src);
23018 if (M == SrcOpMap.end()) {
23019 VT = Src.getValueType();
23020 // Quit if not the same type.
23021 if (!SrcOpMap.empty() && VT != SrcOpMap.begin()->first.getValueType())
23022 return false;
23023 unsigned NumElts = VT.getVectorNumElements();
23024 APInt EltCount = APInt::getZero(NumElts);
23025 M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;
23026 SrcOps.push_back(Src);
23027 }
23028
23029 // Quit if element already used.
23030 unsigned CIdx = Idx->getZExtValue();
23031 if (M->second[CIdx])
23032 return false;
23033 M->second.setBit(CIdx);
23034 }
23035
23036 if (SrcMask) {
23037 // Collect the source partial masks.
23038 for (SDValue &SrcOp : SrcOps)
23039 SrcMask->push_back(SrcOpMap[SrcOp]);
23040 } else {
23041 // Quit if not all elements are used.
23042 for (const auto &I : SrcOpMap)
23043 if (!I.second.isAllOnes())
23044 return false;
23045 }
23046
23047 return true;
23048}
23049
23050// Helper function for comparing all bits of a vector against zero.
23051static SDValue LowerVectorAllZero(const SDLoc &DL, SDValue V, ISD::CondCode CC,
23052 const APInt &Mask,
23053 const X86Subtarget &Subtarget,
23054 SelectionDAG &DAG, X86::CondCode &X86CC) {
23055 EVT VT = V.getValueType();
23056 unsigned ScalarSize = VT.getScalarSizeInBits();
23057 if (Mask.getBitWidth() != ScalarSize) {
23058 assert(ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch")(static_cast <bool> (ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch"
) ? void (0) : __assert_fail ("ScalarSize == 1 && \"Element Mask vs Vector bitwidth mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23058, __extension__
__PRETTY_FUNCTION__))
;
23059 return SDValue();
23060 }
23061
23062 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode")(static_cast <bool> ((CC == ISD::SETEQ || CC == ISD::SETNE
) && "Unsupported ISD::CondCode") ? void (0) : __assert_fail
("(CC == ISD::SETEQ || CC == ISD::SETNE) && \"Unsupported ISD::CondCode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23062, __extension__
__PRETTY_FUNCTION__))
;
23063 X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE);
23064
23065 auto MaskBits = [&](SDValue Src) {
23066 if (Mask.isAllOnes())
23067 return Src;
23068 EVT SrcVT = Src.getValueType();
23069 SDValue MaskValue = DAG.getConstant(Mask, DL, SrcVT);
23070 return DAG.getNode(ISD::AND, DL, SrcVT, Src, MaskValue);
23071 };
23072
23073 // For sub-128-bit vector, cast to (legal) integer and compare with zero.
23074 if (VT.getSizeInBits() < 128) {
23075 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
23076 if (!DAG.getTargetLoweringInfo().isTypeLegal(IntVT))
23077 return SDValue();
23078 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
23079 DAG.getBitcast(IntVT, MaskBits(V)),
23080 DAG.getConstant(0, DL, IntVT));
23081 }
23082
23083 // Quit if not splittable to 128/256-bit vector.
23084 if (!isPowerOf2_32(VT.getSizeInBits()))
23085 return SDValue();
23086
23087 // Split down to 128/256-bit vector.
23088 unsigned TestSize = Subtarget.hasAVX() ? 256 : 128;
23089 while (VT.getSizeInBits() > TestSize) {
23090 auto Split = DAG.SplitVector(V, DL);
23091 VT = Split.first.getValueType();
23092 V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second);
23093 }
23094
23095 bool UsePTEST = Subtarget.hasSSE41();
23096 if (UsePTEST) {
23097 MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
23098 V = DAG.getBitcast(TestVT, MaskBits(V));
23099 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V);
23100 }
23101
23102 // Without PTEST, a masked v2i64 or-reduction is not faster than
23103 // scalarization.
23104 if (!Mask.isAllOnes() && VT.getScalarSizeInBits() > 32)
23105 return SDValue();
23106
23107 V = DAG.getBitcast(MVT::v16i8, MaskBits(V));
23108 V = DAG.getNode(X86ISD::PCMPEQ, DL, MVT::v16i8, V,
23109 getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
23110 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
23111 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
23112 DAG.getConstant(0xFFFF, DL, MVT::i32));
23113}
23114
23115// Check whether an OR'd reduction tree is PTEST-able, or if we can fallback to
23116// CMP(MOVMSK(PCMPEQB(X,0))).
23117static SDValue MatchVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
23118 const SDLoc &DL,
23119 const X86Subtarget &Subtarget,
23120 SelectionDAG &DAG, SDValue &X86CC) {
23121 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode")(static_cast <bool> ((CC == ISD::SETEQ || CC == ISD::SETNE
) && "Unsupported ISD::CondCode") ? void (0) : __assert_fail
("(CC == ISD::SETEQ || CC == ISD::SETNE) && \"Unsupported ISD::CondCode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23121, __extension__
__PRETTY_FUNCTION__))
;
23122
23123 if (!Subtarget.hasSSE2() || !Op->hasOneUse())
23124 return SDValue();
23125
23126 // Check whether we're masking/truncating an OR-reduction result, in which
23127 // case track the masked bits.
23128 APInt Mask = APInt::getAllOnes(Op.getScalarValueSizeInBits());
23129 switch (Op.getOpcode()) {
23130 case ISD::TRUNCATE: {
23131 SDValue Src = Op.getOperand(0);
23132 Mask = APInt::getLowBitsSet(Src.getScalarValueSizeInBits(),
23133 Op.getScalarValueSizeInBits());
23134 Op = Src;
23135 break;
23136 }
23137 case ISD::AND: {
23138 if (auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
23139 Mask = Cst->getAPIntValue();
23140 Op = Op.getOperand(0);
23141 }
23142 break;
23143 }
23144 }
23145
23146 SmallVector<SDValue, 8> VecIns;
23147 if (Op.getOpcode() == ISD::OR && matchScalarReduction(Op, ISD::OR, VecIns)) {
23148 EVT VT = VecIns[0].getValueType();
23149 assert(llvm::all_of(VecIns,(static_cast <bool> (llvm::all_of(VecIns, [VT](SDValue V
) { return VT == V.getValueType(); }) && "Reduction source vector mismatch"
) ? void (0) : __assert_fail ("llvm::all_of(VecIns, [VT](SDValue V) { return VT == V.getValueType(); }) && \"Reduction source vector mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23151, __extension__
__PRETTY_FUNCTION__))
23150 [VT](SDValue V) { return VT == V.getValueType(); }) &&(static_cast <bool> (llvm::all_of(VecIns, [VT](SDValue V
) { return VT == V.getValueType(); }) && "Reduction source vector mismatch"
) ? void (0) : __assert_fail ("llvm::all_of(VecIns, [VT](SDValue V) { return VT == V.getValueType(); }) && \"Reduction source vector mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23151, __extension__
__PRETTY_FUNCTION__))
23151 "Reduction source vector mismatch")(static_cast <bool> (llvm::all_of(VecIns, [VT](SDValue V
) { return VT == V.getValueType(); }) && "Reduction source vector mismatch"
) ? void (0) : __assert_fail ("llvm::all_of(VecIns, [VT](SDValue V) { return VT == V.getValueType(); }) && \"Reduction source vector mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23151, __extension__
__PRETTY_FUNCTION__))
;
23152
23153 // Quit if less than 128-bits or not splittable to 128/256-bit vector.
23154 if (VT.getSizeInBits() < 128 || !isPowerOf2_32(VT.getSizeInBits()))
23155 return SDValue();
23156
23157 // If more than one full vector is evaluated, OR them first before PTEST.
23158 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1;
23159 Slot += 2, e += 1) {
23160 // Each iteration will OR 2 nodes and append the result until there is
23161 // only 1 node left, i.e. the final OR'd value of all vectors.
23162 SDValue LHS = VecIns[Slot];
23163 SDValue RHS = VecIns[Slot + 1];
23164 VecIns.push_back(DAG.getNode(ISD::OR, DL, VT, LHS, RHS));
23165 }
23166
23167 X86::CondCode CCode;
23168 if (SDValue V = LowerVectorAllZero(DL, VecIns.back(), CC, Mask, Subtarget,
23169 DAG, CCode)) {
23170 X86CC = DAG.getTargetConstant(CCode, DL, MVT::i8);
23171 return V;
23172 }
23173 }
23174
23175 if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
23176 ISD::NodeType BinOp;
23177 if (SDValue Match =
23178 DAG.matchBinOpReduction(Op.getNode(), BinOp, {ISD::OR})) {
23179 X86::CondCode CCode;
23180 if (SDValue V =
23181 LowerVectorAllZero(DL, Match, CC, Mask, Subtarget, DAG, CCode)) {
23182 X86CC = DAG.getTargetConstant(CCode, DL, MVT::i8);
23183 return V;
23184 }
23185 }
23186 }
23187
23188 return SDValue();
23189}
23190
23191/// return true if \c Op has a use that doesn't just read flags.
23192static bool hasNonFlagsUse(SDValue Op) {
23193 for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
23194 ++UI) {
23195 SDNode *User = *UI;
23196 unsigned UOpNo = UI.getOperandNo();
23197 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
23198 // Look pass truncate.
23199 UOpNo = User->use_begin().getOperandNo();
23200 User = *User->use_begin();
23201 }
23202
23203 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
23204 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
23205 return true;
23206 }
23207 return false;
23208}
23209
23210// Transform to an x86-specific ALU node with flags if there is a chance of
23211// using an RMW op or only the flags are used. Otherwise, leave
23212// the node alone and emit a 'cmp' or 'test' instruction.
23213static bool isProfitableToUseFlagOp(SDValue Op) {
23214 for (SDNode *U : Op->uses())
23215 if (U->getOpcode() != ISD::CopyToReg &&
23216 U->getOpcode() != ISD::SETCC &&
23217 U->getOpcode() != ISD::STORE)
23218 return false;
23219
23220 return true;
23221}
23222
23223/// Emit nodes that will be selected as "test Op0,Op0", or something
23224/// equivalent.
23225static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
23226 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
23227 // CF and OF aren't always set the way we want. Determine which
23228 // of these we need.
23229 bool NeedCF = false;
23230 bool NeedOF = false;
23231 switch (X86CC) {
23232 default: break;
23233 case X86::COND_A: case X86::COND_AE:
23234 case X86::COND_B: case X86::COND_BE:
23235 NeedCF = true;
23236 break;
23237 case X86::COND_G: case X86::COND_GE:
23238 case X86::COND_L: case X86::COND_LE:
23239 case X86::COND_O: case X86::COND_NO: {
23240 // Check if we really need to set the
23241 // Overflow flag. If NoSignedWrap is present
23242 // that is not actually needed.
23243 switch (Op->getOpcode()) {
23244 case ISD::ADD:
23245 case ISD::SUB:
23246 case ISD::MUL:
23247 case ISD::SHL:
23248 if (Op.getNode()->getFlags().hasNoSignedWrap())
23249 break;
23250 LLVM_FALLTHROUGH[[gnu::fallthrough]];
23251 default:
23252 NeedOF = true;
23253 break;
23254 }
23255 break;
23256 }
23257 }
23258 // See if we can use the EFLAGS value from the operand instead of
23259 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
23260 // we prove that the arithmetic won't overflow, we can't use OF or CF.
23261 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
23262 // Emit a CMP with 0, which is the TEST pattern.
23263 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
23264 DAG.getConstant(0, dl, Op.getValueType()));
23265 }
23266 unsigned Opcode = 0;
23267 unsigned NumOperands = 0;
23268
23269 SDValue ArithOp = Op;
23270
23271 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
23272 // which may be the result of a CAST. We use the variable 'Op', which is the
23273 // non-casted variable when we check for possible users.
23274 switch (ArithOp.getOpcode()) {
23275 case ISD::AND:
23276 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
23277 // because a TEST instruction will be better.
23278 if (!hasNonFlagsUse(Op))
23279 break;
23280
23281 LLVM_FALLTHROUGH[[gnu::fallthrough]];
23282 case ISD::ADD:
23283 case ISD::SUB:
23284 case ISD::OR:
23285 case ISD::XOR:
23286 if (!isProfitableToUseFlagOp(Op))
23287 break;
23288
23289 // Otherwise use a regular EFLAGS-setting instruction.
23290 switch (ArithOp.getOpcode()) {
23291 default: llvm_unreachable("unexpected operator!")::llvm::llvm_unreachable_internal("unexpected operator!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23291)
;
23292 case ISD::ADD: Opcode = X86ISD::ADD; break;
23293 case ISD::SUB: Opcode = X86ISD::SUB; break;
23294 case ISD::XOR: Opcode = X86ISD::XOR; break;
23295 case ISD::AND: Opcode = X86ISD::AND; break;
23296 case ISD::OR: Opcode = X86ISD::OR; break;
23297 }
23298
23299 NumOperands = 2;
23300 break;
23301 case X86ISD::ADD:
23302 case X86ISD::SUB:
23303 case X86ISD::OR:
23304 case X86ISD::XOR:
23305 case X86ISD::AND:
23306 return SDValue(Op.getNode(), 1);
23307 case ISD::SSUBO:
23308 case ISD::USUBO: {
23309 // /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag.
23310 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
23311 return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),
23312 Op->getOperand(1)).getValue(1);
23313 }
23314 default:
23315 break;
23316 }
23317
23318 if (Opcode == 0) {
23319 // Emit a CMP with 0, which is the TEST pattern.
23320 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
23321 DAG.getConstant(0, dl, Op.getValueType()));
23322 }
23323 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
23324 SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
23325
23326 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
23327 DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);
23328 return SDValue(New.getNode(), 1);
23329}
23330
23331/// Emit nodes that will be selected as "cmp Op0,Op1", or something
23332/// equivalent.
23333static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
23334 const SDLoc &dl, SelectionDAG &DAG,
23335 const X86Subtarget &Subtarget) {
23336 if (isNullConstant(Op1))
23337 return EmitTest(Op0, X86CC, dl, DAG, Subtarget);
23338
23339 EVT CmpVT = Op0.getValueType();
23340
23341 assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||(static_cast <bool> ((CmpVT == MVT::i8 || CmpVT == MVT::
i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!"
) ? void (0) : __assert_fail ("(CmpVT == MVT::i8 || CmpVT == MVT::i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23342, __extension__
__PRETTY_FUNCTION__))
23342 CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!")(static_cast <bool> ((CmpVT == MVT::i8 || CmpVT == MVT::
i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!"
) ? void (0) : __assert_fail ("(CmpVT == MVT::i8 || CmpVT == MVT::i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23342, __extension__
__PRETTY_FUNCTION__))
;
23343
23344 // Only promote the compare up to I32 if it is a 16 bit operation
23345 // with an immediate. 16 bit immediates are to be avoided.
23346 if (CmpVT == MVT::i16 && !Subtarget.isAtom() &&
23347 !DAG.getMachineFunction().getFunction().hasMinSize()) {
23348 ConstantSDNode *COp0 = dyn_cast<ConstantSDNode>(Op0);
23349 ConstantSDNode *COp1 = dyn_cast<ConstantSDNode>(Op1);
23350 // Don't do this if the immediate can fit in 8-bits.
23351 if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||
23352 (COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {
23353 unsigned ExtendOp =
23354 isX86CCSigned(X86CC) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
23355 if (X86CC == X86::COND_E || X86CC == X86::COND_NE) {
23356 // For equality comparisons try to use SIGN_EXTEND if the input was
23357 // truncate from something with enough sign bits.
23358 if (Op0.getOpcode() == ISD::TRUNCATE) {
23359 if (DAG.ComputeMaxSignificantBits(Op0.getOperand(0)) <= 16)
23360 ExtendOp = ISD::SIGN_EXTEND;
23361 } else if (Op1.getOpcode() == ISD::TRUNCATE) {
23362 if (DAG.ComputeMaxSignificantBits(Op1.getOperand(0)) <= 16)
23363 ExtendOp = ISD::SIGN_EXTEND;
23364 }
23365 }
23366
23367 CmpVT = MVT::i32;
23368 Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0);
23369 Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);
23370 }
23371 }
23372
23373 // Try to shrink i64 compares if the input has enough zero bits.
23374 // FIXME: Do this for non-constant compares for constant on LHS?
23375 if (CmpVT == MVT::i64 && isa<ConstantSDNode>(Op1) && !isX86CCSigned(X86CC) &&
23376 Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
23377 cast<ConstantSDNode>(Op1)->getAPIntValue().getActiveBits() <= 32 &&
23378 DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) {
23379 CmpVT = MVT::i32;
23380 Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
23381 Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
23382 }
23383
23384 // 0-x == y --> x+y == 0
23385 // 0-x != y --> x+y != 0
23386 if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) &&
23387 Op0.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
23388 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
23389 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(1), Op1);
23390 return Add.getValue(1);
23391 }
23392
23393 // x == 0-y --> x+y == 0
23394 // x != 0-y --> x+y != 0
23395 if (Op1.getOpcode() == ISD::SUB && isNullConstant(Op1.getOperand(0)) &&
23396 Op1.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
23397 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
23398 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0, Op1.getOperand(1));
23399 return Add.getValue(1);
23400 }
23401
23402 // Use SUB instead of CMP to enable CSE between SUB and CMP.
23403 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
23404 SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
23405 return Sub.getValue(1);
23406}
23407
23408/// Check if replacement of SQRT with RSQRT should be disabled.
23409bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
23410 EVT VT = Op.getValueType();
23411
23412 // We don't need to replace SQRT with RSQRT for half type.
23413 if (VT.getScalarType() == MVT::f16)
23414 return true;
23415
23416 // We never want to use both SQRT and RSQRT instructions for the same input.
23417 if (DAG.doesNodeExist(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
23418 return false;
23419
23420 if (VT.isVector())
23421 return Subtarget.hasFastVectorFSQRT();
23422 return Subtarget.hasFastScalarFSQRT();
23423}
23424
23425/// The minimum architected relative accuracy is 2^-12. We need one
23426/// Newton-Raphson step to have a good float result (24 bits of precision).
23427SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
23428 SelectionDAG &DAG, int Enabled,
23429 int &RefinementSteps,
23430 bool &UseOneConstNR,
23431 bool Reciprocal) const {
23432 SDLoc DL(Op);
23433 EVT VT = Op.getValueType();
23434
23435 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
23436 // It is likely not profitable to do this for f64 because a double-precision
23437 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
23438 // instructions: convert to single, rsqrtss, convert back to double, refine
23439 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
23440 // along with FMA, this could be a throughput win.
23441 // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
23442 // after legalize types.
23443 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
23444 (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
23445 (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
23446 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
23447 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
23448 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23449 RefinementSteps = 1;
23450
23451 UseOneConstNR = false;
23452 // There is no FSQRT for 512-bits, but there is RSQRT14.
23453 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
23454 SDValue Estimate = DAG.getNode(Opcode, DL, VT, Op);
23455 if (RefinementSteps == 0 && !Reciprocal)
23456 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Op, Estimate);
23457 return Estimate;
23458 }
23459
23460 if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
23461 Subtarget.hasFP16()) {
23462 assert(Reciprocal && "Don't replace SQRT with RSQRT for half type")(static_cast <bool> (Reciprocal && "Don't replace SQRT with RSQRT for half type"
) ? void (0) : __assert_fail ("Reciprocal && \"Don't replace SQRT with RSQRT for half type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23462, __extension__
__PRETTY_FUNCTION__))
;
23463 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23464 RefinementSteps = 0;
23465
23466 if (VT == MVT::f16) {
23467 SDValue Zero = DAG.getIntPtrConstant(0, DL);
23468 SDValue Undef = DAG.getUNDEF(MVT::v8f16);
23469 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
23470 Op = DAG.getNode(X86ISD::RSQRT14S, DL, MVT::v8f16, Undef, Op);
23471 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
23472 }
23473
23474 return DAG.getNode(X86ISD::RSQRT14, DL, VT, Op);
23475 }
23476 return SDValue();
23477}
23478
23479/// The minimum architected relative accuracy is 2^-12. We need one
23480/// Newton-Raphson step to have a good float result (24 bits of precision).
23481SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
23482 int Enabled,
23483 int &RefinementSteps) const {
23484 SDLoc DL(Op);
23485 EVT VT = Op.getValueType();
23486
23487 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
23488 // It is likely not profitable to do this for f64 because a double-precision
23489 // reciprocal estimate with refinement on x86 prior to FMA requires
23490 // 15 instructions: convert to single, rcpss, convert back to double, refine
23491 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
23492 // along with FMA, this could be a throughput win.
23493
23494 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
23495 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
23496 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
23497 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
23498 // Enable estimate codegen with 1 refinement step for vector division.
23499 // Scalar division estimates are disabled because they break too much
23500 // real-world code. These defaults are intended to match GCC behavior.
23501 if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
23502 return SDValue();
23503
23504 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23505 RefinementSteps = 1;
23506
23507 // There is no FSQRT for 512-bits, but there is RCP14.
23508 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
23509 return DAG.getNode(Opcode, DL, VT, Op);
23510 }
23511
23512 if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
23513 Subtarget.hasFP16()) {
23514 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23515 RefinementSteps = 0;
23516
23517 if (VT == MVT::f16) {
23518 SDValue Zero = DAG.getIntPtrConstant(0, DL);
23519 SDValue Undef = DAG.getUNDEF(MVT::v8f16);
23520 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
23521 Op = DAG.getNode(X86ISD::RCP14S, DL, MVT::v8f16, Undef, Op);
23522 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
23523 }
23524
23525 return DAG.getNode(X86ISD::RCP14, DL, VT, Op);
23526 }
23527 return SDValue();
23528}
23529
23530/// If we have at least two divisions that use the same divisor, convert to
23531/// multiplication by a reciprocal. This may need to be adjusted for a given
23532/// CPU if a division's cost is not at least twice the cost of a multiplication.
23533/// This is because we still need one division to calculate the reciprocal and
23534/// then we need two multiplies by that reciprocal as replacements for the
23535/// original divisions.
23536unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
23537 return 2;
23538}
23539
23540SDValue
23541X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
23542 SelectionDAG &DAG,
23543 SmallVectorImpl<SDNode *> &Created) const {
23544 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
23545 if (isIntDivCheap(N->getValueType(0), Attr))
23546 return SDValue(N,0); // Lower SDIV as SDIV
23547
23548 assert((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) &&(static_cast <bool> ((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2
()) && "Unexpected divisor!") ? void (0) : __assert_fail
("(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) && \"Unexpected divisor!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23549, __extension__
__PRETTY_FUNCTION__))
23549 "Unexpected divisor!")(static_cast <bool> ((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2
()) && "Unexpected divisor!") ? void (0) : __assert_fail
("(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) && \"Unexpected divisor!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23549, __extension__
__PRETTY_FUNCTION__))
;
23550
23551 // Only perform this transform if CMOV is supported otherwise the select
23552 // below will become a branch.
23553 if (!Subtarget.canUseCMOV())
23554 return SDValue();
23555
23556 // fold (sdiv X, pow2)
23557 EVT VT = N->getValueType(0);
23558 // FIXME: Support i8.
23559 if (VT != MVT::i16 && VT != MVT::i32 &&
23560 !(Subtarget.is64Bit() && VT == MVT::i64))
23561 return SDValue();
23562
23563 unsigned Lg2 = Divisor.countTrailingZeros();
23564
23565 // If the divisor is 2 or -2, the default expansion is better.
23566 if (Lg2 == 1)
23567 return SDValue();
23568
23569 SDLoc DL(N);
23570 SDValue N0 = N->getOperand(0);
23571 SDValue Zero = DAG.getConstant(0, DL, VT);
23572 APInt Lg2Mask = APInt::getLowBitsSet(VT.getSizeInBits(), Lg2);
23573 SDValue Pow2MinusOne = DAG.getConstant(Lg2Mask, DL, VT);
23574
23575 // If N0 is negative, we need to add (Pow2 - 1) to it before shifting right.
23576 SDValue Cmp = DAG.getSetCC(DL, MVT::i8, N0, Zero, ISD::SETLT);
23577 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
23578 SDValue CMov = DAG.getNode(ISD::SELECT, DL, VT, Cmp, Add, N0);
23579
23580 Created.push_back(Cmp.getNode());
23581 Created.push_back(Add.getNode());
23582 Created.push_back(CMov.getNode());
23583
23584 // Divide by pow2.
23585 SDValue SRA =
23586 DAG.getNode(ISD::SRA, DL, VT, CMov, DAG.getConstant(Lg2, DL, MVT::i8));
23587
23588 // If we're dividing by a positive value, we're done. Otherwise, we must
23589 // negate the result.
23590 if (Divisor.isNonNegative())
23591 return SRA;
23592
23593 Created.push_back(SRA.getNode());
23594 return DAG.getNode(ISD::SUB, DL, VT, Zero, SRA);
23595}
23596
23597/// Result of 'and' is compared against zero. Change to a BT node if possible.
23598/// Returns the BT node and the condition code needed to use it.
23599static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl,
23600 SelectionDAG &DAG, X86::CondCode &X86CC) {
23601 assert(And.getOpcode() == ISD::AND && "Expected AND node!")(static_cast <bool> (And.getOpcode() == ISD::AND &&
"Expected AND node!") ? void (0) : __assert_fail ("And.getOpcode() == ISD::AND && \"Expected AND node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23601, __extension__
__PRETTY_FUNCTION__))
;
23602 SDValue Op0 = And.getOperand(0);
23603 SDValue Op1 = And.getOperand(1);
23604 if (Op0.getOpcode() == ISD::TRUNCATE)
23605 Op0 = Op0.getOperand(0);
23606 if (Op1.getOpcode() == ISD::TRUNCATE)
23607 Op1 = Op1.getOperand(0);
23608
23609 SDValue Src, BitNo;
23610 if (Op1.getOpcode() == ISD::SHL)
23611 std::swap(Op0, Op1);
23612 if (Op0.getOpcode() == ISD::SHL) {
23613 if (isOneConstant(Op0.getOperand(0))) {
23614 // If we looked past a truncate, check that it's only truncating away
23615 // known zeros.
23616 unsigned BitWidth = Op0.getValueSizeInBits();
23617 unsigned AndBitWidth = And.getValueSizeInBits();
23618 if (BitWidth > AndBitWidth) {
23619 KnownBits Known = DAG.computeKnownBits(Op0);
23620 if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
23621 return SDValue();
23622 }
23623 Src = Op1;
23624 BitNo = Op0.getOperand(1);
23625 }
23626 } else if (Op1.getOpcode() == ISD::Constant) {
23627 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
23628 uint64_t AndRHSVal = AndRHS->getZExtValue();
23629 SDValue AndLHS = Op0;
23630
23631 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
23632 Src = AndLHS.getOperand(0);
23633 BitNo = AndLHS.getOperand(1);
23634 } else {
23635 // Use BT if the immediate can't be encoded in a TEST instruction or we
23636 // are optimizing for size and the immedaite won't fit in a byte.
23637 bool OptForSize = DAG.shouldOptForSize();
23638 if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
23639 isPowerOf2_64(AndRHSVal)) {
23640 Src = AndLHS;
23641 BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,
23642 Src.getValueType());
23643 }
23644 }
23645 }
23646
23647 // No patterns found, give up.
23648 if (!Src.getNode())
23649 return SDValue();
23650
23651 // Remove any bit flip.
23652 if (isBitwiseNot(Src)) {
23653 Src = Src.getOperand(0);
23654 CC = CC == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ;
23655 }
23656
23657 // Attempt to create the X86ISD::BT node.
23658 if (SDValue BT = getBT(Src, BitNo, dl, DAG)) {
23659 X86CC = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
23660 return BT;
23661 }
23662
23663 return SDValue();
23664}
23665
23666// Check if pre-AVX condcode can be performed by a single FCMP op.
23667static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode) {
23668 return (SetCCOpcode != ISD::SETONE) && (SetCCOpcode != ISD::SETUEQ);
23669}
23670
23671/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
23672/// CMPs.
23673static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
23674 SDValue &Op1, bool &IsAlwaysSignaling) {
23675 unsigned SSECC;
23676 bool Swap = false;
23677
23678 // SSE Condition code mapping:
23679 // 0 - EQ
23680 // 1 - LT
23681 // 2 - LE
23682 // 3 - UNORD
23683 // 4 - NEQ
23684 // 5 - NLT
23685 // 6 - NLE
23686 // 7 - ORD
23687 switch (SetCCOpcode) {
23688 default: llvm_unreachable("Unexpected SETCC condition")::llvm::llvm_unreachable_internal("Unexpected SETCC condition"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23688)
;
23689 case ISD::SETOEQ:
23690 case ISD::SETEQ: SSECC = 0; break;
23691 case ISD::SETOGT:
23692 case ISD::SETGT: Swap = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];
23693 case ISD::SETLT:
23694 case ISD::SETOLT: SSECC = 1; break;
23695 case ISD::SETOGE:
23696 case ISD::SETGE: Swap = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];
23697 case ISD::SETLE:
23698 case ISD::SETOLE: SSECC = 2; break;
23699 case ISD::SETUO: SSECC = 3; break;
23700 case ISD::SETUNE:
23701 case ISD::SETNE: SSECC = 4; break;
23702 case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];
23703 case ISD::SETUGE: SSECC = 5; break;
23704 case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];
23705 case ISD::SETUGT: SSECC = 6; break;
23706 case ISD::SETO: SSECC = 7; break;
23707 case ISD::SETUEQ: SSECC = 8; break;
23708 case ISD::SETONE: SSECC = 12; break;
23709 }
23710 if (Swap)
23711 std::swap(Op0, Op1);
23712
23713 switch (SetCCOpcode) {
23714 default:
23715 IsAlwaysSignaling = true;
23716 break;
23717 case ISD::SETEQ:
23718 case ISD::SETOEQ:
23719 case ISD::SETUEQ:
23720 case ISD::SETNE:
23721 case ISD::SETONE:
23722 case ISD::SETUNE:
23723 case ISD::SETO:
23724 case ISD::SETUO:
23725 IsAlwaysSignaling = false;
23726 break;
23727 }
23728
23729 return SSECC;
23730}
23731
23732/// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
23733/// concatenate the result back.
23734static SDValue splitIntVSETCC(EVT VT, SDValue LHS, SDValue RHS,
23735 ISD::CondCode Cond, SelectionDAG &DAG,
23736 const SDLoc &dl) {
23737 assert(VT.isInteger() && VT == LHS.getValueType() &&(static_cast <bool> (VT.isInteger() && VT == LHS
.getValueType() && VT == RHS.getValueType() &&
"Unsupported VTs!") ? void (0) : __assert_fail ("VT.isInteger() && VT == LHS.getValueType() && VT == RHS.getValueType() && \"Unsupported VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23738, __extension__
__PRETTY_FUNCTION__))
23738 VT == RHS.getValueType() && "Unsupported VTs!")(static_cast <bool> (VT.isInteger() && VT == LHS
.getValueType() && VT == RHS.getValueType() &&
"Unsupported VTs!") ? void (0) : __assert_fail ("VT.isInteger() && VT == LHS.getValueType() && VT == RHS.getValueType() && \"Unsupported VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23738, __extension__
__PRETTY_FUNCTION__))
;
23739
23740 SDValue CC = DAG.getCondCode(Cond);
23741
23742 // Extract the LHS Lo/Hi vectors
23743 SDValue LHS1, LHS2;
23744 std::tie(LHS1, LHS2) = splitVector(LHS, DAG, dl);
23745
23746 // Extract the RHS Lo/Hi vectors
23747 SDValue RHS1, RHS2;
23748 std::tie(RHS1, RHS2) = splitVector(RHS, DAG, dl);
23749
23750 // Issue the operation on the smaller types and concatenate the result back
23751 EVT LoVT, HiVT;
23752 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
23753 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
23754 DAG.getNode(ISD::SETCC, dl, LoVT, LHS1, RHS1, CC),
23755 DAG.getNode(ISD::SETCC, dl, HiVT, LHS2, RHS2, CC));
23756}
23757
23758static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
23759
23760 SDValue Op0 = Op.getOperand(0);
23761 SDValue Op1 = Op.getOperand(1);
23762 SDValue CC = Op.getOperand(2);
23763 MVT VT = Op.getSimpleValueType();
23764 SDLoc dl(Op);
23765
23766 assert(VT.getVectorElementType() == MVT::i1 &&(static_cast <bool> (VT.getVectorElementType() == MVT::
i1 && "Cannot set masked compare for this operation")
? void (0) : __assert_fail ("VT.getVectorElementType() == MVT::i1 && \"Cannot set masked compare for this operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23767, __extension__
__PRETTY_FUNCTION__))
23767 "Cannot set masked compare for this operation")(static_cast <bool> (VT.getVectorElementType() == MVT::
i1 && "Cannot set masked compare for this operation")
? void (0) : __assert_fail ("VT.getVectorElementType() == MVT::i1 && \"Cannot set masked compare for this operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23767, __extension__
__PRETTY_FUNCTION__))
;
23768
23769 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
23770
23771 // Prefer SETGT over SETLT.
23772 if (SetCCOpcode == ISD::SETLT) {
23773 SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
23774 std::swap(Op0, Op1);
23775 }
23776
23777 return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
23778}
23779
23780/// Given a buildvector constant, return a new vector constant with each element
23781/// incremented or decremented. If incrementing or decrementing would result in
23782/// unsigned overflow or underflow or this is not a simple vector constant,
23783/// return an empty value.
23784static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc) {
23785 auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());
23786 if (!BV)
23787 return SDValue();
23788
23789 MVT VT = V.getSimpleValueType();
23790 MVT EltVT = VT.getVectorElementType();
23791 unsigned NumElts = VT.getVectorNumElements();
23792 SmallVector<SDValue, 8> NewVecC;
23793 SDLoc DL(V);
23794 for (unsigned i = 0; i < NumElts; ++i) {
23795 auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
23796 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)
23797 return SDValue();
23798
23799 // Avoid overflow/underflow.
23800 const APInt &EltC = Elt->getAPIntValue();
23801 if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isZero()))
23802 return SDValue();
23803
23804 NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));
23805 }
23806
23807 return DAG.getBuildVector(VT, DL, NewVecC);
23808}
23809
23810/// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
23811/// Op0 u<= Op1:
23812/// t = psubus Op0, Op1
23813/// pcmpeq t, <0..0>
23814static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,
23815 ISD::CondCode Cond, const SDLoc &dl,
23816 const X86Subtarget &Subtarget,
23817 SelectionDAG &DAG) {
23818 if (!Subtarget.hasSSE2())
23819 return SDValue();
23820
23821 MVT VET = VT.getVectorElementType();
23822 if (VET != MVT::i8 && VET != MVT::i16)
23823 return SDValue();
23824
23825 switch (Cond) {
23826 default:
23827 return SDValue();
23828 case ISD::SETULT: {
23829 // If the comparison is against a constant we can turn this into a
23830 // setule. With psubus, setule does not require a swap. This is
23831 // beneficial because the constant in the register is no longer
23832 // destructed as the destination so it can be hoisted out of a loop.
23833 // Only do this pre-AVX since vpcmp* is no longer destructive.
23834 if (Subtarget.hasAVX())
23835 return SDValue();
23836 SDValue ULEOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/false);
23837 if (!ULEOp1)
23838 return SDValue();
23839 Op1 = ULEOp1;
23840 break;
23841 }
23842 case ISD::SETUGT: {
23843 // If the comparison is against a constant, we can turn this into a setuge.
23844 // This is beneficial because materializing a constant 0 for the PCMPEQ is
23845 // probably cheaper than XOR+PCMPGT using 2 different vector constants:
23846 // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
23847 SDValue UGEOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/true);
23848 if (!UGEOp1)
23849 return SDValue();
23850 Op1 = Op0;
23851 Op0 = UGEOp1;
23852 break;
23853 }
23854 // Psubus is better than flip-sign because it requires no inversion.
23855 case ISD::SETUGE:
23856 std::swap(Op0, Op1);
23857 break;
23858 case ISD::SETULE:
23859 break;
23860 }
23861
23862 SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1);
23863 return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
23864 DAG.getConstant(0, dl, VT));
23865}
23866
23867static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
23868 SelectionDAG &DAG) {
23869 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
23870 Op.getOpcode() == ISD::STRICT_FSETCCS;
23871 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
23872 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
23873 SDValue CC = Op.getOperand(IsStrict ? 3 : 2);
23874 MVT VT = Op->getSimpleValueType(0);
23875 ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
23876 bool isFP = Op1.getSimpleValueType().isFloatingPoint();
23877 SDLoc dl(Op);
23878
23879 if (isFP) {
23880#ifndef NDEBUG
23881 MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
23882 assert(EltVT == MVT::f16 || EltVT == MVT::f32 || EltVT == MVT::f64)(static_cast <bool> (EltVT == MVT::f16 || EltVT == MVT::
f32 || EltVT == MVT::f64) ? void (0) : __assert_fail ("EltVT == MVT::f16 || EltVT == MVT::f32 || EltVT == MVT::f64"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23882, __extension__
__PRETTY_FUNCTION__))
;
23883#endif
23884
23885 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
23886 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
23887
23888 // If we have a strict compare with a vXi1 result and the input is 128/256
23889 // bits we can't use a masked compare unless we have VLX. If we use a wider
23890 // compare like we do for non-strict, we might trigger spurious exceptions
23891 // from the upper elements. Instead emit a AVX compare and convert to mask.
23892 unsigned Opc;
23893 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 &&
23894 (!IsStrict || Subtarget.hasVLX() ||
23895 Op0.getSimpleValueType().is512BitVector())) {
23896#ifndef NDEBUG
23897 unsigned Num = VT.getVectorNumElements();
23898 assert(Num <= 16 || (Num == 32 && EltVT == MVT::f16))(static_cast <bool> (Num <= 16 || (Num == 32 &&
EltVT == MVT::f16)) ? void (0) : __assert_fail ("Num <= 16 || (Num == 32 && EltVT == MVT::f16)"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23898, __extension__
__PRETTY_FUNCTION__))
;
23899#endif
23900 Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;
23901 } else {
23902 Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP;
23903 // The SSE/AVX packed FP comparison nodes are defined with a
23904 // floating-point vector result that matches the operand type. This allows
23905 // them to work with an SSE1 target (integer vector types are not legal).
23906 VT = Op0.getSimpleValueType();
23907 }
23908
23909 SDValue Cmp;
23910 bool IsAlwaysSignaling;
23911 unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1, IsAlwaysSignaling);
23912 if (!Subtarget.hasAVX()) {
23913 // TODO: We could use following steps to handle a quiet compare with
23914 // signaling encodings.
23915 // 1. Get ordered masks from a quiet ISD::SETO
23916 // 2. Use the masks to mask potential unordered elements in operand A, B
23917 // 3. Get the compare results of masked A, B
23918 // 4. Calculating final result using the mask and result from 3
23919 // But currently, we just fall back to scalar operations.
23920 if (IsStrict && IsAlwaysSignaling && !IsSignaling)
23921 return SDValue();
23922
23923 // Insert an extra signaling instruction to raise exception.
23924 if (IsStrict && !IsAlwaysSignaling && IsSignaling) {
23925 SDValue SignalCmp = DAG.getNode(
23926 Opc, dl, {VT, MVT::Other},
23927 {Chain, Op0, Op1, DAG.getTargetConstant(1, dl, MVT::i8)}); // LT_OS
23928 // FIXME: It seems we need to update the flags of all new strict nodes.
23929 // Otherwise, mayRaiseFPException in MI will return false due to
23930 // NoFPExcept = false by default. However, I didn't find it in other
23931 // patches.
23932 SignalCmp->setFlags(Op->getFlags());
23933 Chain = SignalCmp.getValue(1);
23934 }
23935
23936 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
23937 // emit two comparisons and a logic op to tie them together.
23938 if (!cheapX86FSETCC_SSE(Cond)) {
23939 // LLVM predicate is SETUEQ or SETONE.
23940 unsigned CC0, CC1;
23941 unsigned CombineOpc;
23942 if (Cond == ISD::SETUEQ) {
23943 CC0 = 3; // UNORD
23944 CC1 = 0; // EQ
23945 CombineOpc = X86ISD::FOR;
23946 } else {
23947 assert(Cond == ISD::SETONE)(static_cast <bool> (Cond == ISD::SETONE) ? void (0) : __assert_fail
("Cond == ISD::SETONE", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23947, __extension__ __PRETTY_FUNCTION__))
;
23948 CC0 = 7; // ORD
23949 CC1 = 4; // NEQ
23950 CombineOpc = X86ISD::FAND;
23951 }
23952
23953 SDValue Cmp0, Cmp1;
23954 if (IsStrict) {
23955 Cmp0 = DAG.getNode(
23956 Opc, dl, {VT, MVT::Other},
23957 {Chain, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)});
23958 Cmp1 = DAG.getNode(
23959 Opc, dl, {VT, MVT::Other},
23960 {Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)});
23961 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1),
23962 Cmp1.getValue(1));
23963 } else {
23964 Cmp0 = DAG.getNode(
23965 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8));
23966 Cmp1 = DAG.getNode(
23967 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8));
23968 }
23969 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
23970 } else {
23971 if (IsStrict) {
23972 Cmp = DAG.getNode(
23973 Opc, dl, {VT, MVT::Other},
23974 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
23975 Chain = Cmp.getValue(1);
23976 } else
23977 Cmp = DAG.getNode(
23978 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
23979 }
23980 } else {
23981 // Handle all other FP comparisons here.
23982 if (IsStrict) {
23983 // Make a flip on already signaling CCs before setting bit 4 of AVX CC.
23984 SSECC |= (IsAlwaysSignaling ^ IsSignaling) << 4;
23985 Cmp = DAG.getNode(
23986 Opc, dl, {VT, MVT::Other},
23987 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
23988 Chain = Cmp.getValue(1);
23989 } else
23990 Cmp = DAG.getNode(
23991 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
23992 }
23993
23994 if (VT.getFixedSizeInBits() >
23995 Op.getSimpleValueType().getFixedSizeInBits()) {
23996 // We emitted a compare with an XMM/YMM result. Finish converting to a
23997 // mask register using a vptestm.
23998 EVT CastVT = EVT(VT).changeVectorElementTypeToInteger();
23999 Cmp = DAG.getBitcast(CastVT, Cmp);
24000 Cmp = DAG.getSetCC(dl, Op.getSimpleValueType(), Cmp,
24001 DAG.getConstant(0, dl, CastVT), ISD::SETNE);
24002 } else {
24003 // If this is SSE/AVX CMPP, bitcast the result back to integer to match
24004 // the result type of SETCC. The bitcast is expected to be optimized
24005 // away during combining/isel.
24006 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
24007 }
24008
24009 if (IsStrict)
24010 return DAG.getMergeValues({Cmp, Chain}, dl);
24011
24012 return Cmp;
24013 }
24014
24015 assert(!IsStrict && "Strict SETCC only handles FP operands.")(static_cast <bool> (!IsStrict && "Strict SETCC only handles FP operands."
) ? void (0) : __assert_fail ("!IsStrict && \"Strict SETCC only handles FP operands.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24015, __extension__
__PRETTY_FUNCTION__))
;
24016
24017 MVT VTOp0 = Op0.getSimpleValueType();
24018 (void)VTOp0;
24019 assert(VTOp0 == Op1.getSimpleValueType() &&(static_cast <bool> (VTOp0 == Op1.getSimpleValueType() &&
"Expected operands with same type!") ? void (0) : __assert_fail
("VTOp0 == Op1.getSimpleValueType() && \"Expected operands with same type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24020, __extension__
__PRETTY_FUNCTION__))
24020 "Expected operands with same type!")(static_cast <bool> (VTOp0 == Op1.getSimpleValueType() &&
"Expected operands with same type!") ? void (0) : __assert_fail
("VTOp0 == Op1.getSimpleValueType() && \"Expected operands with same type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24020, __extension__
__PRETTY_FUNCTION__))
;
24021 assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&(static_cast <bool> (VT.getVectorNumElements() == VTOp0
.getVectorNumElements() && "Invalid number of packed elements for source and destination!"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == VTOp0.getVectorNumElements() && \"Invalid number of packed elements for source and destination!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24022, __extension__
__PRETTY_FUNCTION__))
24022 "Invalid number of packed elements for source and destination!")(static_cast <bool> (VT.getVectorNumElements() == VTOp0
.getVectorNumElements() && "Invalid number of packed elements for source and destination!"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == VTOp0.getVectorNumElements() && \"Invalid number of packed elements for source and destination!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24022, __extension__
__PRETTY_FUNCTION__))
;
24023
24024 // The non-AVX512 code below works under the assumption that source and
24025 // destination types are the same.
24026 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&(static_cast <bool> ((Subtarget.hasAVX512() || (VT == VTOp0
)) && "Value types for source and destination must be the same!"
) ? void (0) : __assert_fail ("(Subtarget.hasAVX512() || (VT == VTOp0)) && \"Value types for source and destination must be the same!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24027, __extension__
__PRETTY_FUNCTION__))
24027 "Value types for source and destination must be the same!")(static_cast <bool> ((Subtarget.hasAVX512() || (VT == VTOp0
)) && "Value types for source and destination must be the same!"
) ? void (0) : __assert_fail ("(Subtarget.hasAVX512() || (VT == VTOp0)) && \"Value types for source and destination must be the same!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24027, __extension__
__PRETTY_FUNCTION__))
;
24028
24029 // The result is boolean, but operands are int/float
24030 if (VT.getVectorElementType() == MVT::i1) {
24031 // In AVX-512 architecture setcc returns mask with i1 elements,
24032 // But there is no compare instruction for i8 and i16 elements in KNL.
24033 assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&(static_cast <bool> ((VTOp0.getScalarSizeInBits() >=
32 || Subtarget.hasBWI()) && "Unexpected operand type"
) ? void (0) : __assert_fail ("(VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) && \"Unexpected operand type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24034, __extension__
__PRETTY_FUNCTION__))
24034 "Unexpected operand type")(static_cast <bool> ((VTOp0.getScalarSizeInBits() >=
32 || Subtarget.hasBWI()) && "Unexpected operand type"
) ? void (0) : __assert_fail ("(VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) && \"Unexpected operand type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24034, __extension__
__PRETTY_FUNCTION__))
;
24035 return LowerIntVSETCC_AVX512(Op, DAG);
24036 }
24037
24038 // Lower using XOP integer comparisons.
24039 if (VT.is128BitVector() && Subtarget.hasXOP()) {
24040 // Translate compare code to XOP PCOM compare mode.
24041 unsigned CmpMode = 0;
24042 switch (Cond) {
24043 default: llvm_unreachable("Unexpected SETCC condition")::llvm::llvm_unreachable_internal("Unexpected SETCC condition"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24043)
;
24044 case ISD::SETULT:
24045 case ISD::SETLT: CmpMode = 0x00; break;
24046 case ISD::SETULE:
24047 case ISD::SETLE: CmpMode = 0x01; break;
24048 case ISD::SETUGT:
24049 case ISD::SETGT: CmpMode = 0x02; break;
24050 case ISD::SETUGE:
24051 case ISD::SETGE: CmpMode = 0x03; break;
24052 case ISD::SETEQ: CmpMode = 0x04; break;
24053 case ISD::SETNE: CmpMode = 0x05; break;
24054 }
24055
24056 // Are we comparing unsigned or signed integers?
24057 unsigned Opc =
24058 ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;
24059
24060 return DAG.getNode(Opc, dl, VT, Op0, Op1,
24061 DAG.getTargetConstant(CmpMode, dl, MVT::i8));
24062 }
24063
24064 // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
24065 // Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
24066 if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) {
24067 SDValue BC0 = peekThroughBitcasts(Op0);
24068 if (BC0.getOpcode() == ISD::AND) {
24069 APInt UndefElts;
24070 SmallVector<APInt, 64> EltBits;
24071 if (getTargetConstantBitsFromNode(BC0.getOperand(1),
24072 VT.getScalarSizeInBits(), UndefElts,
24073 EltBits, false, false)) {
24074 if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) {
24075 Cond = ISD::SETEQ;
24076 Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
24077 }
24078 }
24079 }
24080 }
24081
24082 // ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.
24083 if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND &&
24084 Op0.getOperand(1) == Op1 && Op0.hasOneUse()) {
24085 ConstantSDNode *C1 = isConstOrConstSplat(Op1);
24086 if (C1 && C1->getAPIntValue().isPowerOf2()) {
24087 unsigned BitWidth = VT.getScalarSizeInBits();
24088 unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;
24089
24090 SDValue Result = Op0.getOperand(0);
24091 Result = DAG.getNode(ISD::SHL, dl, VT, Result,
24092 DAG.getConstant(ShiftAmt, dl, VT));
24093 Result = DAG.getNode(ISD::SRA, dl, VT, Result,
24094 DAG.getConstant(BitWidth - 1, dl, VT));
24095 return Result;
24096 }
24097 }
24098
24099 // Break 256-bit integer vector compare into smaller ones.
24100 if (VT.is256BitVector() && !Subtarget.hasInt256())
24101 return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);
24102
24103 // Break 512-bit integer vector compare into smaller ones.
24104 // TODO: Try harder to use VPCMPx + VPMOV2x?
24105 if (VT.is512BitVector())
24106 return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);
24107
24108 // If we have a limit constant, try to form PCMPGT (signed cmp) to avoid
24109 // not-of-PCMPEQ:
24110 // X != INT_MIN --> X >s INT_MIN
24111 // X != INT_MAX --> X <s INT_MAX --> INT_MAX >s X
24112 // +X != 0 --> +X >s 0
24113 APInt ConstValue;
24114 if (Cond == ISD::SETNE &&
24115 ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {
24116 if (ConstValue.isMinSignedValue())
24117 Cond = ISD::SETGT;
24118 else if (ConstValue.isMaxSignedValue())
24119 Cond = ISD::SETLT;
24120 else if (ConstValue.isZero() && DAG.SignBitIsZero(Op0))
24121 Cond = ISD::SETGT;
24122 }
24123
24124 // If both operands are known non-negative, then an unsigned compare is the
24125 // same as a signed compare and there's no need to flip signbits.
24126 // TODO: We could check for more general simplifications here since we're
24127 // computing known bits.
24128 bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
24129 !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
24130
24131 // Special case: Use min/max operations for unsigned compares.
24132 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24133 if (ISD::isUnsignedIntSetCC(Cond) &&
24134 (FlipSigns || ISD::isTrueWhenEqual(Cond)) &&
24135 TLI.isOperationLegal(ISD::UMIN, VT)) {
24136 // If we have a constant operand, increment/decrement it and change the
24137 // condition to avoid an invert.
24138 if (Cond == ISD::SETUGT) {
24139 // X > C --> X >= (C+1) --> X == umax(X, C+1)
24140 if (SDValue UGTOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/true)) {
24141 Op1 = UGTOp1;
24142 Cond = ISD::SETUGE;
24143 }
24144 }
24145 if (Cond == ISD::SETULT) {
24146 // X < C --> X <= (C-1) --> X == umin(X, C-1)
24147 if (SDValue ULTOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/false)) {
24148 Op1 = ULTOp1;
24149 Cond = ISD::SETULE;
24150 }
24151 }
24152 bool Invert = false;
24153 unsigned Opc;
24154 switch (Cond) {
24155 default: llvm_unreachable("Unexpected condition code")::llvm::llvm_unreachable_internal("Unexpected condition code"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24155)
;
24156 case ISD::SETUGT: Invert = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];
24157 case ISD::SETULE: Opc = ISD::UMIN; break;
24158 case ISD::SETULT: Invert = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];
24159 case ISD::SETUGE: Opc = ISD::UMAX; break;
24160 }
24161
24162 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
24163 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
24164
24165 // If the logical-not of the result is required, perform that now.
24166 if (Invert)
24167 Result = DAG.getNOT(dl, Result, VT);
24168
24169 return Result;
24170 }
24171
24172 // Try to use SUBUS and PCMPEQ.
24173 if (FlipSigns)
24174 if (SDValue V =
24175 LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
24176 return V;
24177
24178 // We are handling one of the integer comparisons here. Since SSE only has
24179 // GT and EQ comparisons for integer, swapping operands and multiple
24180 // operations may be required for some comparisons.
24181 unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
24182 : X86ISD::PCMPGT;
24183 bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
24184 Cond == ISD::SETGE || Cond == ISD::SETUGE;
24185 bool Invert = Cond == ISD::SETNE ||
24186 (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));
24187
24188 if (Swap)
24189 std::swap(Op0, Op1);
24190
24191 // Check that the operation in question is available (most are plain SSE2,
24192 // but PCMPGTQ and PCMPEQQ have different requirements).
24193 if (VT == MVT::v2i64) {
24194 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
24195 assert(Subtarget.hasSSE2() && "Don't know how to lower!")(static_cast <bool> (Subtarget.hasSSE2() && "Don't know how to lower!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Don't know how to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24195, __extension__
__PRETTY_FUNCTION__))
;
24196
24197 // Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle
24198 // the odd elements over the even elements.
24199 if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) {
24200 Op0 = DAG.getConstant(0, dl, MVT::v4i32);
24201 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24202
24203 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24204 static const int MaskHi[] = { 1, 1, 3, 3 };
24205 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
24206
24207 return DAG.getBitcast(VT, Result);
24208 }
24209
24210 if (!FlipSigns && !Invert && ISD::isBuildVectorAllOnes(Op1.getNode())) {
24211 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24212 Op1 = DAG.getConstant(-1, dl, MVT::v4i32);
24213
24214 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24215 static const int MaskHi[] = { 1, 1, 3, 3 };
24216 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
24217
24218 return DAG.getBitcast(VT, Result);
24219 }
24220
24221 // Since SSE has no unsigned integer comparisons, we need to flip the sign
24222 // bits of the inputs before performing those operations. The lower
24223 // compare is always unsigned.
24224 SDValue SB;
24225 if (FlipSigns) {
24226 SB = DAG.getConstant(0x8000000080000000ULL, dl, MVT::v2i64);
24227 } else {
24228 SB = DAG.getConstant(0x0000000080000000ULL, dl, MVT::v2i64);
24229 }
24230 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB);
24231 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB);
24232
24233 // Cast everything to the right type.
24234 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24235 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24236
24237 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
24238 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24239 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
24240
24241 // Create masks for only the low parts/high parts of the 64 bit integers.
24242 static const int MaskHi[] = { 1, 1, 3, 3 };
24243 static const int MaskLo[] = { 0, 0, 2, 2 };
24244 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
24245 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
24246 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
24247
24248 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
24249 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
24250
24251 if (Invert)
24252 Result = DAG.getNOT(dl, Result, MVT::v4i32);
24253
24254 return DAG.getBitcast(VT, Result);
24255 }
24256
24257 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
24258 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
24259 // pcmpeqd + pshufd + pand.
24260 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!")(static_cast <bool> (Subtarget.hasSSE2() && !FlipSigns
&& "Don't know how to lower!") ? void (0) : __assert_fail
("Subtarget.hasSSE2() && !FlipSigns && \"Don't know how to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24260, __extension__
__PRETTY_FUNCTION__))
;
24261
24262 // First cast everything to the right type.
24263 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24264 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24265
24266 // Do the compare.
24267 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
24268
24269 // Make sure the lower and upper halves are both all-ones.
24270 static const int Mask[] = { 1, 0, 3, 2 };
24271 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
24272 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
24273
24274 if (Invert)
24275 Result = DAG.getNOT(dl, Result, MVT::v4i32);
24276
24277 return DAG.getBitcast(VT, Result);
24278 }
24279 }
24280
24281 // Since SSE has no unsigned integer comparisons, we need to flip the sign
24282 // bits of the inputs before performing those operations.
24283 if (FlipSigns) {
24284 MVT EltVT = VT.getVectorElementType();
24285 SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
24286 VT);
24287 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
24288 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
24289 }
24290
24291 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
24292
24293 // If the logical-not of the result is required, perform that now.
24294 if (Invert)
24295 Result = DAG.getNOT(dl, Result, VT);
24296
24297 return Result;
24298}
24299
24300// Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible.
24301static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC,
24302 const SDLoc &dl, SelectionDAG &DAG,
24303 const X86Subtarget &Subtarget,
24304 SDValue &X86CC) {
24305 // Only support equality comparisons.
24306 if (CC != ISD::SETEQ && CC != ISD::SETNE)
24307 return SDValue();
24308
24309 // Must be a bitcast from vXi1.
24310 if (Op0.getOpcode() != ISD::BITCAST)
24311 return SDValue();
24312
24313 Op0 = Op0.getOperand(0);
24314 MVT VT = Op0.getSimpleValueType();
24315 if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&
24316 !(Subtarget.hasDQI() && VT == MVT::v8i1) &&
24317 !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
24318 return SDValue();
24319
24320 X86::CondCode X86Cond;
24321 if (isNullConstant(Op1)) {
24322 X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
24323 } else if (isAllOnesConstant(Op1)) {
24324 // C flag is set for all ones.
24325 X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;
24326 } else
24327 return SDValue();
24328
24329 // If the input is an AND, we can combine it's operands into the KTEST.
24330 bool KTestable = false;
24331 if (Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1))
24332 KTestable = true;
24333 if (Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1))
24334 KTestable = true;
24335 if (!isNullConstant(Op1))
24336 KTestable = false;
24337 if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) {
24338 SDValue LHS = Op0.getOperand(0);
24339 SDValue RHS = Op0.getOperand(1);
24340 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24341 return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS);
24342 }
24343
24344 // If the input is an OR, we can combine it's operands into the KORTEST.
24345 SDValue LHS = Op0;
24346 SDValue RHS = Op0;
24347 if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {
24348 LHS = Op0.getOperand(0);
24349 RHS = Op0.getOperand(1);
24350 }
24351
24352 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24353 return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
24354}
24355
24356/// Emit flags for the given setcc condition and operands. Also returns the
24357/// corresponding X86 condition code constant in X86CC.
24358SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
24359 ISD::CondCode CC, const SDLoc &dl,
24360 SelectionDAG &DAG,
24361 SDValue &X86CC) const {
24362 // Optimize to BT if possible.
24363 // Lower (X & (1 << N)) == 0 to BT(X, N).
24364 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
24365 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
24366 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1) &&
24367 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
24368 X86::CondCode X86CondCode;
24369 if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CondCode)) {
24370 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24371 return BT;
24372 }
24373 }
24374
24375 // Try to use PTEST/PMOVMSKB for a tree ORs equality compared with 0.
24376 // TODO: We could do AND tree with all 1s as well by using the C flag.
24377 if (isNullConstant(Op1) && (CC == ISD::SETEQ || CC == ISD::SETNE))
24378 if (SDValue CmpZ =
24379 MatchVectorAllZeroTest(Op0, CC, dl, Subtarget, DAG, X86CC))
24380 return CmpZ;
24381
24382 // Try to lower using KORTEST or KTEST.
24383 if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))
24384 return Test;
24385
24386 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
24387 // these.
24388 if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
24389 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
24390 // If the input is a setcc, then reuse the input setcc or use a new one with
24391 // the inverted condition.
24392 if (Op0.getOpcode() == X86ISD::SETCC) {
24393 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
24394
24395 X86CC = Op0.getOperand(0);
24396 if (Invert) {
24397 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
24398 CCode = X86::GetOppositeBranchCondition(CCode);
24399 X86CC = DAG.getTargetConstant(CCode, dl, MVT::i8);
24400 }
24401
24402 return Op0.getOperand(1);
24403 }
24404 }
24405
24406 // Try to use the carry flag from the add in place of an separate CMP for:
24407 // (seteq (add X, -1), -1). Similar for setne.
24408 if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD &&
24409 Op0.getOperand(1) == Op1 && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
24410 if (isProfitableToUseFlagOp(Op0)) {
24411 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
24412
24413 SDValue New = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(0),
24414 Op0.getOperand(1));
24415 DAG.ReplaceAllUsesOfValueWith(SDValue(Op0.getNode(), 0), New);
24416 X86::CondCode CCode = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
24417 X86CC = DAG.getTargetConstant(CCode, dl, MVT::i8);
24418 return SDValue(New.getNode(), 1);
24419 }
24420 }
24421
24422 X86::CondCode CondCode =
24423 TranslateX86CC(CC, dl, /*IsFP*/ false, Op0, Op1, DAG);
24424 assert(CondCode != X86::COND_INVALID && "Unexpected condition code!")(static_cast <bool> (CondCode != X86::COND_INVALID &&
"Unexpected condition code!") ? void (0) : __assert_fail ("CondCode != X86::COND_INVALID && \"Unexpected condition code!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24424, __extension__
__PRETTY_FUNCTION__))
;
24425
24426 SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget);
24427 X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
24428 return EFLAGS;
24429}
24430
24431SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
24432
24433 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
24434 Op.getOpcode() == ISD::STRICT_FSETCCS;
24435 MVT VT = Op->getSimpleValueType(0);
24436
24437 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
24438
24439 assert(VT == MVT::i8 && "SetCC type must be 8-bit integer")(static_cast <bool> (VT == MVT::i8 && "SetCC type must be 8-bit integer"
) ? void (0) : __assert_fail ("VT == MVT::i8 && \"SetCC type must be 8-bit integer\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24439, __extension__
__PRETTY_FUNCTION__))
;
24440 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
24441 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
24442 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
24443 SDLoc dl(Op);
24444 ISD::CondCode CC =
24445 cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();
24446
24447 // Handle f128 first, since one possible outcome is a normal integer
24448 // comparison which gets handled by emitFlagsForSetcc.
24449 if (Op0.getValueType() == MVT::f128) {
24450 softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1, Chain,
24451 Op.getOpcode() == ISD::STRICT_FSETCCS);
24452
24453 // If softenSetCCOperands returned a scalar, use it.
24454 if (!Op1.getNode()) {
24455 assert(Op0.getValueType() == Op.getValueType() &&(static_cast <bool> (Op0.getValueType() == Op.getValueType
() && "Unexpected setcc expansion!") ? void (0) : __assert_fail
("Op0.getValueType() == Op.getValueType() && \"Unexpected setcc expansion!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24456, __extension__
__PRETTY_FUNCTION__))
24456 "Unexpected setcc expansion!")(static_cast <bool> (Op0.getValueType() == Op.getValueType
() && "Unexpected setcc expansion!") ? void (0) : __assert_fail
("Op0.getValueType() == Op.getValueType() && \"Unexpected setcc expansion!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24456, __extension__
__PRETTY_FUNCTION__))
;
24457 if (IsStrict)
24458 return DAG.getMergeValues({Op0, Chain}, dl);
24459 return Op0;
24460 }
24461 }
24462
24463 if (Op0.getSimpleValueType().isInteger()) {
24464 // Attempt to canonicalize SGT/UGT -> SGE/UGE compares with constant which
24465 // reduces the number of EFLAGs bit reads (the GE conditions don't read ZF),
24466 // this may translate to less uops depending on uarch implementation. The
24467 // equivalent for SLE/ULE -> SLT/ULT isn't likely to happen as we already
24468 // canonicalize to that CondCode.
24469 // NOTE: Only do this if incrementing the constant doesn't increase the bit
24470 // encoding size - so it must either already be a i8 or i32 immediate, or it
24471 // shrinks down to that. We don't do this for any i64's to avoid additional
24472 // constant materializations.
24473 // TODO: Can we move this to TranslateX86CC to handle jumps/branches too?
24474 if (auto *Op1C = dyn_cast<ConstantSDNode>(Op1)) {
24475 const APInt &Op1Val = Op1C->getAPIntValue();
24476 if (!Op1Val.isZero()) {
24477 // Ensure the constant+1 doesn't overflow.
24478 if ((CC == ISD::CondCode::SETGT && !Op1Val.isMaxSignedValue()) ||
24479 (CC == ISD::CondCode::SETUGT && !Op1Val.isMaxValue())) {
24480 APInt Op1ValPlusOne = Op1Val + 1;
24481 if (Op1ValPlusOne.isSignedIntN(32) &&
24482 (!Op1Val.isSignedIntN(8) || Op1ValPlusOne.isSignedIntN(8))) {
24483 Op1 = DAG.getConstant(Op1ValPlusOne, dl, Op0.getValueType());
24484 CC = CC == ISD::CondCode::SETGT ? ISD::CondCode::SETGE
24485 : ISD::CondCode::SETUGE;
24486 }
24487 }
24488 }
24489 }
24490
24491 SDValue X86CC;
24492 SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);
24493 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
24494 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
24495 }
24496
24497 // Handle floating point.
24498 X86::CondCode CondCode = TranslateX86CC(CC, dl, /*IsFP*/ true, Op0, Op1, DAG);
24499 if (CondCode == X86::COND_INVALID)
24500 return SDValue();
24501
24502 SDValue EFLAGS;
24503 if (IsStrict) {
24504 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
24505 EFLAGS =
24506 DAG.getNode(IsSignaling ? X86ISD::STRICT_FCMPS : X86ISD::STRICT_FCMP,
24507 dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});
24508 Chain = EFLAGS.getValue(1);
24509 } else {
24510 EFLAGS = DAG.getNode(X86ISD::FCMP, dl, MVT::i32, Op0, Op1);
24511 }
24512
24513 SDValue X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
24514 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
24515 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
24516}
24517
24518SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
24519 SDValue LHS = Op.getOperand(0);
24520 SDValue RHS = Op.getOperand(1);
24521 SDValue Carry = Op.getOperand(2);
24522 SDValue Cond = Op.getOperand(3);
24523 SDLoc DL(Op);
24524
24525 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.")(static_cast <bool> (LHS.getSimpleValueType().isInteger
() && "SETCCCARRY is integer only.") ? void (0) : __assert_fail
("LHS.getSimpleValueType().isInteger() && \"SETCCCARRY is integer only.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24525, __extension__
__PRETTY_FUNCTION__))
;
24526 X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
24527
24528 // Recreate the carry if needed.
24529 EVT CarryVT = Carry.getValueType();
24530 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
24531 Carry, DAG.getAllOnesConstant(DL, CarryVT));
24532
24533 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
24534 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
24535 return getSETCC(CC, Cmp.getValue(1), DL, DAG);
24536}
24537
24538// This function returns three things: the arithmetic computation itself
24539// (Value), an EFLAGS result (Overflow), and a condition code (Cond). The
24540// flag and the condition code define the case in which the arithmetic
24541// computation overflows.
24542static std::pair<SDValue, SDValue>
24543getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG) {
24544 assert(Op.getResNo() == 0 && "Unexpected result number!")(static_cast <bool> (Op.getResNo() == 0 && "Unexpected result number!"
) ? void (0) : __assert_fail ("Op.getResNo() == 0 && \"Unexpected result number!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24544, __extension__
__PRETTY_FUNCTION__))
;
24545 SDValue Value, Overflow;
24546 SDValue LHS = Op.getOperand(0);
24547 SDValue RHS = Op.getOperand(1);
24548 unsigned BaseOp = 0;
24549 SDLoc DL(Op);
24550 switch (Op.getOpcode()) {
24551 default: llvm_unreachable("Unknown ovf instruction!")::llvm::llvm_unreachable_internal("Unknown ovf instruction!",
"llvm/lib/Target/X86/X86ISelLowering.cpp", 24551)
;
24552 case ISD::SADDO:
24553 BaseOp = X86ISD::ADD;
24554 Cond = X86::COND_O;
24555 break;
24556 case ISD::UADDO:
24557 BaseOp = X86ISD::ADD;
24558 Cond = isOneConstant(RHS) ? X86::COND_E : X86::COND_B;
24559 break;
24560 case ISD::SSUBO:
24561 BaseOp = X86ISD::SUB;
24562 Cond = X86::COND_O;
24563 break;
24564 case ISD::USUBO:
24565 BaseOp = X86ISD::SUB;
24566 Cond = X86::COND_B;
24567 break;
24568 case ISD::SMULO:
24569 BaseOp = X86ISD::SMUL;
24570 Cond = X86::COND_O;
24571 break;
24572 case ISD::UMULO:
24573 BaseOp = X86ISD::UMUL;
24574 Cond = X86::COND_O;
24575 break;
24576 }
24577
24578 if (BaseOp) {
24579 // Also sets EFLAGS.
24580 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
24581 Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
24582 Overflow = Value.getValue(1);
24583 }
24584
24585 return std::make_pair(Value, Overflow);
24586}
24587
24588static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
24589 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
24590 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
24591 // looks for this combo and may remove the "setcc" instruction if the "setcc"
24592 // has only one use.
24593 SDLoc DL(Op);
24594 X86::CondCode Cond;
24595 SDValue Value, Overflow;
24596 std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);
24597
24598 SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);
24599 assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!")(static_cast <bool> (Op->getValueType(1) == MVT::i8 &&
"Unexpected VT!") ? void (0) : __assert_fail ("Op->getValueType(1) == MVT::i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24599, __extension__
__PRETTY_FUNCTION__))
;
24600 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);
24601}
24602
24603/// Return true if opcode is a X86 logical comparison.
24604static bool isX86LogicalCmp(SDValue Op) {
24605 unsigned Opc = Op.getOpcode();
24606 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
24607 Opc == X86ISD::FCMP)
24608 return true;
24609 if (Op.getResNo() == 1 &&
24610 (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
24611 Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL ||
24612 Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND))
24613 return true;
24614
24615 return false;
24616}
24617
24618static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
24619 if (V.getOpcode() != ISD::TRUNCATE)
24620 return false;
24621
24622 SDValue VOp0 = V.getOperand(0);
24623 unsigned InBits = VOp0.getValueSizeInBits();
24624 unsigned Bits = V.getValueSizeInBits();
24625 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
24626}
24627
24628SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
24629 bool AddTest = true;
24630 SDValue Cond = Op.getOperand(0);
24631 SDValue Op1 = Op.getOperand(1);
24632 SDValue Op2 = Op.getOperand(2);
24633 SDLoc DL(Op);
24634 MVT VT = Op1.getSimpleValueType();
24635 SDValue CC;
24636
24637 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
24638 // are available or VBLENDV if AVX is available.
24639 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
24640 if (Cond.getOpcode() == ISD::SETCC && isScalarFPTypeInSSEReg(VT) &&
24641 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
24642 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
24643 bool IsAlwaysSignaling;
24644 unsigned SSECC =
24645 translateX86FSETCC(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
24646 CondOp0, CondOp1, IsAlwaysSignaling);
24647
24648 if (Subtarget.hasAVX512()) {
24649 SDValue Cmp =
24650 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1,
24651 DAG.getTargetConstant(SSECC, DL, MVT::i8));
24652 assert(!VT.isVector() && "Not a scalar type?")(static_cast <bool> (!VT.isVector() && "Not a scalar type?"
) ? void (0) : __assert_fail ("!VT.isVector() && \"Not a scalar type?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24652, __extension__
__PRETTY_FUNCTION__))
;
24653 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
24654 }
24655
24656 if (SSECC < 8 || Subtarget.hasAVX()) {
24657 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
24658 DAG.getTargetConstant(SSECC, DL, MVT::i8));
24659
24660 // If we have AVX, we can use a variable vector select (VBLENDV) instead
24661 // of 3 logic instructions for size savings and potentially speed.
24662 // Unfortunately, there is no scalar form of VBLENDV.
24663
24664 // If either operand is a +0.0 constant, don't try this. We can expect to
24665 // optimize away at least one of the logic instructions later in that
24666 // case, so that sequence would be faster than a variable blend.
24667
24668 // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
24669 // uses XMM0 as the selection register. That may need just as many
24670 // instructions as the AND/ANDN/OR sequence due to register moves, so
24671 // don't bother.
24672 if (Subtarget.hasAVX() && !isNullFPConstant(Op1) &&
24673 !isNullFPConstant(Op2)) {
24674 // Convert to vectors, do a VSELECT, and convert back to scalar.
24675 // All of the conversions should be optimized away.
24676 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
24677 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
24678 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
24679 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
24680
24681 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
24682 VCmp = DAG.getBitcast(VCmpVT, VCmp);
24683
24684 SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
24685
24686 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
24687 VSel, DAG.getIntPtrConstant(0, DL));
24688 }
24689 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
24690 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
24691 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
24692 }
24693 }
24694
24695 // AVX512 fallback is to lower selects of scalar floats to masked moves.
24696 if (isScalarFPTypeInSSEReg(VT) && Subtarget.hasAVX512()) {
24697 SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
24698 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
24699 }
24700
24701 if (Cond.getOpcode() == ISD::SETCC) {
24702 if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
24703 Cond = NewCond;
24704 // If the condition was updated, it's possible that the operands of the
24705 // select were also updated (for example, EmitTest has a RAUW). Refresh
24706 // the local references to the select operands in case they got stale.
24707 Op1 = Op.getOperand(1);
24708 Op2 = Op.getOperand(2);
24709 }
24710 }
24711
24712 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
24713 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
24714 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
24715 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
24716 // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
24717 // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
24718 // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
24719 // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
24720 if (Cond.getOpcode() == X86ISD::SETCC &&
24721 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
24722 isNullConstant(Cond.getOperand(1).getOperand(1))) {
24723 SDValue Cmp = Cond.getOperand(1);
24724 SDValue CmpOp0 = Cmp.getOperand(0);
24725 unsigned CondCode = Cond.getConstantOperandVal(0);
24726
24727 // Special handling for __builtin_ffs(X) - 1 pattern which looks like
24728 // (select (seteq X, 0), -1, (cttz_zero_undef X)). Disable the special
24729 // handle to keep the CMP with 0. This should be removed by
24730 // optimizeCompareInst by using the flags from the BSR/TZCNT used for the
24731 // cttz_zero_undef.
24732 auto MatchFFSMinus1 = [&](SDValue Op1, SDValue Op2) {
24733 return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() &&
24734 Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2));
24735 };
24736 if (Subtarget.canUseCMOV() && (VT == MVT::i32 || VT == MVT::i64) &&
24737 ((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) ||
24738 (CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) {
24739 // Keep Cmp.
24740 } else if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
24741 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
24742 SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
24743 SDVTList CmpVTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
24744
24745 // 'X - 1' sets the carry flag if X == 0.
24746 // '0 - X' sets the carry flag if X != 0.
24747 // Convert the carry flag to a -1/0 mask with sbb:
24748 // select (X != 0), -1, Y --> 0 - X; or (sbb), Y
24749 // select (X == 0), Y, -1 --> 0 - X; or (sbb), Y
24750 // select (X != 0), Y, -1 --> X - 1; or (sbb), Y
24751 // select (X == 0), -1, Y --> X - 1; or (sbb), Y
24752 SDValue Sub;
24753 if (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE)) {
24754 SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
24755 Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpOp0);
24756 } else {
24757 SDValue One = DAG.getConstant(1, DL, CmpOp0.getValueType());
24758 Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, CmpOp0, One);
24759 }
24760 SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
24761 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
24762 Sub.getValue(1));
24763 return DAG.getNode(ISD::OR, DL, VT, SBB, Y);
24764 } else if (!Subtarget.canUseCMOV() && CondCode == X86::COND_E &&
24765 Cmp.getOperand(0).getOpcode() == ISD::AND &&
24766 isOneConstant(Cmp.getOperand(0).getOperand(1))) {
24767 SDValue Src1, Src2;
24768 // true if Op2 is XOR or OR operator and one of its operands
24769 // is equal to Op1
24770 // ( a , a op b) || ( b , a op b)
24771 auto isOrXorPattern = [&]() {
24772 if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&
24773 (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {
24774 Src1 =
24775 Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
24776 Src2 = Op1;
24777 return true;
24778 }
24779 return false;
24780 };
24781
24782 if (isOrXorPattern()) {
24783 SDValue Neg;
24784 unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
24785 // we need mask of all zeros or ones with same size of the other
24786 // operands.
24787 if (CmpSz > VT.getSizeInBits())
24788 Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
24789 else if (CmpSz < VT.getSizeInBits())
24790 Neg = DAG.getNode(ISD::AND, DL, VT,
24791 DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
24792 DAG.getConstant(1, DL, VT));
24793 else
24794 Neg = CmpOp0;
24795 SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
24796 Neg); // -(and (x, 0x1))
24797 SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
24798 return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y
24799 }
24800 } else if ((VT == MVT::i32 || VT == MVT::i64) && isNullConstant(Op2) &&
24801 Cmp.getNode()->hasOneUse() && (CmpOp0 == Op1) &&
24802 ((CondCode == X86::COND_S) || // smin(x, 0)
24803 (CondCode == X86::COND_G && hasAndNot(Op1)))) { // smax(x, 0)
24804 // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
24805 //
24806 // If the comparison is testing for a positive value, we have to invert
24807 // the sign bit mask, so only do that transform if the target has a
24808 // bitwise 'and not' instruction (the invert is free).
24809 // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
24810 unsigned ShCt = VT.getSizeInBits() - 1;
24811 SDValue ShiftAmt = DAG.getConstant(ShCt, DL, VT);
24812 SDValue Shift = DAG.getNode(ISD::SRA, DL, VT, Op1, ShiftAmt);
24813 if (CondCode == X86::COND_G)
24814 Shift = DAG.getNOT(DL, Shift, VT);
24815 return DAG.getNode(ISD::AND, DL, VT, Shift, Op1);
24816 }
24817 }
24818
24819 // Look past (and (setcc_carry (cmp ...)), 1).
24820 if (Cond.getOpcode() == ISD::AND &&
24821 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
24822 isOneConstant(Cond.getOperand(1)))
24823 Cond = Cond.getOperand(0);
24824
24825 // If condition flag is set by a X86ISD::CMP, then use it as the condition
24826 // setting operand in place of the X86ISD::SETCC.
24827 unsigned CondOpcode = Cond.getOpcode();
24828 if (CondOpcode == X86ISD::SETCC ||
24829 CondOpcode == X86ISD::SETCC_CARRY) {
24830 CC = Cond.getOperand(0);
24831
24832 SDValue Cmp = Cond.getOperand(1);
24833 bool IllegalFPCMov = false;
24834 if (VT.isFloatingPoint() && !VT.isVector() &&
24835 !isScalarFPTypeInSSEReg(VT) && Subtarget.canUseCMOV()) // FPStack?
24836 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
24837
24838 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
24839 Cmp.getOpcode() == X86ISD::BT) { // FIXME
24840 Cond = Cmp;
24841 AddTest = false;
24842 }
24843 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
24844 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
24845 CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
24846 SDValue Value;
24847 X86::CondCode X86Cond;
24848 std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
24849
24850 CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);
24851 AddTest = false;
24852 }
24853
24854 if (AddTest) {
24855 // Look past the truncate if the high bits are known zero.
24856 if (isTruncWithZeroHighBitsInput(Cond, DAG))
24857 Cond = Cond.getOperand(0);
24858
24859 // We know the result of AND is compared against zero. Try to match
24860 // it to BT.
24861 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
24862 X86::CondCode X86CondCode;
24863 if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, X86CondCode)) {
24864 CC = DAG.getTargetConstant(X86CondCode, DL, MVT::i8);
24865 Cond = BT;
24866 AddTest = false;
24867 }
24868 }
24869 }
24870
24871 if (AddTest) {
24872 CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
24873 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);
24874 }
24875
24876 // a < b ? -1 : 0 -> RES = ~setcc_carry
24877 // a < b ? 0 : -1 -> RES = setcc_carry
24878 // a >= b ? -1 : 0 -> RES = setcc_carry
24879 // a >= b ? 0 : -1 -> RES = ~setcc_carry
24880 if (Cond.getOpcode() == X86ISD::SUB) {
24881 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
24882
24883 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
24884 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
24885 (isNullConstant(Op1) || isNullConstant(Op2))) {
24886 SDValue Res =
24887 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
24888 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond);
24889 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
24890 return DAG.getNOT(DL, Res, Res.getValueType());
24891 return Res;
24892 }
24893 }
24894
24895 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
24896 // widen the cmov and push the truncate through. This avoids introducing a new
24897 // branch during isel and doesn't add any extensions.
24898 if (Op.getValueType() == MVT::i8 &&
24899 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
24900 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
24901 if (T1.getValueType() == T2.getValueType() &&
24902 // Exclude CopyFromReg to avoid partial register stalls.
24903 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
24904 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
24905 CC, Cond);
24906 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
24907 }
24908 }
24909
24910 // Or finally, promote i8 cmovs if we have CMOV,
24911 // or i16 cmovs if it won't prevent folding a load.
24912 // FIXME: we should not limit promotion of i8 case to only when the CMOV is
24913 // legal, but EmitLoweredSelect() can not deal with these extensions
24914 // being inserted between two CMOV's. (in i16 case too TBN)
24915 // https://bugs.llvm.org/show_bug.cgi?id=40974
24916 if ((Op.getValueType() == MVT::i8 && Subtarget.canUseCMOV()) ||
24917 (Op.getValueType() == MVT::i16 && !X86::mayFoldLoad(Op1, Subtarget) &&
24918 !X86::mayFoldLoad(Op2, Subtarget))) {
24919 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
24920 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
24921 SDValue Ops[] = { Op2, Op1, CC, Cond };
24922 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
24923 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
24924 }
24925
24926 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
24927 // condition is true.
24928 SDValue Ops[] = { Op2, Op1, CC, Cond };
24929 return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops);
24930}
24931
24932static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,
24933 const X86Subtarget &Subtarget,
24934 SelectionDAG &DAG) {
24935 MVT VT = Op->getSimpleValueType(0);
24936 SDValue In = Op->getOperand(0);
24937 MVT InVT = In.getSimpleValueType();
24938 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!")(static_cast <bool> (InVT.getVectorElementType() == MVT
::i1 && "Unexpected input type!") ? void (0) : __assert_fail
("InVT.getVectorElementType() == MVT::i1 && \"Unexpected input type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24938, __extension__
__PRETTY_FUNCTION__))
;
24939 MVT VTElt = VT.getVectorElementType();
24940 SDLoc dl(Op);
24941
24942 unsigned NumElts = VT.getVectorNumElements();
24943
24944 // Extend VT if the scalar type is i8/i16 and BWI is not supported.
24945 MVT ExtVT = VT;
24946 if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
24947 // If v16i32 is to be avoided, we'll need to split and concatenate.
24948 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
24949 return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG);
24950
24951 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
24952 }
24953
24954 // Widen to 512-bits if VLX is not supported.
24955 MVT WideVT = ExtVT;
24956 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
24957 NumElts *= 512 / ExtVT.getSizeInBits();
24958 InVT = MVT::getVectorVT(MVT::i1, NumElts);
24959 In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT),
24960 In, DAG.getIntPtrConstant(0, dl));
24961 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
24962 }
24963
24964 SDValue V;
24965 MVT WideEltVT = WideVT.getVectorElementType();
24966 if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
24967 (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
24968 V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);
24969 } else {
24970 SDValue NegOne = DAG.getConstant(-1, dl, WideVT);
24971 SDValue Zero = DAG.getConstant(0, dl, WideVT);
24972 V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
24973 }
24974
24975 // Truncate if we had to extend i16/i8 above.
24976 if (VT != ExtVT) {
24977 WideVT = MVT::getVectorVT(VTElt, NumElts);
24978 V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
24979 }
24980
24981 // Extract back to 128/256-bit if we widened.
24982 if (WideVT != VT)
24983 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
24984 DAG.getIntPtrConstant(0, dl));
24985
24986 return V;
24987}
24988
24989static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
24990 SelectionDAG &DAG) {
24991 SDValue In = Op->getOperand(0);
24992 MVT InVT = In.getSimpleValueType();
24993
24994 if (InVT.getVectorElementType() == MVT::i1)
24995 return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
24996
24997 assert(Subtarget.hasAVX() && "Expected AVX support")(static_cast <bool> (Subtarget.hasAVX() && "Expected AVX support"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"Expected AVX support\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24997, __extension__
__PRETTY_FUNCTION__))
;
24998 return LowerAVXExtend(Op, DAG, Subtarget);
24999}
25000
25001// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
25002// For sign extend this needs to handle all vector sizes and SSE4.1 and
25003// non-SSE4.1 targets. For zero extend this should only handle inputs of
25004// MVT::v64i8 when BWI is not supported, but AVX512 is.
25005static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
25006 const X86Subtarget &Subtarget,
25007 SelectionDAG &DAG) {
25008 SDValue In = Op->getOperand(0);
25009 MVT VT = Op->getSimpleValueType(0);
25010 MVT InVT = In.getSimpleValueType();
25011
25012 MVT SVT = VT.getVectorElementType();
25013 MVT InSVT = InVT.getVectorElementType();
25014 assert(SVT.getFixedSizeInBits() > InSVT.getFixedSizeInBits())(static_cast <bool> (SVT.getFixedSizeInBits() > InSVT
.getFixedSizeInBits()) ? void (0) : __assert_fail ("SVT.getFixedSizeInBits() > InSVT.getFixedSizeInBits()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25014, __extension__
__PRETTY_FUNCTION__))
;
25015
25016 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
25017 return SDValue();
25018 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
25019 return SDValue();
25020 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
25021 !(VT.is256BitVector() && Subtarget.hasAVX()) &&
25022 !(VT.is512BitVector() && Subtarget.hasAVX512()))
25023 return SDValue();
25024
25025 SDLoc dl(Op);
25026 unsigned Opc = Op.getOpcode();
25027 unsigned NumElts = VT.getVectorNumElements();
25028
25029 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
25030 // For 512-bit vectors, we need 128-bits or 256-bits.
25031 if (InVT.getSizeInBits() > 128) {
25032 // Input needs to be at least the same number of elements as output, and
25033 // at least 128-bits.
25034 int InSize = InSVT.getSizeInBits() * NumElts;
25035 In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
25036 InVT = In.getSimpleValueType();
25037 }
25038
25039 // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,
25040 // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
25041 // need to be handled here for 256/512-bit results.
25042 if (Subtarget.hasInt256()) {
25043 assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension")(static_cast <bool> (VT.getSizeInBits() > 128 &&
"Unexpected 128-bit vector extension") ? void (0) : __assert_fail
("VT.getSizeInBits() > 128 && \"Unexpected 128-bit vector extension\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25043, __extension__
__PRETTY_FUNCTION__))
;
25044
25045 if (InVT.getVectorNumElements() != NumElts)
25046 return DAG.getNode(Op.getOpcode(), dl, VT, In);
25047
25048 // FIXME: Apparently we create inreg operations that could be regular
25049 // extends.
25050 unsigned ExtOpc =
25051 Opc == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SIGN_EXTEND
25052 : ISD::ZERO_EXTEND;
25053 return DAG.getNode(ExtOpc, dl, VT, In);
25054 }
25055
25056 // pre-AVX2 256-bit extensions need to be split into 128-bit instructions.
25057 if (Subtarget.hasAVX()) {
25058 assert(VT.is256BitVector() && "256-bit vector expected")(static_cast <bool> (VT.is256BitVector() && "256-bit vector expected"
) ? void (0) : __assert_fail ("VT.is256BitVector() && \"256-bit vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25058, __extension__
__PRETTY_FUNCTION__))
;
25059 MVT HalfVT = VT.getHalfNumVectorElementsVT();
25060 int HalfNumElts = HalfVT.getVectorNumElements();
25061
25062 unsigned NumSrcElts = InVT.getVectorNumElements();
25063 SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);
25064 for (int i = 0; i != HalfNumElts; ++i)
25065 HiMask[i] = HalfNumElts + i;
25066
25067 SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In);
25068 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask);
25069 Hi = DAG.getNode(Opc, dl, HalfVT, Hi);
25070 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
25071 }
25072
25073 // We should only get here for sign extend.
25074 assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!")(static_cast <bool> (Opc == ISD::SIGN_EXTEND_VECTOR_INREG
&& "Unexpected opcode!") ? void (0) : __assert_fail (
"Opc == ISD::SIGN_EXTEND_VECTOR_INREG && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25074, __extension__
__PRETTY_FUNCTION__))
;
25075 assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs")(static_cast <bool> (VT.is128BitVector() && InVT
.is128BitVector() && "Unexpected VTs") ? void (0) : __assert_fail
("VT.is128BitVector() && InVT.is128BitVector() && \"Unexpected VTs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25075, __extension__
__PRETTY_FUNCTION__))
;
25076
25077 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
25078 SDValue Curr = In;
25079 SDValue SignExt = Curr;
25080
25081 // As SRAI is only available on i16/i32 types, we expand only up to i32
25082 // and handle i64 separately.
25083 if (InVT != MVT::v4i32) {
25084 MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
25085
25086 unsigned DestWidth = DestVT.getScalarSizeInBits();
25087 unsigned Scale = DestWidth / InSVT.getSizeInBits();
25088
25089 unsigned InNumElts = InVT.getVectorNumElements();
25090 unsigned DestElts = DestVT.getVectorNumElements();
25091
25092 // Build a shuffle mask that takes each input element and places it in the
25093 // MSBs of the new element size.
25094 SmallVector<int, 16> Mask(InNumElts, SM_SentinelUndef);
25095 for (unsigned i = 0; i != DestElts; ++i)
25096 Mask[i * Scale + (Scale - 1)] = i;
25097
25098 Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask);
25099 Curr = DAG.getBitcast(DestVT, Curr);
25100
25101 unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();
25102 SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,
25103 DAG.getTargetConstant(SignExtShift, dl, MVT::i8));
25104 }
25105
25106 if (VT == MVT::v2i64) {
25107 assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT")(static_cast <bool> (Curr.getValueType() == MVT::v4i32 &&
"Unexpected input VT") ? void (0) : __assert_fail ("Curr.getValueType() == MVT::v4i32 && \"Unexpected input VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25107, __extension__
__PRETTY_FUNCTION__))
;
25108 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
25109 SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT);
25110 SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5});
25111 SignExt = DAG.getBitcast(VT, SignExt);
25112 }
25113
25114 return SignExt;
25115}
25116
25117static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
25118 SelectionDAG &DAG) {
25119 MVT VT = Op->getSimpleValueType(0);
25120 SDValue In = Op->getOperand(0);
25121 MVT InVT = In.getSimpleValueType();
25122 SDLoc dl(Op);
25123
25124 if (InVT.getVectorElementType() == MVT::i1)
25125 return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
25126
25127 assert(VT.isVector() && InVT.isVector() && "Expected vector type")(static_cast <bool> (VT.isVector() && InVT.isVector
() && "Expected vector type") ? void (0) : __assert_fail
("VT.isVector() && InVT.isVector() && \"Expected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25127, __extension__
__PRETTY_FUNCTION__))
;
25128 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Expected same number of elements"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25129, __extension__
__PRETTY_FUNCTION__))
25129 "Expected same number of elements")(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Expected same number of elements"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25129, __extension__
__PRETTY_FUNCTION__))
;
25130 assert((VT.getVectorElementType() == MVT::i16 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25133, __extension__
__PRETTY_FUNCTION__))
25131 VT.getVectorElementType() == MVT::i32 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25133, __extension__
__PRETTY_FUNCTION__))
25132 VT.getVectorElementType() == MVT::i64) &&(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25133, __extension__
__PRETTY_FUNCTION__))
25133 "Unexpected element type")(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25133, __extension__
__PRETTY_FUNCTION__))
;
25134 assert((InVT.getVectorElementType() == MVT::i8 ||(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25137, __extension__
__PRETTY_FUNCTION__))
25135 InVT.getVectorElementType() == MVT::i16 ||(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25137, __extension__
__PRETTY_FUNCTION__))
25136 InVT.getVectorElementType() == MVT::i32) &&(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25137, __extension__
__PRETTY_FUNCTION__))
25137 "Unexpected element type")(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25137, __extension__
__PRETTY_FUNCTION__))
;
25138
25139 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
25140 assert(InVT == MVT::v32i8 && "Unexpected VT!")(static_cast <bool> (InVT == MVT::v32i8 && "Unexpected VT!"
) ? void (0) : __assert_fail ("InVT == MVT::v32i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25140, __extension__
__PRETTY_FUNCTION__))
;
25141 return splitVectorIntUnary(Op, DAG);
25142 }
25143
25144 if (Subtarget.hasInt256())
25145 return Op;
25146
25147 // Optimize vectors in AVX mode
25148 // Sign extend v8i16 to v8i32 and
25149 // v4i32 to v4i64
25150 //
25151 // Divide input vector into two parts
25152 // for v4i32 the high shuffle mask will be {2, 3, -1, -1}
25153 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
25154 // concat the vectors to original VT
25155 MVT HalfVT = VT.getHalfNumVectorElementsVT();
25156 SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);
25157
25158 unsigned NumElems = InVT.getVectorNumElements();
25159 SmallVector<int,8> ShufMask(NumElems, -1);
25160 for (unsigned i = 0; i != NumElems/2; ++i)
25161 ShufMask[i] = i + NumElems/2;
25162
25163 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
25164 OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);
25165
25166 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
25167}
25168
25169/// Change a vector store into a pair of half-size vector stores.
25170static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) {
25171 SDValue StoredVal = Store->getValue();
25172 assert((StoredVal.getValueType().is256BitVector() ||(static_cast <bool> ((StoredVal.getValueType().is256BitVector
() || StoredVal.getValueType().is512BitVector()) && "Expecting 256/512-bit op"
) ? void (0) : __assert_fail ("(StoredVal.getValueType().is256BitVector() || StoredVal.getValueType().is512BitVector()) && \"Expecting 256/512-bit op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25174, __extension__
__PRETTY_FUNCTION__))
25173 StoredVal.getValueType().is512BitVector()) &&(static_cast <bool> ((StoredVal.getValueType().is256BitVector
() || StoredVal.getValueType().is512BitVector()) && "Expecting 256/512-bit op"
) ? void (0) : __assert_fail ("(StoredVal.getValueType().is256BitVector() || StoredVal.getValueType().is512BitVector()) && \"Expecting 256/512-bit op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25174, __extension__
__PRETTY_FUNCTION__))
25174 "Expecting 256/512-bit op")(static_cast <bool> ((StoredVal.getValueType().is256BitVector
() || StoredVal.getValueType().is512BitVector()) && "Expecting 256/512-bit op"
) ? void (0) : __assert_fail ("(StoredVal.getValueType().is256BitVector() || StoredVal.getValueType().is512BitVector()) && \"Expecting 256/512-bit op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25174, __extension__
__PRETTY_FUNCTION__))
;
25175
25176 // Splitting volatile memory ops is not allowed unless the operation was not
25177 // legal to begin with. Assume the input store is legal (this transform is
25178 // only used for targets with AVX). Note: It is possible that we have an
25179 // illegal type like v2i128, and so we could allow splitting a volatile store
25180 // in that case if that is important.
25181 if (!Store->isSimple())
25182 return SDValue();
25183
25184 SDLoc DL(Store);
25185 SDValue Value0, Value1;
25186 std::tie(Value0, Value1) = splitVector(StoredVal, DAG, DL);
25187 unsigned HalfOffset = Value0.getValueType().getStoreSize();
25188 SDValue Ptr0 = Store->getBasePtr();
25189 SDValue Ptr1 =
25190 DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(HalfOffset), DL);
25191 SDValue Ch0 =
25192 DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),
25193 Store->getOriginalAlign(),
25194 Store->getMemOperand()->getFlags());
25195 SDValue Ch1 = DAG.getStore(Store->getChain(), DL, Value1, Ptr1,
25196 Store->getPointerInfo().getWithOffset(HalfOffset),
25197 Store->getOriginalAlign(),
25198 Store->getMemOperand()->getFlags());
25199 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);
25200}
25201
25202/// Scalarize a vector store, bitcasting to TargetVT to determine the scalar
25203/// type.
25204static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT,
25205 SelectionDAG &DAG) {
25206 SDValue StoredVal = Store->getValue();
25207 assert(StoreVT.is128BitVector() &&(static_cast <bool> (StoreVT.is128BitVector() &&
StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op"
) ? void (0) : __assert_fail ("StoreVT.is128BitVector() && StoredVal.getValueType().is128BitVector() && \"Expecting 128-bit op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25208, __extension__
__PRETTY_FUNCTION__))
25208 StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op")(static_cast <bool> (StoreVT.is128BitVector() &&
StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op"
) ? void (0) : __assert_fail ("StoreVT.is128BitVector() && StoredVal.getValueType().is128BitVector() && \"Expecting 128-bit op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25208, __extension__
__PRETTY_FUNCTION__))
;
25209 StoredVal = DAG.getBitcast(StoreVT, StoredVal);
25210
25211 // Splitting volatile memory ops is not allowed unless the operation was not
25212 // legal to begin with. We are assuming the input op is legal (this transform
25213 // is only used for targets with AVX).
25214 if (!Store->isSimple())
25215 return SDValue();
25216
25217 MVT StoreSVT = StoreVT.getScalarType();
25218 unsigned NumElems = StoreVT.getVectorNumElements();
25219 unsigned ScalarSize = StoreSVT.getStoreSize();
25220
25221 SDLoc DL(Store);
25222 SmallVector<SDValue, 4> Stores;
25223 for (unsigned i = 0; i != NumElems; ++i) {
25224 unsigned Offset = i * ScalarSize;
25225 SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(),
25226 TypeSize::Fixed(Offset), DL);
25227 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,
25228 DAG.getIntPtrConstant(i, DL));
25229 SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr,
25230 Store->getPointerInfo().getWithOffset(Offset),
25231 Store->getOriginalAlign(),
25232 Store->getMemOperand()->getFlags());
25233 Stores.push_back(Ch);
25234 }
25235 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
25236}
25237
25238static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
25239 SelectionDAG &DAG) {
25240 StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
25241 SDLoc dl(St);
25242 SDValue StoredVal = St->getValue();
25243
25244 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.
25245 if (StoredVal.getValueType().isVector() &&
25246 StoredVal.getValueType().getVectorElementType() == MVT::i1) {
25247 unsigned NumElts = StoredVal.getValueType().getVectorNumElements();
25248 assert(NumElts <= 8 && "Unexpected VT")(static_cast <bool> (NumElts <= 8 && "Unexpected VT"
) ? void (0) : __assert_fail ("NumElts <= 8 && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25248, __extension__
__PRETTY_FUNCTION__))
;
25249 assert(!St->isTruncatingStore() && "Expected non-truncating store")(static_cast <bool> (!St->isTruncatingStore() &&
"Expected non-truncating store") ? void (0) : __assert_fail (
"!St->isTruncatingStore() && \"Expected non-truncating store\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25249, __extension__
__PRETTY_FUNCTION__))
;
25250 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25251, __extension__
__PRETTY_FUNCTION__))
25251 "Expected AVX512F without AVX512DQI")(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25251, __extension__
__PRETTY_FUNCTION__))
;
25252
25253 // We must pad with zeros to ensure we store zeroes to any unused bits.
25254 StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
25255 DAG.getUNDEF(MVT::v16i1), StoredVal,
25256 DAG.getIntPtrConstant(0, dl));
25257 StoredVal = DAG.getBitcast(MVT::i16, StoredVal);
25258 StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);
25259 // Make sure we store zeros in the extra bits.
25260 if (NumElts < 8)
25261 StoredVal = DAG.getZeroExtendInReg(
25262 StoredVal, dl, EVT::getIntegerVT(*DAG.getContext(), NumElts));
25263
25264 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
25265 St->getPointerInfo(), St->getOriginalAlign(),
25266 St->getMemOperand()->getFlags());
25267 }
25268
25269 if (St->isTruncatingStore())
25270 return SDValue();
25271
25272 // If this is a 256-bit store of concatenated ops, we are better off splitting
25273 // that store into two 128-bit stores. This avoids spurious use of 256-bit ops
25274 // and each half can execute independently. Some cores would split the op into
25275 // halves anyway, so the concat (vinsertf128) is purely an extra op.
25276 MVT StoreVT = StoredVal.getSimpleValueType();
25277 if (StoreVT.is256BitVector() ||
25278 ((StoreVT == MVT::v32i16 || StoreVT == MVT::v64i8) &&
25279 !Subtarget.hasBWI())) {
25280 SmallVector<SDValue, 4> CatOps;
25281 if (StoredVal.hasOneUse() && collectConcatOps(StoredVal.getNode(), CatOps))
25282 return splitVectorStore(St, DAG);
25283 return SDValue();
25284 }
25285
25286 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25287 assert(StoreVT.isVector() && StoreVT.getSizeInBits() == 64 &&(static_cast <bool> (StoreVT.isVector() && StoreVT
.getSizeInBits() == 64 && "Unexpected VT") ? void (0)
: __assert_fail ("StoreVT.isVector() && StoreVT.getSizeInBits() == 64 && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25288, __extension__
__PRETTY_FUNCTION__))
25288 "Unexpected VT")(static_cast <bool> (StoreVT.isVector() && StoreVT
.getSizeInBits() == 64 && "Unexpected VT") ? void (0)
: __assert_fail ("StoreVT.isVector() && StoreVT.getSizeInBits() == 64 && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25288, __extension__
__PRETTY_FUNCTION__))
;
25289 assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) ==(static_cast <bool> (TLI.getTypeAction(*DAG.getContext(
), StoreVT) == TargetLowering::TypeWidenVector && "Unexpected type action!"
) ? void (0) : __assert_fail ("TLI.getTypeAction(*DAG.getContext(), StoreVT) == TargetLowering::TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25290, __extension__
__PRETTY_FUNCTION__))
25290 TargetLowering::TypeWidenVector && "Unexpected type action!")(static_cast <bool> (TLI.getTypeAction(*DAG.getContext(
), StoreVT) == TargetLowering::TypeWidenVector && "Unexpected type action!"
) ? void (0) : __assert_fail ("TLI.getTypeAction(*DAG.getContext(), StoreVT) == TargetLowering::TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25290, __extension__
__PRETTY_FUNCTION__))
;
25291
25292 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT);
25293 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
25294 DAG.getUNDEF(StoreVT));
25295
25296 if (Subtarget.hasSSE2()) {
25297 // Widen the vector, cast to a v2x64 type, extract the single 64-bit element
25298 // and store it.
25299 MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
25300 MVT CastVT = MVT::getVectorVT(StVT, 2);
25301 StoredVal = DAG.getBitcast(CastVT, StoredVal);
25302 StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
25303 DAG.getIntPtrConstant(0, dl));
25304
25305 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
25306 St->getPointerInfo(), St->getOriginalAlign(),
25307 St->getMemOperand()->getFlags());
25308 }
25309 assert(Subtarget.hasSSE1() && "Expected SSE")(static_cast <bool> (Subtarget.hasSSE1() && "Expected SSE"
) ? void (0) : __assert_fail ("Subtarget.hasSSE1() && \"Expected SSE\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25309, __extension__
__PRETTY_FUNCTION__))
;
25310 SDVTList Tys = DAG.getVTList(MVT::Other);
25311 SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};
25312 return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,
25313 St->getMemOperand());
25314}
25315
25316// Lower vector extended loads using a shuffle. If SSSE3 is not available we
25317// may emit an illegal shuffle but the expansion is still better than scalar
25318// code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise
25319// we'll emit a shuffle and a arithmetic shift.
25320// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
25321// TODO: It is possible to support ZExt by zeroing the undef values during
25322// the shuffle phase or after the shuffle.
25323static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
25324 SelectionDAG &DAG) {
25325 MVT RegVT = Op.getSimpleValueType();
25326 assert(RegVT.isVector() && "We only custom lower vector loads.")(static_cast <bool> (RegVT.isVector() && "We only custom lower vector loads."
) ? void (0) : __assert_fail ("RegVT.isVector() && \"We only custom lower vector loads.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25326, __extension__
__PRETTY_FUNCTION__))
;
25327 assert(RegVT.isInteger() &&(static_cast <bool> (RegVT.isInteger() && "We only custom lower integer vector loads."
) ? void (0) : __assert_fail ("RegVT.isInteger() && \"We only custom lower integer vector loads.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25328, __extension__
__PRETTY_FUNCTION__))
25328 "We only custom lower integer vector loads.")(static_cast <bool> (RegVT.isInteger() && "We only custom lower integer vector loads."
) ? void (0) : __assert_fail ("RegVT.isInteger() && \"We only custom lower integer vector loads.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25328, __extension__
__PRETTY_FUNCTION__))
;
25329
25330 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
25331 SDLoc dl(Ld);
25332
25333 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
25334 if (RegVT.getVectorElementType() == MVT::i1) {
25335 assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load")(static_cast <bool> (EVT(RegVT) == Ld->getMemoryVT()
&& "Expected non-extending load") ? void (0) : __assert_fail
("EVT(RegVT) == Ld->getMemoryVT() && \"Expected non-extending load\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25335, __extension__
__PRETTY_FUNCTION__))
;
25336 assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT")(static_cast <bool> (RegVT.getVectorNumElements() <=
8 && "Unexpected VT") ? void (0) : __assert_fail ("RegVT.getVectorNumElements() <= 8 && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25336, __extension__
__PRETTY_FUNCTION__))
;
25337 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25338, __extension__
__PRETTY_FUNCTION__))
25338 "Expected AVX512F without AVX512DQI")(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25338, __extension__
__PRETTY_FUNCTION__))
;
25339
25340 SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
25341 Ld->getPointerInfo(), Ld->getOriginalAlign(),
25342 Ld->getMemOperand()->getFlags());
25343
25344 // Replace chain users with the new chain.
25345 assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!")(static_cast <bool> (NewLd->getNumValues() == 2 &&
"Loads must carry a chain!") ? void (0) : __assert_fail ("NewLd->getNumValues() == 2 && \"Loads must carry a chain!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25345, __extension__
__PRETTY_FUNCTION__))
;
25346
25347 SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd);
25348 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
25349 DAG.getBitcast(MVT::v16i1, Val),
25350 DAG.getIntPtrConstant(0, dl));
25351 return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);
25352 }
25353
25354 return SDValue();
25355}
25356
25357/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
25358/// each of which has no other use apart from the AND / OR.
25359static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
25360 Opc = Op.getOpcode();
25361 if (Opc != ISD::OR && Opc != ISD::AND)
25362 return false;
25363 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
25364 Op.getOperand(0).hasOneUse() &&
25365 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
25366 Op.getOperand(1).hasOneUse());
25367}
25368
25369SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
25370 SDValue Chain = Op.getOperand(0);
25371 SDValue Cond = Op.getOperand(1);
25372 SDValue Dest = Op.getOperand(2);
25373 SDLoc dl(Op);
25374
25375 if (Cond.getOpcode() == ISD::SETCC &&
25376 Cond.getOperand(0).getValueType() != MVT::f128) {
25377 SDValue LHS = Cond.getOperand(0);
25378 SDValue RHS = Cond.getOperand(1);
25379 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
25380
25381 // Special case for
25382 // setcc([su]{add,sub,mul}o == 0)
25383 // setcc([su]{add,sub,mul}o != 1)
25384 if (ISD::isOverflowIntrOpRes(LHS) &&
25385 (CC == ISD::SETEQ || CC == ISD::SETNE) &&
25386 (isNullConstant(RHS) || isOneConstant(RHS))) {
25387 SDValue Value, Overflow;
25388 X86::CondCode X86Cond;
25389 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, LHS.getValue(0), DAG);
25390
25391 if ((CC == ISD::SETEQ) == isNullConstant(RHS))
25392 X86Cond = X86::GetOppositeBranchCondition(X86Cond);
25393
25394 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25395 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25396 Overflow);
25397 }
25398
25399 if (LHS.getSimpleValueType().isInteger()) {
25400 SDValue CCVal;
25401 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, CC, SDLoc(Cond), DAG, CCVal);
25402 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25403 EFLAGS);
25404 }
25405
25406 if (CC == ISD::SETOEQ) {
25407 // For FCMP_OEQ, we can emit
25408 // two branches instead of an explicit AND instruction with a
25409 // separate test. However, we only do this if this block doesn't
25410 // have a fall-through edge, because this requires an explicit
25411 // jmp when the condition is false.
25412 if (Op.getNode()->hasOneUse()) {
25413 SDNode *User = *Op.getNode()->use_begin();
25414 // Look for an unconditional branch following this conditional branch.
25415 // We need this because we need to reverse the successors in order
25416 // to implement FCMP_OEQ.
25417 if (User->getOpcode() == ISD::BR) {
25418 SDValue FalseBB = User->getOperand(1);
25419 SDNode *NewBR =
25420 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
25421 assert(NewBR == User)(static_cast <bool> (NewBR == User) ? void (0) : __assert_fail
("NewBR == User", "llvm/lib/Target/X86/X86ISelLowering.cpp",
25421, __extension__ __PRETTY_FUNCTION__))
;
25422 (void)NewBR;
25423 Dest = FalseBB;
25424
25425 SDValue Cmp =
25426 DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
25427 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
25428 Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest,
25429 CCVal, Cmp);
25430 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
25431 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25432 Cmp);
25433 }
25434 }
25435 } else if (CC == ISD::SETUNE) {
25436 // For FCMP_UNE, we can emit
25437 // two branches instead of an explicit OR instruction with a
25438 // separate test.
25439 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
25440 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
25441 Chain =
25442 DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, Cmp);
25443 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
25444 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25445 Cmp);
25446 } else {
25447 X86::CondCode X86Cond =
25448 TranslateX86CC(CC, dl, /*IsFP*/ true, LHS, RHS, DAG);
25449 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
25450 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25451 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25452 Cmp);
25453 }
25454 }
25455
25456 if (ISD::isOverflowIntrOpRes(Cond)) {
25457 SDValue Value, Overflow;
25458 X86::CondCode X86Cond;
25459 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
25460
25461 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25462 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25463 Overflow);
25464 }
25465
25466 // Look past the truncate if the high bits are known zero.
25467 if (isTruncWithZeroHighBitsInput(Cond, DAG))
25468 Cond = Cond.getOperand(0);
25469
25470 EVT CondVT = Cond.getValueType();
25471
25472 // Add an AND with 1 if we don't already have one.
25473 if (!(Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1))))
25474 Cond =
25475 DAG.getNode(ISD::AND, dl, CondVT, Cond, DAG.getConstant(1, dl, CondVT));
25476
25477 SDValue LHS = Cond;
25478 SDValue RHS = DAG.getConstant(0, dl, CondVT);
25479
25480 SDValue CCVal;
25481 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, ISD::SETNE, dl, DAG, CCVal);
25482 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25483 EFLAGS);
25484}
25485
25486// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
25487// Calls to _alloca are needed to probe the stack when allocating more than 4k
25488// bytes in one go. Touching the stack at 4K increments is necessary to ensure
25489// that the guard pages used by the OS virtual memory manager are allocated in
25490// correct sequence.
25491SDValue
25492X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
25493 SelectionDAG &DAG) const {
25494 MachineFunction &MF = DAG.getMachineFunction();
25495 bool SplitStack = MF.shouldSplitStack();
25496 bool EmitStackProbeCall = hasStackProbeSymbol(MF);
25497 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
25498 SplitStack || EmitStackProbeCall;
25499 SDLoc dl(Op);
25500
25501 // Get the inputs.
25502 SDNode *Node = Op.getNode();
25503 SDValue Chain = Op.getOperand(0);
25504 SDValue Size = Op.getOperand(1);
25505 MaybeAlign Alignment(Op.getConstantOperandVal(2));
25506 EVT VT = Node->getValueType(0);
25507
25508 // Chain the dynamic stack allocation so that it doesn't modify the stack
25509 // pointer when other instructions are using the stack.
25510 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
25511
25512 bool Is64Bit = Subtarget.is64Bit();
25513 MVT SPTy = getPointerTy(DAG.getDataLayout());
25514
25515 SDValue Result;
25516 if (!Lower) {
25517 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25518 Register SPReg = TLI.getStackPointerRegisterToSaveRestore();
25519 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"(static_cast <bool> (SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
" not tell us which reg is the stack pointer!") ? void (0) :
__assert_fail ("SPReg && \"Target cannot require DYNAMIC_STACKALLOC expansion and\" \" not tell us which reg is the stack pointer!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25520, __extension__
__PRETTY_FUNCTION__))
25520 " not tell us which reg is the stack pointer!")(static_cast <bool> (SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
" not tell us which reg is the stack pointer!") ? void (0) :
__assert_fail ("SPReg && \"Target cannot require DYNAMIC_STACKALLOC expansion and\" \" not tell us which reg is the stack pointer!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25520, __extension__
__PRETTY_FUNCTION__))
;
25521
25522 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
25523 const Align StackAlign = TFI.getStackAlign();
25524 if (hasInlineStackProbe(MF)) {
25525 MachineRegisterInfo &MRI = MF.getRegInfo();
25526
25527 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
25528 Register Vreg = MRI.createVirtualRegister(AddrRegClass);
25529 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
25530 Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, SPTy, Chain,
25531 DAG.getRegister(Vreg, SPTy));
25532 } else {
25533 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
25534 Chain = SP.getValue(1);
25535 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
25536 }
25537 if (Alignment && *Alignment > StackAlign)
25538 Result =
25539 DAG.getNode(ISD::AND, dl, VT, Result,
25540 DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
25541 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
25542 } else if (SplitStack) {
25543 MachineRegisterInfo &MRI = MF.getRegInfo();
25544
25545 if (Is64Bit) {
25546 // The 64 bit implementation of segmented stacks needs to clobber both r10
25547 // r11. This makes it impossible to use it along with nested parameters.
25548 const Function &F = MF.getFunction();
25549 for (const auto &A : F.args()) {
25550 if (A.hasNestAttr())
25551 report_fatal_error("Cannot use segmented stacks with functions that "
25552 "have nested arguments.");
25553 }
25554 }
25555
25556 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
25557 Register Vreg = MRI.createVirtualRegister(AddrRegClass);
25558 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
25559 Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
25560 DAG.getRegister(Vreg, SPTy));
25561 } else {
25562 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
25563 Chain = DAG.getNode(X86ISD::DYN_ALLOCA, dl, NodeTys, Chain, Size);
25564 MF.getInfo<X86MachineFunctionInfo>()->setHasDynAlloca(true);
25565
25566 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
25567 Register SPReg = RegInfo->getStackRegister();
25568 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
25569 Chain = SP.getValue(1);
25570
25571 if (Alignment) {
25572 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
25573 DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
25574 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
25575 }
25576
25577 Result = SP;
25578 }
25579
25580 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
25581 DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
25582
25583 SDValue Ops[2] = {Result, Chain};
25584 return DAG.getMergeValues(Ops, dl);
25585}
25586
25587SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
25588 MachineFunction &MF = DAG.getMachineFunction();
25589 auto PtrVT = getPointerTy(MF.getDataLayout());
25590 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
25591
25592 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
25593 SDLoc DL(Op);
25594
25595 if (!Subtarget.is64Bit() ||
25596 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
25597 // vastart just stores the address of the VarArgsFrameIndex slot into the
25598 // memory location argument.
25599 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
25600 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
25601 MachinePointerInfo(SV));
25602 }
25603
25604 // __va_list_tag:
25605 // gp_offset (0 - 6 * 8)
25606 // fp_offset (48 - 48 + 8 * 16)
25607 // overflow_arg_area (point to parameters coming in memory).
25608 // reg_save_area
25609 SmallVector<SDValue, 8> MemOps;
25610 SDValue FIN = Op.getOperand(1);
25611 // Store gp_offset
25612 SDValue Store = DAG.getStore(
25613 Op.getOperand(0), DL,
25614 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
25615 MachinePointerInfo(SV));
25616 MemOps.push_back(Store);
25617
25618 // Store fp_offset
25619 FIN = DAG.getMemBasePlusOffset(FIN, TypeSize::Fixed(4), DL);
25620 Store = DAG.getStore(
25621 Op.getOperand(0), DL,
25622 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
25623 MachinePointerInfo(SV, 4));
25624 MemOps.push_back(Store);
25625
25626 // Store ptr to overflow_arg_area
25627 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
25628 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
25629 Store =
25630 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
25631 MemOps.push_back(Store);
25632
25633 // Store ptr to reg_save_area.
25634 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
25635 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
25636 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
25637 Store = DAG.getStore(
25638 Op.getOperand(0), DL, RSFIN, FIN,
25639 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
25640 MemOps.push_back(Store);
25641 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
25642}
25643
25644SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
25645 assert(Subtarget.is64Bit() &&(static_cast <bool> (Subtarget.is64Bit() && "LowerVAARG only handles 64-bit va_arg!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"LowerVAARG only handles 64-bit va_arg!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25646, __extension__
__PRETTY_FUNCTION__))
25646 "LowerVAARG only handles 64-bit va_arg!")(static_cast <bool> (Subtarget.is64Bit() && "LowerVAARG only handles 64-bit va_arg!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"LowerVAARG only handles 64-bit va_arg!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25646, __extension__
__PRETTY_FUNCTION__))
;
25647 assert(Op.getNumOperands() == 4)(static_cast <bool> (Op.getNumOperands() == 4) ? void (
0) : __assert_fail ("Op.getNumOperands() == 4", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25647, __extension__ __PRETTY_FUNCTION__))
;
25648
25649 MachineFunction &MF = DAG.getMachineFunction();
25650 if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
25651 // The Win64 ABI uses char* instead of a structure.
25652 return DAG.expandVAArg(Op.getNode());
25653
25654 SDValue Chain = Op.getOperand(0);
25655 SDValue SrcPtr = Op.getOperand(1);
25656 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
25657 unsigned Align = Op.getConstantOperandVal(3);
25658 SDLoc dl(Op);
25659
25660 EVT ArgVT = Op.getNode()->getValueType(0);
25661 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
25662 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
25663 uint8_t ArgMode;
25664
25665 // Decide which area this value should be read from.
25666 // TODO: Implement the AMD64 ABI in its entirety. This simple
25667 // selection mechanism works only for the basic types.
25668 assert(ArgVT != MVT::f80 && "va_arg for f80 not yet implemented")(static_cast <bool> (ArgVT != MVT::f80 && "va_arg for f80 not yet implemented"
) ? void (0) : __assert_fail ("ArgVT != MVT::f80 && \"va_arg for f80 not yet implemented\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25668, __extension__
__PRETTY_FUNCTION__))
;
25669 if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
25670 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
25671 } else {
25672 assert(ArgVT.isInteger() && ArgSize <= 32 /*bytes*/ &&(static_cast <bool> (ArgVT.isInteger() && ArgSize
<= 32 && "Unhandled argument type in LowerVAARG")
? void (0) : __assert_fail ("ArgVT.isInteger() && ArgSize <= 32 && \"Unhandled argument type in LowerVAARG\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25673, __extension__
__PRETTY_FUNCTION__))
25673 "Unhandled argument type in LowerVAARG")(static_cast <bool> (ArgVT.isInteger() && ArgSize
<= 32 && "Unhandled argument type in LowerVAARG")
? void (0) : __assert_fail ("ArgVT.isInteger() && ArgSize <= 32 && \"Unhandled argument type in LowerVAARG\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25673, __extension__
__PRETTY_FUNCTION__))
;
25674 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
25675 }
25676
25677 if (ArgMode == 2) {
25678 // Make sure using fp_offset makes sense.
25679 assert(!Subtarget.useSoftFloat() &&(static_cast <bool> (!Subtarget.useSoftFloat() &&
!(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat
)) && Subtarget.hasSSE1()) ? void (0) : __assert_fail
("!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25681, __extension__
__PRETTY_FUNCTION__))
25680 !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&(static_cast <bool> (!Subtarget.useSoftFloat() &&
!(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat
)) && Subtarget.hasSSE1()) ? void (0) : __assert_fail
("!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25681, __extension__
__PRETTY_FUNCTION__))
25681 Subtarget.hasSSE1())(static_cast <bool> (!Subtarget.useSoftFloat() &&
!(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat
)) && Subtarget.hasSSE1()) ? void (0) : __assert_fail
("!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25681, __extension__
__PRETTY_FUNCTION__))
;
25682 }
25683
25684 // Insert VAARG node into the DAG
25685 // VAARG returns two values: Variable Argument Address, Chain
25686 SDValue InstOps[] = {Chain, SrcPtr,
25687 DAG.getTargetConstant(ArgSize, dl, MVT::i32),
25688 DAG.getTargetConstant(ArgMode, dl, MVT::i8),
25689 DAG.getTargetConstant(Align, dl, MVT::i32)};
25690 SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
25691 SDValue VAARG = DAG.getMemIntrinsicNode(
25692 Subtarget.isTarget64BitLP64() ? X86ISD::VAARG_64 : X86ISD::VAARG_X32, dl,
25693 VTs, InstOps, MVT::i64, MachinePointerInfo(SV),
25694 /*Alignment=*/None,
25695 MachineMemOperand::MOLoad | MachineMemOperand::MOStore);
25696 Chain = VAARG.getValue(1);
25697
25698 // Load the next argument and return it
25699 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
25700}
25701
25702static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
25703 SelectionDAG &DAG) {
25704 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
25705 // where a va_list is still an i8*.
25706 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!")(static_cast <bool> (Subtarget.is64Bit() && "This code only handles 64-bit va_copy!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"This code only handles 64-bit va_copy!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25706, __extension__
__PRETTY_FUNCTION__))
;
25707 if (Subtarget.isCallingConvWin64(
25708 DAG.getMachineFunction().getFunction().getCallingConv()))
25709 // Probably a Win64 va_copy.
25710 return DAG.expandVACopy(Op.getNode());
25711
25712 SDValue Chain = Op.getOperand(0);
25713 SDValue DstPtr = Op.getOperand(1);
25714 SDValue SrcPtr = Op.getOperand(2);
25715 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
25716 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
25717 SDLoc DL(Op);
25718
25719 return DAG.getMemcpy(
25720 Chain, DL, DstPtr, SrcPtr,
25721 DAG.getIntPtrConstant(Subtarget.isTarget64BitLP64() ? 24 : 16, DL),
25722 Align(Subtarget.isTarget64BitLP64() ? 8 : 4), /*isVolatile*/ false, false,
25723 false, MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
25724}
25725
25726// Helper to get immediate/variable SSE shift opcode from other shift opcodes.
25727static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {
25728 switch (Opc) {
25729 case ISD::SHL:
25730 case X86ISD::VSHL:
25731 case X86ISD::VSHLI:
25732 return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;
25733 case ISD::SRL:
25734 case X86ISD::VSRL:
25735 case X86ISD::VSRLI:
25736 return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;
25737 case ISD::SRA:
25738 case X86ISD::VSRA:
25739 case X86ISD::VSRAI:
25740 return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;
25741 }
25742 llvm_unreachable("Unknown target vector shift node")::llvm::llvm_unreachable_internal("Unknown target vector shift node"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25742)
;
25743}
25744
25745/// Handle vector element shifts where the shift amount is a constant.
25746/// Takes immediate version of shift as input.
25747static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
25748 SDValue SrcOp, uint64_t ShiftAmt,
25749 SelectionDAG &DAG) {
25750 MVT ElementType = VT.getVectorElementType();
25751
25752 // Bitcast the source vector to the output type, this is mainly necessary for
25753 // vXi8/vXi64 shifts.
25754 if (VT != SrcOp.getSimpleValueType())
25755 SrcOp = DAG.getBitcast(VT, SrcOp);
25756
25757 // Fold this packed shift into its first operand if ShiftAmt is 0.
25758 if (ShiftAmt == 0)
25759 return SrcOp;
25760
25761 // Check for ShiftAmt >= element width
25762 if (ShiftAmt >= ElementType.getSizeInBits()) {
25763 if (Opc == X86ISD::VSRAI)
25764 ShiftAmt = ElementType.getSizeInBits() - 1;
25765 else
25766 return DAG.getConstant(0, dl, VT);
25767 }
25768
25769 assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)(static_cast <bool> ((Opc == X86ISD::VSHLI || Opc == X86ISD
::VSRLI || Opc == X86ISD::VSRAI) && "Unknown target vector shift-by-constant node"
) ? void (0) : __assert_fail ("(Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) && \"Unknown target vector shift-by-constant node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25770, __extension__
__PRETTY_FUNCTION__))
25770 && "Unknown target vector shift-by-constant node")(static_cast <bool> ((Opc == X86ISD::VSHLI || Opc == X86ISD
::VSRLI || Opc == X86ISD::VSRAI) && "Unknown target vector shift-by-constant node"
) ? void (0) : __assert_fail ("(Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) && \"Unknown target vector shift-by-constant node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25770, __extension__
__PRETTY_FUNCTION__))
;
25771
25772 // Fold this packed vector shift into a build vector if SrcOp is a
25773 // vector of Constants or UNDEFs.
25774 if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
25775 SmallVector<SDValue, 8> Elts;
25776 unsigned NumElts = SrcOp->getNumOperands();
25777
25778 switch (Opc) {
25779 default: llvm_unreachable("Unknown opcode!")::llvm::llvm_unreachable_internal("Unknown opcode!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25779)
;
25780 case X86ISD::VSHLI:
25781 for (unsigned i = 0; i != NumElts; ++i) {
25782 SDValue CurrentOp = SrcOp->getOperand(i);
25783 if (CurrentOp->isUndef()) {
25784 // Must produce 0s in the correct bits.
25785 Elts.push_back(DAG.getConstant(0, dl, ElementType));
25786 continue;
25787 }
25788 auto *ND = cast<ConstantSDNode>(CurrentOp);
25789 const APInt &C = ND->getAPIntValue();
25790 Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
25791 }
25792 break;
25793 case X86ISD::VSRLI:
25794 for (unsigned i = 0; i != NumElts; ++i) {
25795 SDValue CurrentOp = SrcOp->getOperand(i);
25796 if (CurrentOp->isUndef()) {
25797 // Must produce 0s in the correct bits.
25798 Elts.push_back(DAG.getConstant(0, dl, ElementType));
25799 continue;
25800 }
25801 auto *ND = cast<ConstantSDNode>(CurrentOp);
25802 const APInt &C = ND->getAPIntValue();
25803 Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
25804 }
25805 break;
25806 case X86ISD::VSRAI:
25807 for (unsigned i = 0; i != NumElts; ++i) {
25808 SDValue CurrentOp = SrcOp->getOperand(i);
25809 if (CurrentOp->isUndef()) {
25810 // All shifted in bits must be the same so use 0.
25811 Elts.push_back(DAG.getConstant(0, dl, ElementType));
25812 continue;
25813 }
25814 auto *ND = cast<ConstantSDNode>(CurrentOp);
25815 const APInt &C = ND->getAPIntValue();
25816 Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
25817 }
25818 break;
25819 }
25820
25821 return DAG.getBuildVector(VT, dl, Elts);
25822 }
25823
25824 return DAG.getNode(Opc, dl, VT, SrcOp,
25825 DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));
25826}
25827
25828/// Handle vector element shifts by a splat shift amount
25829static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
25830 SDValue SrcOp, SDValue ShAmt, int ShAmtIdx,
25831 const X86Subtarget &Subtarget,
25832 SelectionDAG &DAG) {
25833 MVT AmtVT = ShAmt.getSimpleValueType();
25834 assert(AmtVT.isVector() && "Vector shift type mismatch")(static_cast <bool> (AmtVT.isVector() && "Vector shift type mismatch"
) ? void (0) : __assert_fail ("AmtVT.isVector() && \"Vector shift type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25834, __extension__
__PRETTY_FUNCTION__))
;
25835 assert(0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() &&(static_cast <bool> (0 <= ShAmtIdx && ShAmtIdx
< (int)AmtVT.getVectorNumElements() && "Illegal vector splat index"
) ? void (0) : __assert_fail ("0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() && \"Illegal vector splat index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25836, __extension__
__PRETTY_FUNCTION__))
25836 "Illegal vector splat index")(static_cast <bool> (0 <= ShAmtIdx && ShAmtIdx
< (int)AmtVT.getVectorNumElements() && "Illegal vector splat index"
) ? void (0) : __assert_fail ("0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() && \"Illegal vector splat index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25836, __extension__
__PRETTY_FUNCTION__))
;
25837
25838 // Move the splat element to the bottom element.
25839 if (ShAmtIdx != 0) {
25840 SmallVector<int> Mask(AmtVT.getVectorNumElements(), -1);
25841 Mask[0] = ShAmtIdx;
25842 ShAmt = DAG.getVectorShuffle(AmtVT, dl, ShAmt, DAG.getUNDEF(AmtVT), Mask);
25843 }
25844
25845 // Peek through any zext node if we can get back to a 128-bit source.
25846 if (AmtVT.getScalarSizeInBits() == 64 &&
25847 (ShAmt.getOpcode() == ISD::ZERO_EXTEND ||
25848 ShAmt.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG) &&
25849 ShAmt.getOperand(0).getValueType().isSimple() &&
25850 ShAmt.getOperand(0).getValueType().is128BitVector()) {
25851 ShAmt = ShAmt.getOperand(0);
25852 AmtVT = ShAmt.getSimpleValueType();
25853 }
25854
25855 // See if we can mask off the upper elements using the existing source node.
25856 // The shift uses the entire lower 64-bits of the amount vector, so no need to
25857 // do this for vXi64 types.
25858 bool IsMasked = false;
25859 if (AmtVT.getScalarSizeInBits() < 64) {
25860 if (ShAmt.getOpcode() == ISD::BUILD_VECTOR ||
25861 ShAmt.getOpcode() == ISD::SCALAR_TO_VECTOR) {
25862 // If the shift amount has come from a scalar, then zero-extend the scalar
25863 // before moving to the vector.
25864 ShAmt = DAG.getZExtOrTrunc(ShAmt.getOperand(0), dl, MVT::i32);
25865 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
25866 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, ShAmt);
25867 AmtVT = MVT::v4i32;
25868 IsMasked = true;
25869 } else if (ShAmt.getOpcode() == ISD::AND) {
25870 // See if the shift amount is already masked (e.g. for rotation modulo),
25871 // then we can zero-extend it by setting all the other mask elements to
25872 // zero.
25873 SmallVector<SDValue> MaskElts(
25874 AmtVT.getVectorNumElements(),
25875 DAG.getConstant(0, dl, AmtVT.getScalarType()));
25876 MaskElts[0] = DAG.getAllOnesConstant(dl, AmtVT.getScalarType());
25877 SDValue Mask = DAG.getBuildVector(AmtVT, dl, MaskElts);
25878 if ((Mask = DAG.FoldConstantArithmetic(ISD::AND, dl, AmtVT,
25879 {ShAmt.getOperand(1), Mask}))) {
25880 ShAmt = DAG.getNode(ISD::AND, dl, AmtVT, ShAmt.getOperand(0), Mask);
25881 IsMasked = true;
25882 }
25883 }
25884 }
25885
25886 // Extract if the shift amount vector is larger than 128-bits.
25887 if (AmtVT.getSizeInBits() > 128) {
25888 ShAmt = extract128BitVector(ShAmt, 0, DAG, dl);
25889 AmtVT = ShAmt.getSimpleValueType();
25890 }
25891
25892 // Zero-extend bottom element to v2i64 vector type, either by extension or
25893 // shuffle masking.
25894 if (!IsMasked && AmtVT.getScalarSizeInBits() < 64) {
25895 if (AmtVT == MVT::v4i32 && (ShAmt.getOpcode() == X86ISD::VBROADCAST ||
25896 ShAmt.getOpcode() == X86ISD::VBROADCAST_LOAD)) {
25897 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, SDLoc(ShAmt), MVT::v4i32, ShAmt);
25898 } else if (Subtarget.hasSSE41()) {
25899 ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
25900 MVT::v2i64, ShAmt);
25901 } else {
25902 SDValue ByteShift = DAG.getTargetConstant(
25903 (128 - AmtVT.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
25904 ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);
25905 ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
25906 ByteShift);
25907 ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
25908 ByteShift);
25909 }
25910 }
25911
25912 // Change opcode to non-immediate version.
25913 Opc = getTargetVShiftUniformOpcode(Opc, true);
25914
25915 // The return type has to be a 128-bit type with the same element
25916 // type as the input type.
25917 MVT EltVT = VT.getVectorElementType();
25918 MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());
25919
25920 ShAmt = DAG.getBitcast(ShVT, ShAmt);
25921 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
25922}
25923
25924/// Return Mask with the necessary casting or extending
25925/// for \p Mask according to \p MaskVT when lowering masking intrinsics
25926static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
25927 const X86Subtarget &Subtarget, SelectionDAG &DAG,
25928 const SDLoc &dl) {
25929
25930 if (isAllOnesConstant(Mask))
25931 return DAG.getConstant(1, dl, MaskVT);
25932 if (X86::isZeroNode(Mask))
25933 return DAG.getConstant(0, dl, MaskVT);
25934
25935 assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!")(static_cast <bool> (MaskVT.bitsLE(Mask.getSimpleValueType
()) && "Unexpected mask size!") ? void (0) : __assert_fail
("MaskVT.bitsLE(Mask.getSimpleValueType()) && \"Unexpected mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25935, __extension__
__PRETTY_FUNCTION__))
;
25936
25937 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
25938 assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!")(static_cast <bool> (MaskVT == MVT::v64i1 && "Expected v64i1 mask!"
) ? void (0) : __assert_fail ("MaskVT == MVT::v64i1 && \"Expected v64i1 mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25938, __extension__
__PRETTY_FUNCTION__))
;
25939 assert(Subtarget.hasBWI() && "Expected AVX512BW target!")(static_cast <bool> (Subtarget.hasBWI() && "Expected AVX512BW target!"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected AVX512BW target!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25939, __extension__
__PRETTY_FUNCTION__))
;
25940 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
25941 SDValue Lo, Hi;
25942 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
25943 DAG.getConstant(0, dl, MVT::i32));
25944 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
25945 DAG.getConstant(1, dl, MVT::i32));
25946
25947 Lo = DAG.getBitcast(MVT::v32i1, Lo);
25948 Hi = DAG.getBitcast(MVT::v32i1, Hi);
25949
25950 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
25951 } else {
25952 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
25953 Mask.getSimpleValueType().getSizeInBits());
25954 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
25955 // are extracted by EXTRACT_SUBVECTOR.
25956 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
25957 DAG.getBitcast(BitcastVT, Mask),
25958 DAG.getIntPtrConstant(0, dl));
25959 }
25960}
25961
25962/// Return (and \p Op, \p Mask) for compare instructions or
25963/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
25964/// necessary casting or extending for \p Mask when lowering masking intrinsics
25965static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
25966 SDValue PreservedSrc,
25967 const X86Subtarget &Subtarget,
25968 SelectionDAG &DAG) {
25969 MVT VT = Op.getSimpleValueType();
25970 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
25971 unsigned OpcodeSelect = ISD::VSELECT;
25972 SDLoc dl(Op);
25973
25974 if (isAllOnesConstant(Mask))
25975 return Op;
25976
25977 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
25978
25979 if (PreservedSrc.isUndef())
25980 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
25981 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
25982}
25983
25984/// Creates an SDNode for a predicated scalar operation.
25985/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
25986/// The mask is coming as MVT::i8 and it should be transformed
25987/// to MVT::v1i1 while lowering masking intrinsics.
25988/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
25989/// "X86select" instead of "vselect". We just can't create the "vselect" node
25990/// for a scalar instruction.
25991static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
25992 SDValue PreservedSrc,
25993 const X86Subtarget &Subtarget,
25994 SelectionDAG &DAG) {
25995
25996 if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
25997 if (MaskConst->getZExtValue() & 0x1)
25998 return Op;
25999
26000 MVT VT = Op.getSimpleValueType();
26001 SDLoc dl(Op);
26002
26003 assert(Mask.getValueType() == MVT::i8 && "Unexpect type")(static_cast <bool> (Mask.getValueType() == MVT::i8 &&
"Unexpect type") ? void (0) : __assert_fail ("Mask.getValueType() == MVT::i8 && \"Unexpect type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26003, __extension__
__PRETTY_FUNCTION__))
;
26004 SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1,
26005 DAG.getBitcast(MVT::v8i1, Mask),
26006 DAG.getIntPtrConstant(0, dl));
26007 if (Op.getOpcode() == X86ISD::FSETCCM ||
26008 Op.getOpcode() == X86ISD::FSETCCM_SAE ||
26009 Op.getOpcode() == X86ISD::VFPCLASSS)
26010 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
26011
26012 if (PreservedSrc.isUndef())
26013 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
26014 return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
26015}
26016
26017static int getSEHRegistrationNodeSize(const Function *Fn) {
26018 if (!Fn->hasPersonalityFn())
26019 report_fatal_error(
26020 "querying registration node size for function without personality");
26021 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
26022 // WinEHStatePass for the full struct definition.
26023 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
26024 case EHPersonality::MSVC_X86SEH: return 24;
26025 case EHPersonality::MSVC_CXX: return 16;
26026 default: break;
26027 }
26028 report_fatal_error(
26029 "can only recover FP for 32-bit MSVC EH personality functions");
26030}
26031
26032/// When the MSVC runtime transfers control to us, either to an outlined
26033/// function or when returning to a parent frame after catching an exception, we
26034/// recover the parent frame pointer by doing arithmetic on the incoming EBP.
26035/// Here's the math:
26036/// RegNodeBase = EntryEBP - RegNodeSize
26037/// ParentFP = RegNodeBase - ParentFrameOffset
26038/// Subtracting RegNodeSize takes us to the offset of the registration node, and
26039/// subtracting the offset (negative on x86) takes us back to the parent FP.
26040static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
26041 SDValue EntryEBP) {
26042 MachineFunction &MF = DAG.getMachineFunction();
26043 SDLoc dl;
26044
26045 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26046 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
26047
26048 // It's possible that the parent function no longer has a personality function
26049 // if the exceptional code was optimized away, in which case we just return
26050 // the incoming EBP.
26051 if (!Fn->hasPersonalityFn())
26052 return EntryEBP;
26053
26054 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
26055 // registration, or the .set_setframe offset.
26056 MCSymbol *OffsetSym =
26057 MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
26058 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
26059 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
26060 SDValue ParentFrameOffset =
26061 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
26062
26063 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
26064 // prologue to RBP in the parent function.
26065 const X86Subtarget &Subtarget =
26066 static_cast<const X86Subtarget &>(DAG.getSubtarget());
26067 if (Subtarget.is64Bit())
26068 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
26069
26070 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
26071 // RegNodeBase = EntryEBP - RegNodeSize
26072 // ParentFP = RegNodeBase - ParentFrameOffset
26073 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
26074 DAG.getConstant(RegNodeSize, dl, PtrVT));
26075 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
26076}
26077
26078SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
26079 SelectionDAG &DAG) const {
26080 // Helper to detect if the operand is CUR_DIRECTION rounding mode.
26081 auto isRoundModeCurDirection = [](SDValue Rnd) {
26082 if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
26083 return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;
26084
26085 return false;
26086 };
26087 auto isRoundModeSAE = [](SDValue Rnd) {
26088 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
26089 unsigned RC = C->getZExtValue();
26090 if (RC & X86::STATIC_ROUNDING::NO_EXC) {
26091 // Clear the NO_EXC bit and check remaining bits.
26092 RC ^= X86::STATIC_ROUNDING::NO_EXC;
26093 // As a convenience we allow no other bits or explicitly
26094 // current direction.
26095 return RC == 0 || RC == X86::STATIC_ROUNDING::CUR_DIRECTION;
26096 }
26097 }
26098
26099 return false;
26100 };
26101 auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) {
26102 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
26103 RC = C->getZExtValue();
26104 if (RC & X86::STATIC_ROUNDING::NO_EXC) {
26105 // Clear the NO_EXC bit and check remaining bits.
26106 RC ^= X86::STATIC_ROUNDING::NO_EXC;
26107 return RC == X86::STATIC_ROUNDING::TO_NEAREST_INT ||
26108 RC == X86::STATIC_ROUNDING::TO_NEG_INF ||
26109 RC == X86::STATIC_ROUNDING::TO_POS_INF ||
26110 RC == X86::STATIC_ROUNDING::TO_ZERO;
26111 }
26112 }
26113
26114 return false;
26115 };
26116
26117 SDLoc dl(Op);
26118 unsigned IntNo = Op.getConstantOperandVal(0);
26119 MVT VT = Op.getSimpleValueType();
26120 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
26121
26122 // Propagate flags from original node to transformed node(s).
26123 SelectionDAG::FlagInserter FlagsInserter(DAG, Op->getFlags());
26124
26125 if (IntrData) {
26126 switch(IntrData->Type) {
26127 case INTR_TYPE_1OP: {
26128 // We specify 2 possible opcodes for intrinsics with rounding modes.
26129 // First, we check if the intrinsic may have non-default rounding mode,
26130 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26131 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26132 if (IntrWithRoundingModeOpcode != 0) {
26133 SDValue Rnd = Op.getOperand(2);
26134 unsigned RC = 0;
26135 if (isRoundModeSAEToX(Rnd, RC))
26136 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26137 Op.getOperand(1),
26138 DAG.getTargetConstant(RC, dl, MVT::i32));
26139 if (!isRoundModeCurDirection(Rnd))
26140 return SDValue();
26141 }
26142 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26143 Op.getOperand(1));
26144 }
26145 case INTR_TYPE_1OP_SAE: {
26146 SDValue Sae = Op.getOperand(2);
26147
26148 unsigned Opc;
26149 if (isRoundModeCurDirection(Sae))
26150 Opc = IntrData->Opc0;
26151 else if (isRoundModeSAE(Sae))
26152 Opc = IntrData->Opc1;
26153 else
26154 return SDValue();
26155
26156 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1));
26157 }
26158 case INTR_TYPE_2OP: {
26159 SDValue Src2 = Op.getOperand(2);
26160
26161 // We specify 2 possible opcodes for intrinsics with rounding modes.
26162 // First, we check if the intrinsic may have non-default rounding mode,
26163 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26164 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26165 if (IntrWithRoundingModeOpcode != 0) {
26166 SDValue Rnd = Op.getOperand(3);
26167 unsigned RC = 0;
26168 if (isRoundModeSAEToX(Rnd, RC))
26169 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26170 Op.getOperand(1), Src2,
26171 DAG.getTargetConstant(RC, dl, MVT::i32));
26172 if (!isRoundModeCurDirection(Rnd))
26173 return SDValue();
26174 }
26175
26176 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26177 Op.getOperand(1), Src2);
26178 }
26179 case INTR_TYPE_2OP_SAE: {
26180 SDValue Sae = Op.getOperand(3);
26181
26182 unsigned Opc;
26183 if (isRoundModeCurDirection(Sae))
26184 Opc = IntrData->Opc0;
26185 else if (isRoundModeSAE(Sae))
26186 Opc = IntrData->Opc1;
26187 else
26188 return SDValue();
26189
26190 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
26191 Op.getOperand(2));
26192 }
26193 case INTR_TYPE_3OP:
26194 case INTR_TYPE_3OP_IMM8: {
26195 SDValue Src1 = Op.getOperand(1);
26196 SDValue Src2 = Op.getOperand(2);
26197 SDValue Src3 = Op.getOperand(3);
26198
26199 if (IntrData->Type == INTR_TYPE_3OP_IMM8 &&
26200 Src3.getValueType() != MVT::i8) {
26201 Src3 = DAG.getTargetConstant(
26202 cast<ConstantSDNode>(Src3)->getZExtValue() & 0xff, dl, MVT::i8);
26203 }
26204
26205 // We specify 2 possible opcodes for intrinsics with rounding modes.
26206 // First, we check if the intrinsic may have non-default rounding mode,
26207 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26208 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26209 if (IntrWithRoundingModeOpcode != 0) {
26210 SDValue Rnd = Op.getOperand(4);
26211 unsigned RC = 0;
26212 if (isRoundModeSAEToX(Rnd, RC))
26213 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26214 Src1, Src2, Src3,
26215 DAG.getTargetConstant(RC, dl, MVT::i32));
26216 if (!isRoundModeCurDirection(Rnd))
26217 return SDValue();
26218 }
26219
26220 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26221 {Src1, Src2, Src3});
26222 }
26223 case INTR_TYPE_4OP_IMM8: {
26224 assert(Op.getOperand(4)->getOpcode() == ISD::TargetConstant)(static_cast <bool> (Op.getOperand(4)->getOpcode() ==
ISD::TargetConstant) ? void (0) : __assert_fail ("Op.getOperand(4)->getOpcode() == ISD::TargetConstant"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26224, __extension__
__PRETTY_FUNCTION__))
;
26225 SDValue Src4 = Op.getOperand(4);
26226 if (Src4.getValueType() != MVT::i8) {
26227 Src4 = DAG.getTargetConstant(
26228 cast<ConstantSDNode>(Src4)->getZExtValue() & 0xff, dl, MVT::i8);
26229 }
26230
26231 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26232 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
26233 Src4);
26234 }
26235 case INTR_TYPE_1OP_MASK: {
26236 SDValue Src = Op.getOperand(1);
26237 SDValue PassThru = Op.getOperand(2);
26238 SDValue Mask = Op.getOperand(3);
26239 // We add rounding mode to the Node when
26240 // - RC Opcode is specified and
26241 // - RC is not "current direction".
26242 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26243 if (IntrWithRoundingModeOpcode != 0) {
26244 SDValue Rnd = Op.getOperand(4);
26245 unsigned RC = 0;
26246 if (isRoundModeSAEToX(Rnd, RC))
26247 return getVectorMaskingNode(
26248 DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26249 Src, DAG.getTargetConstant(RC, dl, MVT::i32)),
26250 Mask, PassThru, Subtarget, DAG);
26251 if (!isRoundModeCurDirection(Rnd))
26252 return SDValue();
26253 }
26254 return getVectorMaskingNode(
26255 DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,
26256 Subtarget, DAG);
26257 }
26258 case INTR_TYPE_1OP_MASK_SAE: {
26259 SDValue Src = Op.getOperand(1);
26260 SDValue PassThru = Op.getOperand(2);
26261 SDValue Mask = Op.getOperand(3);
26262 SDValue Rnd = Op.getOperand(4);
26263
26264 unsigned Opc;
26265 if (isRoundModeCurDirection(Rnd))
26266 Opc = IntrData->Opc0;
26267 else if (isRoundModeSAE(Rnd))
26268 Opc = IntrData->Opc1;
26269 else
26270 return SDValue();
26271
26272 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), Mask, PassThru,
26273 Subtarget, DAG);
26274 }
26275 case INTR_TYPE_SCALAR_MASK: {
26276 SDValue Src1 = Op.getOperand(1);
26277 SDValue Src2 = Op.getOperand(2);
26278 SDValue passThru = Op.getOperand(3);
26279 SDValue Mask = Op.getOperand(4);
26280 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26281 // There are 2 kinds of intrinsics in this group:
26282 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
26283 // (2) With rounding mode and sae - 7 operands.
26284 bool HasRounding = IntrWithRoundingModeOpcode != 0;
26285 if (Op.getNumOperands() == (5U + HasRounding)) {
26286 if (HasRounding) {
26287 SDValue Rnd = Op.getOperand(5);
26288 unsigned RC = 0;
26289 if (isRoundModeSAEToX(Rnd, RC))
26290 return getScalarMaskingNode(
26291 DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2,
26292 DAG.getTargetConstant(RC, dl, MVT::i32)),
26293 Mask, passThru, Subtarget, DAG);
26294 if (!isRoundModeCurDirection(Rnd))
26295 return SDValue();
26296 }
26297 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
26298 Src2),
26299 Mask, passThru, Subtarget, DAG);
26300 }
26301
26302 assert(Op.getNumOperands() == (6U + HasRounding) &&(static_cast <bool> (Op.getNumOperands() == (6U + HasRounding
) && "Unexpected intrinsic form") ? void (0) : __assert_fail
("Op.getNumOperands() == (6U + HasRounding) && \"Unexpected intrinsic form\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26303, __extension__
__PRETTY_FUNCTION__))
26303 "Unexpected intrinsic form")(static_cast <bool> (Op.getNumOperands() == (6U + HasRounding
) && "Unexpected intrinsic form") ? void (0) : __assert_fail
("Op.getNumOperands() == (6U + HasRounding) && \"Unexpected intrinsic form\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26303, __extension__
__PRETTY_FUNCTION__))
;
26304 SDValue RoundingMode = Op.getOperand(5);
26305 unsigned Opc = IntrData->Opc0;
26306 if (HasRounding) {
26307 SDValue Sae = Op.getOperand(6);
26308 if (isRoundModeSAE(Sae))
26309 Opc = IntrWithRoundingModeOpcode;
26310 else if (!isRoundModeCurDirection(Sae))
26311 return SDValue();
26312 }
26313 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,
26314 Src2, RoundingMode),
26315 Mask, passThru, Subtarget, DAG);
26316 }
26317 case INTR_TYPE_SCALAR_MASK_RND: {
26318 SDValue Src1 = Op.getOperand(1);
26319 SDValue Src2 = Op.getOperand(2);
26320 SDValue passThru = Op.getOperand(3);
26321 SDValue Mask = Op.getOperand(4);
26322 SDValue Rnd = Op.getOperand(5);
26323
26324 SDValue NewOp;
26325 unsigned RC = 0;
26326 if (isRoundModeCurDirection(Rnd))
26327 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
26328 else if (isRoundModeSAEToX(Rnd, RC))
26329 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
26330 DAG.getTargetConstant(RC, dl, MVT::i32));
26331 else
26332 return SDValue();
26333
26334 return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG);
26335 }
26336 case INTR_TYPE_SCALAR_MASK_SAE: {
26337 SDValue Src1 = Op.getOperand(1);
26338 SDValue Src2 = Op.getOperand(2);
26339 SDValue passThru = Op.getOperand(3);
26340 SDValue Mask = Op.getOperand(4);
26341 SDValue Sae = Op.getOperand(5);
26342 unsigned Opc;
26343 if (isRoundModeCurDirection(Sae))
26344 Opc = IntrData->Opc0;
26345 else if (isRoundModeSAE(Sae))
26346 Opc = IntrData->Opc1;
26347 else
26348 return SDValue();
26349
26350 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
26351 Mask, passThru, Subtarget, DAG);
26352 }
26353 case INTR_TYPE_2OP_MASK: {
26354 SDValue Src1 = Op.getOperand(1);
26355 SDValue Src2 = Op.getOperand(2);
26356 SDValue PassThru = Op.getOperand(3);
26357 SDValue Mask = Op.getOperand(4);
26358 SDValue NewOp;
26359 if (IntrData->Opc1 != 0) {
26360 SDValue Rnd = Op.getOperand(5);
26361 unsigned RC = 0;
26362 if (isRoundModeSAEToX(Rnd, RC))
26363 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
26364 DAG.getTargetConstant(RC, dl, MVT::i32));
26365 else if (!isRoundModeCurDirection(Rnd))
26366 return SDValue();
26367 }
26368 if (!NewOp)
26369 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
26370 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
26371 }
26372 case INTR_TYPE_2OP_MASK_SAE: {
26373 SDValue Src1 = Op.getOperand(1);
26374 SDValue Src2 = Op.getOperand(2);
26375 SDValue PassThru = Op.getOperand(3);
26376 SDValue Mask = Op.getOperand(4);
26377
26378 unsigned Opc = IntrData->Opc0;
26379 if (IntrData->Opc1 != 0) {
26380 SDValue Sae = Op.getOperand(5);
26381 if (isRoundModeSAE(Sae))
26382 Opc = IntrData->Opc1;
26383 else if (!isRoundModeCurDirection(Sae))
26384 return SDValue();
26385 }
26386
26387 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
26388 Mask, PassThru, Subtarget, DAG);
26389 }
26390 case INTR_TYPE_3OP_SCALAR_MASK_SAE: {
26391 SDValue Src1 = Op.getOperand(1);
26392 SDValue Src2 = Op.getOperand(2);
26393 SDValue Src3 = Op.getOperand(3);
26394 SDValue PassThru = Op.getOperand(4);
26395 SDValue Mask = Op.getOperand(5);
26396 SDValue Sae = Op.getOperand(6);
26397 unsigned Opc;
26398 if (isRoundModeCurDirection(Sae))
26399 Opc = IntrData->Opc0;
26400 else if (isRoundModeSAE(Sae))
26401 Opc = IntrData->Opc1;
26402 else
26403 return SDValue();
26404
26405 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
26406 Mask, PassThru, Subtarget, DAG);
26407 }
26408 case INTR_TYPE_3OP_MASK_SAE: {
26409 SDValue Src1 = Op.getOperand(1);
26410 SDValue Src2 = Op.getOperand(2);
26411 SDValue Src3 = Op.getOperand(3);
26412 SDValue PassThru = Op.getOperand(4);
26413 SDValue Mask = Op.getOperand(5);
26414
26415 unsigned Opc = IntrData->Opc0;
26416 if (IntrData->Opc1 != 0) {
26417 SDValue Sae = Op.getOperand(6);
26418 if (isRoundModeSAE(Sae))
26419 Opc = IntrData->Opc1;
26420 else if (!isRoundModeCurDirection(Sae))
26421 return SDValue();
26422 }
26423 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
26424 Mask, PassThru, Subtarget, DAG);
26425 }
26426 case BLENDV: {
26427 SDValue Src1 = Op.getOperand(1);
26428 SDValue Src2 = Op.getOperand(2);
26429 SDValue Src3 = Op.getOperand(3);
26430
26431 EVT MaskVT = Src3.getValueType().changeVectorElementTypeToInteger();
26432 Src3 = DAG.getBitcast(MaskVT, Src3);
26433
26434 // Reverse the operands to match VSELECT order.
26435 return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);
26436 }
26437 case VPERM_2OP : {
26438 SDValue Src1 = Op.getOperand(1);
26439 SDValue Src2 = Op.getOperand(2);
26440
26441 // Swap Src1 and Src2 in the node creation
26442 return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
26443 }
26444 case CFMA_OP_MASKZ:
26445 case CFMA_OP_MASK: {
26446 SDValue Src1 = Op.getOperand(1);
26447 SDValue Src2 = Op.getOperand(2);
26448 SDValue Src3 = Op.getOperand(3);
26449 SDValue Mask = Op.getOperand(4);
26450 MVT VT = Op.getSimpleValueType();
26451
26452 SDValue PassThru = Src3;
26453 if (IntrData->Type == CFMA_OP_MASKZ)
26454 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
26455
26456 // We add rounding mode to the Node when
26457 // - RC Opcode is specified and
26458 // - RC is not "current direction".
26459 SDValue NewOp;
26460 if (IntrData->Opc1 != 0) {
26461 SDValue Rnd = Op.getOperand(5);
26462 unsigned RC = 0;
26463 if (isRoundModeSAEToX(Rnd, RC))
26464 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2, Src3,
26465 DAG.getTargetConstant(RC, dl, MVT::i32));
26466 else if (!isRoundModeCurDirection(Rnd))
26467 return SDValue();
26468 }
26469 if (!NewOp)
26470 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, Src3);
26471 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
26472 }
26473 case IFMA_OP:
26474 // NOTE: We need to swizzle the operands to pass the multiply operands
26475 // first.
26476 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26477 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
26478 case FPCLASSS: {
26479 SDValue Src1 = Op.getOperand(1);
26480 SDValue Imm = Op.getOperand(2);
26481 SDValue Mask = Op.getOperand(3);
26482 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
26483 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
26484 Subtarget, DAG);
26485 // Need to fill with zeros to ensure the bitcast will produce zeroes
26486 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
26487 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
26488 DAG.getConstant(0, dl, MVT::v8i1),
26489 FPclassMask, DAG.getIntPtrConstant(0, dl));
26490 return DAG.getBitcast(MVT::i8, Ins);
26491 }
26492
26493 case CMP_MASK_CC: {
26494 MVT MaskVT = Op.getSimpleValueType();
26495 SDValue CC = Op.getOperand(3);
26496 SDValue Mask = Op.getOperand(4);
26497 // We specify 2 possible opcodes for intrinsics with rounding modes.
26498 // First, we check if the intrinsic may have non-default rounding mode,
26499 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26500 if (IntrData->Opc1 != 0) {
26501 SDValue Sae = Op.getOperand(5);
26502 if (isRoundModeSAE(Sae))
26503 return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
26504 Op.getOperand(2), CC, Mask, Sae);
26505 if (!isRoundModeCurDirection(Sae))
26506 return SDValue();
26507 }
26508 //default rounding mode
26509 return DAG.getNode(IntrData->Opc0, dl, MaskVT,
26510 {Op.getOperand(1), Op.getOperand(2), CC, Mask});
26511 }
26512 case CMP_MASK_SCALAR_CC: {
26513 SDValue Src1 = Op.getOperand(1);
26514 SDValue Src2 = Op.getOperand(2);
26515 SDValue CC = Op.getOperand(3);
26516 SDValue Mask = Op.getOperand(4);
26517
26518 SDValue Cmp;
26519 if (IntrData->Opc1 != 0) {
26520 SDValue Sae = Op.getOperand(5);
26521 if (isRoundModeSAE(Sae))
26522 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);
26523 else if (!isRoundModeCurDirection(Sae))
26524 return SDValue();
26525 }
26526 //default rounding mode
26527 if (!Cmp.getNode())
26528 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
26529
26530 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
26531 Subtarget, DAG);
26532 // Need to fill with zeros to ensure the bitcast will produce zeroes
26533 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
26534 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
26535 DAG.getConstant(0, dl, MVT::v8i1),
26536 CmpMask, DAG.getIntPtrConstant(0, dl));
26537 return DAG.getBitcast(MVT::i8, Ins);
26538 }
26539 case COMI: { // Comparison intrinsics
26540 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
26541 SDValue LHS = Op.getOperand(1);
26542 SDValue RHS = Op.getOperand(2);
26543 // Some conditions require the operands to be swapped.
26544 if (CC == ISD::SETLT || CC == ISD::SETLE)
26545 std::swap(LHS, RHS);
26546
26547 SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
26548 SDValue SetCC;
26549 switch (CC) {
26550 case ISD::SETEQ: { // (ZF = 0 and PF = 0)
26551 SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
26552 SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
26553 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
26554 break;
26555 }
26556 case ISD::SETNE: { // (ZF = 1 or PF = 1)
26557 SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
26558 SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
26559 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
26560 break;
26561 }
26562 case ISD::SETGT: // (CF = 0 and ZF = 0)
26563 case ISD::SETLT: { // Condition opposite to GT. Operands swapped above.
26564 SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
26565 break;
26566 }
26567 case ISD::SETGE: // CF = 0
26568 case ISD::SETLE: // Condition opposite to GE. Operands swapped above.
26569 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
26570 break;
26571 default:
26572 llvm_unreachable("Unexpected illegal condition!")::llvm::llvm_unreachable_internal("Unexpected illegal condition!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26572)
;
26573 }
26574 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
26575 }
26576 case COMI_RM: { // Comparison intrinsics with Sae
26577 SDValue LHS = Op.getOperand(1);
26578 SDValue RHS = Op.getOperand(2);
26579 unsigned CondVal = Op.getConstantOperandVal(3);
26580 SDValue Sae = Op.getOperand(4);
26581
26582 SDValue FCmp;
26583 if (isRoundModeCurDirection(Sae))
26584 FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
26585 DAG.getTargetConstant(CondVal, dl, MVT::i8));
26586 else if (isRoundModeSAE(Sae))
26587 FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,
26588 DAG.getTargetConstant(CondVal, dl, MVT::i8), Sae);
26589 else
26590 return SDValue();
26591 // Need to fill with zeros to ensure the bitcast will produce zeroes
26592 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
26593 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
26594 DAG.getConstant(0, dl, MVT::v16i1),
26595 FCmp, DAG.getIntPtrConstant(0, dl));
26596 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
26597 DAG.getBitcast(MVT::i16, Ins));
26598 }
26599 case VSHIFT: {
26600 SDValue SrcOp = Op.getOperand(1);
26601 SDValue ShAmt = Op.getOperand(2);
26602 assert(ShAmt.getValueType() == MVT::i32 &&(static_cast <bool> (ShAmt.getValueType() == MVT::i32 &&
"Unexpected VSHIFT amount type") ? void (0) : __assert_fail (
"ShAmt.getValueType() == MVT::i32 && \"Unexpected VSHIFT amount type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26603, __extension__
__PRETTY_FUNCTION__))
26603 "Unexpected VSHIFT amount type")(static_cast <bool> (ShAmt.getValueType() == MVT::i32 &&
"Unexpected VSHIFT amount type") ? void (0) : __assert_fail (
"ShAmt.getValueType() == MVT::i32 && \"Unexpected VSHIFT amount type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26603, __extension__
__PRETTY_FUNCTION__))
;
26604
26605 // Catch shift-by-constant.
26606 if (auto *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
26607 return getTargetVShiftByConstNode(IntrData->Opc0, dl,
26608 Op.getSimpleValueType(), SrcOp,
26609 CShAmt->getZExtValue(), DAG);
26610
26611 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
26612 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
26613 SrcOp, ShAmt, 0, Subtarget, DAG);
26614 }
26615 case COMPRESS_EXPAND_IN_REG: {
26616 SDValue Mask = Op.getOperand(3);
26617 SDValue DataToCompress = Op.getOperand(1);
26618 SDValue PassThru = Op.getOperand(2);
26619 if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is
26620 return Op.getOperand(1);
26621
26622 // Avoid false dependency.
26623 if (PassThru.isUndef())
26624 PassThru = DAG.getConstant(0, dl, VT);
26625
26626 return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,
26627 Mask);
26628 }
26629 case FIXUPIMM:
26630 case FIXUPIMM_MASKZ: {
26631 SDValue Src1 = Op.getOperand(1);
26632 SDValue Src2 = Op.getOperand(2);
26633 SDValue Src3 = Op.getOperand(3);
26634 SDValue Imm = Op.getOperand(4);
26635 SDValue Mask = Op.getOperand(5);
26636 SDValue Passthru = (IntrData->Type == FIXUPIMM)
26637 ? Src1
26638 : getZeroVector(VT, Subtarget, DAG, dl);
26639
26640 unsigned Opc = IntrData->Opc0;
26641 if (IntrData->Opc1 != 0) {
26642 SDValue Sae = Op.getOperand(6);
26643 if (isRoundModeSAE(Sae))
26644 Opc = IntrData->Opc1;
26645 else if (!isRoundModeCurDirection(Sae))
26646 return SDValue();
26647 }
26648
26649 SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm);
26650
26651 if (Opc == X86ISD::VFIXUPIMM || Opc == X86ISD::VFIXUPIMM_SAE)
26652 return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
26653
26654 return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
26655 }
26656 case ROUNDP: {
26657 assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode")(static_cast <bool> (IntrData->Opc0 == X86ISD::VRNDSCALE
&& "Unexpected opcode") ? void (0) : __assert_fail (
"IntrData->Opc0 == X86ISD::VRNDSCALE && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26657, __extension__
__PRETTY_FUNCTION__))
;
26658 // Clear the upper bits of the rounding immediate so that the legacy
26659 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
26660 auto Round = cast<ConstantSDNode>(Op.getOperand(2));
26661 SDValue RoundingMode =
26662 DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);
26663 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26664 Op.getOperand(1), RoundingMode);
26665 }
26666 case ROUNDS: {
26667 assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode")(static_cast <bool> (IntrData->Opc0 == X86ISD::VRNDSCALES
&& "Unexpected opcode") ? void (0) : __assert_fail (
"IntrData->Opc0 == X86ISD::VRNDSCALES && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26667, __extension__
__PRETTY_FUNCTION__))
;
26668 // Clear the upper bits of the rounding immediate so that the legacy
26669 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
26670 auto Round = cast<ConstantSDNode>(Op.getOperand(3));
26671 SDValue RoundingMode =
26672 DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);
26673 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26674 Op.getOperand(1), Op.getOperand(2), RoundingMode);
26675 }
26676 case BEXTRI: {
26677 assert(IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode")(static_cast <bool> (IntrData->Opc0 == X86ISD::BEXTRI
&& "Unexpected opcode") ? void (0) : __assert_fail (
"IntrData->Opc0 == X86ISD::BEXTRI && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26677, __extension__
__PRETTY_FUNCTION__))
;
26678
26679 uint64_t Imm = Op.getConstantOperandVal(2);
26680 SDValue Control = DAG.getTargetConstant(Imm & 0xffff, dl,
26681 Op.getValueType());
26682 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26683 Op.getOperand(1), Control);
26684 }
26685 // ADC/ADCX/SBB
26686 case ADX: {
26687 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
26688 SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32);
26689
26690 SDValue Res;
26691 // If the carry in is zero, then we should just use ADD/SUB instead of
26692 // ADC/SBB.
26693 if (isNullConstant(Op.getOperand(1))) {
26694 Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),
26695 Op.getOperand(3));
26696 } else {
26697 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1),
26698 DAG.getConstant(-1, dl, MVT::i8));
26699 Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),
26700 Op.getOperand(3), GenCF.getValue(1));
26701 }
26702 SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
26703 SDValue Results[] = { SetCC, Res };
26704 return DAG.getMergeValues(Results, dl);
26705 }
26706 case CVTPD2PS_MASK:
26707 case CVTPD2DQ_MASK:
26708 case CVTQQ2PS_MASK:
26709 case TRUNCATE_TO_REG: {
26710 SDValue Src = Op.getOperand(1);
26711 SDValue PassThru = Op.getOperand(2);
26712 SDValue Mask = Op.getOperand(3);
26713
26714 if (isAllOnesConstant(Mask))
26715 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
26716
26717 MVT SrcVT = Src.getSimpleValueType();
26718 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
26719 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26720 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
26721 {Src, PassThru, Mask});
26722 }
26723 case CVTPS2PH_MASK: {
26724 SDValue Src = Op.getOperand(1);
26725 SDValue Rnd = Op.getOperand(2);
26726 SDValue PassThru = Op.getOperand(3);
26727 SDValue Mask = Op.getOperand(4);
26728
26729 if (isAllOnesConstant(Mask))
26730 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src, Rnd);
26731
26732 MVT SrcVT = Src.getSimpleValueType();
26733 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
26734 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26735 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, Rnd,
26736 PassThru, Mask);
26737
26738 }
26739 case CVTNEPS2BF16_MASK: {
26740 SDValue Src = Op.getOperand(1);
26741 SDValue PassThru = Op.getOperand(2);
26742 SDValue Mask = Op.getOperand(3);
26743
26744 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
26745 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
26746
26747 // Break false dependency.
26748 if (PassThru.isUndef())
26749 PassThru = DAG.getConstant(0, dl, PassThru.getValueType());
26750
26751 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
26752 Mask);
26753 }
26754 default:
26755 break;
26756 }
26757 }
26758
26759 switch (IntNo) {
26760 default: return SDValue(); // Don't custom lower most intrinsics.
26761
26762 // ptest and testp intrinsics. The intrinsic these come from are designed to
26763 // return an integer value, not just an instruction so lower it to the ptest
26764 // or testp pattern and a setcc for the result.
26765 case Intrinsic::x86_avx512_ktestc_b:
26766 case Intrinsic::x86_avx512_ktestc_w:
26767 case Intrinsic::x86_avx512_ktestc_d:
26768 case Intrinsic::x86_avx512_ktestc_q:
26769 case Intrinsic::x86_avx512_ktestz_b:
26770 case Intrinsic::x86_avx512_ktestz_w:
26771 case Intrinsic::x86_avx512_ktestz_d:
26772 case Intrinsic::x86_avx512_ktestz_q:
26773 case Intrinsic::x86_sse41_ptestz:
26774 case Intrinsic::x86_sse41_ptestc:
26775 case Intrinsic::x86_sse41_ptestnzc:
26776 case Intrinsic::x86_avx_ptestz_256:
26777 case Intrinsic::x86_avx_ptestc_256:
26778 case Intrinsic::x86_avx_ptestnzc_256:
26779 case Intrinsic::x86_avx_vtestz_ps:
26780 case Intrinsic::x86_avx_vtestc_ps:
26781 case Intrinsic::x86_avx_vtestnzc_ps:
26782 case Intrinsic::x86_avx_vtestz_pd:
26783 case Intrinsic::x86_avx_vtestc_pd:
26784 case Intrinsic::x86_avx_vtestnzc_pd:
26785 case Intrinsic::x86_avx_vtestz_ps_256:
26786 case Intrinsic::x86_avx_vtestc_ps_256:
26787 case Intrinsic::x86_avx_vtestnzc_ps_256:
26788 case Intrinsic::x86_avx_vtestz_pd_256:
26789 case Intrinsic::x86_avx_vtestc_pd_256:
26790 case Intrinsic::x86_avx_vtestnzc_pd_256: {
26791 unsigned TestOpc = X86ISD::PTEST;
26792 X86::CondCode X86CC;
26793 switch (IntNo) {
26794 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.")::llvm::llvm_unreachable_internal("Bad fallthrough in Intrinsic lowering."
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26794)
;
26795 case Intrinsic::x86_avx512_ktestc_b:
26796 case Intrinsic::x86_avx512_ktestc_w:
26797 case Intrinsic::x86_avx512_ktestc_d:
26798 case Intrinsic::x86_avx512_ktestc_q:
26799 // CF = 1
26800 TestOpc = X86ISD::KTEST;
26801 X86CC = X86::COND_B;
26802 break;
26803 case Intrinsic::x86_avx512_ktestz_b:
26804 case Intrinsic::x86_avx512_ktestz_w:
26805 case Intrinsic::x86_avx512_ktestz_d:
26806 case Intrinsic::x86_avx512_ktestz_q:
26807 TestOpc = X86ISD::KTEST;
26808 X86CC = X86::COND_E;
26809 break;
26810 case Intrinsic::x86_avx_vtestz_ps:
26811 case Intrinsic::x86_avx_vtestz_pd:
26812 case Intrinsic::x86_avx_vtestz_ps_256:
26813 case Intrinsic::x86_avx_vtestz_pd_256:
26814 TestOpc = X86ISD::TESTP;
26815 LLVM_FALLTHROUGH[[gnu::fallthrough]];
26816 case Intrinsic::x86_sse41_ptestz:
26817 case Intrinsic::x86_avx_ptestz_256:
26818 // ZF = 1
26819 X86CC = X86::COND_E;
26820 break;
26821 case Intrinsic::x86_avx_vtestc_ps:
26822 case Intrinsic::x86_avx_vtestc_pd:
26823 case Intrinsic::x86_avx_vtestc_ps_256:
26824 case Intrinsic::x86_avx_vtestc_pd_256:
26825 TestOpc = X86ISD::TESTP;
26826 LLVM_FALLTHROUGH[[gnu::fallthrough]];
26827 case Intrinsic::x86_sse41_ptestc:
26828 case Intrinsic::x86_avx_ptestc_256:
26829 // CF = 1
26830 X86CC = X86::COND_B;
26831 break;
26832 case Intrinsic::x86_avx_vtestnzc_ps:
26833 case Intrinsic::x86_avx_vtestnzc_pd:
26834 case Intrinsic::x86_avx_vtestnzc_ps_256:
26835 case Intrinsic::x86_avx_vtestnzc_pd_256:
26836 TestOpc = X86ISD::TESTP;
26837 LLVM_FALLTHROUGH[[gnu::fallthrough]];
26838 case Intrinsic::x86_sse41_ptestnzc:
26839 case Intrinsic::x86_avx_ptestnzc_256:
26840 // ZF and CF = 0
26841 X86CC = X86::COND_A;
26842 break;
26843 }
26844
26845 SDValue LHS = Op.getOperand(1);
26846 SDValue RHS = Op.getOperand(2);
26847 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
26848 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
26849 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
26850 }
26851
26852 case Intrinsic::x86_sse42_pcmpistria128:
26853 case Intrinsic::x86_sse42_pcmpestria128:
26854 case Intrinsic::x86_sse42_pcmpistric128:
26855 case Intrinsic::x86_sse42_pcmpestric128:
26856 case Intrinsic::x86_sse42_pcmpistrio128:
26857 case Intrinsic::x86_sse42_pcmpestrio128:
26858 case Intrinsic::x86_sse42_pcmpistris128:
26859 case Intrinsic::x86_sse42_pcmpestris128:
26860 case Intrinsic::x86_sse42_pcmpistriz128:
26861 case Intrinsic::x86_sse42_pcmpestriz128: {
26862 unsigned Opcode;
26863 X86::CondCode X86CC;
26864 switch (IntNo) {
26865 default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26865)
; // Can't reach here.
26866 case Intrinsic::x86_sse42_pcmpistria128:
26867 Opcode = X86ISD::PCMPISTR;
26868 X86CC = X86::COND_A;
26869 break;
26870 case Intrinsic::x86_sse42_pcmpestria128:
26871 Opcode = X86ISD::PCMPESTR;
26872 X86CC = X86::COND_A;
26873 break;
26874 case Intrinsic::x86_sse42_pcmpistric128:
26875 Opcode = X86ISD::PCMPISTR;
26876 X86CC = X86::COND_B;
26877 break;
26878 case Intrinsic::x86_sse42_pcmpestric128:
26879 Opcode = X86ISD::PCMPESTR;
26880 X86CC = X86::COND_B;
26881 break;
26882 case Intrinsic::x86_sse42_pcmpistrio128:
26883 Opcode = X86ISD::PCMPISTR;
26884 X86CC = X86::COND_O;
26885 break;
26886 case Intrinsic::x86_sse42_pcmpestrio128:
26887 Opcode = X86ISD::PCMPESTR;
26888 X86CC = X86::COND_O;
26889 break;
26890 case Intrinsic::x86_sse42_pcmpistris128:
26891 Opcode = X86ISD::PCMPISTR;
26892 X86CC = X86::COND_S;
26893 break;
26894 case Intrinsic::x86_sse42_pcmpestris128:
26895 Opcode = X86ISD::PCMPESTR;
26896 X86CC = X86::COND_S;
26897 break;
26898 case Intrinsic::x86_sse42_pcmpistriz128:
26899 Opcode = X86ISD::PCMPISTR;
26900 X86CC = X86::COND_E;
26901 break;
26902 case Intrinsic::x86_sse42_pcmpestriz128:
26903 Opcode = X86ISD::PCMPESTR;
26904 X86CC = X86::COND_E;
26905 break;
26906 }
26907 SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops()));
26908 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
26909 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
26910 SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
26911 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
26912 }
26913
26914 case Intrinsic::x86_sse42_pcmpistri128:
26915 case Intrinsic::x86_sse42_pcmpestri128: {
26916 unsigned Opcode;
26917 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
26918 Opcode = X86ISD::PCMPISTR;
26919 else
26920 Opcode = X86ISD::PCMPESTR;
26921
26922 SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops()));
26923 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
26924 return DAG.getNode(Opcode, dl, VTs, NewOps);
26925 }
26926
26927 case Intrinsic::x86_sse42_pcmpistrm128:
26928 case Intrinsic::x86_sse42_pcmpestrm128: {
26929 unsigned Opcode;
26930 if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
26931 Opcode = X86ISD::PCMPISTR;
26932 else
26933 Opcode = X86ISD::PCMPESTR;
26934
26935 SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops()));
26936 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
26937 return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
26938 }
26939
26940 case Intrinsic::eh_sjlj_lsda: {
26941 MachineFunction &MF = DAG.getMachineFunction();
26942 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26943 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
26944 auto &Context = MF.getMMI().getContext();
26945 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
26946 Twine(MF.getFunctionNumber()));
26947 return DAG.getNode(getGlobalWrapperKind(), dl, VT,
26948 DAG.getMCSymbol(S, PtrVT));
26949 }
26950
26951 case Intrinsic::x86_seh_lsda: {
26952 // Compute the symbol for the LSDA. We know it'll get emitted later.
26953 MachineFunction &MF = DAG.getMachineFunction();
26954 SDValue Op1 = Op.getOperand(1);
26955 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
26956 MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
26957 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
26958
26959 // Generate a simple absolute symbol reference. This intrinsic is only
26960 // supported on 32-bit Windows, which isn't PIC.
26961 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
26962 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
26963 }
26964
26965 case Intrinsic::eh_recoverfp: {
26966 SDValue FnOp = Op.getOperand(1);
26967 SDValue IncomingFPOp = Op.getOperand(2);
26968 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
26969 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
26970 if (!Fn)
26971 report_fatal_error(
26972 "llvm.eh.recoverfp must take a function as the first argument");
26973 return recoverFramePointer(DAG, Fn, IncomingFPOp);
26974 }
26975
26976 case Intrinsic::localaddress: {
26977 // Returns one of the stack, base, or frame pointer registers, depending on
26978 // which is used to reference local variables.
26979 MachineFunction &MF = DAG.getMachineFunction();
26980 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26981 unsigned Reg;
26982 if (RegInfo->hasBasePointer(MF))
26983 Reg = RegInfo->getBaseRegister();
26984 else { // Handles the SP or FP case.
26985 bool CantUseFP = RegInfo->hasStackRealignment(MF);
26986 if (CantUseFP)
26987 Reg = RegInfo->getPtrSizedStackRegister(MF);
26988 else
26989 Reg = RegInfo->getPtrSizedFrameRegister(MF);
26990 }
26991 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
26992 }
26993 case Intrinsic::swift_async_context_addr: {
26994 auto &MF = DAG.getMachineFunction();
26995 auto X86FI = MF.getInfo<X86MachineFunctionInfo>();
26996 if (Subtarget.is64Bit()) {
26997 MF.getFrameInfo().setFrameAddressIsTaken(true);
26998 X86FI->setHasSwiftAsyncContext(true);
26999 return SDValue(
27000 DAG.getMachineNode(
27001 X86::SUB64ri8, dl, MVT::i64,
27002 DAG.getCopyFromReg(DAG.getEntryNode(), dl, X86::RBP, MVT::i64),
27003 DAG.getTargetConstant(8, dl, MVT::i32)),
27004 0);
27005 } else {
27006 // 32-bit so no special extended frame, create or reuse an existing stack
27007 // slot.
27008 if (!X86FI->getSwiftAsyncContextFrameIdx())
27009 X86FI->setSwiftAsyncContextFrameIdx(
27010 MF.getFrameInfo().CreateStackObject(4, Align(4), false));
27011 return DAG.getFrameIndex(*X86FI->getSwiftAsyncContextFrameIdx(), MVT::i32);
27012 }
27013 }
27014 case Intrinsic::x86_avx512_vp2intersect_q_512:
27015 case Intrinsic::x86_avx512_vp2intersect_q_256:
27016 case Intrinsic::x86_avx512_vp2intersect_q_128:
27017 case Intrinsic::x86_avx512_vp2intersect_d_512:
27018 case Intrinsic::x86_avx512_vp2intersect_d_256:
27019 case Intrinsic::x86_avx512_vp2intersect_d_128: {
27020 MVT MaskVT = Op.getSimpleValueType();
27021
27022 SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
27023 SDLoc DL(Op);
27024
27025 SDValue Operation =
27026 DAG.getNode(X86ISD::VP2INTERSECT, DL, VTs,
27027 Op->getOperand(1), Op->getOperand(2));
27028
27029 SDValue Result0 = DAG.getTargetExtractSubreg(X86::sub_mask_0, DL,
27030 MaskVT, Operation);
27031 SDValue Result1 = DAG.getTargetExtractSubreg(X86::sub_mask_1, DL,
27032 MaskVT, Operation);
27033 return DAG.getMergeValues({Result0, Result1}, DL);
27034 }
27035 case Intrinsic::x86_mmx_pslli_w:
27036 case Intrinsic::x86_mmx_pslli_d:
27037 case Intrinsic::x86_mmx_pslli_q:
27038 case Intrinsic::x86_mmx_psrli_w:
27039 case Intrinsic::x86_mmx_psrli_d:
27040 case Intrinsic::x86_mmx_psrli_q:
27041 case Intrinsic::x86_mmx_psrai_w:
27042 case Intrinsic::x86_mmx_psrai_d: {
27043 SDLoc DL(Op);
27044 SDValue ShAmt = Op.getOperand(2);
27045 // If the argument is a constant, convert it to a target constant.
27046 if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) {
27047 // Clamp out of bounds shift amounts since they will otherwise be masked
27048 // to 8-bits which may make it no longer out of bounds.
27049 unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);
27050 if (ShiftAmount == 0)
27051 return Op.getOperand(1);
27052
27053 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
27054 Op.getOperand(0), Op.getOperand(1),
27055 DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));
27056 }
27057
27058 unsigned NewIntrinsic;
27059 switch (IntNo) {
27060 default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27060)
; // Can't reach here.
27061 case Intrinsic::x86_mmx_pslli_w:
27062 NewIntrinsic = Intrinsic::x86_mmx_psll_w;
27063 break;
27064 case Intrinsic::x86_mmx_pslli_d:
27065 NewIntrinsic = Intrinsic::x86_mmx_psll_d;
27066 break;
27067 case Intrinsic::x86_mmx_pslli_q:
27068 NewIntrinsic = Intrinsic::x86_mmx_psll_q;
27069 break;
27070 case Intrinsic::x86_mmx_psrli_w:
27071 NewIntrinsic = Intrinsic::x86_mmx_psrl_w;
27072 break;
27073 case Intrinsic::x86_mmx_psrli_d:
27074 NewIntrinsic = Intrinsic::x86_mmx_psrl_d;
27075 break;
27076 case Intrinsic::x86_mmx_psrli_q:
27077 NewIntrinsic = Intrinsic::x86_mmx_psrl_q;
27078 break;
27079 case Intrinsic::x86_mmx_psrai_w:
27080 NewIntrinsic = Intrinsic::x86_mmx_psra_w;
27081 break;
27082 case Intrinsic::x86_mmx_psrai_d:
27083 NewIntrinsic = Intrinsic::x86_mmx_psra_d;
27084 break;
27085 }
27086
27087 // The vector shift intrinsics with scalars uses 32b shift amounts but
27088 // the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an
27089 // MMX register.
27090 ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt);
27091 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
27092 DAG.getTargetConstant(NewIntrinsic, DL,
27093 getPointerTy(DAG.getDataLayout())),
27094 Op.getOperand(1), ShAmt);
27095 }
27096 case Intrinsic::thread_pointer: {
27097 if (Subtarget.isTargetELF()) {
27098 SDLoc dl(Op);
27099 EVT PtrVT = getPointerTy(DAG.getDataLayout());
27100 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
27101 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(
27102 *DAG.getContext(), Subtarget.is64Bit() ? X86AS::FS : X86AS::GS));
27103 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
27104 DAG.getIntPtrConstant(0, dl), MachinePointerInfo(Ptr));
27105 }
27106 report_fatal_error(
27107 "Target OS doesn't support __builtin_thread_pointer() yet.");
27108 }
27109 }
27110}
27111
27112static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
27113 SDValue Src, SDValue Mask, SDValue Base,
27114 SDValue Index, SDValue ScaleOp, SDValue Chain,
27115 const X86Subtarget &Subtarget) {
27116 SDLoc dl(Op);
27117 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27118 // Scale must be constant.
27119 if (!C)
27120 return SDValue();
27121 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27122 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27123 TLI.getPointerTy(DAG.getDataLayout()));
27124 EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();
27125 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
27126 // If source is undef or we know it won't be used, use a zero vector
27127 // to break register dependency.
27128 // TODO: use undef instead and let BreakFalseDeps deal with it?
27129 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
27130 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
27131
27132 // Cast mask to an integer type.
27133 Mask = DAG.getBitcast(MaskVT, Mask);
27134
27135 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
27136
27137 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
27138 SDValue Res =
27139 DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
27140 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
27141 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
27142}
27143
27144static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,
27145 SDValue Src, SDValue Mask, SDValue Base,
27146 SDValue Index, SDValue ScaleOp, SDValue Chain,
27147 const X86Subtarget &Subtarget) {
27148 MVT VT = Op.getSimpleValueType();
27149 SDLoc dl(Op);
27150 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27151 // Scale must be constant.
27152 if (!C)
27153 return SDValue();
27154 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27155 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27156 TLI.getPointerTy(DAG.getDataLayout()));
27157 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
27158 VT.getVectorNumElements());
27159 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
27160
27161 // We support two versions of the gather intrinsics. One with scalar mask and
27162 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
27163 if (Mask.getValueType() != MaskVT)
27164 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27165
27166 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
27167 // If source is undef or we know it won't be used, use a zero vector
27168 // to break register dependency.
27169 // TODO: use undef instead and let BreakFalseDeps deal with it?
27170 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
27171 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
27172
27173 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
27174
27175 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
27176 SDValue Res =
27177 DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
27178 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
27179 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
27180}
27181
27182static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
27183 SDValue Src, SDValue Mask, SDValue Base,
27184 SDValue Index, SDValue ScaleOp, SDValue Chain,
27185 const X86Subtarget &Subtarget) {
27186 SDLoc dl(Op);
27187 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27188 // Scale must be constant.
27189 if (!C)
27190 return SDValue();
27191 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27192 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27193 TLI.getPointerTy(DAG.getDataLayout()));
27194 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
27195 Src.getSimpleValueType().getVectorNumElements());
27196 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
27197
27198 // We support two versions of the scatter intrinsics. One with scalar mask and
27199 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
27200 if (Mask.getValueType() != MaskVT)
27201 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27202
27203 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
27204
27205 SDVTList VTs = DAG.getVTList(MVT::Other);
27206 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
27207 SDValue Res =
27208 DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
27209 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
27210 return Res;
27211}
27212
27213static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
27214 SDValue Mask, SDValue Base, SDValue Index,
27215 SDValue ScaleOp, SDValue Chain,
27216 const X86Subtarget &Subtarget) {
27217 SDLoc dl(Op);
27218 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27219 // Scale must be constant.
27220 if (!C)
27221 return SDValue();
27222 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27223 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27224 TLI.getPointerTy(DAG.getDataLayout()));
27225 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
27226 SDValue Segment = DAG.getRegister(0, MVT::i32);
27227 MVT MaskVT =
27228 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
27229 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27230 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
27231 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
27232 return SDValue(Res, 0);
27233}
27234
27235/// Handles the lowering of builtin intrinsics with chain that return their
27236/// value into registers EDX:EAX.
27237/// If operand ScrReg is a valid register identifier, then operand 2 of N is
27238/// copied to SrcReg. The assumption is that SrcReg is an implicit input to
27239/// TargetOpcode.
27240/// Returns a Glue value which can be used to add extra copy-from-reg if the
27241/// expanded intrinsics implicitly defines extra registers (i.e. not just
27242/// EDX:EAX).
27243static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL,
27244 SelectionDAG &DAG,
27245 unsigned TargetOpcode,
27246 unsigned SrcReg,
27247 const X86Subtarget &Subtarget,
27248 SmallVectorImpl<SDValue> &Results) {
27249 SDValue Chain = N->getOperand(0);
27250 SDValue Glue;
27251
27252 if (SrcReg) {
27253 assert(N->getNumOperands() == 3 && "Unexpected number of operands!")(static_cast <bool> (N->getNumOperands() == 3 &&
"Unexpected number of operands!") ? void (0) : __assert_fail
("N->getNumOperands() == 3 && \"Unexpected number of operands!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27253, __extension__
__PRETTY_FUNCTION__))
;
27254 Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);
27255 Glue = Chain.getValue(1);
27256 }
27257
27258 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
27259 SDValue N1Ops[] = {Chain, Glue};
27260 SDNode *N1 = DAG.getMachineNode(
27261 TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1));
27262 Chain = SDValue(N1, 0);
27263
27264 // Reads the content of XCR and returns it in registers EDX:EAX.
27265 SDValue LO, HI;
27266 if (Subtarget.is64Bit()) {
27267 LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
27268 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
27269 LO.getValue(2));
27270 } else {
27271 LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
27272 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
27273 LO.getValue(2));
27274 }
27275 Chain = HI.getValue(1);
27276 Glue = HI.getValue(2);
27277
27278 if (Subtarget.is64Bit()) {
27279 // Merge the two 32-bit values into a 64-bit one.
27280 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
27281 DAG.getConstant(32, DL, MVT::i8));
27282 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
27283 Results.push_back(Chain);
27284 return Glue;
27285 }
27286
27287 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
27288 SDValue Ops[] = { LO, HI };
27289 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
27290 Results.push_back(Pair);
27291 Results.push_back(Chain);
27292 return Glue;
27293}
27294
27295/// Handles the lowering of builtin intrinsics that read the time stamp counter
27296/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
27297/// READCYCLECOUNTER nodes.
27298static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
27299 SelectionDAG &DAG,
27300 const X86Subtarget &Subtarget,
27301 SmallVectorImpl<SDValue> &Results) {
27302 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
27303 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
27304 // and the EAX register is loaded with the low-order 32 bits.
27305 SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,
27306 /* NoRegister */0, Subtarget,
27307 Results);
27308 if (Opcode != X86::RDTSCP)
27309 return;
27310
27311 SDValue Chain = Results[1];
27312 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
27313 // the ECX register. Add 'ecx' explicitly to the chain.
27314 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue);
27315 Results[1] = ecx;
27316 Results.push_back(ecx.getValue(1));
27317}
27318
27319static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
27320 SelectionDAG &DAG) {
27321 SmallVector<SDValue, 3> Results;
27322 SDLoc DL(Op);
27323 getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget,
27324 Results);
27325 return DAG.getMergeValues(Results, DL);
27326}
27327
27328static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
27329 MachineFunction &MF = DAG.getMachineFunction();
27330 SDValue Chain = Op.getOperand(0);
27331 SDValue RegNode = Op.getOperand(2);
27332 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
27333 if (!EHInfo)
27334 report_fatal_error("EH registrations only live in functions using WinEH");
27335
27336 // Cast the operand to an alloca, and remember the frame index.
27337 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
27338 if (!FINode)
27339 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
27340 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
27341
27342 // Return the chain operand without making any DAG nodes.
27343 return Chain;
27344}
27345
27346static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
27347 MachineFunction &MF = DAG.getMachineFunction();
27348 SDValue Chain = Op.getOperand(0);
27349 SDValue EHGuard = Op.getOperand(2);
27350 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
27351 if (!EHInfo)
27352 report_fatal_error("EHGuard only live in functions using WinEH");
27353
27354 // Cast the operand to an alloca, and remember the frame index.
27355 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
27356 if (!FINode)
27357 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
27358 EHInfo->EHGuardFrameIndex = FINode->getIndex();
27359
27360 // Return the chain operand without making any DAG nodes.
27361 return Chain;
27362}
27363
27364/// Emit Truncating Store with signed or unsigned saturation.
27365static SDValue
27366EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
27367 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
27368 SelectionDAG &DAG) {
27369 SDVTList VTs = DAG.getVTList(MVT::Other);
27370 SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
27371 SDValue Ops[] = { Chain, Val, Ptr, Undef };
27372 unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS;
27373 return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);
27374}
27375
27376/// Emit Masked Truncating Store with signed or unsigned saturation.
27377static SDValue
27378EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
27379 SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
27380 MachineMemOperand *MMO, SelectionDAG &DAG) {
27381 SDVTList VTs = DAG.getVTList(MVT::Other);
27382 SDValue Ops[] = { Chain, Val, Ptr, Mask };
27383 unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS;
27384 return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);
27385}
27386
27387static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
27388 SelectionDAG &DAG) {
27389 unsigned IntNo = Op.getConstantOperandVal(1);
27390 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
27391 if (!IntrData) {
27392 switch (IntNo) {
27393 case llvm::Intrinsic::x86_seh_ehregnode:
27394 return MarkEHRegistrationNode(Op, DAG);
27395 case llvm::Intrinsic::x86_seh_ehguard:
27396 return MarkEHGuard(Op, DAG);
27397 case llvm::Intrinsic::x86_rdpkru: {
27398 SDLoc dl(Op);
27399 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27400 // Create a RDPKRU node and pass 0 to the ECX parameter.
27401 return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0),
27402 DAG.getConstant(0, dl, MVT::i32));
27403 }
27404 case llvm::Intrinsic::x86_wrpkru: {
27405 SDLoc dl(Op);
27406 // Create a WRPKRU node, pass the input to the EAX parameter, and pass 0
27407 // to the EDX and ECX parameters.
27408 return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,
27409 Op.getOperand(0), Op.getOperand(2),
27410 DAG.getConstant(0, dl, MVT::i32),
27411 DAG.getConstant(0, dl, MVT::i32));
27412 }
27413 case llvm::Intrinsic::asan_check_memaccess: {
27414 // Mark this as adjustsStack because it will be lowered to a call.
27415 DAG.getMachineFunction().getFrameInfo().setAdjustsStack(true);
27416 // Don't do anything here, we will expand these intrinsics out later.
27417 return Op;
27418 }
27419 case llvm::Intrinsic::x86_flags_read_u32:
27420 case llvm::Intrinsic::x86_flags_read_u64:
27421 case llvm::Intrinsic::x86_flags_write_u32:
27422 case llvm::Intrinsic::x86_flags_write_u64: {
27423 // We need a frame pointer because this will get lowered to a PUSH/POP
27424 // sequence.
27425 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
27426 MFI.setHasCopyImplyingStackAdjustment(true);
27427 // Don't do anything here, we will expand these intrinsics out later
27428 // during FinalizeISel in EmitInstrWithCustomInserter.
27429 return Op;
27430 }
27431 case Intrinsic::x86_lwpins32:
27432 case Intrinsic::x86_lwpins64:
27433 case Intrinsic::x86_umwait:
27434 case Intrinsic::x86_tpause: {
27435 SDLoc dl(Op);
27436 SDValue Chain = Op->getOperand(0);
27437 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27438 unsigned Opcode;
27439
27440 switch (IntNo) {
27441 default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27441)
;
27442 case Intrinsic::x86_umwait:
27443 Opcode = X86ISD::UMWAIT;
27444 break;
27445 case Intrinsic::x86_tpause:
27446 Opcode = X86ISD::TPAUSE;
27447 break;
27448 case Intrinsic::x86_lwpins32:
27449 case Intrinsic::x86_lwpins64:
27450 Opcode = X86ISD::LWPINS;
27451 break;
27452 }
27453
27454 SDValue Operation =
27455 DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
27456 Op->getOperand(3), Op->getOperand(4));
27457 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
27458 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27459 Operation.getValue(1));
27460 }
27461 case Intrinsic::x86_enqcmd:
27462 case Intrinsic::x86_enqcmds: {
27463 SDLoc dl(Op);
27464 SDValue Chain = Op.getOperand(0);
27465 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27466 unsigned Opcode;
27467 switch (IntNo) {
27468 default: llvm_unreachable("Impossible intrinsic!")::llvm::llvm_unreachable_internal("Impossible intrinsic!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27468)
;
27469 case Intrinsic::x86_enqcmd:
27470 Opcode = X86ISD::ENQCMD;
27471 break;
27472 case Intrinsic::x86_enqcmds:
27473 Opcode = X86ISD::ENQCMDS;
27474 break;
27475 }
27476 SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2),
27477 Op.getOperand(3));
27478 SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG);
27479 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27480 Operation.getValue(1));
27481 }
27482 case Intrinsic::x86_aesenc128kl:
27483 case Intrinsic::x86_aesdec128kl:
27484 case Intrinsic::x86_aesenc256kl:
27485 case Intrinsic::x86_aesdec256kl: {
27486 SDLoc DL(Op);
27487 SDVTList VTs = DAG.getVTList(MVT::v2i64, MVT::i32, MVT::Other);
27488 SDValue Chain = Op.getOperand(0);
27489 unsigned Opcode;
27490
27491 switch (IntNo) {
27492 default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27492)
;
27493 case Intrinsic::x86_aesenc128kl:
27494 Opcode = X86ISD::AESENC128KL;
27495 break;
27496 case Intrinsic::x86_aesdec128kl:
27497 Opcode = X86ISD::AESDEC128KL;
27498 break;
27499 case Intrinsic::x86_aesenc256kl:
27500 Opcode = X86ISD::AESENC256KL;
27501 break;
27502 case Intrinsic::x86_aesdec256kl:
27503 Opcode = X86ISD::AESDEC256KL;
27504 break;
27505 }
27506
27507 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
27508 MachineMemOperand *MMO = MemIntr->getMemOperand();
27509 EVT MemVT = MemIntr->getMemoryVT();
27510 SDValue Operation = DAG.getMemIntrinsicNode(
27511 Opcode, DL, VTs, {Chain, Op.getOperand(2), Op.getOperand(3)}, MemVT,
27512 MMO);
27513 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(1), DL, DAG);
27514
27515 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
27516 {ZF, Operation.getValue(0), Operation.getValue(2)});
27517 }
27518 case Intrinsic::x86_aesencwide128kl:
27519 case Intrinsic::x86_aesdecwide128kl:
27520 case Intrinsic::x86_aesencwide256kl:
27521 case Intrinsic::x86_aesdecwide256kl: {
27522 SDLoc DL(Op);
27523 SDVTList VTs = DAG.getVTList(
27524 {MVT::i32, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64,
27525 MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::Other});
27526 SDValue Chain = Op.getOperand(0);
27527 unsigned Opcode;
27528
27529 switch (IntNo) {
27530 default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27530)
;
27531 case Intrinsic::x86_aesencwide128kl:
27532 Opcode = X86ISD::AESENCWIDE128KL;
27533 break;
27534 case Intrinsic::x86_aesdecwide128kl:
27535 Opcode = X86ISD::AESDECWIDE128KL;
27536 break;
27537 case Intrinsic::x86_aesencwide256kl:
27538 Opcode = X86ISD::AESENCWIDE256KL;
27539 break;
27540 case Intrinsic::x86_aesdecwide256kl:
27541 Opcode = X86ISD::AESDECWIDE256KL;
27542 break;
27543 }
27544
27545 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
27546 MachineMemOperand *MMO = MemIntr->getMemOperand();
27547 EVT MemVT = MemIntr->getMemoryVT();
27548 SDValue Operation = DAG.getMemIntrinsicNode(
27549 Opcode, DL, VTs,
27550 {Chain, Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
27551 Op.getOperand(5), Op.getOperand(6), Op.getOperand(7),
27552 Op.getOperand(8), Op.getOperand(9), Op.getOperand(10)},
27553 MemVT, MMO);
27554 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(0), DL, DAG);
27555
27556 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
27557 {ZF, Operation.getValue(1), Operation.getValue(2),
27558 Operation.getValue(3), Operation.getValue(4),
27559 Operation.getValue(5), Operation.getValue(6),
27560 Operation.getValue(7), Operation.getValue(8),
27561 Operation.getValue(9)});
27562 }
27563 case Intrinsic::x86_testui: {
27564 SDLoc dl(Op);
27565 SDValue Chain = Op.getOperand(0);
27566 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27567 SDValue Operation = DAG.getNode(X86ISD::TESTUI, dl, VTs, Chain);
27568 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
27569 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27570 Operation.getValue(1));
27571 }
27572 case Intrinsic::x86_atomic_bts:
27573 case Intrinsic::x86_atomic_btc:
27574 case Intrinsic::x86_atomic_btr: {
27575 SDLoc DL(Op);
27576 MVT VT = Op.getSimpleValueType();
27577 SDValue Chain = Op.getOperand(0);
27578 SDValue Op1 = Op.getOperand(2);
27579 SDValue Op2 = Op.getOperand(3);
27580 unsigned Opc = IntNo == Intrinsic::x86_atomic_bts ? X86ISD::LBTS
27581 : IntNo == Intrinsic::x86_atomic_btc ? X86ISD::LBTC
27582 : X86ISD::LBTR;
27583 SDValue Size = DAG.getConstant(VT.getScalarSizeInBits(), DL, MVT::i32);
27584 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
27585 SDValue Res =
27586 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
27587 {Chain, Op1, Op2, Size}, VT, MMO);
27588 Chain = Res.getValue(1);
27589 Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
27590 unsigned Imm = cast<ConstantSDNode>(Op2)->getZExtValue();
27591 if (Imm)
27592 Res = DAG.getNode(ISD::SHL, DL, VT, Res,
27593 DAG.getShiftAmountConstant(Imm, VT, DL));
27594 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
27595 }
27596 }
27597 return SDValue();
27598 }
27599
27600 SDLoc dl(Op);
27601 switch(IntrData->Type) {
27602 default: llvm_unreachable("Unknown Intrinsic Type")::llvm::llvm_unreachable_internal("Unknown Intrinsic Type", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27602)
;
27603 case RDSEED:
27604 case RDRAND: {
27605 // Emit the node with the right value type.
27606 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
27607 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
27608
27609 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
27610 // Otherwise return the value from Rand, which is always 0, casted to i32.
27611 SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
27612 DAG.getConstant(1, dl, Op->getValueType(1)),
27613 DAG.getTargetConstant(X86::COND_B, dl, MVT::i8),
27614 SDValue(Result.getNode(), 1)};
27615 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
27616
27617 // Return { result, isValid, chain }.
27618 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
27619 SDValue(Result.getNode(), 2));
27620 }
27621 case GATHER_AVX2: {
27622 SDValue Chain = Op.getOperand(0);
27623 SDValue Src = Op.getOperand(2);
27624 SDValue Base = Op.getOperand(3);
27625 SDValue Index = Op.getOperand(4);
27626 SDValue Mask = Op.getOperand(5);
27627 SDValue Scale = Op.getOperand(6);
27628 return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
27629 Scale, Chain, Subtarget);
27630 }
27631 case GATHER: {
27632 //gather(v1, mask, index, base, scale);
27633 SDValue Chain = Op.getOperand(0);
27634 SDValue Src = Op.getOperand(2);
27635 SDValue Base = Op.getOperand(3);
27636 SDValue Index = Op.getOperand(4);
27637 SDValue Mask = Op.getOperand(5);
27638 SDValue Scale = Op.getOperand(6);
27639 return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,
27640 Chain, Subtarget);
27641 }
27642 case SCATTER: {
27643 //scatter(base, mask, index, v1, scale);
27644 SDValue Chain = Op.getOperand(0);
27645 SDValue Base = Op.getOperand(2);
27646 SDValue Mask = Op.getOperand(3);
27647 SDValue Index = Op.getOperand(4);
27648 SDValue Src = Op.getOperand(5);
27649 SDValue Scale = Op.getOperand(6);
27650 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
27651 Scale, Chain, Subtarget);
27652 }
27653 case PREFETCH: {
27654 const APInt &HintVal = Op.getConstantOperandAPInt(6);
27655 assert((HintVal == 2 || HintVal == 3) &&(static_cast <bool> ((HintVal == 2 || HintVal == 3) &&
"Wrong prefetch hint in intrinsic: should be 2 or 3") ? void
(0) : __assert_fail ("(HintVal == 2 || HintVal == 3) && \"Wrong prefetch hint in intrinsic: should be 2 or 3\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27656, __extension__
__PRETTY_FUNCTION__))
27656 "Wrong prefetch hint in intrinsic: should be 2 or 3")(static_cast <bool> ((HintVal == 2 || HintVal == 3) &&
"Wrong prefetch hint in intrinsic: should be 2 or 3") ? void
(0) : __assert_fail ("(HintVal == 2 || HintVal == 3) && \"Wrong prefetch hint in intrinsic: should be 2 or 3\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27656, __extension__
__PRETTY_FUNCTION__))
;
27657 unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
27658 SDValue Chain = Op.getOperand(0);
27659 SDValue Mask = Op.getOperand(2);
27660 SDValue Index = Op.getOperand(3);
27661 SDValue Base = Op.getOperand(4);
27662 SDValue Scale = Op.getOperand(5);
27663 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
27664 Subtarget);
27665 }
27666 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
27667 case RDTSC: {
27668 SmallVector<SDValue, 2> Results;
27669 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
27670 Results);
27671 return DAG.getMergeValues(Results, dl);
27672 }
27673 // Read Performance Monitoring Counters.
27674 case RDPMC:
27675 // GetExtended Control Register.
27676 case XGETBV: {
27677 SmallVector<SDValue, 2> Results;
27678
27679 // RDPMC uses ECX to select the index of the performance counter to read.
27680 // XGETBV uses ECX to select the index of the XCR register to return.
27681 // The result is stored into registers EDX:EAX.
27682 expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,
27683 Subtarget, Results);
27684 return DAG.getMergeValues(Results, dl);
27685 }
27686 // XTEST intrinsics.
27687 case XTEST: {
27688 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
27689 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
27690
27691 SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
27692 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
27693 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
27694 Ret, SDValue(InTrans.getNode(), 1));
27695 }
27696 case TRUNCATE_TO_MEM_VI8:
27697 case TRUNCATE_TO_MEM_VI16:
27698 case TRUNCATE_TO_MEM_VI32: {
27699 SDValue Mask = Op.getOperand(4);
27700 SDValue DataToTruncate = Op.getOperand(3);
27701 SDValue Addr = Op.getOperand(2);
27702 SDValue Chain = Op.getOperand(0);
27703
27704 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
27705 assert(MemIntr && "Expected MemIntrinsicSDNode!")(static_cast <bool> (MemIntr && "Expected MemIntrinsicSDNode!"
) ? void (0) : __assert_fail ("MemIntr && \"Expected MemIntrinsicSDNode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27705, __extension__
__PRETTY_FUNCTION__))
;
27706
27707 EVT MemVT = MemIntr->getMemoryVT();
27708
27709 uint16_t TruncationOp = IntrData->Opc0;
27710 switch (TruncationOp) {
27711 case X86ISD::VTRUNC: {
27712 if (isAllOnesConstant(Mask)) // return just a truncate store
27713 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
27714 MemIntr->getMemOperand());
27715
27716 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
27717 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27718 SDValue Offset = DAG.getUNDEF(VMask.getValueType());
27719
27720 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask,
27721 MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED,
27722 true /* truncating */);
27723 }
27724 case X86ISD::VTRUNCUS:
27725 case X86ISD::VTRUNCS: {
27726 bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
27727 if (isAllOnesConstant(Mask))
27728 return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
27729 MemIntr->getMemOperand(), DAG);
27730
27731 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
27732 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27733
27734 return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
27735 VMask, MemVT, MemIntr->getMemOperand(), DAG);
27736 }
27737 default:
27738 llvm_unreachable("Unsupported truncstore intrinsic")::llvm::llvm_unreachable_internal("Unsupported truncstore intrinsic"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27738)
;
27739 }
27740 }
27741 }
27742}
27743
27744SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
27745 SelectionDAG &DAG) const {
27746 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
27747 MFI.setReturnAddressIsTaken(true);
27748
27749 if (verifyReturnAddressArgumentIsConstant(Op, DAG))
27750 return SDValue();
27751
27752 unsigned Depth = Op.getConstantOperandVal(0);
27753 SDLoc dl(Op);
27754 EVT PtrVT = getPointerTy(DAG.getDataLayout());
27755
27756 if (Depth > 0) {
27757 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
27758 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27759 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
27760 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
27761 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
27762 MachinePointerInfo());
27763 }
27764
27765 // Just load the return address.
27766 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
27767 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
27768 MachinePointerInfo());
27769}
27770
27771SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
27772 SelectionDAG &DAG) const {
27773 DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
27774 return getReturnAddressFrameIndex(DAG);
27775}
27776
27777SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
27778 MachineFunction &MF = DAG.getMachineFunction();
27779 MachineFrameInfo &MFI = MF.getFrameInfo();
27780 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
27781 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27782 EVT VT = Op.getValueType();
27783
27784 MFI.setFrameAddressIsTaken(true);
27785
27786 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
27787 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
27788 // is not possible to crawl up the stack without looking at the unwind codes
27789 // simultaneously.
27790 int FrameAddrIndex = FuncInfo->getFAIndex();
27791 if (!FrameAddrIndex) {
27792 // Set up a frame object for the return address.
27793 unsigned SlotSize = RegInfo->getSlotSize();
27794 FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
27795 SlotSize, /*SPOffset=*/0, /*IsImmutable=*/false);
27796 FuncInfo->setFAIndex(FrameAddrIndex);
27797 }
27798 return DAG.getFrameIndex(FrameAddrIndex, VT);
27799 }
27800
27801 unsigned FrameReg =
27802 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
27803 SDLoc dl(Op); // FIXME probably not meaningful
27804 unsigned Depth = Op.getConstantOperandVal(0);
27805 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||(static_cast <bool> (((FrameReg == X86::RBP && VT
== MVT::i64) || (FrameReg == X86::EBP && VT == MVT::
i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27807, __extension__
__PRETTY_FUNCTION__))
27806 (FrameReg == X86::EBP && VT == MVT::i32)) &&(static_cast <bool> (((FrameReg == X86::RBP && VT
== MVT::i64) || (FrameReg == X86::EBP && VT == MVT::
i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27807, __extension__
__PRETTY_FUNCTION__))
27807 "Invalid Frame Register!")(static_cast <bool> (((FrameReg == X86::RBP && VT
== MVT::i64) || (FrameReg == X86::EBP && VT == MVT::
i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27807, __extension__
__PRETTY_FUNCTION__))
;
27808 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
27809 while (Depth--)
27810 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
27811 MachinePointerInfo());
27812 return FrameAddr;
27813}
27814
27815// FIXME? Maybe this could be a TableGen attribute on some registers and
27816// this table could be generated automatically from RegInfo.
27817Register X86TargetLowering::getRegisterByName(const char* RegName, LLT VT,
27818 const MachineFunction &MF) const {
27819 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
27820
27821 Register Reg = StringSwitch<unsigned>(RegName)
27822 .Case("esp", X86::ESP)
27823 .Case("rsp", X86::RSP)
27824 .Case("ebp", X86::EBP)
27825 .Case("rbp", X86::RBP)
27826 .Default(0);
27827
27828 if (Reg == X86::EBP || Reg == X86::RBP) {
27829 if (!TFI.hasFP(MF))
27830 report_fatal_error("register " + StringRef(RegName) +
27831 " is allocatable: function has no frame pointer");
27832#ifndef NDEBUG
27833 else {
27834 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27835 Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF);
27836 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&(static_cast <bool> ((FrameReg == X86::EBP || FrameReg ==
X86::RBP) && "Invalid Frame Register!") ? void (0) :
__assert_fail ("(FrameReg == X86::EBP || FrameReg == X86::RBP) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27837, __extension__
__PRETTY_FUNCTION__))
27837 "Invalid Frame Register!")(static_cast <bool> ((FrameReg == X86::EBP || FrameReg ==
X86::RBP) && "Invalid Frame Register!") ? void (0) :
__assert_fail ("(FrameReg == X86::EBP || FrameReg == X86::RBP) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27837, __extension__
__PRETTY_FUNCTION__))
;
27838 }
27839#endif
27840 }
27841
27842 if (Reg)
27843 return Reg;
27844
27845 report_fatal_error("Invalid register name global variable");
27846}
27847
27848SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
27849 SelectionDAG &DAG) const {
27850 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27851 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
27852}
27853
27854Register X86TargetLowering::getExceptionPointerRegister(
27855 const Constant *PersonalityFn) const {
27856 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
27857 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
27858
27859 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
27860}
27861
27862Register X86TargetLowering::getExceptionSelectorRegister(
27863 const Constant *PersonalityFn) const {
27864 // Funclet personalities don't use selectors (the runtime does the selection).
27865 if (isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)))
27866 return X86::NoRegister;
27867 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
27868}
27869
27870bool X86TargetLowering::needsFixedCatchObjects() const {
27871 return Subtarget.isTargetWin64();
27872}
27873
27874SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
27875 SDValue Chain = Op.getOperand(0);
27876 SDValue Offset = Op.getOperand(1);
27877 SDValue Handler = Op.getOperand(2);
27878 SDLoc dl (Op);
27879
27880 EVT PtrVT = getPointerTy(DAG.getDataLayout());
27881 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27882 Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
27883 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||(static_cast <bool> (((FrameReg == X86::RBP && PtrVT
== MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT
::i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27885, __extension__
__PRETTY_FUNCTION__))
27884 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&(static_cast <bool> (((FrameReg == X86::RBP && PtrVT
== MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT
::i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27885, __extension__
__PRETTY_FUNCTION__))
27885 "Invalid Frame Register!")(static_cast <bool> (((FrameReg == X86::RBP && PtrVT
== MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT
::i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27885, __extension__
__PRETTY_FUNCTION__))
;
27886 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
27887 Register StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
27888
27889 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
27890 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
27891 dl));
27892 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
27893 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
27894 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
27895
27896 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
27897 DAG.getRegister(StoreAddrReg, PtrVT));
27898}
27899
27900SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
27901 SelectionDAG &DAG) const {
27902 SDLoc DL(Op);
27903 // If the subtarget is not 64bit, we may need the global base reg
27904 // after isel expand pseudo, i.e., after CGBR pass ran.
27905 // Therefore, ask for the GlobalBaseReg now, so that the pass
27906 // inserts the code for us in case we need it.
27907 // Otherwise, we will end up in a situation where we will
27908 // reference a virtual register that is not defined!
27909 if (!Subtarget.is64Bit()) {
27910 const X86InstrInfo *TII = Subtarget.getInstrInfo();
27911 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
27912 }
27913 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
27914 DAG.getVTList(MVT::i32, MVT::Other),
27915 Op.getOperand(0), Op.getOperand(1));
27916}
27917
27918SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
27919 SelectionDAG &DAG) const {
27920 SDLoc DL(Op);
27921 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
27922 Op.getOperand(0), Op.getOperand(1));
27923}
27924
27925SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
27926 SelectionDAG &DAG) const {
27927 SDLoc DL(Op);
27928 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
27929 Op.getOperand(0));
27930}
27931
27932static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
27933 return Op.getOperand(0);
27934}
27935
27936SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
27937 SelectionDAG &DAG) const {
27938 SDValue Root = Op.getOperand(0);
27939 SDValue Trmp = Op.getOperand(1); // trampoline
27940 SDValue FPtr = Op.getOperand(2); // nested function
27941 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
27942 SDLoc dl (Op);
27943
27944 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
27945 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
27946
27947 if (Subtarget.is64Bit()) {
27948 SDValue OutChains[6];
27949
27950 // Large code-model.
27951 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
27952 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
27953
27954 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
27955 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
27956
27957 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
27958
27959 // Load the pointer to the nested function into R11.
27960 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
27961 SDValue Addr = Trmp;
27962 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
27963 Addr, MachinePointerInfo(TrmpAddr));
27964
27965 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27966 DAG.getConstant(2, dl, MVT::i64));
27967 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
27968 MachinePointerInfo(TrmpAddr, 2), Align(2));
27969
27970 // Load the 'nest' parameter value into R10.
27971 // R10 is specified in X86CallingConv.td
27972 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
27973 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27974 DAG.getConstant(10, dl, MVT::i64));
27975 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
27976 Addr, MachinePointerInfo(TrmpAddr, 10));
27977
27978 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27979 DAG.getConstant(12, dl, MVT::i64));
27980 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
27981 MachinePointerInfo(TrmpAddr, 12), Align(2));
27982
27983 // Jump to the nested function.
27984 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
27985 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27986 DAG.getConstant(20, dl, MVT::i64));
27987 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
27988 Addr, MachinePointerInfo(TrmpAddr, 20));
27989
27990 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
27991 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27992 DAG.getConstant(22, dl, MVT::i64));
27993 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
27994 Addr, MachinePointerInfo(TrmpAddr, 22));
27995
27996 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
27997 } else {
27998 const Function *Func =
27999 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
28000 CallingConv::ID CC = Func->getCallingConv();
28001 unsigned NestReg;
28002
28003 switch (CC) {
28004 default:
28005 llvm_unreachable("Unsupported calling convention")::llvm::llvm_unreachable_internal("Unsupported calling convention"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28005)
;
28006 case CallingConv::C:
28007 case CallingConv::X86_StdCall: {
28008 // Pass 'nest' parameter in ECX.
28009 // Must be kept in sync with X86CallingConv.td
28010 NestReg = X86::ECX;
28011
28012 // Check that ECX wasn't needed by an 'inreg' parameter.
28013 FunctionType *FTy = Func->getFunctionType();
28014 const AttributeList &Attrs = Func->getAttributes();
28015
28016 if (!Attrs.isEmpty() && !Func->isVarArg()) {
28017 unsigned InRegCount = 0;
28018 unsigned Idx = 0;
28019
28020 for (FunctionType::param_iterator I = FTy->param_begin(),
28021 E = FTy->param_end(); I != E; ++I, ++Idx)
28022 if (Attrs.hasParamAttr(Idx, Attribute::InReg)) {
28023 const DataLayout &DL = DAG.getDataLayout();
28024 // FIXME: should only count parameters that are lowered to integers.
28025 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
28026 }
28027
28028 if (InRegCount > 2) {
28029 report_fatal_error("Nest register in use - reduce number of inreg"
28030 " parameters!");
28031 }
28032 }
28033 break;
28034 }
28035 case CallingConv::X86_FastCall:
28036 case CallingConv::X86_ThisCall:
28037 case CallingConv::Fast:
28038 case CallingConv::Tail:
28039 case CallingConv::SwiftTail:
28040 // Pass 'nest' parameter in EAX.
28041 // Must be kept in sync with X86CallingConv.td
28042 NestReg = X86::EAX;
28043 break;
28044 }
28045
28046 SDValue OutChains[4];
28047 SDValue Addr, Disp;
28048
28049 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28050 DAG.getConstant(10, dl, MVT::i32));
28051 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
28052
28053 // This is storing the opcode for MOV32ri.
28054 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
28055 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
28056 OutChains[0] =
28057 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
28058 Trmp, MachinePointerInfo(TrmpAddr));
28059
28060 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28061 DAG.getConstant(1, dl, MVT::i32));
28062 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
28063 MachinePointerInfo(TrmpAddr, 1), Align(1));
28064
28065 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
28066 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28067 DAG.getConstant(5, dl, MVT::i32));
28068 OutChains[2] =
28069 DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), Addr,
28070 MachinePointerInfo(TrmpAddr, 5), Align(1));
28071
28072 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28073 DAG.getConstant(6, dl, MVT::i32));
28074 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
28075 MachinePointerInfo(TrmpAddr, 6), Align(1));
28076
28077 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
28078 }
28079}
28080
28081SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
28082 SelectionDAG &DAG) const {
28083 /*
28084 The rounding mode is in bits 11:10 of FPSR, and has the following
28085 settings:
28086 00 Round to nearest
28087 01 Round to -inf
28088 10 Round to +inf
28089 11 Round to 0
28090
28091 FLT_ROUNDS, on the other hand, expects the following:
28092 -1 Undefined
28093 0 Round to 0
28094 1 Round to nearest
28095 2 Round to +inf
28096 3 Round to -inf
28097
28098 To perform the conversion, we use a packed lookup table of the four 2-bit
28099 values that we can index by FPSP[11:10]
28100 0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]
28101
28102 (0x2d >> ((FPSR & 0xc00) >> 9)) & 3
28103 */
28104
28105 MachineFunction &MF = DAG.getMachineFunction();
28106 MVT VT = Op.getSimpleValueType();
28107 SDLoc DL(Op);
28108
28109 // Save FP Control Word to stack slot
28110 int SSFI = MF.getFrameInfo().CreateStackObject(2, Align(2), false);
28111 SDValue StackSlot =
28112 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
28113
28114 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
28115
28116 SDValue Chain = Op.getOperand(0);
28117 SDValue Ops[] = {Chain, StackSlot};
28118 Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
28119 DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI,
28120 Align(2), MachineMemOperand::MOStore);
28121
28122 // Load FP Control Word from stack slot
28123 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, Align(2));
28124 Chain = CWD.getValue(1);
28125
28126 // Mask and turn the control bits into a shift for the lookup table.
28127 SDValue Shift =
28128 DAG.getNode(ISD::SRL, DL, MVT::i16,
28129 DAG.getNode(ISD::AND, DL, MVT::i16,
28130 CWD, DAG.getConstant(0xc00, DL, MVT::i16)),
28131 DAG.getConstant(9, DL, MVT::i8));
28132 Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift);
28133
28134 SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32);
28135 SDValue RetVal =
28136 DAG.getNode(ISD::AND, DL, MVT::i32,
28137 DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),
28138 DAG.getConstant(3, DL, MVT::i32));
28139
28140 RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT);
28141
28142 return DAG.getMergeValues({RetVal, Chain}, DL);
28143}
28144
28145SDValue X86TargetLowering::LowerSET_ROUNDING(SDValue Op,
28146 SelectionDAG &DAG) const {
28147 MachineFunction &MF = DAG.getMachineFunction();
28148 SDLoc DL(Op);
28149 SDValue Chain = Op.getNode()->getOperand(0);
28150
28151 // FP control word may be set only from data in memory. So we need to allocate
28152 // stack space to save/load FP control word.
28153 int OldCWFrameIdx = MF.getFrameInfo().CreateStackObject(4, Align(4), false);
28154 SDValue StackSlot =
28155 DAG.getFrameIndex(OldCWFrameIdx, getPointerTy(DAG.getDataLayout()));
28156 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, OldCWFrameIdx);
28157 MachineMemOperand *MMO =
28158 MF.getMachineMemOperand(MPI, MachineMemOperand::MOStore, 2, Align(2));
28159
28160 // Store FP control word into memory.
28161 SDValue Ops[] = {Chain, StackSlot};
28162 Chain = DAG.getMemIntrinsicNode(
28163 X86ISD::FNSTCW16m, DL, DAG.getVTList(MVT::Other), Ops, MVT::i16, MMO);
28164
28165 // Load FP Control Word from stack slot and clear RM field (bits 11:10).
28166 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI);
28167 Chain = CWD.getValue(1);
28168 CWD = DAG.getNode(ISD::AND, DL, MVT::i16, CWD.getValue(0),
28169 DAG.getConstant(0xf3ff, DL, MVT::i16));
28170
28171 // Calculate new rounding mode.
28172 SDValue NewRM = Op.getNode()->getOperand(1);
28173 SDValue RMBits;
28174 if (auto *CVal = dyn_cast<ConstantSDNode>(NewRM)) {
28175 uint64_t RM = CVal->getZExtValue();
28176 int FieldVal;
28177 switch (static_cast<RoundingMode>(RM)) {
28178 case RoundingMode::NearestTiesToEven: FieldVal = X86::rmToNearest; break;
28179 case RoundingMode::TowardNegative: FieldVal = X86::rmDownward; break;
28180 case RoundingMode::TowardPositive: FieldVal = X86::rmUpward; break;
28181 case RoundingMode::TowardZero: FieldVal = X86::rmTowardZero; break;
28182 default:
28183 llvm_unreachable("rounding mode is not supported by X86 hardware")::llvm::llvm_unreachable_internal("rounding mode is not supported by X86 hardware"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28183)
;
28184 }
28185 RMBits = DAG.getConstant(FieldVal, DL, MVT::i16);
28186 } else {
28187 // Need to convert argument into bits of control word:
28188 // 0 Round to 0 -> 11
28189 // 1 Round to nearest -> 00
28190 // 2 Round to +inf -> 10
28191 // 3 Round to -inf -> 01
28192 // The 2-bit value needs then to be shifted so that it occupies bits 11:10.
28193 // To make the conversion, put all these values into a value 0xc9 and shift
28194 // it left depending on the rounding mode:
28195 // (0xc9 << 4) & 0xc00 = X86::rmTowardZero
28196 // (0xc9 << 6) & 0xc00 = X86::rmToNearest
28197 // ...
28198 // (0xc9 << (2 * NewRM + 4)) & 0xc00
28199 SDValue ShiftValue =
28200 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
28201 DAG.getNode(ISD::ADD, DL, MVT::i32,
28202 DAG.getNode(ISD::SHL, DL, MVT::i32, NewRM,
28203 DAG.getConstant(1, DL, MVT::i8)),
28204 DAG.getConstant(4, DL, MVT::i32)));
28205 SDValue Shifted =
28206 DAG.getNode(ISD::SHL, DL, MVT::i16, DAG.getConstant(0xc9, DL, MVT::i16),
28207 ShiftValue);
28208 RMBits = DAG.getNode(ISD::AND, DL, MVT::i16, Shifted,
28209 DAG.getConstant(0xc00, DL, MVT::i16));
28210 }
28211
28212 // Update rounding mode bits and store the new FP Control Word into stack.
28213 CWD = DAG.getNode(ISD::OR, DL, MVT::i16, CWD, RMBits);
28214 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, /* Alignment = */ 2);
28215
28216 // Load FP control word from the slot.
28217 SDValue OpsLD[] = {Chain, StackSlot};
28218 MachineMemOperand *MMOL =
28219 MF.getMachineMemOperand(MPI, MachineMemOperand::MOLoad, 2, Align(2));
28220 Chain = DAG.getMemIntrinsicNode(
28221 X86ISD::FLDCW16m, DL, DAG.getVTList(MVT::Other), OpsLD, MVT::i16, MMOL);
28222
28223 // If target supports SSE, set MXCSR as well. Rounding mode is encoded in the
28224 // same way but in bits 14:13.
28225 if (Subtarget.hasSSE1()) {
28226 // Store MXCSR into memory.
28227 Chain = DAG.getNode(
28228 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28229 DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
28230 StackSlot);
28231
28232 // Load MXCSR from stack slot and clear RM field (bits 14:13).
28233 SDValue CWD = DAG.getLoad(MVT::i32, DL, Chain, StackSlot, MPI);
28234 Chain = CWD.getValue(1);
28235 CWD = DAG.getNode(ISD::AND, DL, MVT::i32, CWD.getValue(0),
28236 DAG.getConstant(0xffff9fff, DL, MVT::i32));
28237
28238 // Shift X87 RM bits from 11:10 to 14:13.
28239 RMBits = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, RMBits);
28240 RMBits = DAG.getNode(ISD::SHL, DL, MVT::i32, RMBits,
28241 DAG.getConstant(3, DL, MVT::i8));
28242
28243 // Update rounding mode bits and store the new FP Control Word into stack.
28244 CWD = DAG.getNode(ISD::OR, DL, MVT::i32, CWD, RMBits);
28245 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, /* Alignment = */ 4);
28246
28247 // Load MXCSR from the slot.
28248 Chain = DAG.getNode(
28249 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28250 DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
28251 StackSlot);
28252 }
28253
28254 return Chain;
28255}
28256
28257/// Lower a vector CTLZ using native supported vector CTLZ instruction.
28258//
28259// i8/i16 vector implemented using dword LZCNT vector instruction
28260// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
28261// split the vector, perform operation on it's Lo a Hi part and
28262// concatenate the results.
28263static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG,
28264 const X86Subtarget &Subtarget) {
28265 assert(Op.getOpcode() == ISD::CTLZ)(static_cast <bool> (Op.getOpcode() == ISD::CTLZ) ? void
(0) : __assert_fail ("Op.getOpcode() == ISD::CTLZ", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28265, __extension__ __PRETTY_FUNCTION__))
;
28266 SDLoc dl(Op);
28267 MVT VT = Op.getSimpleValueType();
28268 MVT EltVT = VT.getVectorElementType();
28269 unsigned NumElems = VT.getVectorNumElements();
28270
28271 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&(static_cast <bool> ((EltVT == MVT::i8 || EltVT == MVT::
i16) && "Unsupported element type") ? void (0) : __assert_fail
("(EltVT == MVT::i8 || EltVT == MVT::i16) && \"Unsupported element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28272, __extension__
__PRETTY_FUNCTION__))
28272 "Unsupported element type")(static_cast <bool> ((EltVT == MVT::i8 || EltVT == MVT::
i16) && "Unsupported element type") ? void (0) : __assert_fail
("(EltVT == MVT::i8 || EltVT == MVT::i16) && \"Unsupported element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28272, __extension__
__PRETTY_FUNCTION__))
;
28273
28274 // Split vector, it's Lo and Hi parts will be handled in next iteration.
28275 if (NumElems > 16 ||
28276 (NumElems == 16 && !Subtarget.canExtendTo512DQ()))
28277 return splitVectorIntUnary(Op, DAG);
28278
28279 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
28280 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&(static_cast <bool> ((NewVT.is256BitVector() || NewVT.is512BitVector
()) && "Unsupported value type for operation") ? void
(0) : __assert_fail ("(NewVT.is256BitVector() || NewVT.is512BitVector()) && \"Unsupported value type for operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28281, __extension__
__PRETTY_FUNCTION__))
28281 "Unsupported value type for operation")(static_cast <bool> ((NewVT.is256BitVector() || NewVT.is512BitVector
()) && "Unsupported value type for operation") ? void
(0) : __assert_fail ("(NewVT.is256BitVector() || NewVT.is512BitVector()) && \"Unsupported value type for operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28281, __extension__
__PRETTY_FUNCTION__))
;
28282
28283 // Use native supported vector instruction vplzcntd.
28284 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
28285 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
28286 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
28287 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
28288
28289 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
28290}
28291
28292// Lower CTLZ using a PSHUFB lookup table implementation.
28293static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
28294 const X86Subtarget &Subtarget,
28295 SelectionDAG &DAG) {
28296 MVT VT = Op.getSimpleValueType();
28297 int NumElts = VT.getVectorNumElements();
28298 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
28299 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
28300
28301 // Per-nibble leading zero PSHUFB lookup table.
28302 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
28303 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
28304 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
28305 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
28306
28307 SmallVector<SDValue, 64> LUTVec;
28308 for (int i = 0; i < NumBytes; ++i)
28309 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
28310 SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
28311
28312 // Begin by bitcasting the input to byte vector, then split those bytes
28313 // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
28314 // If the hi input nibble is zero then we add both results together, otherwise
28315 // we just take the hi result (by masking the lo result to zero before the
28316 // add).
28317 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
28318 SDValue Zero = DAG.getConstant(0, DL, CurrVT);
28319
28320 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
28321 SDValue Lo = Op0;
28322 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
28323 SDValue HiZ;
28324 if (CurrVT.is512BitVector()) {
28325 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
28326 HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
28327 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
28328 } else {
28329 HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
28330 }
28331
28332 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
28333 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
28334 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
28335 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
28336
28337 // Merge result back from vXi8 back to VT, working on the lo/hi halves
28338 // of the current vector width in the same way we did for the nibbles.
28339 // If the upper half of the input element is zero then add the halves'
28340 // leading zero counts together, otherwise just use the upper half's.
28341 // Double the width of the result until we are at target width.
28342 while (CurrVT != VT) {
28343 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
28344 int CurrNumElts = CurrVT.getVectorNumElements();
28345 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
28346 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
28347 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
28348
28349 // Check if the upper half of the input element is zero.
28350 if (CurrVT.is512BitVector()) {
28351 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
28352 HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
28353 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
28354 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
28355 } else {
28356 HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
28357 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
28358 }
28359 HiZ = DAG.getBitcast(NextVT, HiZ);
28360
28361 // Move the upper/lower halves to the lower bits as we'll be extending to
28362 // NextVT. Mask the lower result to zero if HiZ is true and add the results
28363 // together.
28364 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
28365 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
28366 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
28367 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
28368 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
28369 CurrVT = NextVT;
28370 }
28371
28372 return Res;
28373}
28374
28375static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
28376 const X86Subtarget &Subtarget,
28377 SelectionDAG &DAG) {
28378 MVT VT = Op.getSimpleValueType();
28379
28380 if (Subtarget.hasCDI() &&
28381 // vXi8 vectors need to be promoted to 512-bits for vXi32.
28382 (Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))
28383 return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);
28384
28385 // Decompose 256-bit ops into smaller 128-bit ops.
28386 if (VT.is256BitVector() && !Subtarget.hasInt256())
28387 return splitVectorIntUnary(Op, DAG);
28388
28389 // Decompose 512-bit ops into smaller 256-bit ops.
28390 if (VT.is512BitVector() && !Subtarget.hasBWI())
28391 return splitVectorIntUnary(Op, DAG);
28392
28393 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB")(static_cast <bool> (Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB"
) ? void (0) : __assert_fail ("Subtarget.hasSSSE3() && \"Expected SSSE3 support for PSHUFB\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28393, __extension__
__PRETTY_FUNCTION__))
;
28394 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
28395}
28396
28397static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
28398 SelectionDAG &DAG) {
28399 MVT VT = Op.getSimpleValueType();
28400 MVT OpVT = VT;
28401 unsigned NumBits = VT.getSizeInBits();
28402 SDLoc dl(Op);
28403 unsigned Opc = Op.getOpcode();
28404
28405 if (VT.isVector())
28406 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
28407
28408 Op = Op.getOperand(0);
28409 if (VT == MVT::i8) {
28410 // Zero extend to i32 since there is not an i8 bsr.
28411 OpVT = MVT::i32;
28412 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
28413 }
28414
28415 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
28416 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
28417 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
28418
28419 if (Opc == ISD::CTLZ) {
28420 // If src is zero (i.e. bsr sets ZF), returns NumBits.
28421 SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
28422 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
28423 Op.getValue(1)};
28424 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
28425 }
28426
28427 // Finally xor with NumBits-1.
28428 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
28429 DAG.getConstant(NumBits - 1, dl, OpVT));
28430
28431 if (VT == MVT::i8)
28432 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
28433 return Op;
28434}
28435
28436static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
28437 SelectionDAG &DAG) {
28438 MVT VT = Op.getSimpleValueType();
28439 unsigned NumBits = VT.getScalarSizeInBits();
28440 SDValue N0 = Op.getOperand(0);
28441 SDLoc dl(Op);
28442
28443 assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&(static_cast <bool> (!VT.isVector() && Op.getOpcode
() == ISD::CTTZ && "Only scalar CTTZ requires custom lowering"
) ? void (0) : __assert_fail ("!VT.isVector() && Op.getOpcode() == ISD::CTTZ && \"Only scalar CTTZ requires custom lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28444, __extension__
__PRETTY_FUNCTION__))
28444 "Only scalar CTTZ requires custom lowering")(static_cast <bool> (!VT.isVector() && Op.getOpcode
() == ISD::CTTZ && "Only scalar CTTZ requires custom lowering"
) ? void (0) : __assert_fail ("!VT.isVector() && Op.getOpcode() == ISD::CTTZ && \"Only scalar CTTZ requires custom lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28444, __extension__
__PRETTY_FUNCTION__))
;
28445
28446 // Issue a bsf (scan bits forward) which also sets EFLAGS.
28447 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
28448 Op = DAG.getNode(X86ISD::BSF, dl, VTs, N0);
28449
28450 // If src is zero (i.e. bsf sets ZF), returns NumBits.
28451 SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT),
28452 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
28453 Op.getValue(1)};
28454 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
28455}
28456
28457static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG,
28458 const X86Subtarget &Subtarget) {
28459 MVT VT = Op.getSimpleValueType();
28460 if (VT == MVT::i16 || VT == MVT::i32)
28461 return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
28462
28463 if (VT == MVT::v32i16 || VT == MVT::v64i8)
28464 return splitVectorIntBinary(Op, DAG);
28465
28466 assert(Op.getSimpleValueType().is256BitVector() &&(static_cast <bool> (Op.getSimpleValueType().is256BitVector
() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28468, __extension__
__PRETTY_FUNCTION__))
28467 Op.getSimpleValueType().isInteger() &&(static_cast <bool> (Op.getSimpleValueType().is256BitVector
() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28468, __extension__
__PRETTY_FUNCTION__))
28468 "Only handle AVX 256-bit vector integer operation")(static_cast <bool> (Op.getSimpleValueType().is256BitVector
() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28468, __extension__
__PRETTY_FUNCTION__))
;
28469 return splitVectorIntBinary(Op, DAG);
28470}
28471
28472static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG,
28473 const X86Subtarget &Subtarget) {
28474 MVT VT = Op.getSimpleValueType();
28475 SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
28476 unsigned Opcode = Op.getOpcode();
28477 SDLoc DL(Op);
28478
28479 if (VT == MVT::v32i16 || VT == MVT::v64i8 ||
28480 (VT.is256BitVector() && !Subtarget.hasInt256())) {
28481 assert(Op.getSimpleValueType().isInteger() &&(static_cast <bool> (Op.getSimpleValueType().isInteger(
) && "Only handle AVX vector integer operation") ? void
(0) : __assert_fail ("Op.getSimpleValueType().isInteger() && \"Only handle AVX vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28482, __extension__
__PRETTY_FUNCTION__))
28482 "Only handle AVX vector integer operation")(static_cast <bool> (Op.getSimpleValueType().isInteger(
) && "Only handle AVX vector integer operation") ? void
(0) : __assert_fail ("Op.getSimpleValueType().isInteger() && \"Only handle AVX vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28482, __extension__
__PRETTY_FUNCTION__))
;
28483 return splitVectorIntBinary(Op, DAG);
28484 }
28485
28486 // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.
28487 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28488 EVT SetCCResultType =
28489 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
28490
28491 unsigned BitWidth = VT.getScalarSizeInBits();
28492 if (Opcode == ISD::USUBSAT) {
28493 if (!TLI.isOperationLegal(ISD::UMAX, VT) || useVPTERNLOG(Subtarget, VT)) {
28494 // Handle a special-case with a bit-hack instead of cmp+select:
28495 // usubsat X, SMIN --> (X ^ SMIN) & (X s>> BW-1)
28496 // If the target can use VPTERNLOG, DAGToDAG will match this as
28497 // "vpsra + vpternlog" which is better than "vpmax + vpsub" with a
28498 // "broadcast" constant load.
28499 ConstantSDNode *C = isConstOrConstSplat(Y, true);
28500 if (C && C->getAPIntValue().isSignMask()) {
28501 SDValue SignMask = DAG.getConstant(C->getAPIntValue(), DL, VT);
28502 SDValue ShiftAmt = DAG.getConstant(BitWidth - 1, DL, VT);
28503 SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, X, SignMask);
28504 SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShiftAmt);
28505 return DAG.getNode(ISD::AND, DL, VT, Xor, Sra);
28506 }
28507 }
28508 if (!TLI.isOperationLegal(ISD::UMAX, VT)) {
28509 // usubsat X, Y --> (X >u Y) ? X - Y : 0
28510 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
28511 SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
28512 // TODO: Move this to DAGCombiner?
28513 if (SetCCResultType == VT &&
28514 DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits())
28515 return DAG.getNode(ISD::AND, DL, VT, Cmp, Sub);
28516 return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
28517 }
28518 }
28519
28520 if ((Opcode == ISD::SADDSAT || Opcode == ISD::SSUBSAT) &&
28521 (!VT.isVector() || VT == MVT::v2i64)) {
28522 APInt MinVal = APInt::getSignedMinValue(BitWidth);
28523 APInt MaxVal = APInt::getSignedMaxValue(BitWidth);
28524 SDValue Zero = DAG.getConstant(0, DL, VT);
28525 SDValue Result =
28526 DAG.getNode(Opcode == ISD::SADDSAT ? ISD::SADDO : ISD::SSUBO, DL,
28527 DAG.getVTList(VT, SetCCResultType), X, Y);
28528 SDValue SumDiff = Result.getValue(0);
28529 SDValue Overflow = Result.getValue(1);
28530 SDValue SatMin = DAG.getConstant(MinVal, DL, VT);
28531 SDValue SatMax = DAG.getConstant(MaxVal, DL, VT);
28532 SDValue SumNeg =
28533 DAG.getSetCC(DL, SetCCResultType, SumDiff, Zero, ISD::SETLT);
28534 Result = DAG.getSelect(DL, VT, SumNeg, SatMax, SatMin);
28535 return DAG.getSelect(DL, VT, Overflow, Result, SumDiff);
28536 }
28537
28538 // Use default expansion.
28539 return SDValue();
28540}
28541
28542static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
28543 SelectionDAG &DAG) {
28544 MVT VT = Op.getSimpleValueType();
28545 if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
28546 // Since X86 does not have CMOV for 8-bit integer, we don't convert
28547 // 8-bit integer abs to NEG and CMOV.
28548 SDLoc DL(Op);
28549 SDValue N0 = Op.getOperand(0);
28550 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
28551 DAG.getConstant(0, DL, VT), N0);
28552 SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_NS, DL, MVT::i8),
28553 SDValue(Neg.getNode(), 1)};
28554 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
28555 }
28556
28557 // ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).
28558 if ((VT == MVT::v2i64 || VT == MVT::v4i64) && Subtarget.hasSSE41()) {
28559 SDLoc DL(Op);
28560 SDValue Src = Op.getOperand(0);
28561 SDValue Sub =
28562 DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Src);
28563 return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Sub, Src);
28564 }
28565
28566 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
28567 assert(VT.isInteger() &&(static_cast <bool> (VT.isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("VT.isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28568, __extension__
__PRETTY_FUNCTION__))
28568 "Only handle AVX 256-bit vector integer operation")(static_cast <bool> (VT.isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("VT.isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28568, __extension__
__PRETTY_FUNCTION__))
;
28569 return splitVectorIntUnary(Op, DAG);
28570 }
28571
28572 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
28573 return splitVectorIntUnary(Op, DAG);
28574
28575 // Default to expand.
28576 return SDValue();
28577}
28578
28579static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget,
28580 SelectionDAG &DAG) {
28581 MVT VT = Op.getSimpleValueType();
28582
28583 // For AVX1 cases, split to use legal ops (everything but v4i64).
28584 if (VT.is256BitVector() && !Subtarget.hasInt256())
28585 return splitVectorIntBinary(Op, DAG);
28586
28587 if (VT == MVT::v32i16 || VT == MVT::v64i8)
28588 return splitVectorIntBinary(Op, DAG);
28589
28590 // Default to expand.
28591 return SDValue();
28592}
28593
28594static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
28595 MVT VT = Op.getSimpleValueType();
28596
28597 // For AVX1 cases, split to use legal ops (everything but v4i64).
28598 if (VT.getScalarType() != MVT::i64 && VT.is256BitVector())
28599 return splitVectorIntBinary(Op, DAG);
28600
28601 if (VT == MVT::v32i16 || VT == MVT::v64i8)
28602 return splitVectorIntBinary(Op, DAG);
28603
28604 // Default to expand.
28605 return SDValue();
28606}
28607
28608static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
28609 SelectionDAG &DAG) {
28610 SDLoc dl(Op);
28611 MVT VT = Op.getSimpleValueType();
28612
28613 // Decompose 256-bit ops into 128-bit ops.
28614 if (VT.is256BitVector() && !Subtarget.hasInt256())
28615 return splitVectorIntBinary(Op, DAG);
28616
28617 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
28618 return splitVectorIntBinary(Op, DAG);
28619
28620 SDValue A = Op.getOperand(0);
28621 SDValue B = Op.getOperand(1);
28622
28623 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
28624 // vector pairs, multiply and truncate.
28625 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
28626 unsigned NumElts = VT.getVectorNumElements();
28627
28628 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
28629 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
28630 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
28631 return DAG.getNode(
28632 ISD::TRUNCATE, dl, VT,
28633 DAG.getNode(ISD::MUL, dl, ExVT,
28634 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A),
28635 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B)));
28636 }
28637
28638 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
28639
28640 // Extract the lo/hi parts to any extend to i16.
28641 // We're going to mask off the low byte of each result element of the
28642 // pmullw, so it doesn't matter what's in the high byte of each 16-bit
28643 // element.
28644 SDValue Undef = DAG.getUNDEF(VT);
28645 SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef));
28646 SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef));
28647
28648 SDValue BLo, BHi;
28649 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
28650 // If the RHS is a constant, manually unpackl/unpackh.
28651 SmallVector<SDValue, 16> LoOps, HiOps;
28652 for (unsigned i = 0; i != NumElts; i += 16) {
28653 for (unsigned j = 0; j != 8; ++j) {
28654 LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,
28655 MVT::i16));
28656 HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,
28657 MVT::i16));
28658 }
28659 }
28660
28661 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
28662 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
28663 } else {
28664 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));
28665 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));
28666 }
28667
28668 // Multiply, mask the lower 8bits of the lo/hi results and pack.
28669 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
28670 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
28671 return getPack(DAG, Subtarget, dl, VT, RLo, RHi);
28672 }
28673
28674 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
28675 if (VT == MVT::v4i32) {
28676 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&(static_cast <bool> (Subtarget.hasSSE2() && !Subtarget
.hasSSE41() && "Should not custom lower when pmulld is available!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && !Subtarget.hasSSE41() && \"Should not custom lower when pmulld is available!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28677, __extension__
__PRETTY_FUNCTION__))
28677 "Should not custom lower when pmulld is available!")(static_cast <bool> (Subtarget.hasSSE2() && !Subtarget
.hasSSE41() && "Should not custom lower when pmulld is available!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && !Subtarget.hasSSE41() && \"Should not custom lower when pmulld is available!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28677, __extension__
__PRETTY_FUNCTION__))
;
28678
28679 // Extract the odd parts.
28680 static const int UnpackMask[] = { 1, -1, 3, -1 };
28681 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
28682 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
28683
28684 // Multiply the even parts.
28685 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
28686 DAG.getBitcast(MVT::v2i64, A),
28687 DAG.getBitcast(MVT::v2i64, B));
28688 // Now multiply odd parts.
28689 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
28690 DAG.getBitcast(MVT::v2i64, Aodds),
28691 DAG.getBitcast(MVT::v2i64, Bodds));
28692
28693 Evens = DAG.getBitcast(VT, Evens);
28694 Odds = DAG.getBitcast(VT, Odds);
28695
28696 // Merge the two vectors back together with a shuffle. This expands into 2
28697 // shuffles.
28698 static const int ShufMask[] = { 0, 4, 2, 6 };
28699 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
28700 }
28701
28702 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&(static_cast <bool> ((VT == MVT::v2i64 || VT == MVT::v4i64
|| VT == MVT::v8i64) && "Only know how to lower V2I64/V4I64/V8I64 multiply"
) ? void (0) : __assert_fail ("(VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && \"Only know how to lower V2I64/V4I64/V8I64 multiply\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28703, __extension__
__PRETTY_FUNCTION__))
28703 "Only know how to lower V2I64/V4I64/V8I64 multiply")(static_cast <bool> ((VT == MVT::v2i64 || VT == MVT::v4i64
|| VT == MVT::v8i64) && "Only know how to lower V2I64/V4I64/V8I64 multiply"
) ? void (0) : __assert_fail ("(VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && \"Only know how to lower V2I64/V4I64/V8I64 multiply\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28703, __extension__
__PRETTY_FUNCTION__))
;
28704 assert(!Subtarget.hasDQI() && "DQI should use MULLQ")(static_cast <bool> (!Subtarget.hasDQI() && "DQI should use MULLQ"
) ? void (0) : __assert_fail ("!Subtarget.hasDQI() && \"DQI should use MULLQ\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28704, __extension__
__PRETTY_FUNCTION__))
;
28705
28706 // Ahi = psrlqi(a, 32);
28707 // Bhi = psrlqi(b, 32);
28708 //
28709 // AloBlo = pmuludq(a, b);
28710 // AloBhi = pmuludq(a, Bhi);
28711 // AhiBlo = pmuludq(Ahi, b);
28712 //
28713 // Hi = psllqi(AloBhi + AhiBlo, 32);
28714 // return AloBlo + Hi;
28715 KnownBits AKnown = DAG.computeKnownBits(A);
28716 KnownBits BKnown = DAG.computeKnownBits(B);
28717
28718 APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
28719 bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);
28720 bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);
28721
28722 APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
28723 bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
28724 bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);
28725
28726 SDValue Zero = DAG.getConstant(0, dl, VT);
28727
28728 // Only multiply lo/hi halves that aren't known to be zero.
28729 SDValue AloBlo = Zero;
28730 if (!ALoIsZero && !BLoIsZero)
28731 AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
28732
28733 SDValue AloBhi = Zero;
28734 if (!ALoIsZero && !BHiIsZero) {
28735 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
28736 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
28737 }
28738
28739 SDValue AhiBlo = Zero;
28740 if (!AHiIsZero && !BLoIsZero) {
28741 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
28742 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
28743 }
28744
28745 SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
28746 Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
28747
28748 return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
28749}
28750
28751static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl,
28752 MVT VT, bool IsSigned,
28753 const X86Subtarget &Subtarget,
28754 SelectionDAG &DAG,
28755 SDValue *Low = nullptr) {
28756 unsigned NumElts = VT.getVectorNumElements();
28757
28758 // For vXi8 we will unpack the low and high half of each 128 bit lane to widen
28759 // to a vXi16 type. Do the multiplies, shift the results and pack the half
28760 // lane results back together.
28761
28762 // We'll take different approaches for signed and unsigned.
28763 // For unsigned we'll use punpcklbw/punpckhbw to put zero extend the bytes
28764 // and use pmullw to calculate the full 16-bit product.
28765 // For signed we'll use punpcklbw/punpckbw to extend the bytes to words and
28766 // shift them left into the upper byte of each word. This allows us to use
28767 // pmulhw to calculate the full 16-bit product. This trick means we don't
28768 // need to sign extend the bytes to use pmullw.
28769
28770 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
28771 SDValue Zero = DAG.getConstant(0, dl, VT);
28772
28773 SDValue ALo, AHi;
28774 if (IsSigned) {
28775 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, A));
28776 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));
28777 } else {
28778 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));
28779 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));
28780 }
28781
28782 SDValue BLo, BHi;
28783 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
28784 // If the RHS is a constant, manually unpackl/unpackh and extend.
28785 SmallVector<SDValue, 16> LoOps, HiOps;
28786 for (unsigned i = 0; i != NumElts; i += 16) {
28787 for (unsigned j = 0; j != 8; ++j) {
28788 SDValue LoOp = B.getOperand(i + j);
28789 SDValue HiOp = B.getOperand(i + j + 8);
28790
28791 if (IsSigned) {
28792 LoOp = DAG.getAnyExtOrTrunc(LoOp, dl, MVT::i16);
28793 HiOp = DAG.getAnyExtOrTrunc(HiOp, dl, MVT::i16);
28794 LoOp = DAG.getNode(ISD::SHL, dl, MVT::i16, LoOp,
28795 DAG.getConstant(8, dl, MVT::i16));
28796 HiOp = DAG.getNode(ISD::SHL, dl, MVT::i16, HiOp,
28797 DAG.getConstant(8, dl, MVT::i16));
28798 } else {
28799 LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
28800 HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
28801 }
28802
28803 LoOps.push_back(LoOp);
28804 HiOps.push_back(HiOp);
28805 }
28806 }
28807
28808 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
28809 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
28810 } else if (IsSigned) {
28811 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));
28812 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));
28813 } else {
28814 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));
28815 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Zero));
28816 }
28817
28818 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
28819 // pack back to vXi8.
28820 unsigned MulOpc = IsSigned ? ISD::MULHS : ISD::MUL;
28821 SDValue RLo = DAG.getNode(MulOpc, dl, ExVT, ALo, BLo);
28822 SDValue RHi = DAG.getNode(MulOpc, dl, ExVT, AHi, BHi);
28823
28824 if (Low)
28825 *Low = getPack(DAG, Subtarget, dl, VT, RLo, RHi);
28826
28827 return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf*/ true);
28828}
28829
28830static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
28831 SelectionDAG &DAG) {
28832 SDLoc dl(Op);
28833 MVT VT = Op.getSimpleValueType();
28834 bool IsSigned = Op->getOpcode() == ISD::MULHS;
28835 unsigned NumElts = VT.getVectorNumElements();
28836 SDValue A = Op.getOperand(0);
28837 SDValue B = Op.getOperand(1);
28838
28839 // Decompose 256-bit ops into 128-bit ops.
28840 if (VT.is256BitVector() && !Subtarget.hasInt256())
28841 return splitVectorIntBinary(Op, DAG);
28842
28843 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
28844 return splitVectorIntBinary(Op, DAG);
28845
28846 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {
28847 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||(static_cast <bool> ((VT == MVT::v4i32 && Subtarget
.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256
()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())) ?
void (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28849, __extension__
__PRETTY_FUNCTION__))
28848 (VT == MVT::v8i32 && Subtarget.hasInt256()) ||(static_cast <bool> ((VT == MVT::v4i32 && Subtarget
.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256
()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())) ?
void (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28849, __extension__
__PRETTY_FUNCTION__))
28849 (VT == MVT::v16i32 && Subtarget.hasAVX512()))(static_cast <bool> ((VT == MVT::v4i32 && Subtarget
.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256
()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())) ?
void (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28849, __extension__
__PRETTY_FUNCTION__))
;
28850
28851 // PMULxD operations multiply each even value (starting at 0) of LHS with
28852 // the related value of RHS and produce a widen result.
28853 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
28854 // => <2 x i64> <ae|cg>
28855 //
28856 // In other word, to have all the results, we need to perform two PMULxD:
28857 // 1. one with the even values.
28858 // 2. one with the odd values.
28859 // To achieve #2, with need to place the odd values at an even position.
28860 //
28861 // Place the odd value at an even position (basically, shift all values 1
28862 // step to the left):
28863 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1,
28864 9, -1, 11, -1, 13, -1, 15, -1};
28865 // <a|b|c|d> => <b|undef|d|undef>
28866 SDValue Odd0 = DAG.getVectorShuffle(VT, dl, A, A,
28867 makeArrayRef(&Mask[0], NumElts));
28868 // <e|f|g|h> => <f|undef|h|undef>
28869 SDValue Odd1 = DAG.getVectorShuffle(VT, dl, B, B,
28870 makeArrayRef(&Mask[0], NumElts));
28871
28872 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
28873 // ints.
28874 MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
28875 unsigned Opcode =
28876 (IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ;
28877 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
28878 // => <2 x i64> <ae|cg>
28879 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
28880 DAG.getBitcast(MulVT, A),
28881 DAG.getBitcast(MulVT, B)));
28882 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
28883 // => <2 x i64> <bf|dh>
28884 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
28885 DAG.getBitcast(MulVT, Odd0),
28886 DAG.getBitcast(MulVT, Odd1)));
28887
28888 // Shuffle it back into the right order.
28889 SmallVector<int, 16> ShufMask(NumElts);
28890 for (int i = 0; i != (int)NumElts; ++i)
28891 ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
28892
28893 SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);
28894
28895 // If we have a signed multiply but no PMULDQ fix up the result of an
28896 // unsigned multiply.
28897 if (IsSigned && !Subtarget.hasSSE41()) {
28898 SDValue Zero = DAG.getConstant(0, dl, VT);
28899 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
28900 DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B);
28901 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
28902 DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A);
28903
28904 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
28905 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);
28906 }
28907
28908 return Res;
28909 }
28910
28911 // Only i8 vectors should need custom lowering after this.
28912 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||(static_cast <bool> ((VT == MVT::v16i8 || (VT == MVT::v32i8
&& Subtarget.hasInt256()) || (VT == MVT::v64i8 &&
Subtarget.hasBWI())) && "Unsupported vector type") ?
void (0) : __assert_fail ("(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && \"Unsupported vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28914, __extension__
__PRETTY_FUNCTION__))
28913 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&(static_cast <bool> ((VT == MVT::v16i8 || (VT == MVT::v32i8
&& Subtarget.hasInt256()) || (VT == MVT::v64i8 &&
Subtarget.hasBWI())) && "Unsupported vector type") ?
void (0) : __assert_fail ("(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && \"Unsupported vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28914, __extension__
__PRETTY_FUNCTION__))
28914 "Unsupported vector type")(static_cast <bool> ((VT == MVT::v16i8 || (VT == MVT::v32i8
&& Subtarget.hasInt256()) || (VT == MVT::v64i8 &&
Subtarget.hasBWI())) && "Unsupported vector type") ?
void (0) : __assert_fail ("(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && \"Unsupported vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28914, __extension__
__PRETTY_FUNCTION__))
;
28915
28916 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
28917 // logical shift down the upper half and pack back to i8.
28918
28919 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
28920 // and then ashr/lshr the upper bits down to the lower bits before multiply.
28921
28922 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
28923 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
28924 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
28925 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
28926 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
28927 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
28928 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
28929 Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
28930 return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
28931 }
28932
28933 return LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG);
28934}
28935
28936// Custom lowering for SMULO/UMULO.
28937static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget,
28938 SelectionDAG &DAG) {
28939 MVT VT = Op.getSimpleValueType();
28940
28941 // Scalars defer to LowerXALUO.
28942 if (!VT.isVector())
28943 return LowerXALUO(Op, DAG);
28944
28945 SDLoc dl(Op);
28946 bool IsSigned = Op->getOpcode() == ISD::SMULO;
28947 SDValue A = Op.getOperand(0);
28948 SDValue B = Op.getOperand(1);
28949 EVT OvfVT = Op->getValueType(1);
28950
28951 if ((VT == MVT::v32i8 && !Subtarget.hasInt256()) ||
28952 (VT == MVT::v64i8 && !Subtarget.hasBWI())) {
28953 // Extract the LHS Lo/Hi vectors
28954 SDValue LHSLo, LHSHi;
28955 std::tie(LHSLo, LHSHi) = splitVector(A, DAG, dl);
28956
28957 // Extract the RHS Lo/Hi vectors
28958 SDValue RHSLo, RHSHi;
28959 std::tie(RHSLo, RHSHi) = splitVector(B, DAG, dl);
28960
28961 EVT LoOvfVT, HiOvfVT;
28962 std::tie(LoOvfVT, HiOvfVT) = DAG.GetSplitDestVTs(OvfVT);
28963 SDVTList LoVTs = DAG.getVTList(LHSLo.getValueType(), LoOvfVT);
28964 SDVTList HiVTs = DAG.getVTList(LHSHi.getValueType(), HiOvfVT);
28965
28966 // Issue the split operations.
28967 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, LoVTs, LHSLo, RHSLo);
28968 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, HiVTs, LHSHi, RHSHi);
28969
28970 // Join the separate data results and the overflow results.
28971 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
28972 SDValue Ovf = DAG.getNode(ISD::CONCAT_VECTORS, dl, OvfVT, Lo.getValue(1),
28973 Hi.getValue(1));
28974
28975 return DAG.getMergeValues({Res, Ovf}, dl);
28976 }
28977
28978 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28979 EVT SetccVT =
28980 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
28981
28982 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
28983 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
28984 unsigned NumElts = VT.getVectorNumElements();
28985 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
28986 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
28987 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
28988 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
28989 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
28990
28991 SDValue Low = DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
28992
28993 SDValue Ovf;
28994 if (IsSigned) {
28995 SDValue High, LowSign;
28996 if (OvfVT.getVectorElementType() == MVT::i1 &&
28997 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
28998 // Rather the truncating try to do the compare on vXi16 or vXi32.
28999 // Shift the high down filling with sign bits.
29000 High = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Mul, 8, DAG);
29001 // Fill all 16 bits with the sign bit from the low.
29002 LowSign =
29003 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExVT, Mul, 8, DAG);
29004 LowSign = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, LowSign,
29005 15, DAG);
29006 SetccVT = OvfVT;
29007 if (!Subtarget.hasBWI()) {
29008 // We can't do a vXi16 compare so sign extend to v16i32.
29009 High = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, High);
29010 LowSign = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, LowSign);
29011 }
29012 } else {
29013 // Otherwise do the compare at vXi8.
29014 High = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
29015 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
29016 LowSign =
29017 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
29018 }
29019
29020 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
29021 } else {
29022 SDValue High =
29023 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
29024 if (OvfVT.getVectorElementType() == MVT::i1 &&
29025 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
29026 // Rather the truncating try to do the compare on vXi16 or vXi32.
29027 SetccVT = OvfVT;
29028 if (!Subtarget.hasBWI()) {
29029 // We can't do a vXi16 compare so sign extend to v16i32.
29030 High = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, High);
29031 }
29032 } else {
29033 // Otherwise do the compare at vXi8.
29034 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
29035 }
29036
29037 Ovf =
29038 DAG.getSetCC(dl, SetccVT, High,
29039 DAG.getConstant(0, dl, High.getValueType()), ISD::SETNE);
29040 }
29041
29042 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
29043
29044 return DAG.getMergeValues({Low, Ovf}, dl);
29045 }
29046
29047 SDValue Low;
29048 SDValue High =
29049 LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG, &Low);
29050
29051 SDValue Ovf;
29052 if (IsSigned) {
29053 // SMULO overflows if the high bits don't match the sign of the low.
29054 SDValue LowSign =
29055 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
29056 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
29057 } else {
29058 // UMULO overflows if the high bits are non-zero.
29059 Ovf =
29060 DAG.getSetCC(dl, SetccVT, High, DAG.getConstant(0, dl, VT), ISD::SETNE);
29061 }
29062
29063 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
29064
29065 return DAG.getMergeValues({Low, Ovf}, dl);
29066}
29067
29068SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
29069 assert(Subtarget.isTargetWin64() && "Unexpected target")(static_cast <bool> (Subtarget.isTargetWin64() &&
"Unexpected target") ? void (0) : __assert_fail ("Subtarget.isTargetWin64() && \"Unexpected target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29069, __extension__
__PRETTY_FUNCTION__))
;
29070 EVT VT = Op.getValueType();
29071 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&(static_cast <bool> (VT.isInteger() && VT.getSizeInBits
() == 128 && "Unexpected return type for lowering") ?
void (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29072, __extension__
__PRETTY_FUNCTION__))
29072 "Unexpected return type for lowering")(static_cast <bool> (VT.isInteger() && VT.getSizeInBits
() == 128 && "Unexpected return type for lowering") ?
void (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29072, __extension__
__PRETTY_FUNCTION__))
;
29073
29074 RTLIB::Libcall LC;
29075 bool isSigned;
29076 switch (Op->getOpcode()) {
29077 default: llvm_unreachable("Unexpected request for libcall!")::llvm::llvm_unreachable_internal("Unexpected request for libcall!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29077)
;
29078 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
29079 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
29080 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
29081 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
29082 }
29083
29084 SDLoc dl(Op);
29085 SDValue InChain = DAG.getEntryNode();
29086
29087 TargetLowering::ArgListTy Args;
29088 TargetLowering::ArgListEntry Entry;
29089 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
29090 EVT ArgVT = Op->getOperand(i).getValueType();
29091 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&(static_cast <bool> (ArgVT.isInteger() && ArgVT
.getSizeInBits() == 128 && "Unexpected argument type for lowering"
) ? void (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29092, __extension__
__PRETTY_FUNCTION__))
29092 "Unexpected argument type for lowering")(static_cast <bool> (ArgVT.isInteger() && ArgVT
.getSizeInBits() == 128 && "Unexpected argument type for lowering"
) ? void (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29092, __extension__
__PRETTY_FUNCTION__))
;
29093 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
29094 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
29095 MachinePointerInfo MPI =
29096 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
29097 Entry.Node = StackPtr;
29098 InChain =
29099 DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16));
29100 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
29101 Entry.Ty = PointerType::get(ArgTy,0);
29102 Entry.IsSExt = false;
29103 Entry.IsZExt = false;
29104 Args.push_back(Entry);
29105 }
29106
29107 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
29108 getPointerTy(DAG.getDataLayout()));
29109
29110 TargetLowering::CallLoweringInfo CLI(DAG);
29111 CLI.setDebugLoc(dl)
29112 .setChain(InChain)
29113 .setLibCallee(
29114 getLibcallCallingConv(LC),
29115 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
29116 std::move(Args))
29117 .setInRegister()
29118 .setSExtResult(isSigned)
29119 .setZExtResult(!isSigned);
29120
29121 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
29122 return DAG.getBitcast(VT, CallInfo.first);
29123}
29124
29125SDValue X86TargetLowering::LowerWin64_FP_TO_INT128(SDValue Op,
29126 SelectionDAG &DAG,
29127 SDValue &Chain) const {
29128 assert(Subtarget.isTargetWin64() && "Unexpected target")(static_cast <bool> (Subtarget.isTargetWin64() &&
"Unexpected target") ? void (0) : __assert_fail ("Subtarget.isTargetWin64() && \"Unexpected target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29128, __extension__
__PRETTY_FUNCTION__))
;
29129 EVT VT = Op.getValueType();
29130 bool IsStrict = Op->isStrictFPOpcode();
29131
29132 SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
29133 EVT ArgVT = Arg.getValueType();
29134
29135 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&(static_cast <bool> (VT.isInteger() && VT.getSizeInBits
() == 128 && "Unexpected return type for lowering") ?
void (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29136, __extension__
__PRETTY_FUNCTION__))
29136 "Unexpected return type for lowering")(static_cast <bool> (VT.isInteger() && VT.getSizeInBits
() == 128 && "Unexpected return type for lowering") ?
void (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29136, __extension__
__PRETTY_FUNCTION__))
;
29137
29138 RTLIB::Libcall LC;
29139 if (Op->getOpcode() == ISD::FP_TO_SINT ||
29140 Op->getOpcode() == ISD::STRICT_FP_TO_SINT)
29141 LC = RTLIB::getFPTOSINT(ArgVT, VT);
29142 else
29143 LC = RTLIB::getFPTOUINT(ArgVT, VT);
29144 assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!")(static_cast <bool> (LC != RTLIB::UNKNOWN_LIBCALL &&
"Unexpected request for libcall!") ? void (0) : __assert_fail
("LC != RTLIB::UNKNOWN_LIBCALL && \"Unexpected request for libcall!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29144, __extension__
__PRETTY_FUNCTION__))
;
29145
29146 SDLoc dl(Op);
29147 MakeLibCallOptions CallOptions;
29148 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
29149
29150 SDValue Result;
29151 // Expect the i128 argument returned as a v2i64 in xmm0, cast back to the
29152 // expected VT (i128).
29153 std::tie(Result, Chain) =
29154 makeLibCall(DAG, LC, MVT::v2i64, Arg, CallOptions, dl, Chain);
29155 Result = DAG.getBitcast(VT, Result);
29156 return Result;
29157}
29158
29159SDValue X86TargetLowering::LowerWin64_INT128_TO_FP(SDValue Op,
29160 SelectionDAG &DAG) const {
29161 assert(Subtarget.isTargetWin64() && "Unexpected target")(static_cast <bool> (Subtarget.isTargetWin64() &&
"Unexpected target") ? void (0) : __assert_fail ("Subtarget.isTargetWin64() && \"Unexpected target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29161, __extension__
__PRETTY_FUNCTION__))
;
29162 EVT VT = Op.getValueType();
29163 bool IsStrict = Op->isStrictFPOpcode();
29164
29165 SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
29166 EVT ArgVT = Arg.getValueType();
29167
29168 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&(static_cast <bool> (ArgVT.isInteger() && ArgVT
.getSizeInBits() == 128 && "Unexpected argument type for lowering"
) ? void (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29169, __extension__
__PRETTY_FUNCTION__))
29169 "Unexpected argument type for lowering")(static_cast <bool> (ArgVT.isInteger() && ArgVT
.getSizeInBits() == 128 && "Unexpected argument type for lowering"
) ? void (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29169, __extension__
__PRETTY_FUNCTION__))
;
29170
29171 RTLIB::Libcall LC;
29172 if (Op->getOpcode() == ISD::SINT_TO_FP ||
29173 Op->getOpcode() == ISD::STRICT_SINT_TO_FP)
29174 LC = RTLIB::getSINTTOFP(ArgVT, VT);
29175 else
29176 LC = RTLIB::getUINTTOFP(ArgVT, VT);
29177 assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!")(static_cast <bool> (LC != RTLIB::UNKNOWN_LIBCALL &&
"Unexpected request for libcall!") ? void (0) : __assert_fail
("LC != RTLIB::UNKNOWN_LIBCALL && \"Unexpected request for libcall!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29177, __extension__
__PRETTY_FUNCTION__))
;
29178
29179 SDLoc dl(Op);
29180 MakeLibCallOptions CallOptions;
29181 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
29182
29183 // Pass the i128 argument as an indirect argument on the stack.
29184 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
29185 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
29186 MachinePointerInfo MPI =
29187 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
29188 Chain = DAG.getStore(Chain, dl, Arg, StackPtr, MPI, Align(16));
29189
29190 SDValue Result;
29191 std::tie(Result, Chain) =
29192 makeLibCall(DAG, LC, VT, StackPtr, CallOptions, dl, Chain);
29193 return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;
29194}
29195
29196// Return true if the required (according to Opcode) shift-imm form is natively
29197// supported by the Subtarget
29198static bool supportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
29199 unsigned Opcode) {
29200 if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
29201 return false;
29202
29203 if (VT.getScalarSizeInBits() < 16)
29204 return false;
29205
29206 if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
29207 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
29208 return true;
29209
29210 bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
29211 (VT.is256BitVector() && Subtarget.hasInt256());
29212
29213 bool AShift = LShift && (Subtarget.hasAVX512() ||
29214 (VT != MVT::v2i64 && VT != MVT::v4i64));
29215 return (Opcode == ISD::SRA) ? AShift : LShift;
29216}
29217
29218// The shift amount is a variable, but it is the same for all vector lanes.
29219// These instructions are defined together with shift-immediate.
29220static
29221bool supportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
29222 unsigned Opcode) {
29223 return supportedVectorShiftWithImm(VT, Subtarget, Opcode);
29224}
29225
29226// Return true if the required (according to Opcode) variable-shift form is
29227// natively supported by the Subtarget
29228static bool supportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
29229 unsigned Opcode) {
29230 if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
29231 return false;
29232
29233 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
29234 return false;
29235
29236 // vXi16 supported only on AVX-512, BWI
29237 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
29238 return false;
29239
29240 if (Subtarget.hasAVX512() &&
29241 (Subtarget.useAVX512Regs() || !VT.is512BitVector()))
29242 return true;
29243
29244 bool LShift = VT.is128BitVector() || VT.is256BitVector();
29245 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
29246 return (Opcode == ISD::SRA) ? AShift : LShift;
29247}
29248
29249static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG,
29250 const X86Subtarget &Subtarget) {
29251 MVT VT = Op.getSimpleValueType();
29252 SDLoc dl(Op);
29253 SDValue R = Op.getOperand(0);
29254 SDValue Amt = Op.getOperand(1);
29255 unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);
29256
29257 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
29258 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type")(static_cast <bool> ((VT == MVT::v2i64 || VT == MVT::v4i64
) && "Unexpected SRA type") ? void (0) : __assert_fail
("(VT == MVT::v2i64 || VT == MVT::v4i64) && \"Unexpected SRA type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29258, __extension__
__PRETTY_FUNCTION__))
;
29259 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
29260 SDValue Ex = DAG.getBitcast(ExVT, R);
29261
29262 // ashr(R, 63) === cmp_slt(R, 0)
29263 if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
29264 assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&(static_cast <bool> ((VT != MVT::v4i64 || Subtarget.hasInt256
()) && "Unsupported PCMPGT op") ? void (0) : __assert_fail
("(VT != MVT::v4i64 || Subtarget.hasInt256()) && \"Unsupported PCMPGT op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29265, __extension__
__PRETTY_FUNCTION__))
29265 "Unsupported PCMPGT op")(static_cast <bool> ((VT != MVT::v4i64 || Subtarget.hasInt256
()) && "Unsupported PCMPGT op") ? void (0) : __assert_fail
("(VT != MVT::v4i64 || Subtarget.hasInt256()) && \"Unsupported PCMPGT op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29265, __extension__
__PRETTY_FUNCTION__))
;
29266 return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R);
29267 }
29268
29269 if (ShiftAmt >= 32) {
29270 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
29271 SDValue Upper =
29272 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
29273 SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
29274 ShiftAmt - 32, DAG);
29275 if (VT == MVT::v2i64)
29276 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
29277 if (VT == MVT::v4i64)
29278 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
29279 {9, 1, 11, 3, 13, 5, 15, 7});
29280 } else {
29281 // SRA upper i32, SRL whole i64 and select lower i32.
29282 SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
29283 ShiftAmt, DAG);
29284 SDValue Lower =
29285 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
29286 Lower = DAG.getBitcast(ExVT, Lower);
29287 if (VT == MVT::v2i64)
29288 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
29289 if (VT == MVT::v4i64)
29290 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
29291 {8, 1, 10, 3, 12, 5, 14, 7});
29292 }
29293 return DAG.getBitcast(VT, Ex);
29294 };
29295
29296 // Optimize shl/srl/sra with constant shift amount.
29297 APInt APIntShiftAmt;
29298 if (!X86::isConstantSplat(Amt, APIntShiftAmt))
29299 return SDValue();
29300
29301 // If the shift amount is out of range, return undef.
29302 if (APIntShiftAmt.uge(VT.getScalarSizeInBits()))
29303 return DAG.getUNDEF(VT);
29304
29305 uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
29306
29307 if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
29308 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
29309
29310 // i64 SRA needs to be performed as partial shifts.
29311 if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
29312 (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
29313 Op.getOpcode() == ISD::SRA)
29314 return ArithmeticShiftRight64(ShiftAmt);
29315
29316 if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) ||
29317 (Subtarget.hasBWI() && VT == MVT::v64i8)) {
29318 unsigned NumElts = VT.getVectorNumElements();
29319 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29320
29321 // Simple i8 add case
29322 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
29323 // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
29324 // must be 0). (add undef, undef) however can be any value. To make this
29325 // safe, we must freeze R to ensure that register allocation uses the same
29326 // register for an undefined value. This ensures that the result will
29327 // still be even and preserves the original semantics.
29328 R = DAG.getNode(ISD::FREEZE, dl, VT, R);
29329 return DAG.getNode(ISD::ADD, dl, VT, R, R);
29330 }
29331
29332 // ashr(R, 7) === cmp_slt(R, 0)
29333 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
29334 SDValue Zeros = DAG.getConstant(0, dl, VT);
29335 if (VT.is512BitVector()) {
29336 assert(VT == MVT::v64i8 && "Unexpected element type!")(static_cast <bool> (VT == MVT::v64i8 && "Unexpected element type!"
) ? void (0) : __assert_fail ("VT == MVT::v64i8 && \"Unexpected element type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29336, __extension__
__PRETTY_FUNCTION__))
;
29337 SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT);
29338 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
29339 }
29340 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
29341 }
29342
29343 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
29344 if (VT == MVT::v16i8 && Subtarget.hasXOP())
29345 return SDValue();
29346
29347 if (Op.getOpcode() == ISD::SHL) {
29348 // Make a large shift.
29349 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,
29350 ShiftAmt, DAG);
29351 SHL = DAG.getBitcast(VT, SHL);
29352 // Zero out the rightmost bits.
29353 APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);
29354 return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT));
29355 }
29356 if (Op.getOpcode() == ISD::SRL) {
29357 // Make a large shift.
29358 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R,
29359 ShiftAmt, DAG);
29360 SRL = DAG.getBitcast(VT, SRL);
29361 // Zero out the leftmost bits.
29362 APInt Mask = APInt::getLowBitsSet(8, 8 - ShiftAmt);
29363 return DAG.getNode(ISD::AND, dl, VT, SRL, DAG.getConstant(Mask, dl, VT));
29364 }
29365 if (Op.getOpcode() == ISD::SRA) {
29366 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
29367 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
29368
29369 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
29370 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
29371 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
29372 return Res;
29373 }
29374 llvm_unreachable("Unknown shift opcode.")::llvm::llvm_unreachable_internal("Unknown shift opcode.", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29374)
;
29375 }
29376
29377 return SDValue();
29378}
29379
29380static SDValue LowerShiftByScalarVariable(SDValue Op, SelectionDAG &DAG,
29381 const X86Subtarget &Subtarget) {
29382 MVT VT = Op.getSimpleValueType();
29383 SDLoc dl(Op);
29384 SDValue R = Op.getOperand(0);
29385 SDValue Amt = Op.getOperand(1);
29386 unsigned Opcode = Op.getOpcode();
29387 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);
29388
29389 int BaseShAmtIdx = -1;
29390 if (SDValue BaseShAmt = DAG.getSplatSourceVector(Amt, BaseShAmtIdx)) {
29391 if (supportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode))
29392 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, BaseShAmtIdx,
29393 Subtarget, DAG);
29394
29395 // vXi8 shifts - shift as v8i16 + mask result.
29396 if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) ||
29397 (VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) ||
29398 VT == MVT::v64i8) &&
29399 !Subtarget.hasXOP()) {
29400 unsigned NumElts = VT.getVectorNumElements();
29401 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29402 if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {
29403 unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);
29404 unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);
29405
29406 // Create the mask using vXi16 shifts. For shift-rights we need to move
29407 // the upper byte down before splatting the vXi8 mask.
29408 SDValue BitMask = DAG.getConstant(-1, dl, ExtVT);
29409 BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask,
29410 BaseShAmt, BaseShAmtIdx, Subtarget, DAG);
29411 if (Opcode != ISD::SHL)
29412 BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask,
29413 8, DAG);
29414 BitMask = DAG.getBitcast(VT, BitMask);
29415 BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask,
29416 SmallVector<int, 64>(NumElts, 0));
29417
29418 SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT,
29419 DAG.getBitcast(ExtVT, R), BaseShAmt,
29420 BaseShAmtIdx, Subtarget, DAG);
29421 Res = DAG.getBitcast(VT, Res);
29422 Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask);
29423
29424 if (Opcode == ISD::SRA) {
29425 // ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask)
29426 // SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.
29427 SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT);
29428 SignMask =
29429 getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask, BaseShAmt,
29430 BaseShAmtIdx, Subtarget, DAG);
29431 SignMask = DAG.getBitcast(VT, SignMask);
29432 Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask);
29433 Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask);
29434 }
29435 return Res;
29436 }
29437 }
29438 }
29439
29440 return SDValue();
29441}
29442
29443// Convert a shift/rotate left amount to a multiplication scale factor.
29444static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,
29445 const X86Subtarget &Subtarget,
29446 SelectionDAG &DAG) {
29447 MVT VT = Amt.getSimpleValueType();
29448 if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||
29449 (Subtarget.hasInt256() && VT == MVT::v16i16) ||
29450 (Subtarget.hasAVX512() && VT == MVT::v32i16) ||
29451 (!Subtarget.hasAVX512() && VT == MVT::v16i8) ||
29452 (Subtarget.hasInt256() && VT == MVT::v32i8) ||
29453 (Subtarget.hasBWI() && VT == MVT::v64i8)))
29454 return SDValue();
29455
29456 MVT SVT = VT.getVectorElementType();
29457 unsigned SVTBits = SVT.getSizeInBits();
29458 unsigned NumElems = VT.getVectorNumElements();
29459
29460 APInt UndefElts;
29461 SmallVector<APInt> EltBits;
29462 if (getTargetConstantBitsFromNode(Amt, SVTBits, UndefElts, EltBits)) {
29463 APInt One(SVTBits, 1);
29464 SmallVector<SDValue> Elts(NumElems, DAG.getUNDEF(SVT));
29465 for (unsigned I = 0; I != NumElems; ++I) {
29466 if (UndefElts[I] || EltBits[I].uge(SVTBits))
29467 continue;
29468 uint64_t ShAmt = EltBits[I].getZExtValue();
29469 Elts[I] = DAG.getConstant(One.shl(ShAmt), dl, SVT);
29470 }
29471 return DAG.getBuildVector(VT, dl, Elts);
29472 }
29473
29474 // If the target doesn't support variable shifts, use either FP conversion
29475 // or integer multiplication to avoid shifting each element individually.
29476 if (VT == MVT::v4i32) {
29477 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
29478 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,
29479 DAG.getConstant(0x3f800000U, dl, VT));
29480 Amt = DAG.getBitcast(MVT::v4f32, Amt);
29481 return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);
29482 }
29483
29484 // AVX2 can more effectively perform this as a zext/trunc to/from v8i32.
29485 if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {
29486 SDValue Z = DAG.getConstant(0, dl, VT);
29487 SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));
29488 SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));
29489 Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);
29490 Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);
29491 if (Subtarget.hasSSE41())
29492 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
29493 return getPack(DAG, Subtarget, dl, VT, Lo, Hi);
29494 }
29495
29496 return SDValue();
29497}
29498
29499static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
29500 SelectionDAG &DAG) {
29501 MVT VT = Op.getSimpleValueType();
29502 SDLoc dl(Op);
29503 SDValue R = Op.getOperand(0);
29504 SDValue Amt = Op.getOperand(1);
29505 unsigned EltSizeInBits = VT.getScalarSizeInBits();
29506 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
29507
29508 unsigned Opc = Op.getOpcode();
29509 unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true);
29510 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false);
29511
29512 assert(VT.isVector() && "Custom lowering only for vector shifts!")(static_cast <bool> (VT.isVector() && "Custom lowering only for vector shifts!"
) ? void (0) : __assert_fail ("VT.isVector() && \"Custom lowering only for vector shifts!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29512, __extension__
__PRETTY_FUNCTION__))
;
29513 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Only custom lower when we have SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Only custom lower when we have SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29513, __extension__
__PRETTY_FUNCTION__))
;
29514
29515 if (SDValue V = LowerShiftByScalarImmediate(Op, DAG, Subtarget))
29516 return V;
29517
29518 if (SDValue V = LowerShiftByScalarVariable(Op, DAG, Subtarget))
29519 return V;
29520
29521 if (supportedVectorVarShift(VT, Subtarget, Opc))
29522 return Op;
29523
29524 // i64 vector arithmetic shift can be emulated with the transform:
29525 // M = lshr(SIGN_MASK, Amt)
29526 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
29527 if (((VT == MVT::v2i64 && !Subtarget.hasXOP()) ||
29528 (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
29529 Opc == ISD::SRA) {
29530 SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
29531 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
29532 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
29533 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
29534 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
29535 return R;
29536 }
29537
29538 // XOP has 128-bit variable logical/arithmetic shifts.
29539 // +ve/-ve Amt = shift left/right.
29540 if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||
29541 VT == MVT::v8i16 || VT == MVT::v16i8)) {
29542 if (Opc == ISD::SRL || Opc == ISD::SRA) {
29543 SDValue Zero = DAG.getConstant(0, dl, VT);
29544 Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
29545 }
29546 if (Opc == ISD::SHL || Opc == ISD::SRL)
29547 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
29548 if (Opc == ISD::SRA)
29549 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
29550 }
29551
29552 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
29553 // shifts per-lane and then shuffle the partial results back together.
29554 if (VT == MVT::v2i64 && Opc != ISD::SRA) {
29555 // Splat the shift amounts so the scalar shifts above will catch it.
29556 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
29557 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
29558 SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
29559 SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
29560 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
29561 }
29562
29563 // If possible, lower this shift as a sequence of two shifts by
29564 // constant plus a BLENDing shuffle instead of scalarizing it.
29565 // Example:
29566 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
29567 //
29568 // Could be rewritten as:
29569 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
29570 //
29571 // The advantage is that the two shifts from the example would be
29572 // lowered as X86ISD::VSRLI nodes in parallel before blending.
29573 if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
29574 (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
29575 SDValue Amt1, Amt2;
29576 unsigned NumElts = VT.getVectorNumElements();
29577 SmallVector<int, 8> ShuffleMask;
29578 for (unsigned i = 0; i != NumElts; ++i) {
29579 SDValue A = Amt->getOperand(i);
29580 if (A.isUndef()) {
29581 ShuffleMask.push_back(SM_SentinelUndef);
29582 continue;
29583 }
29584 if (!Amt1 || Amt1 == A) {
29585 ShuffleMask.push_back(i);
29586 Amt1 = A;
29587 continue;
29588 }
29589 if (!Amt2 || Amt2 == A) {
29590 ShuffleMask.push_back(i + NumElts);
29591 Amt2 = A;
29592 continue;
29593 }
29594 break;
29595 }
29596
29597 // Only perform this blend if we can perform it without loading a mask.
29598 if (ShuffleMask.size() == NumElts && Amt1 && Amt2 &&
29599 (VT != MVT::v16i16 ||
29600 is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&
29601 (VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL ||
29602 canWidenShuffleElements(ShuffleMask))) {
29603 auto *Cst1 = dyn_cast<ConstantSDNode>(Amt1);
29604 auto *Cst2 = dyn_cast<ConstantSDNode>(Amt2);
29605 if (Cst1 && Cst2 && Cst1->getAPIntValue().ult(EltSizeInBits) &&
29606 Cst2->getAPIntValue().ult(EltSizeInBits)) {
29607 SDValue Shift1 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
29608 Cst1->getZExtValue(), DAG);
29609 SDValue Shift2 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
29610 Cst2->getZExtValue(), DAG);
29611 return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
29612 }
29613 }
29614 }
29615
29616 // If possible, lower this packed shift into a vector multiply instead of
29617 // expanding it into a sequence of scalar shifts.
29618 // For v32i8 cases, it might be quicker to split/extend to vXi16 shifts.
29619 if (Opc == ISD::SHL && !(VT == MVT::v32i8 && (Subtarget.hasXOP() ||
29620 Subtarget.canExtendTo512BW())))
29621 if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
29622 return DAG.getNode(ISD::MUL, dl, VT, R, Scale);
29623
29624 // Constant ISD::SRL can be performed efficiently on vXi16 vectors as we
29625 // can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).
29626 if (Opc == ISD::SRL && ConstantAmt &&
29627 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
29628 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
29629 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
29630 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
29631 SDValue Zero = DAG.getConstant(0, dl, VT);
29632 SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);
29633 SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);
29634 return DAG.getSelect(dl, VT, ZAmt, R, Res);
29635 }
29636 }
29637
29638 // Constant ISD::SRA can be performed efficiently on vXi16 vectors as we
29639 // can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).
29640 // TODO: Special case handling for shift by 0/1, really we can afford either
29641 // of these cases in pre-SSE41/XOP/AVX512 but not both.
29642 if (Opc == ISD::SRA && ConstantAmt &&
29643 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256())) &&
29644 ((Subtarget.hasSSE41() && !Subtarget.hasXOP() &&
29645 !Subtarget.hasAVX512()) ||
29646 DAG.isKnownNeverZero(Amt))) {
29647 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
29648 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
29649 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
29650 SDValue Amt0 =
29651 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ);
29652 SDValue Amt1 =
29653 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ);
29654 SDValue Sra1 =
29655 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG);
29656 SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale);
29657 Res = DAG.getSelect(dl, VT, Amt0, R, Res);
29658 return DAG.getSelect(dl, VT, Amt1, Sra1, Res);
29659 }
29660 }
29661
29662 // v4i32 Non Uniform Shifts.
29663 // If the shift amount is constant we can shift each lane using the SSE2
29664 // immediate shifts, else we need to zero-extend each lane to the lower i64
29665 // and shift using the SSE2 variable shifts.
29666 // The separate results can then be blended together.
29667 if (VT == MVT::v4i32) {
29668 SDValue Amt0, Amt1, Amt2, Amt3;
29669 if (ConstantAmt) {
29670 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
29671 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
29672 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
29673 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
29674 } else {
29675 // The SSE2 shifts use the lower i64 as the same shift amount for
29676 // all lanes and the upper i64 is ignored. On AVX we're better off
29677 // just zero-extending, but for SSE just duplicating the top 16-bits is
29678 // cheaper and has the same effect for out of range values.
29679 if (Subtarget.hasAVX()) {
29680 SDValue Z = DAG.getConstant(0, dl, VT);
29681 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
29682 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
29683 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
29684 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
29685 } else {
29686 SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);
29687 SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
29688 {4, 5, 6, 7, -1, -1, -1, -1});
29689 Amt0 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
29690 {0, 1, 1, 1, -1, -1, -1, -1});
29691 Amt1 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
29692 {2, 3, 3, 3, -1, -1, -1, -1});
29693 Amt2 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
29694 {0, 1, 1, 1, -1, -1, -1, -1});
29695 Amt3 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
29696 {2, 3, 3, 3, -1, -1, -1, -1});
29697 }
29698 }
29699
29700 unsigned ShOpc = ConstantAmt ? Opc : X86OpcV;
29701 SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0));
29702 SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1));
29703 SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2));
29704 SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3));
29705
29706 // Merge the shifted lane results optimally with/without PBLENDW.
29707 // TODO - ideally shuffle combining would handle this.
29708 if (Subtarget.hasSSE41()) {
29709 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
29710 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
29711 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
29712 }
29713 SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
29714 SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
29715 return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
29716 }
29717
29718 // It's worth extending once and using the vXi16/vXi32 shifts for smaller
29719 // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
29720 // make the existing SSE solution better.
29721 // NOTE: We honor prefered vector width before promoting to 512-bits.
29722 if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
29723 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||
29724 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||
29725 (Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||
29726 (Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
29727 assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&(static_cast <bool> ((!Subtarget.hasBWI() || VT == MVT::
v32i8 || VT == MVT::v16i8) && "Unexpected vector type"
) ? void (0) : __assert_fail ("(!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29728, __extension__
__PRETTY_FUNCTION__))
29728 "Unexpected vector type")(static_cast <bool> ((!Subtarget.hasBWI() || VT == MVT::
v32i8 || VT == MVT::v16i8) && "Unexpected vector type"
) ? void (0) : __assert_fail ("(!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29728, __extension__
__PRETTY_FUNCTION__))
;
29729 MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
29730 MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
29731 unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29732 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
29733 Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
29734 return DAG.getNode(ISD::TRUNCATE, dl, VT,
29735 DAG.getNode(Opc, dl, ExtVT, R, Amt));
29736 }
29737
29738 // Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we
29739 // extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.
29740 if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&
29741 (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
29742 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
29743 !Subtarget.hasXOP()) {
29744 int NumElts = VT.getVectorNumElements();
29745 SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);
29746
29747 // Extend constant shift amount to vXi16 (it doesn't matter if the type
29748 // isn't legal).
29749 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
29750 Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT);
29751 Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt);
29752 Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt);
29753 assert(ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) &&(static_cast <bool> (ISD::isBuildVectorOfConstantSDNodes
(Amt.getNode()) && "Constant build vector expected") ?
void (0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) && \"Constant build vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29754, __extension__
__PRETTY_FUNCTION__))
29754 "Constant build vector expected")(static_cast <bool> (ISD::isBuildVectorOfConstantSDNodes
(Amt.getNode()) && "Constant build vector expected") ?
void (0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) && \"Constant build vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29754, __extension__
__PRETTY_FUNCTION__))
;
29755
29756 if (VT == MVT::v16i8 && Subtarget.hasInt256()) {
29757 R = Opc == ISD::SRA ? DAG.getSExtOrTrunc(R, dl, ExVT)
29758 : DAG.getZExtOrTrunc(R, dl, ExVT);
29759 R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt);
29760 R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8);
29761 return DAG.getZExtOrTrunc(R, dl, VT);
29762 }
29763
29764 SmallVector<SDValue, 16> LoAmt, HiAmt;
29765 for (int i = 0; i != NumElts; i += 16) {
29766 for (int j = 0; j != 8; ++j) {
29767 LoAmt.push_back(Amt.getOperand(i + j));
29768 HiAmt.push_back(Amt.getOperand(i + j + 8));
29769 }
29770 }
29771
29772 MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2);
29773 SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt);
29774 SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt);
29775
29776 SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R));
29777 SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R));
29778 LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8);
29779 HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8);
29780 LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA);
29781 HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA);
29782 LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8);
29783 HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8);
29784 return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);
29785 }
29786
29787 if (VT == MVT::v16i8 ||
29788 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
29789 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
29790 MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
29791
29792 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
29793 if (VT.is512BitVector()) {
29794 // On AVX512BW targets we make use of the fact that VSELECT lowers
29795 // to a masked blend which selects bytes based just on the sign bit
29796 // extracted to a mask.
29797 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
29798 V0 = DAG.getBitcast(VT, V0);
29799 V1 = DAG.getBitcast(VT, V1);
29800 Sel = DAG.getBitcast(VT, Sel);
29801 Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,
29802 ISD::SETGT);
29803 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
29804 } else if (Subtarget.hasSSE41()) {
29805 // On SSE41 targets we can use PBLENDVB which selects bytes based just
29806 // on the sign bit.
29807 V0 = DAG.getBitcast(VT, V0);
29808 V1 = DAG.getBitcast(VT, V1);
29809 Sel = DAG.getBitcast(VT, Sel);
29810 return DAG.getBitcast(SelVT,
29811 DAG.getNode(X86ISD::BLENDV, dl, VT, Sel, V0, V1));
29812 }
29813 // On pre-SSE41 targets we test for the sign bit by comparing to
29814 // zero - a negative value will set all bits of the lanes to true
29815 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
29816 SDValue Z = DAG.getConstant(0, dl, SelVT);
29817 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
29818 return DAG.getSelect(dl, SelVT, C, V0, V1);
29819 };
29820
29821 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
29822 // We can safely do this using i16 shifts as we're only interested in
29823 // the 3 lower bits of each byte.
29824 Amt = DAG.getBitcast(ExtVT, Amt);
29825 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG);
29826 Amt = DAG.getBitcast(VT, Amt);
29827
29828 if (Opc == ISD::SHL || Opc == ISD::SRL) {
29829 // r = VSELECT(r, shift(r, 4), a);
29830 SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT));
29831 R = SignBitSelect(VT, Amt, M, R);
29832
29833 // a += a
29834 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
29835
29836 // r = VSELECT(r, shift(r, 2), a);
29837 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT));
29838 R = SignBitSelect(VT, Amt, M, R);
29839
29840 // a += a
29841 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
29842
29843 // return VSELECT(r, shift(r, 1), a);
29844 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT));
29845 R = SignBitSelect(VT, Amt, M, R);
29846 return R;
29847 }
29848
29849 if (Opc == ISD::SRA) {
29850 // For SRA we need to unpack each byte to the higher byte of a i16 vector
29851 // so we can correctly sign extend. We don't care what happens to the
29852 // lower byte.
29853 SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
29854 SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
29855 SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R);
29856 SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R);
29857 ALo = DAG.getBitcast(ExtVT, ALo);
29858 AHi = DAG.getBitcast(ExtVT, AHi);
29859 RLo = DAG.getBitcast(ExtVT, RLo);
29860 RHi = DAG.getBitcast(ExtVT, RHi);
29861
29862 // r = VSELECT(r, shift(r, 4), a);
29863 SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG);
29864 SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG);
29865 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
29866 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
29867
29868 // a += a
29869 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
29870 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
29871
29872 // r = VSELECT(r, shift(r, 2), a);
29873 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG);
29874 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG);
29875 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
29876 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
29877
29878 // a += a
29879 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
29880 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
29881
29882 // r = VSELECT(r, shift(r, 1), a);
29883 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG);
29884 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG);
29885 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
29886 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
29887
29888 // Logical shift the result back to the lower byte, leaving a zero upper
29889 // byte meaning that we can safely pack with PACKUSWB.
29890 RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG);
29891 RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG);
29892 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
29893 }
29894 }
29895
29896 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
29897 MVT ExtVT = MVT::v8i32;
29898 SDValue Z = DAG.getConstant(0, dl, VT);
29899 SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z);
29900 SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z);
29901 SDValue RLo = getUnpackl(DAG, dl, VT, Z, R);
29902 SDValue RHi = getUnpackh(DAG, dl, VT, Z, R);
29903 ALo = DAG.getBitcast(ExtVT, ALo);
29904 AHi = DAG.getBitcast(ExtVT, AHi);
29905 RLo = DAG.getBitcast(ExtVT, RLo);
29906 RHi = DAG.getBitcast(ExtVT, RHi);
29907 SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo);
29908 SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi);
29909 Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG);
29910 Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG);
29911 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
29912 }
29913
29914 if (VT == MVT::v8i16) {
29915 // If we have a constant shift amount, the non-SSE41 path is best as
29916 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
29917 bool UseSSE41 = Subtarget.hasSSE41() &&
29918 !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
29919
29920 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
29921 // On SSE41 targets we can use PBLENDVB which selects bytes based just on
29922 // the sign bit.
29923 if (UseSSE41) {
29924 MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
29925 V0 = DAG.getBitcast(ExtVT, V0);
29926 V1 = DAG.getBitcast(ExtVT, V1);
29927 Sel = DAG.getBitcast(ExtVT, Sel);
29928 return DAG.getBitcast(
29929 VT, DAG.getNode(X86ISD::BLENDV, dl, ExtVT, Sel, V0, V1));
29930 }
29931 // On pre-SSE41 targets we splat the sign bit - a negative value will
29932 // set all bits of the lanes to true and VSELECT uses that in
29933 // its OR(AND(V0,C),AND(V1,~C)) lowering.
29934 SDValue C =
29935 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG);
29936 return DAG.getSelect(dl, VT, C, V0, V1);
29937 };
29938
29939 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
29940 if (UseSSE41) {
29941 // On SSE41 targets we need to replicate the shift mask in both
29942 // bytes for PBLENDVB.
29943 Amt = DAG.getNode(
29944 ISD::OR, dl, VT,
29945 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG),
29946 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG));
29947 } else {
29948 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG);
29949 }
29950
29951 // r = VSELECT(r, shift(r, 8), a);
29952 SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG);
29953 R = SignBitSelect(Amt, M, R);
29954
29955 // a += a
29956 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
29957
29958 // r = VSELECT(r, shift(r, 4), a);
29959 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG);
29960 R = SignBitSelect(Amt, M, R);
29961
29962 // a += a
29963 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
29964
29965 // r = VSELECT(r, shift(r, 2), a);
29966 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG);
29967 R = SignBitSelect(Amt, M, R);
29968
29969 // a += a
29970 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
29971
29972 // return VSELECT(r, shift(r, 1), a);
29973 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG);
29974 R = SignBitSelect(Amt, M, R);
29975 return R;
29976 }
29977
29978 // Decompose 256-bit shifts into 128-bit shifts.
29979 if (VT.is256BitVector())
29980 return splitVectorIntBinary(Op, DAG);
29981
29982 if (VT == MVT::v32i16 || VT == MVT::v64i8)
29983 return splitVectorIntBinary(Op, DAG);
29984
29985 return SDValue();
29986}
29987
29988static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
29989 SelectionDAG &DAG) {
29990 MVT VT = Op.getSimpleValueType();
29991 assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) &&(static_cast <bool> ((Op.getOpcode() == ISD::FSHL || Op
.getOpcode() == ISD::FSHR) && "Unexpected funnel shift opcode!"
) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) && \"Unexpected funnel shift opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29992, __extension__
__PRETTY_FUNCTION__))
29992 "Unexpected funnel shift opcode!")(static_cast <bool> ((Op.getOpcode() == ISD::FSHL || Op
.getOpcode() == ISD::FSHR) && "Unexpected funnel shift opcode!"
) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) && \"Unexpected funnel shift opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29992, __extension__
__PRETTY_FUNCTION__))
;
29993
29994 SDLoc DL(Op);
29995 SDValue Op0 = Op.getOperand(0);
29996 SDValue Op1 = Op.getOperand(1);
29997 SDValue Amt = Op.getOperand(2);
29998 unsigned EltSizeInBits = VT.getScalarSizeInBits();
29999 bool IsFSHR = Op.getOpcode() == ISD::FSHR;
30000
30001 if (VT.isVector()) {
30002 APInt APIntShiftAmt;
30003 bool IsCstSplat = X86::isConstantSplat(Amt, APIntShiftAmt);
30004
30005 if (Subtarget.hasVBMI2() && EltSizeInBits > 8) {
30006 if (IsFSHR)
30007 std::swap(Op0, Op1);
30008
30009 if (IsCstSplat) {
30010 uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
30011 SDValue Imm = DAG.getTargetConstant(ShiftAmt, DL, MVT::i8);
30012 return getAVX512Node(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT,
30013 {Op0, Op1, Imm}, DAG, Subtarget);
30014 }
30015 return getAVX512Node(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT,
30016 {Op0, Op1, Amt}, DAG, Subtarget);
30017 }
30018 assert((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8
|| VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16
|| VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32
|| VT == MVT::v16i32) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30021, __extension__
__PRETTY_FUNCTION__))
30019 VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 ||(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8
|| VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16
|| VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32
|| VT == MVT::v16i32) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30021, __extension__
__PRETTY_FUNCTION__))
30020 VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) &&(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8
|| VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16
|| VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32
|| VT == MVT::v16i32) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30021, __extension__
__PRETTY_FUNCTION__))
30021 "Unexpected funnel shift type!")(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8
|| VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16
|| VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32
|| VT == MVT::v16i32) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30021, __extension__
__PRETTY_FUNCTION__))
;
30022
30023 // fshl(x,y,z) -> unpack(y,x) << (z & (bw-1))) >> bw.
30024 // fshr(x,y,z) -> unpack(y,x) >> (z & (bw-1))).
30025 if (IsCstSplat)
30026 return SDValue();
30027
30028 SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
30029 SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
30030 bool IsCst = ISD::isBuildVectorOfConstantSDNodes(AmtMod.getNode());
30031
30032 // Constant vXi16 funnel shifts can be efficiently handled by default.
30033 if (IsCst && EltSizeInBits == 16)
30034 return SDValue();
30035
30036 unsigned ShiftOpc = IsFSHR ? ISD::SRL : ISD::SHL;
30037 unsigned NumElts = VT.getVectorNumElements();
30038 MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
30039 MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
30040
30041 // Split 256-bit integers on XOP/pre-AVX2 targets.
30042 // Split 512-bit integers on non 512-bit BWI targets.
30043 if ((VT.is256BitVector() && ((Subtarget.hasXOP() && EltSizeInBits < 16) ||
30044 !Subtarget.hasAVX2())) ||
30045 (VT.is512BitVector() && !Subtarget.useBWIRegs() &&
30046 EltSizeInBits < 32)) {
30047 // Pre-mask the amount modulo using the wider vector.
30048 Op = DAG.getNode(Op.getOpcode(), DL, VT, Op0, Op1, AmtMod);
30049 return splitVectorOp(Op, DAG);
30050 }
30051
30052 // Attempt to fold scalar shift as unpack(y,x) << zext(splat(z))
30053 if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, ShiftOpc)) {
30054 int ScalarAmtIdx = -1;
30055 if (SDValue ScalarAmt = DAG.getSplatSourceVector(AmtMod, ScalarAmtIdx)) {
30056 // Uniform vXi16 funnel shifts can be efficiently handled by default.
30057 if (EltSizeInBits == 16)
30058 return SDValue();
30059
30060 SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
30061 SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
30062 Lo = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Lo, ScalarAmt,
30063 ScalarAmtIdx, Subtarget, DAG);
30064 Hi = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Hi, ScalarAmt,
30065 ScalarAmtIdx, Subtarget, DAG);
30066 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
30067 }
30068 }
30069
30070 MVT WideSVT = MVT::getIntegerVT(
30071 std::min<unsigned>(EltSizeInBits * 2, Subtarget.hasBWI() ? 16 : 32));
30072 MVT WideVT = MVT::getVectorVT(WideSVT, NumElts);
30073
30074 // If per-element shifts are legal, fallback to generic expansion.
30075 if (supportedVectorVarShift(VT, Subtarget, ShiftOpc) || Subtarget.hasXOP())
30076 return SDValue();
30077
30078 // Attempt to fold as:
30079 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
30080 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
30081 if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
30082 supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
30083 Op0 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Op0);
30084 Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, Op1);
30085 AmtMod = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
30086 Op0 = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, Op0,
30087 EltSizeInBits, DAG);
30088 SDValue Res = DAG.getNode(ISD::OR, DL, WideVT, Op0, Op1);
30089 Res = DAG.getNode(ShiftOpc, DL, WideVT, Res, AmtMod);
30090 if (!IsFSHR)
30091 Res = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, Res,
30092 EltSizeInBits, DAG);
30093 return DAG.getNode(ISD::TRUNCATE, DL, VT, Res);
30094 }
30095
30096 // Attempt to fold per-element (ExtVT) shift as unpack(y,x) << zext(z)
30097 if (((IsCst || !Subtarget.hasAVX512()) && !IsFSHR && EltSizeInBits <= 16) ||
30098 supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc)) {
30099 SDValue Z = DAG.getConstant(0, DL, VT);
30100 SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
30101 SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
30102 SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
30103 SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
30104 SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
30105 SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
30106 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
30107 }
30108
30109 // Fallback to generic expansion.
30110 return SDValue();
30111 }
30112 assert((static_cast <bool> ((VT == MVT::i8 || VT == MVT::i16 ||
VT == MVT::i32 || VT == MVT::i64) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30114, __extension__
__PRETTY_FUNCTION__))
30113 (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&(static_cast <bool> ((VT == MVT::i8 || VT == MVT::i16 ||
VT == MVT::i32 || VT == MVT::i64) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30114, __extension__
__PRETTY_FUNCTION__))
30114 "Unexpected funnel shift type!")(static_cast <bool> ((VT == MVT::i8 || VT == MVT::i16 ||
VT == MVT::i32 || VT == MVT::i64) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30114, __extension__
__PRETTY_FUNCTION__))
;
30115
30116 // Expand slow SHLD/SHRD cases if we are not optimizing for size.
30117 bool OptForSize = DAG.shouldOptForSize();
30118 bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow();
30119
30120 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
30121 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
30122 if ((VT == MVT::i8 || (ExpandFunnel && VT == MVT::i16)) &&
30123 !isa<ConstantSDNode>(Amt)) {
30124 SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType());
30125 SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType());
30126 Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32);
30127 Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32);
30128 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask);
30129 SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift);
30130 Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1);
30131 if (IsFSHR) {
30132 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt);
30133 } else {
30134 Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt);
30135 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift);
30136 }
30137 return DAG.getZExtOrTrunc(Res, DL, VT);
30138 }
30139
30140 if (VT == MVT::i8 || ExpandFunnel)
30141 return SDValue();
30142
30143 // i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
30144 if (VT == MVT::i16) {
30145 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
30146 DAG.getConstant(15, DL, Amt.getValueType()));
30147 unsigned FSHOp = (IsFSHR ? X86ISD::FSHR : X86ISD::FSHL);
30148 return DAG.getNode(FSHOp, DL, VT, Op0, Op1, Amt);
30149 }
30150
30151 return Op;
30152}
30153
30154static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
30155 SelectionDAG &DAG) {
30156 MVT VT = Op.getSimpleValueType();
30157 assert(VT.isVector() && "Custom lowering only for vector rotates!")(static_cast <bool> (VT.isVector() && "Custom lowering only for vector rotates!"
) ? void (0) : __assert_fail ("VT.isVector() && \"Custom lowering only for vector rotates!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30157, __extension__
__PRETTY_FUNCTION__))
;
30158
30159 SDLoc DL(Op);
30160 SDValue R = Op.getOperand(0);
30161 SDValue Amt = Op.getOperand(1);
30162 unsigned Opcode = Op.getOpcode();
30163 unsigned EltSizeInBits = VT.getScalarSizeInBits();
30164 int NumElts = VT.getVectorNumElements();
30165 bool IsROTL = Opcode == ISD::ROTL;
30166
30167 // Check for constant splat rotation amount.
30168 APInt CstSplatValue;
30169 bool IsCstSplat = X86::isConstantSplat(Amt, CstSplatValue);
30170
30171 // Check for splat rotate by zero.
30172 if (IsCstSplat && CstSplatValue.urem(EltSizeInBits) == 0)
30173 return R;
30174
30175 // AVX512 implicitly uses modulo rotation amounts.
30176 if (Subtarget.hasAVX512() && 32 <= EltSizeInBits) {
30177 // Attempt to rotate by immediate.
30178 if (IsCstSplat) {
30179 unsigned RotOpc = IsROTL ? X86ISD::VROTLI : X86ISD::VROTRI;
30180 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
30181 return DAG.getNode(RotOpc, DL, VT, R,
30182 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
30183 }
30184
30185 // Else, fall-back on VPROLV/VPRORV.
30186 return Op;
30187 }
30188
30189 // AVX512 VBMI2 vXi16 - lower to funnel shifts.
30190 if (Subtarget.hasVBMI2() && 16 == EltSizeInBits) {
30191 unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
30192 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
30193 }
30194
30195 SDValue Z = DAG.getConstant(0, DL, VT);
30196
30197 if (!IsROTL) {
30198 // If the ISD::ROTR amount is constant, we're always better converting to
30199 // ISD::ROTL.
30200 if (SDValue NegAmt = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {Z, Amt}))
30201 return DAG.getNode(ISD::ROTL, DL, VT, R, NegAmt);
30202
30203 // XOP targets always prefers ISD::ROTL.
30204 if (Subtarget.hasXOP())
30205 return DAG.getNode(ISD::ROTL, DL, VT, R,
30206 DAG.getNode(ISD::SUB, DL, VT, Z, Amt));
30207 }
30208
30209 // Split 256-bit integers on XOP/pre-AVX2 targets.
30210 if (VT.is256BitVector() && (Subtarget.hasXOP() || !Subtarget.hasAVX2()))
30211 return splitVectorIntBinary(Op, DAG);
30212
30213 // XOP has 128-bit vector variable + immediate rotates.
30214 // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
30215 // XOP implicitly uses modulo rotation amounts.
30216 if (Subtarget.hasXOP()) {
30217 assert(IsROTL && "Only ROTL expected")(static_cast <bool> (IsROTL && "Only ROTL expected"
) ? void (0) : __assert_fail ("IsROTL && \"Only ROTL expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30217, __extension__
__PRETTY_FUNCTION__))
;
30218 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!")(static_cast <bool> (VT.is128BitVector() && "Only rotate 128-bit vectors!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only rotate 128-bit vectors!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30218, __extension__
__PRETTY_FUNCTION__))
;
30219
30220 // Attempt to rotate by immediate.
30221 if (IsCstSplat) {
30222 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
30223 return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
30224 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
30225 }
30226
30227 // Use general rotate by variable (per-element).
30228 return Op;
30229 }
30230
30231 // Rotate by an uniform constant - expand back to shifts.
30232 if (IsCstSplat)
30233 return SDValue();
30234
30235 // Split 512-bit integers on non 512-bit BWI targets.
30236 if (VT.is512BitVector() && !Subtarget.useBWIRegs())
30237 return splitVectorIntBinary(Op, DAG);
30238
30239 assert((static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30244, __extension__
__PRETTY_FUNCTION__))
30240 (VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30244, __extension__
__PRETTY_FUNCTION__))
30241 ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) &&(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30244, __extension__
__PRETTY_FUNCTION__))
30242 Subtarget.hasAVX2()) ||(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30244, __extension__
__PRETTY_FUNCTION__))
30243 ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) &&(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30244, __extension__
__PRETTY_FUNCTION__))
30244 "Only vXi32/vXi16/vXi8 vector rotates supported")(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30244, __extension__
__PRETTY_FUNCTION__))
;
30245
30246 MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
30247 MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
30248
30249 SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
30250 SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
30251
30252 // Attempt to fold as unpack(x,x) << zext(splat(y)):
30253 // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
30254 // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
30255 if (EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) {
30256 int BaseRotAmtIdx = -1;
30257 if (SDValue BaseRotAmt = DAG.getSplatSourceVector(AmtMod, BaseRotAmtIdx)) {
30258 if (EltSizeInBits == 16 && Subtarget.hasSSE41()) {
30259 unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
30260 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
30261 }
30262 unsigned ShiftX86Opc = IsROTL ? X86ISD::VSHLI : X86ISD::VSRLI;
30263 SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
30264 SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
30265 Lo = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Lo, BaseRotAmt,
30266 BaseRotAmtIdx, Subtarget, DAG);
30267 Hi = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Hi, BaseRotAmt,
30268 BaseRotAmtIdx, Subtarget, DAG);
30269 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
30270 }
30271 }
30272
30273 // v16i8/v32i8/v64i8: Split rotation into rot4/rot2/rot1 stages and select by
30274 // the amount bit.
30275 // TODO: We're doing nothing here that we couldn't do for funnel shifts.
30276 if (EltSizeInBits == 8) {
30277 bool IsConstAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
30278 MVT WideVT =
30279 MVT::getVectorVT(Subtarget.hasBWI() ? MVT::i16 : MVT::i32, NumElts);
30280 unsigned ShiftOpc = IsROTL ? ISD::SHL : ISD::SRL;
30281
30282 // Attempt to fold as:
30283 // rotl(x,y) -> (((aext(x) << bw) | zext(x)) << (y & (bw-1))) >> bw.
30284 // rotr(x,y) -> (((aext(x) << bw) | zext(x)) >> (y & (bw-1))).
30285 if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
30286 supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
30287 // If we're rotating by constant, just use default promotion.
30288 if (IsConstAmt)
30289 return SDValue();
30290 // See if we can perform this by widening to vXi16 or vXi32.
30291 R = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, R);
30292 R = DAG.getNode(
30293 ISD::OR, DL, WideVT, R,
30294 getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, R, 8, DAG));
30295 Amt = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
30296 R = DAG.getNode(ShiftOpc, DL, WideVT, R, Amt);
30297 if (IsROTL)
30298 R = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, R, 8, DAG);
30299 return DAG.getNode(ISD::TRUNCATE, DL, VT, R);
30300 }
30301
30302 // Attempt to fold as unpack(x,x) << zext(y):
30303 // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
30304 // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
30305 if (IsConstAmt || supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc)) {
30306 // See if we can perform this by unpacking to lo/hi vXi16.
30307 SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
30308 SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
30309 SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
30310 SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
30311 SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
30312 SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
30313 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
30314 }
30315 assert((VT == MVT::v16i8 || VT == MVT::v32i8) && "Unsupported vXi8 type")(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8
) && "Unsupported vXi8 type") ? void (0) : __assert_fail
("(VT == MVT::v16i8 || VT == MVT::v32i8) && \"Unsupported vXi8 type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30315, __extension__
__PRETTY_FUNCTION__))
;
30316
30317 // We don't need ModuloAmt here as we just peek at individual bits.
30318 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
30319 if (Subtarget.hasSSE41()) {
30320 // On SSE41 targets we can use PBLENDVB which selects bytes based just
30321 // on the sign bit.
30322 V0 = DAG.getBitcast(VT, V0);
30323 V1 = DAG.getBitcast(VT, V1);
30324 Sel = DAG.getBitcast(VT, Sel);
30325 return DAG.getBitcast(SelVT,
30326 DAG.getNode(X86ISD::BLENDV, DL, VT, Sel, V0, V1));
30327 }
30328 // On pre-SSE41 targets we test for the sign bit by comparing to
30329 // zero - a negative value will set all bits of the lanes to true
30330 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
30331 SDValue Z = DAG.getConstant(0, DL, SelVT);
30332 SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);
30333 return DAG.getSelect(DL, SelVT, C, V0, V1);
30334 };
30335
30336 // ISD::ROTR is currently only profitable on AVX512 targets with VPTERNLOG.
30337 if (!IsROTL && !useVPTERNLOG(Subtarget, VT)) {
30338 Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
30339 IsROTL = true;
30340 }
30341
30342 unsigned ShiftLHS = IsROTL ? ISD::SHL : ISD::SRL;
30343 unsigned ShiftRHS = IsROTL ? ISD::SRL : ISD::SHL;
30344
30345 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
30346 // We can safely do this using i16 shifts as we're only interested in
30347 // the 3 lower bits of each byte.
30348 Amt = DAG.getBitcast(ExtVT, Amt);
30349 Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));
30350 Amt = DAG.getBitcast(VT, Amt);
30351
30352 // r = VSELECT(r, rot(r, 4), a);
30353 SDValue M;
30354 M = DAG.getNode(
30355 ISD::OR, DL, VT,
30356 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(4, DL, VT)),
30357 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(4, DL, VT)));
30358 R = SignBitSelect(VT, Amt, M, R);
30359
30360 // a += a
30361 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
30362
30363 // r = VSELECT(r, rot(r, 2), a);
30364 M = DAG.getNode(
30365 ISD::OR, DL, VT,
30366 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(2, DL, VT)),
30367 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(6, DL, VT)));
30368 R = SignBitSelect(VT, Amt, M, R);
30369
30370 // a += a
30371 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
30372
30373 // return VSELECT(r, rot(r, 1), a);
30374 M = DAG.getNode(
30375 ISD::OR, DL, VT,
30376 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(1, DL, VT)),
30377 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(7, DL, VT)));
30378 return SignBitSelect(VT, Amt, M, R);
30379 }
30380
30381 bool IsSplatAmt = DAG.isSplatValue(Amt);
30382 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
30383 bool LegalVarShifts = supportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
30384 supportedVectorVarShift(VT, Subtarget, ISD::SRL);
30385
30386 // Fallback for splats + all supported variable shifts.
30387 // Fallback for non-constants AVX2 vXi16 as well.
30388 if (IsSplatAmt || LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {
30389 Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
30390 SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
30391 AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
30392 SDValue SHL = DAG.getNode(IsROTL ? ISD::SHL : ISD::SRL, DL, VT, R, Amt);
30393 SDValue SRL = DAG.getNode(IsROTL ? ISD::SRL : ISD::SHL, DL, VT, R, AmtR);
30394 return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
30395 }
30396
30397 // Everything below assumes ISD::ROTL.
30398 if (!IsROTL) {
30399 Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
30400 IsROTL = true;
30401 }
30402
30403 // ISD::ROT* uses modulo rotate amounts.
30404 Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
30405
30406 assert(IsROTL && "Only ROTL supported")(static_cast <bool> (IsROTL && "Only ROTL supported"
) ? void (0) : __assert_fail ("IsROTL && \"Only ROTL supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30406, __extension__
__PRETTY_FUNCTION__))
;
30407
30408 // As with shifts, attempt to convert the rotation amount to a multiplication
30409 // factor, fallback to general expansion.
30410 SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);
30411 if (!Scale)
30412 return SDValue();
30413
30414 // v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.
30415 if (EltSizeInBits == 16) {
30416 SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);
30417 SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);
30418 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
30419 }
30420
30421 // v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32
30422 // to v2i64 results at a time. The upper 32-bits contain the wrapped bits
30423 // that can then be OR'd with the lower 32-bits.
30424 assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected")(static_cast <bool> (VT == MVT::v4i32 && "Only v4i32 vector rotate expected"
) ? void (0) : __assert_fail ("VT == MVT::v4i32 && \"Only v4i32 vector rotate expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30424, __extension__
__PRETTY_FUNCTION__))
;
30425 static const int OddMask[] = {1, -1, 3, -1};
30426 SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);
30427 SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);
30428
30429 SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
30430 DAG.getBitcast(MVT::v2i64, R),
30431 DAG.getBitcast(MVT::v2i64, Scale));
30432 SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
30433 DAG.getBitcast(MVT::v2i64, R13),
30434 DAG.getBitcast(MVT::v2i64, Scale13));
30435 Res02 = DAG.getBitcast(VT, Res02);
30436 Res13 = DAG.getBitcast(VT, Res13);
30437
30438 return DAG.getNode(ISD::OR, DL, VT,
30439 DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),
30440 DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));
30441}
30442
30443/// Returns true if the operand type is exactly twice the native width, and
30444/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
30445/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
30446/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
30447bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
30448 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
30449
30450 if (OpWidth == 64)
30451 return Subtarget.canUseCMPXCHG8B() && !Subtarget.is64Bit();
30452 if (OpWidth == 128)
30453 return Subtarget.canUseCMPXCHG16B();
30454
30455 return false;
30456}
30457
30458TargetLoweringBase::AtomicExpansionKind
30459X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
30460 Type *MemType = SI->getValueOperand()->getType();
30461
30462 bool NoImplicitFloatOps =
30463 SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
30464 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
30465 !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
30466 (Subtarget.hasSSE1() || Subtarget.hasX87()))
30467 return AtomicExpansionKind::None;
30468
30469 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::Expand
30470 : AtomicExpansionKind::None;
30471}
30472
30473// Note: this turns large loads into lock cmpxchg8b/16b.
30474// TODO: In 32-bit mode, use MOVLPS when SSE1 is available?
30475TargetLowering::AtomicExpansionKind
30476X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
30477 Type *MemType = LI->getType();
30478
30479 // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
30480 // can use movq to do the load. If we have X87 we can load into an 80-bit
30481 // X87 register and store it to a stack temporary.
30482 bool NoImplicitFloatOps =
30483 LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
30484 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
30485 !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
30486 (Subtarget.hasSSE1() || Subtarget.hasX87()))
30487 return AtomicExpansionKind::None;
30488
30489 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
30490 : AtomicExpansionKind::None;
30491}
30492
30493TargetLowering::AtomicExpansionKind
30494X86TargetLowering::shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const {
30495 // If the atomicrmw's result isn't actually used, we can just add a "lock"
30496 // prefix to a normal instruction for these operations.
30497 if (AI->use_empty())
30498 return AtomicExpansionKind::None;
30499
30500 // If the atomicrmw's result is used by a single bit AND, we may use
30501 // bts/btr/btc instruction for these operations.
30502 auto *C1 = dyn_cast<ConstantInt>(AI->getValOperand());
30503 Instruction *I = AI->user_back();
30504 if (!C1 || !AI->hasOneUse() || I->getOpcode() != Instruction::And ||
30505 AI->getParent() != I->getParent())
30506 return AtomicExpansionKind::CmpXChg;
30507 // The following instruction must be a AND single bit.
30508 auto *C2 = dyn_cast<ConstantInt>(I->getOperand(1));
30509 unsigned Bits = AI->getType()->getPrimitiveSizeInBits();
30510 if (!C2 || Bits == 8 || !isPowerOf2_64(C2->getZExtValue()))
30511 return AtomicExpansionKind::CmpXChg;
30512
30513 if (AI->getOperation() == AtomicRMWInst::And)
30514 return ~C1->getValue() == C2->getValue()
30515 ? AtomicExpansionKind::BitTestIntrinsic
30516 : AtomicExpansionKind::CmpXChg;
30517
30518 return C1 == C2 ? AtomicExpansionKind::BitTestIntrinsic
30519 : AtomicExpansionKind::CmpXChg;
30520}
30521
30522void X86TargetLowering::emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const {
30523 IRBuilder<> Builder(AI);
30524 Intrinsic::ID IID = Intrinsic::not_intrinsic;
30525 switch (AI->getOperation()) {
30526 default:
30527 llvm_unreachable("Unknown atomic operation")::llvm::llvm_unreachable_internal("Unknown atomic operation",
"llvm/lib/Target/X86/X86ISelLowering.cpp", 30527)
;
30528 case AtomicRMWInst::Or:
30529 IID = Intrinsic::x86_atomic_bts;
30530 break;
30531 case AtomicRMWInst::Xor:
30532 IID = Intrinsic::x86_atomic_btc;
30533 break;
30534 case AtomicRMWInst::And:
30535 IID = Intrinsic::x86_atomic_btr;
30536 break;
30537 }
30538 Instruction *I = AI->user_back();
30539 LLVMContext &Ctx = AI->getContext();
30540 unsigned Imm =
30541 countTrailingZeros(cast<ConstantInt>(I->getOperand(1))->getZExtValue());
30542 Function *BitTest =
30543 Intrinsic::getDeclaration(AI->getModule(), IID, AI->getType());
30544 Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
30545 Type::getInt8PtrTy(Ctx));
30546 Value *Result = Builder.CreateCall(BitTest, {Addr, Builder.getInt8(Imm)});
30547 I->replaceAllUsesWith(Result);
30548 I->eraseFromParent();
30549 AI->eraseFromParent();
30550}
30551
30552TargetLowering::AtomicExpansionKind
30553X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
30554 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
30555 Type *MemType = AI->getType();
30556
30557 // If the operand is too big, we must see if cmpxchg8/16b is available
30558 // and default to library calls otherwise.
30559 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
30560 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
30561 : AtomicExpansionKind::None;
30562 }
30563
30564 AtomicRMWInst::BinOp Op = AI->getOperation();
30565 switch (Op) {
30566 default:
30567 llvm_unreachable("Unknown atomic operation")::llvm::llvm_unreachable_internal("Unknown atomic operation",
"llvm/lib/Target/X86/X86ISelLowering.cpp", 30567)
;
30568 case AtomicRMWInst::Xchg:
30569 case AtomicRMWInst::Add:
30570 case AtomicRMWInst::Sub:
30571 // It's better to use xadd, xsub or xchg for these in all cases.
30572 return AtomicExpansionKind::None;
30573 case AtomicRMWInst::Or:
30574 case AtomicRMWInst::And:
30575 case AtomicRMWInst::Xor:
30576 return shouldExpandLogicAtomicRMWInIR(AI);
30577 case AtomicRMWInst::Nand:
30578 case AtomicRMWInst::Max:
30579 case AtomicRMWInst::Min:
30580 case AtomicRMWInst::UMax:
30581 case AtomicRMWInst::UMin:
30582 case AtomicRMWInst::FAdd:
30583 case AtomicRMWInst::FSub:
30584 // These always require a non-trivial set of data operations on x86. We must
30585 // use a cmpxchg loop.
30586 return AtomicExpansionKind::CmpXChg;
30587 }
30588}
30589
30590LoadInst *
30591X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
30592 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
30593 Type *MemType = AI->getType();
30594 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
30595 // there is no benefit in turning such RMWs into loads, and it is actually
30596 // harmful as it introduces a mfence.
30597 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
30598 return nullptr;
30599
30600 // If this is a canonical idempotent atomicrmw w/no uses, we have a better
30601 // lowering available in lowerAtomicArith.
30602 // TODO: push more cases through this path.
30603 if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
30604 if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
30605 AI->use_empty())
30606 return nullptr;
30607
30608 IRBuilder<> Builder(AI);
30609 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
30610 auto SSID = AI->getSyncScopeID();
30611 // We must restrict the ordering to avoid generating loads with Release or
30612 // ReleaseAcquire orderings.
30613 auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
30614
30615 // Before the load we need a fence. Here is an example lifted from
30616 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
30617 // is required:
30618 // Thread 0:
30619 // x.store(1, relaxed);
30620 // r1 = y.fetch_add(0, release);
30621 // Thread 1:
30622 // y.fetch_add(42, acquire);
30623 // r2 = x.load(relaxed);
30624 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
30625 // lowered to just a load without a fence. A mfence flushes the store buffer,
30626 // making the optimization clearly correct.
30627 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
30628 // otherwise, we might be able to be more aggressive on relaxed idempotent
30629 // rmw. In practice, they do not look useful, so we don't try to be
30630 // especially clever.
30631 if (SSID == SyncScope::SingleThread)
30632 // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
30633 // the IR level, so we must wrap it in an intrinsic.
30634 return nullptr;
30635
30636 if (!Subtarget.hasMFence())
30637 // FIXME: it might make sense to use a locked operation here but on a
30638 // different cache-line to prevent cache-line bouncing. In practice it
30639 // is probably a small win, and x86 processors without mfence are rare
30640 // enough that we do not bother.
30641 return nullptr;
30642
30643 Function *MFence =
30644 llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
30645 Builder.CreateCall(MFence, {});
30646
30647 // Finally we can emit the atomic load.
30648 LoadInst *Loaded = Builder.CreateAlignedLoad(
30649 AI->getType(), AI->getPointerOperand(), AI->getAlign());
30650 Loaded->setAtomic(Order, SSID);
30651 AI->replaceAllUsesWith(Loaded);
30652 AI->eraseFromParent();
30653 return Loaded;
30654}
30655
30656bool X86TargetLowering::lowerAtomicStoreAsStoreSDNode(const StoreInst &SI) const {
30657 if (!SI.isUnordered())
30658 return false;
30659 return ExperimentalUnorderedISEL;
30660}
30661bool X86TargetLowering::lowerAtomicLoadAsLoadSDNode(const LoadInst &LI) const {
30662 if (!LI.isUnordered())
30663 return false;
30664 return ExperimentalUnorderedISEL;
30665}
30666
30667
30668/// Emit a locked operation on a stack location which does not change any
30669/// memory location, but does involve a lock prefix. Location is chosen to be
30670/// a) very likely accessed only by a single thread to minimize cache traffic,
30671/// and b) definitely dereferenceable. Returns the new Chain result.
30672static SDValue emitLockedStackOp(SelectionDAG &DAG,
30673 const X86Subtarget &Subtarget, SDValue Chain,
30674 const SDLoc &DL) {
30675 // Implementation notes:
30676 // 1) LOCK prefix creates a full read/write reordering barrier for memory
30677 // operations issued by the current processor. As such, the location
30678 // referenced is not relevant for the ordering properties of the instruction.
30679 // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
30680 // 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions
30681 // 2) Using an immediate operand appears to be the best encoding choice
30682 // here since it doesn't require an extra register.
30683 // 3) OR appears to be very slightly faster than ADD. (Though, the difference
30684 // is small enough it might just be measurement noise.)
30685 // 4) When choosing offsets, there are several contributing factors:
30686 // a) If there's no redzone, we default to TOS. (We could allocate a cache
30687 // line aligned stack object to improve this case.)
30688 // b) To minimize our chances of introducing a false dependence, we prefer
30689 // to offset the stack usage from TOS slightly.
30690 // c) To minimize concerns about cross thread stack usage - in particular,
30691 // the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which
30692 // captures state in the TOS frame and accesses it from many threads -
30693 // we want to use an offset such that the offset is in a distinct cache
30694 // line from the TOS frame.
30695 //
30696 // For a general discussion of the tradeoffs and benchmark results, see:
30697 // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
30698
30699 auto &MF = DAG.getMachineFunction();
30700 auto &TFL = *Subtarget.getFrameLowering();
30701 const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;
30702
30703 if (Subtarget.is64Bit()) {
30704 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
30705 SDValue Ops[] = {
30706 DAG.getRegister(X86::RSP, MVT::i64), // Base
30707 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
30708 DAG.getRegister(0, MVT::i64), // Index
30709 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
30710 DAG.getRegister(0, MVT::i16), // Segment.
30711 Zero,
30712 Chain};
30713 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
30714 MVT::Other, Ops);
30715 return SDValue(Res, 1);
30716 }
30717
30718 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
30719 SDValue Ops[] = {
30720 DAG.getRegister(X86::ESP, MVT::i32), // Base
30721 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
30722 DAG.getRegister(0, MVT::i32), // Index
30723 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
30724 DAG.getRegister(0, MVT::i16), // Segment.
30725 Zero,
30726 Chain
30727 };
30728 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
30729 MVT::Other, Ops);
30730 return SDValue(Res, 1);
30731}
30732
30733static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
30734 SelectionDAG &DAG) {
30735 SDLoc dl(Op);
30736 AtomicOrdering FenceOrdering =
30737 static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
30738 SyncScope::ID FenceSSID =
30739 static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
30740
30741 // The only fence that needs an instruction is a sequentially-consistent
30742 // cross-thread fence.
30743 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
30744 FenceSSID == SyncScope::System) {
30745 if (Subtarget.hasMFence())
30746 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
30747
30748 SDValue Chain = Op.getOperand(0);
30749 return emitLockedStackOp(DAG, Subtarget, Chain, dl);
30750 }
30751
30752 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
30753 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
30754}
30755
30756static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
30757 SelectionDAG &DAG) {
30758 MVT T = Op.getSimpleValueType();
30759 SDLoc DL(Op);
30760 unsigned Reg = 0;
30761 unsigned size = 0;
30762 switch(T.SimpleTy) {
30763 default: llvm_unreachable("Invalid value type!")::llvm::llvm_unreachable_internal("Invalid value type!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30763)
;
30764 case MVT::i8: Reg = X86::AL; size = 1; break;
30765 case MVT::i16: Reg = X86::AX; size = 2; break;
30766 case MVT::i32: Reg = X86::EAX; size = 4; break;
30767 case MVT::i64:
30768 assert(Subtarget.is64Bit() && "Node not type legal!")(static_cast <bool> (Subtarget.is64Bit() && "Node not type legal!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Node not type legal!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30768, __extension__
__PRETTY_FUNCTION__))
;
30769 Reg = X86::RAX; size = 8;
30770 break;
30771 }
30772 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
30773 Op.getOperand(2), SDValue());
30774 SDValue Ops[] = { cpIn.getValue(0),
30775 Op.getOperand(1),
30776 Op.getOperand(3),
30777 DAG.getTargetConstant(size, DL, MVT::i8),
30778 cpIn.getValue(1) };
30779 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
30780 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
30781 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
30782 Ops, T, MMO);
30783
30784 SDValue cpOut =
30785 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
30786 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
30787 MVT::i32, cpOut.getValue(2));
30788 SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
30789
30790 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
30791 cpOut, Success, EFLAGS.getValue(1));
30792}
30793
30794// Create MOVMSKB, taking into account whether we need to split for AVX1.
30795static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG,
30796 const X86Subtarget &Subtarget) {
30797 MVT InVT = V.getSimpleValueType();
30798
30799 if (InVT == MVT::v64i8) {
30800 SDValue Lo, Hi;
30801 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
30802 Lo = getPMOVMSKB(DL, Lo, DAG, Subtarget);
30803 Hi = getPMOVMSKB(DL, Hi, DAG, Subtarget);
30804 Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);
30805 Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);
30806 Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,
30807 DAG.getConstant(32, DL, MVT::i8));
30808 return DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);
30809 }
30810 if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
30811 SDValue Lo, Hi;
30812 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
30813 Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
30814 Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
30815 Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
30816 DAG.getConstant(16, DL, MVT::i8));
30817 return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
30818 }
30819
30820 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
30821}
30822
30823static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
30824 SelectionDAG &DAG) {
30825 SDValue Src = Op.getOperand(0);
30826 MVT SrcVT = Src.getSimpleValueType();
30827 MVT DstVT = Op.getSimpleValueType();
30828
30829 // Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each
30830 // half to v32i1 and concatenating the result.
30831 if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {
30832 assert(!Subtarget.is64Bit() && "Expected 32-bit mode")(static_cast <bool> (!Subtarget.is64Bit() && "Expected 32-bit mode"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Expected 32-bit mode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30832, __extension__
__PRETTY_FUNCTION__))
;
30833 assert(Subtarget.hasBWI() && "Expected BWI target")(static_cast <bool> (Subtarget.hasBWI() && "Expected BWI target"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected BWI target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30833, __extension__
__PRETTY_FUNCTION__))
;
30834 SDLoc dl(Op);
30835 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
30836 DAG.getIntPtrConstant(0, dl));
30837 Lo = DAG.getBitcast(MVT::v32i1, Lo);
30838 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
30839 DAG.getIntPtrConstant(1, dl));
30840 Hi = DAG.getBitcast(MVT::v32i1, Hi);
30841 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
30842 }
30843
30844 // Use MOVMSK for vector to scalar conversion to prevent scalarization.
30845 if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
30846 assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512")(static_cast <bool> (!Subtarget.hasAVX512() && "Should use K-registers with AVX512"
) ? void (0) : __assert_fail ("!Subtarget.hasAVX512() && \"Should use K-registers with AVX512\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30846, __extension__
__PRETTY_FUNCTION__))
;
30847 MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
30848 SDLoc DL(Op);
30849 SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
30850 V = getPMOVMSKB(DL, V, DAG, Subtarget);
30851 return DAG.getZExtOrTrunc(V, DL, DstVT);
30852 }
30853
30854 assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||(static_cast <bool> ((SrcVT == MVT::v2i32 || SrcVT == MVT
::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) &&
"Unexpected VT!") ? void (0) : __assert_fail ("(SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30855, __extension__
__PRETTY_FUNCTION__))
30855 SrcVT == MVT::i64) && "Unexpected VT!")(static_cast <bool> ((SrcVT == MVT::v2i32 || SrcVT == MVT
::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) &&
"Unexpected VT!") ? void (0) : __assert_fail ("(SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30855, __extension__
__PRETTY_FUNCTION__))
;
30856
30857 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30857, __extension__
__PRETTY_FUNCTION__))
;
30858 if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) &&
30859 !(DstVT == MVT::x86mmx && SrcVT.isVector()))
30860 // This conversion needs to be expanded.
30861 return SDValue();
30862
30863 SDLoc dl(Op);
30864 if (SrcVT.isVector()) {
30865 // Widen the vector in input in the case of MVT::v2i32.
30866 // Example: from MVT::v2i32 to MVT::v4i32.
30867 MVT NewVT = MVT::getVectorVT(SrcVT.getVectorElementType(),
30868 SrcVT.getVectorNumElements() * 2);
30869 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,
30870 DAG.getUNDEF(SrcVT));
30871 } else {
30872 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&(static_cast <bool> (SrcVT == MVT::i64 && !Subtarget
.is64Bit() && "Unexpected source type in LowerBITCAST"
) ? void (0) : __assert_fail ("SrcVT == MVT::i64 && !Subtarget.is64Bit() && \"Unexpected source type in LowerBITCAST\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30873, __extension__
__PRETTY_FUNCTION__))
30873 "Unexpected source type in LowerBITCAST")(static_cast <bool> (SrcVT == MVT::i64 && !Subtarget
.is64Bit() && "Unexpected source type in LowerBITCAST"
) ? void (0) : __assert_fail ("SrcVT == MVT::i64 && !Subtarget.is64Bit() && \"Unexpected source type in LowerBITCAST\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30873, __extension__
__PRETTY_FUNCTION__))
;
30874 Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
30875 }
30876
30877 MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;
30878 Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);
30879
30880 if (DstVT == MVT::x86mmx)
30881 return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);
30882
30883 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,
30884 DAG.getIntPtrConstant(0, dl));
30885}
30886
30887/// Compute the horizontal sum of bytes in V for the elements of VT.
30888///
30889/// Requires V to be a byte vector and VT to be an integer vector type with
30890/// wider elements than V's type. The width of the elements of VT determines
30891/// how many bytes of V are summed horizontally to produce each element of the
30892/// result.
30893static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
30894 const X86Subtarget &Subtarget,
30895 SelectionDAG &DAG) {
30896 SDLoc DL(V);
30897 MVT ByteVecVT = V.getSimpleValueType();
30898 MVT EltVT = VT.getVectorElementType();
30899 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&(static_cast <bool> (ByteVecVT.getVectorElementType() ==
MVT::i8 && "Expected value to have byte element type."
) ? void (0) : __assert_fail ("ByteVecVT.getVectorElementType() == MVT::i8 && \"Expected value to have byte element type.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30900, __extension__
__PRETTY_FUNCTION__))
30900 "Expected value to have byte element type.")(static_cast <bool> (ByteVecVT.getVectorElementType() ==
MVT::i8 && "Expected value to have byte element type."
) ? void (0) : __assert_fail ("ByteVecVT.getVectorElementType() == MVT::i8 && \"Expected value to have byte element type.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30900, __extension__
__PRETTY_FUNCTION__))
;
30901 assert(EltVT != MVT::i8 &&(static_cast <bool> (EltVT != MVT::i8 && "Horizontal byte sum only makes sense for wider elements!"
) ? void (0) : __assert_fail ("EltVT != MVT::i8 && \"Horizontal byte sum only makes sense for wider elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30902, __extension__
__PRETTY_FUNCTION__))
30902 "Horizontal byte sum only makes sense for wider elements!")(static_cast <bool> (EltVT != MVT::i8 && "Horizontal byte sum only makes sense for wider elements!"
) ? void (0) : __assert_fail ("EltVT != MVT::i8 && \"Horizontal byte sum only makes sense for wider elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30902, __extension__
__PRETTY_FUNCTION__))
;
30903 unsigned VecSize = VT.getSizeInBits();
30904 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!")(static_cast <bool> (ByteVecVT.getSizeInBits() == VecSize
&& "Cannot change vector size!") ? void (0) : __assert_fail
("ByteVecVT.getSizeInBits() == VecSize && \"Cannot change vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30904, __extension__
__PRETTY_FUNCTION__))
;
30905
30906 // PSADBW instruction horizontally add all bytes and leave the result in i64
30907 // chunks, thus directly computes the pop count for v2i64 and v4i64.
30908 if (EltVT == MVT::i64) {
30909 SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT);
30910 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
30911 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
30912 return DAG.getBitcast(VT, V);
30913 }
30914
30915 if (EltVT == MVT::i32) {
30916 // We unpack the low half and high half into i32s interleaved with zeros so
30917 // that we can use PSADBW to horizontally sum them. The most useful part of
30918 // this is that it lines up the results of two PSADBW instructions to be
30919 // two v2i64 vectors which concatenated are the 4 population counts. We can
30920 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
30921 SDValue Zeros = DAG.getConstant(0, DL, VT);
30922 SDValue V32 = DAG.getBitcast(VT, V);
30923 SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros);
30924 SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros);
30925
30926 // Do the horizontal sums into two v2i64s.
30927 Zeros = DAG.getConstant(0, DL, ByteVecVT);
30928 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
30929 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
30930 DAG.getBitcast(ByteVecVT, Low), Zeros);
30931 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
30932 DAG.getBitcast(ByteVecVT, High), Zeros);
30933
30934 // Merge them together.
30935 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
30936 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
30937 DAG.getBitcast(ShortVecVT, Low),
30938 DAG.getBitcast(ShortVecVT, High));
30939
30940 return DAG.getBitcast(VT, V);
30941 }
30942
30943 // The only element type left is i16.
30944 assert(EltVT == MVT::i16 && "Unknown how to handle type")(static_cast <bool> (EltVT == MVT::i16 && "Unknown how to handle type"
) ? void (0) : __assert_fail ("EltVT == MVT::i16 && \"Unknown how to handle type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30944, __extension__
__PRETTY_FUNCTION__))
;
30945
30946 // To obtain pop count for each i16 element starting from the pop count for
30947 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
30948 // right by 8. It is important to shift as i16s as i8 vector shift isn't
30949 // directly supported.
30950 SDValue ShifterV = DAG.getConstant(8, DL, VT);
30951 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
30952 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
30953 DAG.getBitcast(ByteVecVT, V));
30954 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
30955}
30956
30957static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
30958 const X86Subtarget &Subtarget,
30959 SelectionDAG &DAG) {
30960 MVT VT = Op.getSimpleValueType();
30961 MVT EltVT = VT.getVectorElementType();
30962 int NumElts = VT.getVectorNumElements();
30963 (void)EltVT;
30964 assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.")(static_cast <bool> (EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported."
) ? void (0) : __assert_fail ("EltVT == MVT::i8 && \"Only vXi8 vector CTPOP lowering supported.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30964, __extension__
__PRETTY_FUNCTION__))
;
30965
30966 // Implement a lookup table in register by using an algorithm based on:
30967 // http://wm.ite.pl/articles/sse-popcount.html
30968 //
30969 // The general idea is that every lower byte nibble in the input vector is an
30970 // index into a in-register pre-computed pop count table. We then split up the
30971 // input vector in two new ones: (1) a vector with only the shifted-right
30972 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
30973 // masked out higher ones) for each byte. PSHUFB is used separately with both
30974 // to index the in-register table. Next, both are added and the result is a
30975 // i8 vector where each element contains the pop count for input byte.
30976 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
30977 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
30978 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
30979 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
30980
30981 SmallVector<SDValue, 64> LUTVec;
30982 for (int i = 0; i < NumElts; ++i)
30983 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
30984 SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);
30985 SDValue M0F = DAG.getConstant(0x0F, DL, VT);
30986
30987 // High nibbles
30988 SDValue FourV = DAG.getConstant(4, DL, VT);
30989 SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);
30990
30991 // Low nibbles
30992 SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);
30993
30994 // The input vector is used as the shuffle mask that index elements into the
30995 // LUT. After counting low and high nibbles, add the vector to obtain the
30996 // final pop count per i8 element.
30997 SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);
30998 SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);
30999 return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);
31000}
31001
31002// Please ensure that any codegen change from LowerVectorCTPOP is reflected in
31003// updated cost models in X86TTIImpl::getIntrinsicInstrCost.
31004static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
31005 SelectionDAG &DAG) {
31006 MVT VT = Op.getSimpleValueType();
31007 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&(static_cast <bool> ((VT.is512BitVector() || VT.is256BitVector
() || VT.is128BitVector()) && "Unknown CTPOP type to handle"
) ? void (0) : __assert_fail ("(VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) && \"Unknown CTPOP type to handle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31008, __extension__
__PRETTY_FUNCTION__))
31008 "Unknown CTPOP type to handle")(static_cast <bool> ((VT.is512BitVector() || VT.is256BitVector
() || VT.is128BitVector()) && "Unknown CTPOP type to handle"
) ? void (0) : __assert_fail ("(VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) && \"Unknown CTPOP type to handle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31008, __extension__
__PRETTY_FUNCTION__))
;
31009 SDLoc DL(Op.getNode());
31010 SDValue Op0 = Op.getOperand(0);
31011
31012 // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
31013 if (Subtarget.hasVPOPCNTDQ()) {
31014 unsigned NumElems = VT.getVectorNumElements();
31015 assert((VT.getVectorElementType() == MVT::i8 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i8 || VT.getVectorElementType() == MVT::i16) && "Unexpected type"
) ? void (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i8 || VT.getVectorElementType() == MVT::i16) && \"Unexpected type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31016, __extension__
__PRETTY_FUNCTION__))
31016 VT.getVectorElementType() == MVT::i16) && "Unexpected type")(static_cast <bool> ((VT.getVectorElementType() == MVT::
i8 || VT.getVectorElementType() == MVT::i16) && "Unexpected type"
) ? void (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i8 || VT.getVectorElementType() == MVT::i16) && \"Unexpected type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31016, __extension__
__PRETTY_FUNCTION__))
;
31017 if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
31018 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
31019 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
31020 Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
31021 return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
31022 }
31023 }
31024
31025 // Decompose 256-bit ops into smaller 128-bit ops.
31026 if (VT.is256BitVector() && !Subtarget.hasInt256())
31027 return splitVectorIntUnary(Op, DAG);
31028
31029 // Decompose 512-bit ops into smaller 256-bit ops.
31030 if (VT.is512BitVector() && !Subtarget.hasBWI())
31031 return splitVectorIntUnary(Op, DAG);
31032
31033 // For element types greater than i8, do vXi8 pop counts and a bytesum.
31034 if (VT.getScalarType() != MVT::i8) {
31035 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
31036 SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);
31037 SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);
31038 return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);
31039 }
31040
31041 // We can't use the fast LUT approach, so fall back on LegalizeDAG.
31042 if (!Subtarget.hasSSSE3())
31043 return SDValue();
31044
31045 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
31046}
31047
31048static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
31049 SelectionDAG &DAG) {
31050 assert(Op.getSimpleValueType().isVector() &&(static_cast <bool> (Op.getSimpleValueType().isVector()
&& "We only do custom lowering for vector population count."
) ? void (0) : __assert_fail ("Op.getSimpleValueType().isVector() && \"We only do custom lowering for vector population count.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31051, __extension__
__PRETTY_FUNCTION__))
31051 "We only do custom lowering for vector population count.")(static_cast <bool> (Op.getSimpleValueType().isVector()
&& "We only do custom lowering for vector population count."
) ? void (0) : __assert_fail ("Op.getSimpleValueType().isVector() && \"We only do custom lowering for vector population count.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31051, __extension__
__PRETTY_FUNCTION__))
;
31052 return LowerVectorCTPOP(Op, Subtarget, DAG);
31053}
31054
31055static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
31056 MVT VT = Op.getSimpleValueType();
31057 SDValue In = Op.getOperand(0);
31058 SDLoc DL(Op);
31059
31060 // For scalars, its still beneficial to transfer to/from the SIMD unit to
31061 // perform the BITREVERSE.
31062 if (!VT.isVector()) {
31063 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
31064 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
31065 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
31066 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
31067 DAG.getIntPtrConstant(0, DL));
31068 }
31069
31070 int NumElts = VT.getVectorNumElements();
31071 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
31072
31073 // Decompose 256-bit ops into smaller 128-bit ops.
31074 if (VT.is256BitVector())
31075 return splitVectorIntUnary(Op, DAG);
31076
31077 assert(VT.is128BitVector() &&(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vector bitreverse lowering supported."
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector bitreverse lowering supported.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31078, __extension__
__PRETTY_FUNCTION__))
31078 "Only 128-bit vector bitreverse lowering supported.")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vector bitreverse lowering supported."
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector bitreverse lowering supported.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31078, __extension__
__PRETTY_FUNCTION__))
;
31079
31080 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
31081 // perform the BSWAP in the shuffle.
31082 // Its best to shuffle using the second operand as this will implicitly allow
31083 // memory folding for multiple vectors.
31084 SmallVector<SDValue, 16> MaskElts;
31085 for (int i = 0; i != NumElts; ++i) {
31086 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
31087 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
31088 int PermuteByte = SourceByte | (2 << 5);
31089 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
31090 }
31091 }
31092
31093 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
31094 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
31095 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
31096 Res, Mask);
31097 return DAG.getBitcast(VT, Res);
31098}
31099
31100static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
31101 SelectionDAG &DAG) {
31102 MVT VT = Op.getSimpleValueType();
31103
31104 if (Subtarget.hasXOP() && !VT.is512BitVector())
31105 return LowerBITREVERSE_XOP(Op, DAG);
31106
31107 assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE")(static_cast <bool> (Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE"
) ? void (0) : __assert_fail ("Subtarget.hasSSSE3() && \"SSSE3 required for BITREVERSE\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31107, __extension__
__PRETTY_FUNCTION__))
;
31108
31109 SDValue In = Op.getOperand(0);
31110 SDLoc DL(Op);
31111
31112 assert(VT.getScalarType() == MVT::i8 &&(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Only byte vector BITREVERSE supported") ? void (0) : __assert_fail
("VT.getScalarType() == MVT::i8 && \"Only byte vector BITREVERSE supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31113, __extension__
__PRETTY_FUNCTION__))
31113 "Only byte vector BITREVERSE supported")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Only byte vector BITREVERSE supported") ? void (0) : __assert_fail
("VT.getScalarType() == MVT::i8 && \"Only byte vector BITREVERSE supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31113, __extension__
__PRETTY_FUNCTION__))
;
31114
31115 // Split v64i8 without BWI so that we can still use the PSHUFB lowering.
31116 if (VT == MVT::v64i8 && !Subtarget.hasBWI())
31117 return splitVectorIntUnary(Op, DAG);
31118
31119 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
31120 if (VT == MVT::v32i8 && !Subtarget.hasInt256())
31121 return splitVectorIntUnary(Op, DAG);
31122
31123 unsigned NumElts = VT.getVectorNumElements();
31124
31125 // If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.
31126 if (Subtarget.hasGFNI()) {
31127 MVT MatrixVT = MVT::getVectorVT(MVT::i64, NumElts / 8);
31128 SDValue Matrix = DAG.getConstant(0x8040201008040201ULL, DL, MatrixVT);
31129 Matrix = DAG.getBitcast(VT, Matrix);
31130 return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix,
31131 DAG.getTargetConstant(0, DL, MVT::i8));
31132 }
31133
31134 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
31135 // two nibbles and a PSHUFB lookup to find the bitreverse of each
31136 // 0-15 value (moved to the other nibble).
31137 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
31138 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
31139 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
31140
31141 const int LoLUT[16] = {
31142 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
31143 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
31144 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
31145 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
31146 const int HiLUT[16] = {
31147 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
31148 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
31149 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
31150 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
31151
31152 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
31153 for (unsigned i = 0; i < NumElts; ++i) {
31154 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
31155 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
31156 }
31157
31158 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
31159 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
31160 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
31161 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
31162 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
31163}
31164
31165static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget,
31166 SelectionDAG &DAG) {
31167 SDLoc DL(Op);
31168 SDValue X = Op.getOperand(0);
31169 MVT VT = Op.getSimpleValueType();
31170
31171 // Special case. If the input fits in 8-bits we can use a single 8-bit TEST.
31172 if (VT == MVT::i8 ||
31173 DAG.MaskedValueIsZero(X, APInt::getBitsSetFrom(VT.getSizeInBits(), 8))) {
31174 X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
31175 SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X,
31176 DAG.getConstant(0, DL, MVT::i8));
31177 // Copy the inverse of the parity flag into a register with setcc.
31178 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
31179 // Extend to the original type.
31180 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
31181 }
31182
31183 // If we have POPCNT, use the default expansion.
31184 if (Subtarget.hasPOPCNT())
31185 return SDValue();
31186
31187 if (VT == MVT::i64) {
31188 // Xor the high and low 16-bits together using a 32-bit operation.
31189 SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
31190 DAG.getNode(ISD::SRL, DL, MVT::i64, X,
31191 DAG.getConstant(32, DL, MVT::i8)));
31192 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
31193 X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
31194 }
31195
31196 if (VT != MVT::i16) {
31197 // Xor the high and low 16-bits together using a 32-bit operation.
31198 SDValue Hi16 = DAG.getNode(ISD::SRL, DL, MVT::i32, X,
31199 DAG.getConstant(16, DL, MVT::i8));
31200 X = DAG.getNode(ISD::XOR, DL, MVT::i32, X, Hi16);
31201 } else {
31202 // If the input is 16-bits, we need to extend to use an i32 shift below.
31203 X = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, X);
31204 }
31205
31206 // Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
31207 // This should allow an h-reg to be used to save a shift.
31208 SDValue Hi = DAG.getNode(
31209 ISD::TRUNCATE, DL, MVT::i8,
31210 DAG.getNode(ISD::SRL, DL, MVT::i32, X, DAG.getConstant(8, DL, MVT::i8)));
31211 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
31212 SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
31213 SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);
31214
31215 // Copy the inverse of the parity flag into a register with setcc.
31216 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
31217 // Extend to the original type.
31218 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
31219}
31220
31221static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
31222 const X86Subtarget &Subtarget) {
31223 unsigned NewOpc = 0;
31224 switch (N->getOpcode()) {
31225 case ISD::ATOMIC_LOAD_ADD:
31226 NewOpc = X86ISD::LADD;
31227 break;
31228 case ISD::ATOMIC_LOAD_SUB:
31229 NewOpc = X86ISD::LSUB;
31230 break;
31231 case ISD::ATOMIC_LOAD_OR:
31232 NewOpc = X86ISD::LOR;
31233 break;
31234 case ISD::ATOMIC_LOAD_XOR:
31235 NewOpc = X86ISD::LXOR;
31236 break;
31237 case ISD::ATOMIC_LOAD_AND:
31238 NewOpc = X86ISD::LAND;
31239 break;
31240 default:
31241 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode")::llvm::llvm_unreachable_internal("Unknown ATOMIC_LOAD_ opcode"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31241)
;
31242 }
31243
31244 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
31245
31246 return DAG.getMemIntrinsicNode(
31247 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
31248 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
31249 /*MemVT=*/N->getSimpleValueType(0), MMO);
31250}
31251
31252/// Lower atomic_load_ops into LOCK-prefixed operations.
31253static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
31254 const X86Subtarget &Subtarget) {
31255 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
31256 SDValue Chain = N->getOperand(0);
31257 SDValue LHS = N->getOperand(1);
31258 SDValue RHS = N->getOperand(2);
31259 unsigned Opc = N->getOpcode();
31260 MVT VT = N->getSimpleValueType(0);
31261 SDLoc DL(N);
31262
31263 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
31264 // can only be lowered when the result is unused. They should have already
31265 // been transformed into a cmpxchg loop in AtomicExpand.
31266 if (N->hasAnyUseOfValue(0)) {
31267 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
31268 // select LXADD if LOCK_SUB can't be selected.
31269 if (Opc == ISD::ATOMIC_LOAD_SUB) {
31270 RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
31271 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
31272 RHS, AN->getMemOperand());
31273 }
31274 assert(Opc == ISD::ATOMIC_LOAD_ADD &&(static_cast <bool> (Opc == ISD::ATOMIC_LOAD_ADD &&
"Used AtomicRMW ops other than Add should have been expanded!"
) ? void (0) : __assert_fail ("Opc == ISD::ATOMIC_LOAD_ADD && \"Used AtomicRMW ops other than Add should have been expanded!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31275, __extension__
__PRETTY_FUNCTION__))
31275 "Used AtomicRMW ops other than Add should have been expanded!")(static_cast <bool> (Opc == ISD::ATOMIC_LOAD_ADD &&
"Used AtomicRMW ops other than Add should have been expanded!"
) ? void (0) : __assert_fail ("Opc == ISD::ATOMIC_LOAD_ADD && \"Used AtomicRMW ops other than Add should have been expanded!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31275, __extension__
__PRETTY_FUNCTION__))
;
31276 return N;
31277 }
31278
31279 // Specialized lowering for the canonical form of an idemptotent atomicrmw.
31280 // The core idea here is that since the memory location isn't actually
31281 // changing, all we need is a lowering for the *ordering* impacts of the
31282 // atomicrmw. As such, we can chose a different operation and memory
31283 // location to minimize impact on other code.
31284 if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS)) {
31285 // On X86, the only ordering which actually requires an instruction is
31286 // seq_cst which isn't SingleThread, everything just needs to be preserved
31287 // during codegen and then dropped. Note that we expect (but don't assume),
31288 // that orderings other than seq_cst and acq_rel have been canonicalized to
31289 // a store or load.
31290 if (AN->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent &&
31291 AN->getSyncScopeID() == SyncScope::System) {
31292 // Prefer a locked operation against a stack location to minimize cache
31293 // traffic. This assumes that stack locations are very likely to be
31294 // accessed only by the owning thread.
31295 SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);
31296 assert(!N->hasAnyUseOfValue(0))(static_cast <bool> (!N->hasAnyUseOfValue(0)) ? void
(0) : __assert_fail ("!N->hasAnyUseOfValue(0)", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31296, __extension__ __PRETTY_FUNCTION__))
;
31297 // NOTE: The getUNDEF is needed to give something for the unused result 0.
31298 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
31299 DAG.getUNDEF(VT), NewChain);
31300 }
31301 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
31302 SDValue NewChain = DAG.getNode(X86ISD::MEMBARRIER, DL, MVT::Other, Chain);
31303 assert(!N->hasAnyUseOfValue(0))(static_cast <bool> (!N->hasAnyUseOfValue(0)) ? void
(0) : __assert_fail ("!N->hasAnyUseOfValue(0)", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31303, __extension__ __PRETTY_FUNCTION__))
;
31304 // NOTE: The getUNDEF is needed to give something for the unused result 0.
31305 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
31306 DAG.getUNDEF(VT), NewChain);
31307 }
31308
31309 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
31310 // RAUW the chain, but don't worry about the result, as it's unused.
31311 assert(!N->hasAnyUseOfValue(0))(static_cast <bool> (!N->hasAnyUseOfValue(0)) ? void
(0) : __assert_fail ("!N->hasAnyUseOfValue(0)", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31311, __extension__ __PRETTY_FUNCTION__))
;
31312 // NOTE: The getUNDEF is needed to give something for the unused result 0.
31313 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
31314 DAG.getUNDEF(VT), LockOp.getValue(1));
31315}
31316
31317static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,
31318 const X86Subtarget &Subtarget) {
31319 auto *Node = cast<AtomicSDNode>(Op.getNode());
31320 SDLoc dl(Node);
31321 EVT VT = Node->getMemoryVT();
31322
31323 bool IsSeqCst =
31324 Node->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent;
31325 bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT);
31326
31327 // If this store is not sequentially consistent and the type is legal
31328 // we can just keep it.
31329 if (!IsSeqCst && IsTypeLegal)
31330 return Op;
31331
31332 if (VT == MVT::i64 && !IsTypeLegal) {
31333 // For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE
31334 // is enabled.
31335 bool NoImplicitFloatOps =
31336 DAG.getMachineFunction().getFunction().hasFnAttribute(
31337 Attribute::NoImplicitFloat);
31338 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
31339 SDValue Chain;
31340 if (Subtarget.hasSSE1()) {
31341 SDValue SclToVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
31342 Node->getOperand(2));
31343 MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
31344 SclToVec = DAG.getBitcast(StVT, SclToVec);
31345 SDVTList Tys = DAG.getVTList(MVT::Other);
31346 SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()};
31347 Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops,
31348 MVT::i64, Node->getMemOperand());
31349 } else if (Subtarget.hasX87()) {
31350 // First load this into an 80-bit X87 register using a stack temporary.
31351 // This will put the whole integer into the significand.
31352 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
31353 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
31354 MachinePointerInfo MPI =
31355 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
31356 Chain =
31357 DAG.getStore(Node->getChain(), dl, Node->getOperand(2), StackPtr,
31358 MPI, MaybeAlign(), MachineMemOperand::MOStore);
31359 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
31360 SDValue LdOps[] = {Chain, StackPtr};
31361 SDValue Value =
31362 DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, LdOps, MVT::i64, MPI,
31363 /*Align*/ None, MachineMemOperand::MOLoad);
31364 Chain = Value.getValue(1);
31365
31366 // Now use an FIST to do the atomic store.
31367 SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()};
31368 Chain =
31369 DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),
31370 StoreOps, MVT::i64, Node->getMemOperand());
31371 }
31372
31373 if (Chain) {
31374 // If this is a sequentially consistent store, also emit an appropriate
31375 // barrier.
31376 if (IsSeqCst)
31377 Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
31378
31379 return Chain;
31380 }
31381 }
31382 }
31383
31384 // Convert seq_cst store -> xchg
31385 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
31386 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
31387 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
31388 Node->getMemoryVT(),
31389 Node->getOperand(0),
31390 Node->getOperand(1), Node->getOperand(2),
31391 Node->getMemOperand());
31392 return Swap.getValue(1);
31393}
31394
31395static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
31396 SDNode *N = Op.getNode();
31397 MVT VT = N->getSimpleValueType(0);
31398 unsigned Opc = Op.getOpcode();
31399
31400 // Let legalize expand this if it isn't a legal type yet.
31401 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
31402 return SDValue();
31403
31404 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
31405 SDLoc DL(N);
31406
31407 // Set the carry flag.
31408 SDValue Carry = Op.getOperand(2);
31409 EVT CarryVT = Carry.getValueType();
31410 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
31411 Carry, DAG.getAllOnesConstant(DL, CarryVT));
31412
31413 bool IsAdd = Opc == ISD::ADDCARRY || Opc == ISD::SADDO_CARRY;
31414 SDValue Sum = DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs,
31415 Op.getOperand(0), Op.getOperand(1),
31416 Carry.getValue(1));
31417
31418 bool IsSigned = Opc == ISD::SADDO_CARRY || Opc == ISD::SSUBO_CARRY;
31419 SDValue SetCC = getSETCC(IsSigned ? X86::COND_O : X86::COND_B,
31420 Sum.getValue(1), DL, DAG);
31421 if (N->getValueType(1) == MVT::i1)
31422 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
31423
31424 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
31425}
31426
31427static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
31428 SelectionDAG &DAG) {
31429 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit())(static_cast <bool> (Subtarget.isTargetDarwin() &&
Subtarget.is64Bit()) ? void (0) : __assert_fail ("Subtarget.isTargetDarwin() && Subtarget.is64Bit()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31429, __extension__
__PRETTY_FUNCTION__))
;
31430
31431 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
31432 // which returns the values as { float, float } (in XMM0) or
31433 // { double, double } (which is returned in XMM0, XMM1).
31434 SDLoc dl(Op);
31435 SDValue Arg = Op.getOperand(0);
31436 EVT ArgVT = Arg.getValueType();
31437 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
31438
31439 TargetLowering::ArgListTy Args;
31440 TargetLowering::ArgListEntry Entry;
31441
31442 Entry.Node = Arg;
31443 Entry.Ty = ArgTy;
31444 Entry.IsSExt = false;
31445 Entry.IsZExt = false;
31446 Args.push_back(Entry);
31447
31448 bool isF64 = ArgVT == MVT::f64;
31449 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
31450 // the small struct {f32, f32} is returned in (eax, edx). For f64,
31451 // the results are returned via SRet in memory.
31452 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31453 RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
31454 const char *LibcallName = TLI.getLibcallName(LC);
31455 SDValue Callee =
31456 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
31457
31458 Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
31459 : (Type *)FixedVectorType::get(ArgTy, 4);
31460
31461 TargetLowering::CallLoweringInfo CLI(DAG);
31462 CLI.setDebugLoc(dl)
31463 .setChain(DAG.getEntryNode())
31464 .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
31465
31466 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
31467
31468 if (isF64)
31469 // Returned in xmm0 and xmm1.
31470 return CallResult.first;
31471
31472 // Returned in bits 0:31 and 32:64 xmm0.
31473 SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
31474 CallResult.first, DAG.getIntPtrConstant(0, dl));
31475 SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
31476 CallResult.first, DAG.getIntPtrConstant(1, dl));
31477 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
31478 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
31479}
31480
31481/// Widen a vector input to a vector of NVT. The
31482/// input vector must have the same element type as NVT.
31483static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
31484 bool FillWithZeroes = false) {
31485 // Check if InOp already has the right width.
31486 MVT InVT = InOp.getSimpleValueType();
31487 if (InVT == NVT)
31488 return InOp;
31489
31490 if (InOp.isUndef())
31491 return DAG.getUNDEF(NVT);
31492
31493 assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&(static_cast <bool> (InVT.getVectorElementType() == NVT
.getVectorElementType() && "input and widen element type must match"
) ? void (0) : __assert_fail ("InVT.getVectorElementType() == NVT.getVectorElementType() && \"input and widen element type must match\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31494, __extension__
__PRETTY_FUNCTION__))
31494 "input and widen element type must match")(static_cast <bool> (InVT.getVectorElementType() == NVT
.getVectorElementType() && "input and widen element type must match"
) ? void (0) : __assert_fail ("InVT.getVectorElementType() == NVT.getVectorElementType() && \"input and widen element type must match\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31494, __extension__
__PRETTY_FUNCTION__))
;
31495
31496 unsigned InNumElts = InVT.getVectorNumElements();
31497 unsigned WidenNumElts = NVT.getVectorNumElements();
31498 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&(static_cast <bool> (WidenNumElts > InNumElts &&
WidenNumElts % InNumElts == 0 && "Unexpected request for vector widening"
) ? void (0) : __assert_fail ("WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 && \"Unexpected request for vector widening\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31499, __extension__
__PRETTY_FUNCTION__))
31499 "Unexpected request for vector widening")(static_cast <bool> (WidenNumElts > InNumElts &&
WidenNumElts % InNumElts == 0 && "Unexpected request for vector widening"
) ? void (0) : __assert_fail ("WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 && \"Unexpected request for vector widening\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31499, __extension__
__PRETTY_FUNCTION__))
;
31500
31501 SDLoc dl(InOp);
31502 if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
31503 InOp.getNumOperands() == 2) {
31504 SDValue N1 = InOp.getOperand(1);
31505 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
31506 N1.isUndef()) {
31507 InOp = InOp.getOperand(0);
31508 InVT = InOp.getSimpleValueType();
31509 InNumElts = InVT.getVectorNumElements();
31510 }
31511 }
31512 if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
31513 ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
31514 SmallVector<SDValue, 16> Ops;
31515 for (unsigned i = 0; i < InNumElts; ++i)
31516 Ops.push_back(InOp.getOperand(i));
31517
31518 EVT EltVT = InOp.getOperand(0).getValueType();
31519
31520 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
31521 DAG.getUNDEF(EltVT);
31522 for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
31523 Ops.push_back(FillVal);
31524 return DAG.getBuildVector(NVT, dl, Ops);
31525 }
31526 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
31527 DAG.getUNDEF(NVT);
31528 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
31529 InOp, DAG.getIntPtrConstant(0, dl));
31530}
31531
31532static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
31533 SelectionDAG &DAG) {
31534 assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "MGATHER/MSCATTER are supported on AVX-512 arch only"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"MGATHER/MSCATTER are supported on AVX-512 arch only\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31535, __extension__
__PRETTY_FUNCTION__))
31535 "MGATHER/MSCATTER are supported on AVX-512 arch only")(static_cast <bool> (Subtarget.hasAVX512() && "MGATHER/MSCATTER are supported on AVX-512 arch only"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"MGATHER/MSCATTER are supported on AVX-512 arch only\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31535, __extension__
__PRETTY_FUNCTION__))
;
31536
31537 MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
31538 SDValue Src = N->getValue();
31539 MVT VT = Src.getSimpleValueType();
31540 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op")(static_cast <bool> (VT.getScalarSizeInBits() >= 32 &&
"Unsupported scatter op") ? void (0) : __assert_fail ("VT.getScalarSizeInBits() >= 32 && \"Unsupported scatter op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31540, __extension__
__PRETTY_FUNCTION__))
;
31541 SDLoc dl(Op);
31542
31543 SDValue Scale = N->getScale();
31544 SDValue Index = N->getIndex();
31545 SDValue Mask = N->getMask();
31546 SDValue Chain = N->getChain();
31547 SDValue BasePtr = N->getBasePtr();
31548
31549 if (VT == MVT::v2f32 || VT == MVT::v2i32) {
31550 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type")(static_cast <bool> (Mask.getValueType() == MVT::v2i1 &&
"Unexpected mask type") ? void (0) : __assert_fail ("Mask.getValueType() == MVT::v2i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31550, __extension__
__PRETTY_FUNCTION__))
;
31551 // If the index is v2i64 and we have VLX we can use xmm for data and index.
31552 if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
31553 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31554 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
31555 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));
31556 SDVTList VTs = DAG.getVTList(MVT::Other);
31557 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
31558 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
31559 N->getMemoryVT(), N->getMemOperand());
31560 }
31561 return SDValue();
31562 }
31563
31564 MVT IndexVT = Index.getSimpleValueType();
31565
31566 // If the index is v2i32, we're being called by type legalization and we
31567 // should just let the default handling take care of it.
31568 if (IndexVT == MVT::v2i32)
31569 return SDValue();
31570
31571 // If we don't have VLX and neither the passthru or index is 512-bits, we
31572 // need to widen until one is.
31573 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
31574 !Index.getSimpleValueType().is512BitVector()) {
31575 // Determine how much we need to widen by to get a 512-bit type.
31576 unsigned Factor = std::min(512/VT.getSizeInBits(),
31577 512/IndexVT.getSizeInBits());
31578 unsigned NumElts = VT.getVectorNumElements() * Factor;
31579
31580 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
31581 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
31582 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
31583
31584 Src = ExtendToType(Src, VT, DAG);
31585 Index = ExtendToType(Index, IndexVT, DAG);
31586 Mask = ExtendToType(Mask, MaskVT, DAG, true);
31587 }
31588
31589 SDVTList VTs = DAG.getVTList(MVT::Other);
31590 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
31591 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
31592 N->getMemoryVT(), N->getMemOperand());
31593}
31594
31595static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
31596 SelectionDAG &DAG) {
31597
31598 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
31599 MVT VT = Op.getSimpleValueType();
31600 MVT ScalarVT = VT.getScalarType();
31601 SDValue Mask = N->getMask();
31602 MVT MaskVT = Mask.getSimpleValueType();
31603 SDValue PassThru = N->getPassThru();
31604 SDLoc dl(Op);
31605
31606 // Handle AVX masked loads which don't support passthru other than 0.
31607 if (MaskVT.getVectorElementType() != MVT::i1) {
31608 // We also allow undef in the isel pattern.
31609 if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode()))
31610 return Op;
31611
31612 SDValue NewLoad = DAG.getMaskedLoad(
31613 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
31614 getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(),
31615 N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),
31616 N->isExpandingLoad());
31617 // Emit a blend.
31618 SDValue Select = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
31619 return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);
31620 }
31621
31622 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&(static_cast <bool> ((!N->isExpandingLoad() || Subtarget
.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"
) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31623, __extension__
__PRETTY_FUNCTION__))
31623 "Expanding masked load is supported on AVX-512 target only!")(static_cast <bool> ((!N->isExpandingLoad() || Subtarget
.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"
) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31623, __extension__
__PRETTY_FUNCTION__))
;
31624
31625 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&(static_cast <bool> ((!N->isExpandingLoad() || ScalarVT
.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31626, __extension__
__PRETTY_FUNCTION__))
31626 "Expanding masked load is supported for 32 and 64-bit types only!")(static_cast <bool> ((!N->isExpandingLoad() || ScalarVT
.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31626, __extension__
__PRETTY_FUNCTION__))
;
31627
31628 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked load op.") ? void (0) : __assert_fail (
"Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31629, __extension__
__PRETTY_FUNCTION__))
31629 "Cannot lower masked load op.")(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked load op.") ? void (0) : __assert_fail (
"Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31629, __extension__
__PRETTY_FUNCTION__))
;
31630
31631 assert((ScalarVT.getSizeInBits() >= 32 ||(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked load op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31634, __extension__
__PRETTY_FUNCTION__))
31632 (Subtarget.hasBWI() &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked load op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31634, __extension__
__PRETTY_FUNCTION__))
31633 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked load op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31634, __extension__
__PRETTY_FUNCTION__))
31634 "Unsupported masked load op.")(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked load op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31634, __extension__
__PRETTY_FUNCTION__))
;
31635
31636 // This operation is legal for targets with VLX, but without
31637 // VLX the vector should be widened to 512 bit
31638 unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
31639 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
31640 PassThru = ExtendToType(PassThru, WideDataVT, DAG);
31641
31642 // Mask element has to be i1.
31643 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&(static_cast <bool> (Mask.getSimpleValueType().getScalarType
() == MVT::i1 && "Unexpected mask type") ? void (0) :
__assert_fail ("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31644, __extension__
__PRETTY_FUNCTION__))
31644 "Unexpected mask type")(static_cast <bool> (Mask.getSimpleValueType().getScalarType
() == MVT::i1 && "Unexpected mask type") ? void (0) :
__assert_fail ("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31644, __extension__
__PRETTY_FUNCTION__))
;
31645
31646 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
31647
31648 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
31649 SDValue NewLoad = DAG.getMaskedLoad(
31650 WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
31651 PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
31652 N->getExtensionType(), N->isExpandingLoad());
31653
31654 SDValue Extract =
31655 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0),
31656 DAG.getIntPtrConstant(0, dl));
31657 SDValue RetOps[] = {Extract, NewLoad.getValue(1)};
31658 return DAG.getMergeValues(RetOps, dl);
31659}
31660
31661static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
31662 SelectionDAG &DAG) {
31663 MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
31664 SDValue DataToStore = N->getValue();
31665 MVT VT = DataToStore.getSimpleValueType();
31666 MVT ScalarVT = VT.getScalarType();
31667 SDValue Mask = N->getMask();
31668 SDLoc dl(Op);
31669
31670 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&(static_cast <bool> ((!N->isCompressingStore() || Subtarget
.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"
) ? void (0) : __assert_fail ("(!N->isCompressingStore() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31671, __extension__
__PRETTY_FUNCTION__))
31671 "Expanding masked load is supported on AVX-512 target only!")(static_cast <bool> ((!N->isCompressingStore() || Subtarget
.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"
) ? void (0) : __assert_fail ("(!N->isCompressingStore() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31671, __extension__
__PRETTY_FUNCTION__))
;
31672
31673 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&(static_cast <bool> ((!N->isCompressingStore() || ScalarVT
.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? void (0) : __assert_fail ("(!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31674, __extension__
__PRETTY_FUNCTION__))
31674 "Expanding masked load is supported for 32 and 64-bit types only!")(static_cast <bool> ((!N->isCompressingStore() || ScalarVT
.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? void (0) : __assert_fail ("(!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31674, __extension__
__PRETTY_FUNCTION__))
;
31675
31676 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked store op.") ? void (0) : __assert_fail (
"Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31677, __extension__
__PRETTY_FUNCTION__))
31677 "Cannot lower masked store op.")(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked store op.") ? void (0) : __assert_fail (
"Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31677, __extension__
__PRETTY_FUNCTION__))
;
31678
31679 assert((ScalarVT.getSizeInBits() >= 32 ||(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked store op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31682, __extension__
__PRETTY_FUNCTION__))
31680 (Subtarget.hasBWI() &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked store op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31682, __extension__
__PRETTY_FUNCTION__))
31681 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked store op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31682, __extension__
__PRETTY_FUNCTION__))
31682 "Unsupported masked store op.")(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked store op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31682, __extension__
__PRETTY_FUNCTION__))
;
31683
31684 // This operation is legal for targets with VLX, but without
31685 // VLX the vector should be widened to 512 bit
31686 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
31687 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
31688
31689 // Mask element has to be i1.
31690 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&(static_cast <bool> (Mask.getSimpleValueType().getScalarType
() == MVT::i1 && "Unexpected mask type") ? void (0) :
__assert_fail ("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31691, __extension__
__PRETTY_FUNCTION__))
31691 "Unexpected mask type")(static_cast <bool> (Mask.getSimpleValueType().getScalarType
() == MVT::i1 && "Unexpected mask type") ? void (0) :
__assert_fail ("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31691, __extension__
__PRETTY_FUNCTION__))
;
31692
31693 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
31694
31695 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
31696 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
31697 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
31698 N->getOffset(), Mask, N->getMemoryVT(),
31699 N->getMemOperand(), N->getAddressingMode(),
31700 N->isTruncatingStore(), N->isCompressingStore());
31701}
31702
31703static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
31704 SelectionDAG &DAG) {
31705 assert(Subtarget.hasAVX2() &&(static_cast <bool> (Subtarget.hasAVX2() && "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31706, __extension__
__PRETTY_FUNCTION__))
31706 "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only")(static_cast <bool> (Subtarget.hasAVX2() && "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31706, __extension__
__PRETTY_FUNCTION__))
;
31707
31708 MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
31709 SDLoc dl(Op);
31710 MVT VT = Op.getSimpleValueType();
31711 SDValue Index = N->getIndex();
31712 SDValue Mask = N->getMask();
31713 SDValue PassThru = N->getPassThru();
31714 MVT IndexVT = Index.getSimpleValueType();
31715
31716 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op")(static_cast <bool> (VT.getScalarSizeInBits() >= 32 &&
"Unsupported gather op") ? void (0) : __assert_fail ("VT.getScalarSizeInBits() >= 32 && \"Unsupported gather op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31716, __extension__
__PRETTY_FUNCTION__))
;
31717
31718 // If the index is v2i32, we're being called by type legalization.
31719 if (IndexVT == MVT::v2i32)
31720 return SDValue();
31721
31722 // If we don't have VLX and neither the passthru or index is 512-bits, we
31723 // need to widen until one is.
31724 MVT OrigVT = VT;
31725 if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
31726 !IndexVT.is512BitVector()) {
31727 // Determine how much we need to widen by to get a 512-bit type.
31728 unsigned Factor = std::min(512/VT.getSizeInBits(),
31729 512/IndexVT.getSizeInBits());
31730
31731 unsigned NumElts = VT.getVectorNumElements() * Factor;
31732
31733 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
31734 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
31735 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
31736
31737 PassThru = ExtendToType(PassThru, VT, DAG);
31738 Index = ExtendToType(Index, IndexVT, DAG);
31739 Mask = ExtendToType(Mask, MaskVT, DAG, true);
31740 }
31741
31742 // Break dependency on the data register.
31743 if (PassThru.isUndef())
31744 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
31745
31746 SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
31747 N->getScale() };
31748 SDValue NewGather = DAG.getMemIntrinsicNode(
31749 X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(),
31750 N->getMemOperand());
31751 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT,
31752 NewGather, DAG.getIntPtrConstant(0, dl));
31753 return DAG.getMergeValues({Extract, NewGather.getValue(1)}, dl);
31754}
31755
31756static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) {
31757 SDLoc dl(Op);
31758 SDValue Src = Op.getOperand(0);
31759 MVT DstVT = Op.getSimpleValueType();
31760
31761 AddrSpaceCastSDNode *N = cast<AddrSpaceCastSDNode>(Op.getNode());
31762 unsigned SrcAS = N->getSrcAddressSpace();
31763
31764 assert(SrcAS != N->getDestAddressSpace() &&(static_cast <bool> (SrcAS != N->getDestAddressSpace
() && "addrspacecast must be between different address spaces"
) ? void (0) : __assert_fail ("SrcAS != N->getDestAddressSpace() && \"addrspacecast must be between different address spaces\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31765, __extension__
__PRETTY_FUNCTION__))
31765 "addrspacecast must be between different address spaces")(static_cast <bool> (SrcAS != N->getDestAddressSpace
() && "addrspacecast must be between different address spaces"
) ? void (0) : __assert_fail ("SrcAS != N->getDestAddressSpace() && \"addrspacecast must be between different address spaces\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31765, __extension__
__PRETTY_FUNCTION__))
;
31766
31767 if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) {
31768 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);
31769 } else if (DstVT == MVT::i64) {
31770 Op = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);
31771 } else if (DstVT == MVT::i32) {
31772 Op = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);
31773 } else {
31774 report_fatal_error("Bad address space in addrspacecast");
31775 }
31776 return Op;
31777}
31778
31779SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op,
31780 SelectionDAG &DAG) const {
31781 // TODO: Eventually, the lowering of these nodes should be informed by or
31782 // deferred to the GC strategy for the function in which they appear. For
31783 // now, however, they must be lowered to something. Since they are logically
31784 // no-ops in the case of a null GC strategy (or a GC strategy which does not
31785 // require special handling for these nodes), lower them as literal NOOPs for
31786 // the time being.
31787 SmallVector<SDValue, 2> Ops;
31788
31789 Ops.push_back(Op.getOperand(0));
31790 if (Op->getGluedNode())
31791 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
31792
31793 SDLoc OpDL(Op);
31794 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
31795 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
31796
31797 return NOOP;
31798}
31799
31800// Custom split CVTPS2PH with wide types.
31801static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG) {
31802 SDLoc dl(Op);
31803 EVT VT = Op.getValueType();
31804 SDValue Lo, Hi;
31805 std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
31806 EVT LoVT, HiVT;
31807 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
31808 SDValue RC = Op.getOperand(1);
31809 Lo = DAG.getNode(X86ISD::CVTPS2PH, dl, LoVT, Lo, RC);
31810 Hi = DAG.getNode(X86ISD::CVTPS2PH, dl, HiVT, Hi, RC);
31811 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
31812}
31813
31814/// Provide custom lowering hooks for some operations.
31815SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
31816 switch (Op.getOpcode()) {
31817 default: llvm_unreachable("Should not custom lower this!")::llvm::llvm_unreachable_internal("Should not custom lower this!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31817)
;
31818 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
31819 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
31820 return LowerCMP_SWAP(Op, Subtarget, DAG);
31821 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
31822 case ISD::ATOMIC_LOAD_ADD:
31823 case ISD::ATOMIC_LOAD_SUB:
31824 case ISD::ATOMIC_LOAD_OR:
31825 case ISD::ATOMIC_LOAD_XOR:
31826 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
31827 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget);
31828 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
31829 case ISD::PARITY: return LowerPARITY(Op, Subtarget, DAG);
31830 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
31831 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
31832 case ISD::VECTOR_SHUFFLE: return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG);
31833 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
31834 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
31835 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
31836 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
31837 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
31838 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
31839 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
31840 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
31841 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
31842 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
31843 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
31844 case ISD::SHL_PARTS:
31845 case ISD::SRA_PARTS:
31846 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
31847 case ISD::FSHL:
31848 case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG);
31849 case ISD::STRICT_SINT_TO_FP:
31850 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
31851 case ISD::STRICT_UINT_TO_FP:
31852 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
31853 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
31854 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
31855 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
31856 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
31857 case ISD::ZERO_EXTEND_VECTOR_INREG:
31858 case ISD::SIGN_EXTEND_VECTOR_INREG:
31859 return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
31860 case ISD::FP_TO_SINT:
31861 case ISD::STRICT_FP_TO_SINT:
31862 case ISD::FP_TO_UINT:
31863 case ISD::STRICT_FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
31864 case ISD::FP_TO_SINT_SAT:
31865 case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG);
31866 case ISD::FP_EXTEND:
31867 case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
31868 case ISD::FP_ROUND:
31869 case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG);
31870 case ISD::FP16_TO_FP:
31871 case ISD::STRICT_FP16_TO_FP: return LowerFP16_TO_FP(Op, DAG);
31872 case ISD::FP_TO_FP16:
31873 case ISD::STRICT_FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
31874 case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);
31875 case ISD::STORE: return LowerStore(Op, Subtarget, DAG);
31876 case ISD::FADD:
31877 case ISD::FSUB: return lowerFaddFsub(Op, DAG);
31878 case ISD::FROUND: return LowerFROUND(Op, DAG);
31879 case ISD::FABS:
31880 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
31881 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
31882 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
31883 case ISD::LRINT:
31884 case ISD::LLRINT: return LowerLRINT_LLRINT(Op, DAG);
31885 case ISD::SETCC:
31886 case ISD::STRICT_FSETCC:
31887 case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG);
31888 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
31889 case ISD::SELECT: return LowerSELECT(Op, DAG);
31890 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
31891 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
31892 case ISD::VASTART: return LowerVASTART(Op, DAG);
31893 case ISD::VAARG: return LowerVAARG(Op, DAG);
31894 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
31895 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
31896 case ISD::INTRINSIC_VOID:
31897 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
31898 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
31899 case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
31900 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
31901 case ISD::FRAME_TO_ARGS_OFFSET:
31902 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
31903 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
31904 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
31905 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
31906 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
31907 case ISD::EH_SJLJ_SETUP_DISPATCH:
31908 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
31909 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
31910 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
31911 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
31912 case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG);
31913 case ISD::CTLZ:
31914 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
31915 case ISD::CTTZ:
31916 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, Subtarget, DAG);
31917 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
31918 case ISD::MULHS:
31919 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
31920 case ISD::ROTL:
31921 case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
31922 case ISD::SRA:
31923 case ISD::SRL:
31924 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
31925 case ISD::SADDO:
31926 case ISD::UADDO:
31927 case ISD::SSUBO:
31928 case ISD::USUBO: return LowerXALUO(Op, DAG);
31929 case ISD::SMULO:
31930 case ISD::UMULO: return LowerMULO(Op, Subtarget, DAG);
31931 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
31932 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
31933 case ISD::SADDO_CARRY:
31934 case ISD::SSUBO_CARRY:
31935 case ISD::ADDCARRY:
31936 case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG);
31937 case ISD::ADD:
31938 case ISD::SUB: return lowerAddSub(Op, DAG, Subtarget);
31939 case ISD::UADDSAT:
31940 case ISD::SADDSAT:
31941 case ISD::USUBSAT:
31942 case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG, Subtarget);
31943 case ISD::SMAX:
31944 case ISD::SMIN:
31945 case ISD::UMAX:
31946 case ISD::UMIN: return LowerMINMAX(Op, DAG);
31947 case ISD::ABS: return LowerABS(Op, Subtarget, DAG);
31948 case ISD::AVGCEILU: return LowerAVG(Op, Subtarget, DAG);
31949 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
31950 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
31951 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
31952 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
31953 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
31954 case ISD::GC_TRANSITION_START:
31955 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION(Op, DAG);
31956 case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG);
31957 case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG);
31958 }
31959}
31960
31961/// Replace a node with an illegal result type with a new node built out of
31962/// custom code.
31963void X86TargetLowering::ReplaceNodeResults(SDNode *N,
31964 SmallVectorImpl<SDValue>&Results,
31965 SelectionDAG &DAG) const {
31966 SDLoc dl(N);
31967 switch (N->getOpcode()) {
31968 default:
31969#ifndef NDEBUG
31970 dbgs() << "ReplaceNodeResults: ";
31971 N->dump(&DAG);
31972#endif
31973 llvm_unreachable("Do not know how to custom type legalize this operation!")::llvm::llvm_unreachable_internal("Do not know how to custom type legalize this operation!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31973)
;
31974 case X86ISD::CVTPH2PS: {
31975 EVT VT = N->getValueType(0);
31976 SDValue Lo, Hi;
31977 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
31978 EVT LoVT, HiVT;
31979 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
31980 Lo = DAG.getNode(X86ISD::CVTPH2PS, dl, LoVT, Lo);
31981 Hi = DAG.getNode(X86ISD::CVTPH2PS, dl, HiVT, Hi);
31982 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
31983 Results.push_back(Res);
31984 return;
31985 }
31986 case X86ISD::STRICT_CVTPH2PS: {
31987 EVT VT = N->getValueType(0);
31988 SDValue Lo, Hi;
31989 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 1);
31990 EVT LoVT, HiVT;
31991 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
31992 Lo = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {LoVT, MVT::Other},
31993 {N->getOperand(0), Lo});
31994 Hi = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {HiVT, MVT::Other},
31995 {N->getOperand(0), Hi});
31996 SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
31997 Lo.getValue(1), Hi.getValue(1));
31998 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
31999 Results.push_back(Res);
32000 Results.push_back(Chain);
32001 return;
32002 }
32003 case X86ISD::CVTPS2PH:
32004 Results.push_back(LowerCVTPS2PH(SDValue(N, 0), DAG));
32005 return;
32006 case ISD::CTPOP: {
32007 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!")(static_cast <bool> (N->getValueType(0) == MVT::i64 &&
"Unexpected VT!") ? void (0) : __assert_fail ("N->getValueType(0) == MVT::i64 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32007, __extension__
__PRETTY_FUNCTION__))
;
32008 // Use a v2i64 if possible.
32009 bool NoImplicitFloatOps =
32010 DAG.getMachineFunction().getFunction().hasFnAttribute(
32011 Attribute::NoImplicitFloat);
32012 if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {
32013 SDValue Wide =
32014 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));
32015 Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);
32016 // Bit count should fit in 32-bits, extract it as that and then zero
32017 // extend to i64. Otherwise we end up extracting bits 63:32 separately.
32018 Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);
32019 Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,
32020 DAG.getIntPtrConstant(0, dl));
32021 Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);
32022 Results.push_back(Wide);
32023 }
32024 return;
32025 }
32026 case ISD::MUL: {
32027 EVT VT = N->getValueType(0);
32028 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && VT.getVectorElementType() == MVT
::i8 && "Unexpected VT!") ? void (0) : __assert_fail (
"getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && VT.getVectorElementType() == MVT::i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32029, __extension__
__PRETTY_FUNCTION__))
32029 VT.getVectorElementType() == MVT::i8 && "Unexpected VT!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && VT.getVectorElementType() == MVT
::i8 && "Unexpected VT!") ? void (0) : __assert_fail (
"getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && VT.getVectorElementType() == MVT::i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32029, __extension__
__PRETTY_FUNCTION__))
;
32030 // Pre-promote these to vXi16 to avoid op legalization thinking all 16
32031 // elements are needed.
32032 MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
32033 SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
32034 SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
32035 SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);
32036 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
32037 unsigned NumConcats = 16 / VT.getVectorNumElements();
32038 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
32039 ConcatOps[0] = Res;
32040 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);
32041 Results.push_back(Res);
32042 return;
32043 }
32044 case X86ISD::VPMADDWD: {
32045 // Legalize types for X86ISD::VPMADDWD by widening.
32046 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32046, __extension__
__PRETTY_FUNCTION__))
;
32047
32048 EVT VT = N->getValueType(0);
32049 EVT InVT = N->getOperand(0).getValueType();
32050 assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&(static_cast <bool> (VT.getSizeInBits() < 128 &&
128 % VT.getSizeInBits() == 0 && "Expected a VT that divides into 128 bits."
) ? void (0) : __assert_fail ("VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 && \"Expected a VT that divides into 128 bits.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32051, __extension__
__PRETTY_FUNCTION__))
32051 "Expected a VT that divides into 128 bits.")(static_cast <bool> (VT.getSizeInBits() < 128 &&
128 % VT.getSizeInBits() == 0 && "Expected a VT that divides into 128 bits."
) ? void (0) : __assert_fail ("VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 && \"Expected a VT that divides into 128 bits.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32051, __extension__
__PRETTY_FUNCTION__))
;
32052 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32053, __extension__
__PRETTY_FUNCTION__))
32053 "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32053, __extension__
__PRETTY_FUNCTION__))
;
32054 unsigned NumConcat = 128 / InVT.getSizeInBits();
32055
32056 EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),
32057 InVT.getVectorElementType(),
32058 NumConcat * InVT.getVectorNumElements());
32059 EVT WideVT = EVT::getVectorVT(*DAG.getContext(),
32060 VT.getVectorElementType(),
32061 NumConcat * VT.getVectorNumElements());
32062
32063 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
32064 Ops[0] = N->getOperand(0);
32065 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
32066 Ops[0] = N->getOperand(1);
32067 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
32068
32069 SDValue Res = DAG.getNode(N->getOpcode(), dl, WideVT, InVec0, InVec1);
32070 Results.push_back(Res);
32071 return;
32072 }
32073 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
32074 case X86ISD::FMINC:
32075 case X86ISD::FMIN:
32076 case X86ISD::FMAXC:
32077 case X86ISD::FMAX: {
32078 EVT VT = N->getValueType(0);
32079 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.")(static_cast <bool> (VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX."
) ? void (0) : __assert_fail ("VT == MVT::v2f32 && \"Unexpected type (!= v2f32) on FMIN/FMAX.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32079, __extension__
__PRETTY_FUNCTION__))
;
32080 SDValue UNDEF = DAG.getUNDEF(VT);
32081 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
32082 N->getOperand(0), UNDEF);
32083 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
32084 N->getOperand(1), UNDEF);
32085 Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
32086 return;
32087 }
32088 case ISD::SDIV:
32089 case ISD::UDIV:
32090 case ISD::SREM:
32091 case ISD::UREM: {
32092 EVT VT = N->getValueType(0);
32093 if (VT.isVector()) {
32094 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32095, __extension__
__PRETTY_FUNCTION__))
32095 "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32095, __extension__
__PRETTY_FUNCTION__))
;
32096 // If this RHS is a constant splat vector we can widen this and let
32097 // division/remainder by constant optimize it.
32098 // TODO: Can we do something for non-splat?
32099 APInt SplatVal;
32100 if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {
32101 unsigned NumConcats = 128 / VT.getSizeInBits();
32102 SmallVector<SDValue, 8> Ops0(NumConcats, DAG.getUNDEF(VT));
32103 Ops0[0] = N->getOperand(0);
32104 EVT ResVT = getTypeToTransformTo(*DAG.getContext(), VT);
32105 SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0);
32106 SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT);
32107 SDValue Res = DAG.getNode(N->getOpcode(), dl, ResVT, N0, N1);
32108 Results.push_back(Res);
32109 }
32110 return;
32111 }
32112
32113 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
32114 Results.push_back(V);
32115 return;
32116 }
32117 case ISD::TRUNCATE: {
32118 MVT VT = N->getSimpleValueType(0);
32119 if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
32120 return;
32121
32122 // The generic legalizer will try to widen the input type to the same
32123 // number of elements as the widened result type. But this isn't always
32124 // the best thing so do some custom legalization to avoid some cases.
32125 MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT();
32126 SDValue In = N->getOperand(0);
32127 EVT InVT = In.getValueType();
32128
32129 unsigned InBits = InVT.getSizeInBits();
32130 if (128 % InBits == 0) {
32131 // 128 bit and smaller inputs should avoid truncate all together and
32132 // just use a build_vector that will become a shuffle.
32133 // TODO: Widen and use a shuffle directly?
32134 MVT InEltVT = InVT.getSimpleVT().getVectorElementType();
32135 EVT EltVT = VT.getVectorElementType();
32136 unsigned WidenNumElts = WidenVT.getVectorNumElements();
32137 SmallVector<SDValue, 16> Ops(WidenNumElts, DAG.getUNDEF(EltVT));
32138 // Use the original element count so we don't do more scalar opts than
32139 // necessary.
32140 unsigned MinElts = VT.getVectorNumElements();
32141 for (unsigned i=0; i < MinElts; ++i) {
32142 SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, In,
32143 DAG.getIntPtrConstant(i, dl));
32144 Ops[i] = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Val);
32145 }
32146 Results.push_back(DAG.getBuildVector(WidenVT, dl, Ops));
32147 return;
32148 }
32149 // With AVX512 there are some cases that can use a target specific
32150 // truncate node to go from 256/512 to less than 128 with zeros in the
32151 // upper elements of the 128 bit result.
32152 if (Subtarget.hasAVX512() && isTypeLegal(InVT)) {
32153 // We can use VTRUNC directly if for 256 bits with VLX or for any 512.
32154 if ((InBits == 256 && Subtarget.hasVLX()) || InBits == 512) {
32155 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
32156 return;
32157 }
32158 // There's one case we can widen to 512 bits and use VTRUNC.
32159 if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) {
32160 In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In,
32161 DAG.getUNDEF(MVT::v4i64));
32162 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
32163 return;
32164 }
32165 }
32166 if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 &&
32167 getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector &&
32168 isTypeLegal(MVT::v4i64)) {
32169 // Input needs to be split and output needs to widened. Let's use two
32170 // VTRUNCs, and shuffle their results together into the wider type.
32171 SDValue Lo, Hi;
32172 std::tie(Lo, Hi) = DAG.SplitVector(In, dl);
32173
32174 Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo);
32175 Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi);
32176 SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi,
32177 { 0, 1, 2, 3, 16, 17, 18, 19,
32178 -1, -1, -1, -1, -1, -1, -1, -1 });
32179 Results.push_back(Res);
32180 return;
32181 }
32182
32183 return;
32184 }
32185 case ISD::ANY_EXTEND:
32186 // Right now, only MVT::v8i8 has Custom action for an illegal type.
32187 // It's intended to custom handle the input type.
32188 assert(N->getValueType(0) == MVT::v8i8 &&(static_cast <bool> (N->getValueType(0) == MVT::v8i8
&& "Do not know how to legalize this Node") ? void (
0) : __assert_fail ("N->getValueType(0) == MVT::v8i8 && \"Do not know how to legalize this Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32189, __extension__
__PRETTY_FUNCTION__))
32189 "Do not know how to legalize this Node")(static_cast <bool> (N->getValueType(0) == MVT::v8i8
&& "Do not know how to legalize this Node") ? void (
0) : __assert_fail ("N->getValueType(0) == MVT::v8i8 && \"Do not know how to legalize this Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32189, __extension__
__PRETTY_FUNCTION__))
;
32190 return;
32191 case ISD::SIGN_EXTEND:
32192 case ISD::ZERO_EXTEND: {
32193 EVT VT = N->getValueType(0);
32194 SDValue In = N->getOperand(0);
32195 EVT InVT = In.getValueType();
32196 if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
32197 (InVT == MVT::v4i16 || InVT == MVT::v4i8)){
32198 assert(getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), InVT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32199, __extension__
__PRETTY_FUNCTION__))
32199 "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), InVT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32199, __extension__
__PRETTY_FUNCTION__))
;
32200 assert(N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode")(static_cast <bool> (N->getOpcode() == ISD::SIGN_EXTEND
&& "Unexpected opcode") ? void (0) : __assert_fail (
"N->getOpcode() == ISD::SIGN_EXTEND && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32200, __extension__
__PRETTY_FUNCTION__))
;
32201 // Custom split this so we can extend i8/i16->i32 invec. This is better
32202 // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
32203 // sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
32204 // we allow the sra from the extend to i32 to be shared by the split.
32205 In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In);
32206
32207 // Fill a vector with sign bits for each element.
32208 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
32209 SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT);
32210
32211 // Create an unpackl and unpackh to interleave the sign bits then bitcast
32212 // to v2i64.
32213 SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
32214 {0, 4, 1, 5});
32215 Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);
32216 SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
32217 {2, 6, 3, 7});
32218 Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);
32219
32220 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
32221 Results.push_back(Res);
32222 return;
32223 }
32224
32225 if (VT == MVT::v16i32 || VT == MVT::v8i64) {
32226 if (!InVT.is128BitVector()) {
32227 // Not a 128 bit vector, but maybe type legalization will promote
32228 // it to 128 bits.
32229 if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger)
32230 return;
32231 InVT = getTypeToTransformTo(*DAG.getContext(), InVT);
32232 if (!InVT.is128BitVector())
32233 return;
32234
32235 // Promote the input to 128 bits. Type legalization will turn this into
32236 // zext_inreg/sext_inreg.
32237 In = DAG.getNode(N->getOpcode(), dl, InVT, In);
32238 }
32239
32240 // Perform custom splitting instead of the two stage extend we would get
32241 // by default.
32242 EVT LoVT, HiVT;
32243 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
32244 assert(isTypeLegal(LoVT) && "Split VT not legal?")(static_cast <bool> (isTypeLegal(LoVT) && "Split VT not legal?"
) ? void (0) : __assert_fail ("isTypeLegal(LoVT) && \"Split VT not legal?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32244, __extension__
__PRETTY_FUNCTION__))
;
32245
32246 SDValue Lo = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, LoVT, In, DAG);
32247
32248 // We need to shift the input over by half the number of elements.
32249 unsigned NumElts = InVT.getVectorNumElements();
32250 unsigned HalfNumElts = NumElts / 2;
32251 SmallVector<int, 16> ShufMask(NumElts, SM_SentinelUndef);
32252 for (unsigned i = 0; i != HalfNumElts; ++i)
32253 ShufMask[i] = i + HalfNumElts;
32254
32255 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
32256 Hi = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, HiVT, Hi, DAG);
32257
32258 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
32259 Results.push_back(Res);
32260 }
32261 return;
32262 }
32263 case ISD::FP_TO_SINT:
32264 case ISD::STRICT_FP_TO_SINT:
32265 case ISD::FP_TO_UINT:
32266 case ISD::STRICT_FP_TO_UINT: {
32267 bool IsStrict = N->isStrictFPOpcode();
32268 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT ||
32269 N->getOpcode() == ISD::STRICT_FP_TO_SINT;
32270 EVT VT = N->getValueType(0);
32271 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
32272 EVT SrcVT = Src.getValueType();
32273
32274 if (VT.isVector() && Subtarget.hasFP16() &&
32275 SrcVT.getVectorElementType() == MVT::f16) {
32276 EVT EleVT = VT.getVectorElementType();
32277 EVT ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
32278
32279 if (SrcVT != MVT::v8f16) {
32280 SDValue Tmp =
32281 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
32282 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
32283 Ops[0] = Src;
32284 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
32285 }
32286
32287 SDValue Res, Chain;
32288 if (IsStrict) {
32289 unsigned Opc =
32290 IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
32291 Res =
32292 DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {N->getOperand(0), Src});
32293 Chain = Res.getValue(1);
32294 } else {
32295 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
32296 Res = DAG.getNode(Opc, dl, ResVT, Src);
32297 }
32298
32299 // TODO: Need to add exception check code for strict FP.
32300 if (EleVT.getSizeInBits() < 16) {
32301 MVT TmpVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8);
32302 Res = DAG.getNode(ISD::TRUNCATE, dl, TmpVT, Res);
32303
32304 // Now widen to 128 bits.
32305 unsigned NumConcats = 128 / TmpVT.getSizeInBits();
32306 MVT ConcatVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8 * NumConcats);
32307 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(TmpVT));
32308 ConcatOps[0] = Res;
32309 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
32310 }
32311
32312 Results.push_back(Res);
32313 if (IsStrict)
32314 Results.push_back(Chain);
32315
32316 return;
32317 }
32318
32319 if (VT.isVector() && VT.getScalarSizeInBits() < 32) {
32320 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32321, __extension__
__PRETTY_FUNCTION__))
32321 "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32321, __extension__
__PRETTY_FUNCTION__))
;
32322
32323 // Try to create a 128 bit vector, but don't exceed a 32 bit element.
32324 unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);
32325 MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),
32326 VT.getVectorNumElements());
32327 SDValue Res;
32328 SDValue Chain;
32329 if (IsStrict) {
32330 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {PromoteVT, MVT::Other},
32331 {N->getOperand(0), Src});
32332 Chain = Res.getValue(1);
32333 } else
32334 Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);
32335
32336 // Preserve what we know about the size of the original result. If the
32337 // result is v2i32, we have to manually widen the assert.
32338 if (PromoteVT == MVT::v2i32)
32339 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
32340 DAG.getUNDEF(MVT::v2i32));
32341
32342 Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext, dl,
32343 Res.getValueType(), Res,
32344 DAG.getValueType(VT.getVectorElementType()));
32345
32346 if (PromoteVT == MVT::v2i32)
32347 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
32348 DAG.getIntPtrConstant(0, dl));
32349
32350 // Truncate back to the original width.
32351 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
32352
32353 // Now widen to 128 bits.
32354 unsigned NumConcats = 128 / VT.getSizeInBits();
32355 MVT ConcatVT = MVT::getVectorVT(VT.getSimpleVT().getVectorElementType(),
32356 VT.getVectorNumElements() * NumConcats);
32357 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
32358 ConcatOps[0] = Res;
32359 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
32360 Results.push_back(Res);
32361 if (IsStrict)
32362 Results.push_back(Chain);
32363 return;
32364 }
32365
32366
32367 if (VT == MVT::v2i32) {
32368 assert((!IsStrict || IsSigned || Subtarget.hasAVX512()) &&(static_cast <bool> ((!IsStrict || IsSigned || Subtarget
.hasAVX512()) && "Strict unsigned conversion requires AVX512"
) ? void (0) : __assert_fail ("(!IsStrict || IsSigned || Subtarget.hasAVX512()) && \"Strict unsigned conversion requires AVX512\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32369, __extension__
__PRETTY_FUNCTION__))
32369 "Strict unsigned conversion requires AVX512")(static_cast <bool> ((!IsStrict || IsSigned || Subtarget
.hasAVX512()) && "Strict unsigned conversion requires AVX512"
) ? void (0) : __assert_fail ("(!IsStrict || IsSigned || Subtarget.hasAVX512()) && \"Strict unsigned conversion requires AVX512\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32369, __extension__
__PRETTY_FUNCTION__))
;
32370 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32370, __extension__
__PRETTY_FUNCTION__))
;
32371 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32372, __extension__
__PRETTY_FUNCTION__))
32372 "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32372, __extension__
__PRETTY_FUNCTION__))
;
32373 if (Src.getValueType() == MVT::v2f64) {
32374 if (!IsSigned && !Subtarget.hasAVX512()) {
32375 SDValue Res =
32376 expandFP_TO_UINT_SSE(MVT::v4i32, Src, dl, DAG, Subtarget);
32377 Results.push_back(Res);
32378 return;
32379 }
32380
32381 unsigned Opc;
32382 if (IsStrict)
32383 Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
32384 else
32385 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
32386
32387 // If we have VLX we can emit a target specific FP_TO_UINT node,.
32388 if (!IsSigned && !Subtarget.hasVLX()) {
32389 // Otherwise we can defer to the generic legalizer which will widen
32390 // the input as well. This will be further widened during op
32391 // legalization to v8i32<-v8f64.
32392 // For strict nodes we'll need to widen ourselves.
32393 // FIXME: Fix the type legalizer to safely widen strict nodes?
32394 if (!IsStrict)
32395 return;
32396 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, Src,
32397 DAG.getConstantFP(0.0, dl, MVT::v2f64));
32398 Opc = N->getOpcode();
32399 }
32400 SDValue Res;
32401 SDValue Chain;
32402 if (IsStrict) {
32403 Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
32404 {N->getOperand(0), Src});
32405 Chain = Res.getValue(1);
32406 } else {
32407 Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);
32408 }
32409 Results.push_back(Res);
32410 if (IsStrict)
32411 Results.push_back(Chain);
32412 return;
32413 }
32414
32415 // Custom widen strict v2f32->v2i32 by padding with zeros.
32416 // FIXME: Should generic type legalizer do this?
32417 if (Src.getValueType() == MVT::v2f32 && IsStrict) {
32418 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
32419 DAG.getConstantFP(0.0, dl, MVT::v2f32));
32420 SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4i32, MVT::Other},
32421 {N->getOperand(0), Src});
32422 Results.push_back(Res);
32423 Results.push_back(Res.getValue(1));
32424 return;
32425 }
32426
32427 // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
32428 // so early out here.
32429 return;
32430 }
32431
32432 assert(!VT.isVector() && "Vectors should have been handled above!")(static_cast <bool> (!VT.isVector() && "Vectors should have been handled above!"
) ? void (0) : __assert_fail ("!VT.isVector() && \"Vectors should have been handled above!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32432, __extension__
__PRETTY_FUNCTION__))
;
32433
32434 if ((Subtarget.hasDQI() && VT == MVT::i64 &&
32435 (SrcVT == MVT::f32 || SrcVT == MVT::f64)) ||
32436 (Subtarget.hasFP16() && SrcVT == MVT::f16)) {
32437 assert(!Subtarget.is64Bit() && "i64 should be legal")(static_cast <bool> (!Subtarget.is64Bit() && "i64 should be legal"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"i64 should be legal\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32437, __extension__
__PRETTY_FUNCTION__))
;
32438 unsigned NumElts = Subtarget.hasVLX() ? 2 : 8;
32439 // If we use a 128-bit result we might need to use a target specific node.
32440 unsigned SrcElts =
32441 std::max(NumElts, 128U / (unsigned)SrcVT.getSizeInBits());
32442 MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);
32443 MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), SrcElts);
32444 unsigned Opc = N->getOpcode();
32445 if (NumElts != SrcElts) {
32446 if (IsStrict)
32447 Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
32448 else
32449 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
32450 }
32451
32452 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
32453 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
32454 DAG.getConstantFP(0.0, dl, VecInVT), Src,
32455 ZeroIdx);
32456 SDValue Chain;
32457 if (IsStrict) {
32458 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
32459 Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res);
32460 Chain = Res.getValue(1);
32461 } else
32462 Res = DAG.getNode(Opc, SDLoc(N), VecVT, Res);
32463 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);
32464 Results.push_back(Res);
32465 if (IsStrict)
32466 Results.push_back(Chain);
32467 return;
32468 }
32469
32470 if (VT == MVT::i128 && Subtarget.isTargetWin64()) {
32471 SDValue Chain;
32472 SDValue V = LowerWin64_FP_TO_INT128(SDValue(N, 0), DAG, Chain);
32473 Results.push_back(V);
32474 if (IsStrict)
32475 Results.push_back(Chain);
32476 return;
32477 }
32478
32479 SDValue Chain;
32480 if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) {
32481 Results.push_back(V);
32482 if (IsStrict)
32483 Results.push_back(Chain);
32484 }
32485 return;
32486 }
32487 case ISD::LRINT:
32488 case ISD::LLRINT: {
32489 if (SDValue V = LRINT_LLRINTHelper(N, DAG))
32490 Results.push_back(V);
32491 return;
32492 }
32493
32494 case ISD::SINT_TO_FP:
32495 case ISD::STRICT_SINT_TO_FP:
32496 case ISD::UINT_TO_FP:
32497 case ISD::STRICT_UINT_TO_FP: {
32498 bool IsStrict = N->isStrictFPOpcode();
32499 bool IsSigned = N->getOpcode() == ISD::SINT_TO_FP ||
32500 N->getOpcode() == ISD::STRICT_SINT_TO_FP;
32501 EVT VT = N->getValueType(0);
32502 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
32503 if (VT.getVectorElementType() == MVT::f16 && Subtarget.hasFP16() &&
32504 Subtarget.hasVLX()) {
32505 if (Src.getValueType().getVectorElementType() == MVT::i16)
32506 return;
32507
32508 if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2i32)
32509 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
32510 IsStrict ? DAG.getConstant(0, dl, MVT::v2i32)
32511 : DAG.getUNDEF(MVT::v2i32));
32512 if (IsStrict) {
32513 unsigned Opc =
32514 IsSigned ? X86ISD::STRICT_CVTSI2P : X86ISD::STRICT_CVTUI2P;
32515 SDValue Res = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
32516 {N->getOperand(0), Src});
32517 Results.push_back(Res);
32518 Results.push_back(Res.getValue(1));
32519 } else {
32520 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
32521 Results.push_back(DAG.getNode(Opc, dl, MVT::v8f16, Src));
32522 }
32523 return;
32524 }
32525 if (VT != MVT::v2f32)
32526 return;
32527 EVT SrcVT = Src.getValueType();
32528 if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
32529 if (IsStrict) {
32530 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P
32531 : X86ISD::STRICT_CVTUI2P;
32532 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
32533 {N->getOperand(0), Src});
32534 Results.push_back(Res);
32535 Results.push_back(Res.getValue(1));
32536 } else {
32537 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
32538 Results.push_back(DAG.getNode(Opc, dl, MVT::v4f32, Src));
32539 }
32540 return;
32541 }
32542 if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() &&
32543 Subtarget.hasSSE41() && !Subtarget.hasAVX512()) {
32544 SDValue Zero = DAG.getConstant(0, dl, SrcVT);
32545 SDValue One = DAG.getConstant(1, dl, SrcVT);
32546 SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT,
32547 DAG.getNode(ISD::SRL, dl, SrcVT, Src, One),
32548 DAG.getNode(ISD::AND, dl, SrcVT, Src, One));
32549 SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT);
32550 SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src);
32551 SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32));
32552 for (int i = 0; i != 2; ++i) {
32553 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
32554 SignSrc, DAG.getIntPtrConstant(i, dl));
32555 if (IsStrict)
32556 SignCvts[i] =
32557 DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other},
32558 {N->getOperand(0), Elt});
32559 else
32560 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Elt);
32561 };
32562 SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts);
32563 SDValue Slow, Chain;
32564 if (IsStrict) {
32565 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
32566 SignCvts[0].getValue(1), SignCvts[1].getValue(1));
32567 Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other},
32568 {Chain, SignCvt, SignCvt});
32569 Chain = Slow.getValue(1);
32570 } else {
32571 Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt);
32572 }
32573 IsNeg = DAG.getBitcast(MVT::v4i32, IsNeg);
32574 IsNeg =
32575 DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1});
32576 SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt);
32577 Results.push_back(Cvt);
32578 if (IsStrict)
32579 Results.push_back(Chain);
32580 return;
32581 }
32582
32583 if (SrcVT != MVT::v2i32)
32584 return;
32585
32586 if (IsSigned || Subtarget.hasAVX512()) {
32587 if (!IsStrict)
32588 return;
32589
32590 // Custom widen strict v2i32->v2f32 to avoid scalarization.
32591 // FIXME: Should generic type legalizer do this?
32592 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
32593 DAG.getConstant(0, dl, MVT::v2i32));
32594 SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
32595 {N->getOperand(0), Src});
32596 Results.push_back(Res);
32597 Results.push_back(Res.getValue(1));
32598 return;
32599 }
32600
32601 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32601, __extension__
__PRETTY_FUNCTION__))
;
32602 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
32603 SDValue VBias =
32604 DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
32605 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
32606 DAG.getBitcast(MVT::v2i64, VBias));
32607 Or = DAG.getBitcast(MVT::v2f64, Or);
32608 if (IsStrict) {
32609 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},
32610 {N->getOperand(0), Or, VBias});
32611 SDValue Res = DAG.getNode(X86ISD::STRICT_VFPROUND, dl,
32612 {MVT::v4f32, MVT::Other},
32613 {Sub.getValue(1), Sub});
32614 Results.push_back(Res);
32615 Results.push_back(Res.getValue(1));
32616 } else {
32617 // TODO: Are there any fast-math-flags to propagate here?
32618 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
32619 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
32620 }
32621 return;
32622 }
32623 case ISD::STRICT_FP_ROUND:
32624 case ISD::FP_ROUND: {
32625 bool IsStrict = N->isStrictFPOpcode();
32626 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
32627 EVT VT = N->getValueType(0);
32628 EVT NewVT = VT.getVectorElementType() == MVT::f16 ? MVT::v8f16 : MVT::v4f32;
32629 if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2f32) {
32630 SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f32)
32631 : DAG.getUNDEF(MVT::v2f32);
32632 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, Ext);
32633 }
32634 if (!isTypeLegal(Src.getValueType()))
32635 return;
32636 SDValue V;
32637 if (IsStrict)
32638 V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {NewVT, MVT::Other},
32639 {N->getOperand(0), Src});
32640 else
32641 V = DAG.getNode(X86ISD::VFPROUND, dl, NewVT, Src);
32642 Results.push_back(V);
32643 if (IsStrict)
32644 Results.push_back(V.getValue(1));
32645 return;
32646 }
32647 case ISD::FP_EXTEND:
32648 case ISD::STRICT_FP_EXTEND: {
32649 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
32650 // No other ValueType for FP_EXTEND should reach this point.
32651 assert(N->getValueType(0) == MVT::v2f32 &&(static_cast <bool> (N->getValueType(0) == MVT::v2f32
&& "Do not know how to legalize this Node") ? void (
0) : __assert_fail ("N->getValueType(0) == MVT::v2f32 && \"Do not know how to legalize this Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32652, __extension__
__PRETTY_FUNCTION__))
32652 "Do not know how to legalize this Node")(static_cast <bool> (N->getValueType(0) == MVT::v2f32
&& "Do not know how to legalize this Node") ? void (
0) : __assert_fail ("N->getValueType(0) == MVT::v2f32 && \"Do not know how to legalize this Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32652, __extension__
__PRETTY_FUNCTION__))
;
32653 if (!Subtarget.hasFP16() || !Subtarget.hasVLX())
32654 return;
32655 bool IsStrict = N->isStrictFPOpcode();
32656 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
32657 SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f16)
32658 : DAG.getUNDEF(MVT::v2f16);
32659 SDValue V = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f16, Src, Ext);
32660 if (IsStrict)
32661 V = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::v4f32, MVT::Other},
32662 {N->getOperand(0), V});
32663 else
32664 V = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, V);
32665 Results.push_back(V);
32666 if (IsStrict)
32667 Results.push_back(V.getValue(1));
32668 return;
32669 }
32670 case ISD::INTRINSIC_W_CHAIN: {
32671 unsigned IntNo = N->getConstantOperandVal(1);
32672 switch (IntNo) {
32673 default : llvm_unreachable("Do not know how to custom type "::llvm::llvm_unreachable_internal("Do not know how to custom type "
"legalize this intrinsic operation!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32674)
32674 "legalize this intrinsic operation!")::llvm::llvm_unreachable_internal("Do not know how to custom type "
"legalize this intrinsic operation!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32674)
;
32675 case Intrinsic::x86_rdtsc:
32676 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,
32677 Results);
32678 case Intrinsic::x86_rdtscp:
32679 return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget,
32680 Results);
32681 case Intrinsic::x86_rdpmc:
32682 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget,
32683 Results);
32684 return;
32685 case Intrinsic::x86_xgetbv:
32686 expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,
32687 Results);
32688 return;
32689 }
32690 }
32691 case ISD::READCYCLECOUNTER: {
32692 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results);
32693 }
32694 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
32695 EVT T = N->getValueType(0);
32696 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair")(static_cast <bool> ((T == MVT::i64 || T == MVT::i128) &&
"can only expand cmpxchg pair") ? void (0) : __assert_fail (
"(T == MVT::i64 || T == MVT::i128) && \"can only expand cmpxchg pair\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32696, __extension__
__PRETTY_FUNCTION__))
;
32697 bool Regs64bit = T == MVT::i128;
32698 assert((!Regs64bit || Subtarget.canUseCMPXCHG16B()) &&(static_cast <bool> ((!Regs64bit || Subtarget.canUseCMPXCHG16B
()) && "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B"
) ? void (0) : __assert_fail ("(!Regs64bit || Subtarget.canUseCMPXCHG16B()) && \"64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32699, __extension__
__PRETTY_FUNCTION__))
32699 "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B")(static_cast <bool> ((!Regs64bit || Subtarget.canUseCMPXCHG16B
()) && "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B"
) ? void (0) : __assert_fail ("(!Regs64bit || Subtarget.canUseCMPXCHG16B()) && \"64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32699, __extension__
__PRETTY_FUNCTION__))
;
32700 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
32701 SDValue cpInL, cpInH;
32702 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
32703 DAG.getConstant(0, dl, HalfT));
32704 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
32705 DAG.getConstant(1, dl, HalfT));
32706 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
32707 Regs64bit ? X86::RAX : X86::EAX,
32708 cpInL, SDValue());
32709 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
32710 Regs64bit ? X86::RDX : X86::EDX,
32711 cpInH, cpInL.getValue(1));
32712 SDValue swapInL, swapInH;
32713 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
32714 DAG.getConstant(0, dl, HalfT));
32715 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
32716 DAG.getConstant(1, dl, HalfT));
32717 swapInH =
32718 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
32719 swapInH, cpInH.getValue(1));
32720
32721 // In 64-bit mode we might need the base pointer in RBX, but we can't know
32722 // until later. So we keep the RBX input in a vreg and use a custom
32723 // inserter.
32724 // Since RBX will be a reserved register the register allocator will not
32725 // make sure its value will be properly saved and restored around this
32726 // live-range.
32727 SDValue Result;
32728 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
32729 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
32730 if (Regs64bit) {
32731 SDValue Ops[] = {swapInH.getValue(0), N->getOperand(1), swapInL,
32732 swapInH.getValue(1)};
32733 Result =
32734 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG16_DAG, dl, Tys, Ops, T, MMO);
32735 } else {
32736 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, X86::EBX, swapInL,
32737 swapInH.getValue(1));
32738 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
32739 swapInL.getValue(1)};
32740 Result =
32741 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, T, MMO);
32742 }
32743
32744 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
32745 Regs64bit ? X86::RAX : X86::EAX,
32746 HalfT, Result.getValue(1));
32747 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
32748 Regs64bit ? X86::RDX : X86::EDX,
32749 HalfT, cpOutL.getValue(2));
32750 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
32751
32752 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
32753 MVT::i32, cpOutH.getValue(2));
32754 SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
32755 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
32756
32757 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
32758 Results.push_back(Success);
32759 Results.push_back(EFLAGS.getValue(1));
32760 return;
32761 }
32762 case ISD::ATOMIC_LOAD: {
32763 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!")(static_cast <bool> (N->getValueType(0) == MVT::i64 &&
"Unexpected VT!") ? void (0) : __assert_fail ("N->getValueType(0) == MVT::i64 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32763, __extension__
__PRETTY_FUNCTION__))
;
32764 bool NoImplicitFloatOps =
32765 DAG.getMachineFunction().getFunction().hasFnAttribute(
32766 Attribute::NoImplicitFloat);
32767 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
32768 auto *Node = cast<AtomicSDNode>(N);
32769 if (Subtarget.hasSSE1()) {
32770 // Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.
32771 // Then extract the lower 64-bits.
32772 MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
32773 SDVTList Tys = DAG.getVTList(LdVT, MVT::Other);
32774 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
32775 SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
32776 MVT::i64, Node->getMemOperand());
32777 if (Subtarget.hasSSE2()) {
32778 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
32779 DAG.getIntPtrConstant(0, dl));
32780 Results.push_back(Res);
32781 Results.push_back(Ld.getValue(1));
32782 return;
32783 }
32784 // We use an alternative sequence for SSE1 that extracts as v2f32 and
32785 // then casts to i64. This avoids a 128-bit stack temporary being
32786 // created by type legalization if we were to cast v4f32->v2i64.
32787 SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld,
32788 DAG.getIntPtrConstant(0, dl));
32789 Res = DAG.getBitcast(MVT::i64, Res);
32790 Results.push_back(Res);
32791 Results.push_back(Ld.getValue(1));
32792 return;
32793 }
32794 if (Subtarget.hasX87()) {
32795 // First load this into an 80-bit X87 register. This will put the whole
32796 // integer into the significand.
32797 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
32798 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
32799 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::FILD,
32800 dl, Tys, Ops, MVT::i64,
32801 Node->getMemOperand());
32802 SDValue Chain = Result.getValue(1);
32803
32804 // Now store the X87 register to a stack temporary and convert to i64.
32805 // This store is not atomic and doesn't need to be.
32806 // FIXME: We don't need a stack temporary if the result of the load
32807 // is already being stored. We could just directly store there.
32808 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
32809 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
32810 MachinePointerInfo MPI =
32811 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
32812 SDValue StoreOps[] = { Chain, Result, StackPtr };
32813 Chain = DAG.getMemIntrinsicNode(
32814 X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64,
32815 MPI, None /*Align*/, MachineMemOperand::MOStore);
32816
32817 // Finally load the value back from the stack temporary and return it.
32818 // This load is not atomic and doesn't need to be.
32819 // This load will be further type legalized.
32820 Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI);
32821 Results.push_back(Result);
32822 Results.push_back(Result.getValue(1));
32823 return;
32824 }
32825 }
32826 // TODO: Use MOVLPS when SSE1 is available?
32827 // Delegate to generic TypeLegalization. Situations we can really handle
32828 // should have already been dealt with by AtomicExpandPass.cpp.
32829 break;
32830 }
32831 case ISD::ATOMIC_SWAP:
32832 case ISD::ATOMIC_LOAD_ADD:
32833 case ISD::ATOMIC_LOAD_SUB:
32834 case ISD::ATOMIC_LOAD_AND:
32835 case ISD::ATOMIC_LOAD_OR:
32836 case ISD::ATOMIC_LOAD_XOR:
32837 case ISD::ATOMIC_LOAD_NAND:
32838 case ISD::ATOMIC_LOAD_MIN:
32839 case ISD::ATOMIC_LOAD_MAX:
32840 case ISD::ATOMIC_LOAD_UMIN:
32841 case ISD::ATOMIC_LOAD_UMAX:
32842 // Delegate to generic TypeLegalization. Situations we can really handle
32843 // should have already been dealt with by AtomicExpandPass.cpp.
32844 break;
32845
32846 case ISD::BITCAST: {
32847 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32847, __extension__
__PRETTY_FUNCTION__))
;
32848 EVT DstVT = N->getValueType(0);
32849 EVT SrcVT = N->getOperand(0).getValueType();
32850
32851 // If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
32852 // we can split using the k-register rather than memory.
32853 if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {
32854 assert(!Subtarget.is64Bit() && "Expected 32-bit mode")(static_cast <bool> (!Subtarget.is64Bit() && "Expected 32-bit mode"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Expected 32-bit mode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32854, __extension__
__PRETTY_FUNCTION__))
;
32855 SDValue Lo, Hi;
32856 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
32857 Lo = DAG.getBitcast(MVT::i32, Lo);
32858 Hi = DAG.getBitcast(MVT::i32, Hi);
32859 SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
32860 Results.push_back(Res);
32861 return;
32862 }
32863
32864 if (DstVT.isVector() && SrcVT == MVT::x86mmx) {
32865 // FIXME: Use v4f32 for SSE1?
32866 assert(Subtarget.hasSSE2() && "Requires SSE2")(static_cast <bool> (Subtarget.hasSSE2() && "Requires SSE2"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires SSE2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32866, __extension__
__PRETTY_FUNCTION__))
;
32867 assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), DstVT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32868, __extension__
__PRETTY_FUNCTION__))
32868 "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), DstVT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32868, __extension__
__PRETTY_FUNCTION__))
;
32869 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);
32870 SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64,
32871 N->getOperand(0));
32872 Res = DAG.getBitcast(WideVT, Res);
32873 Results.push_back(Res);
32874 return;
32875 }
32876
32877 return;
32878 }
32879 case ISD::MGATHER: {
32880 EVT VT = N->getValueType(0);
32881 if ((VT == MVT::v2f32 || VT == MVT::v2i32) &&
32882 (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
32883 auto *Gather = cast<MaskedGatherSDNode>(N);
32884 SDValue Index = Gather->getIndex();
32885 if (Index.getValueType() != MVT::v2i64)
32886 return;
32887 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32888, __extension__
__PRETTY_FUNCTION__))
32888 "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32888, __extension__
__PRETTY_FUNCTION__))
;
32889 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
32890 SDValue Mask = Gather->getMask();
32891 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type")(static_cast <bool> (Mask.getValueType() == MVT::v2i1 &&
"Unexpected mask type") ? void (0) : __assert_fail ("Mask.getValueType() == MVT::v2i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32891, __extension__
__PRETTY_FUNCTION__))
;
32892 SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT,
32893 Gather->getPassThru(),
32894 DAG.getUNDEF(VT));
32895 if (!Subtarget.hasVLX()) {
32896 // We need to widen the mask, but the instruction will only use 2
32897 // of its elements. So we can use undef.
32898 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
32899 DAG.getUNDEF(MVT::v2i1));
32900 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
32901 }
32902 SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
32903 Gather->getBasePtr(), Index, Gather->getScale() };
32904 SDValue Res = DAG.getMemIntrinsicNode(
32905 X86ISD::MGATHER, dl, DAG.getVTList(WideVT, MVT::Other), Ops,
32906 Gather->getMemoryVT(), Gather->getMemOperand());
32907 Results.push_back(Res);
32908 Results.push_back(Res.getValue(1));
32909 return;
32910 }
32911 return;
32912 }
32913 case ISD::LOAD: {
32914 // Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This
32915 // avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp
32916 // cast since type legalization will try to use an i64 load.
32917 MVT VT = N->getSimpleValueType(0);
32918 assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT")(static_cast <bool> (VT.isVector() && VT.getSizeInBits
() == 64 && "Unexpected VT") ? void (0) : __assert_fail
("VT.isVector() && VT.getSizeInBits() == 64 && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32918, __extension__
__PRETTY_FUNCTION__))
;
32919 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32920, __extension__
__PRETTY_FUNCTION__))
32920 "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32920, __extension__
__PRETTY_FUNCTION__))
;
32921 if (!ISD::isNON_EXTLoad(N))
32922 return;
32923 auto *Ld = cast<LoadSDNode>(N);
32924 if (Subtarget.hasSSE2()) {
32925 MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
32926 SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
32927 Ld->getPointerInfo(), Ld->getOriginalAlign(),
32928 Ld->getMemOperand()->getFlags());
32929 SDValue Chain = Res.getValue(1);
32930 MVT VecVT = MVT::getVectorVT(LdVT, 2);
32931 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res);
32932 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
32933 Res = DAG.getBitcast(WideVT, Res);
32934 Results.push_back(Res);
32935 Results.push_back(Chain);
32936 return;
32937 }
32938 assert(Subtarget.hasSSE1() && "Expected SSE")(static_cast <bool> (Subtarget.hasSSE1() && "Expected SSE"
) ? void (0) : __assert_fail ("Subtarget.hasSSE1() && \"Expected SSE\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32938, __extension__
__PRETTY_FUNCTION__))
;
32939 SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);
32940 SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
32941 SDValue Res = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
32942 MVT::i64, Ld->getMemOperand());
32943 Results.push_back(Res);
32944 Results.push_back(Res.getValue(1));
32945 return;
32946 }
32947 case ISD::ADDRSPACECAST: {
32948 SDValue V = LowerADDRSPACECAST(SDValue(N,0), DAG);
32949 Results.push_back(V);
32950 return;
32951 }
32952 case ISD::BITREVERSE:
32953 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!")(static_cast <bool> (N->getValueType(0) == MVT::i64 &&
"Unexpected VT!") ? void (0) : __assert_fail ("N->getValueType(0) == MVT::i64 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32953, __extension__
__PRETTY_FUNCTION__))
;
32954 assert(Subtarget.hasXOP() && "Expected XOP")(static_cast <bool> (Subtarget.hasXOP() && "Expected XOP"
) ? void (0) : __assert_fail ("Subtarget.hasXOP() && \"Expected XOP\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32954, __extension__
__PRETTY_FUNCTION__))
;
32955 // We can use VPPERM by copying to a vector register and back. We'll need
32956 // to move the scalar in two i32 pieces.
32957 Results.push_back(LowerBITREVERSE(SDValue(N, 0), Subtarget, DAG));
32958 return;
32959 }
32960}
32961
32962const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
32963 switch ((X86ISD::NodeType)Opcode) {
32964 case X86ISD::FIRST_NUMBER: break;
32965#define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE;
32966 NODE_NAME_CASE(BSF)
32967 NODE_NAME_CASE(BSR)
32968 NODE_NAME_CASE(FSHL)
32969 NODE_NAME_CASE(FSHR)
32970 NODE_NAME_CASE(FAND)
32971 NODE_NAME_CASE(FANDN)
32972 NODE_NAME_CASE(FOR)
32973 NODE_NAME_CASE(FXOR)
32974 NODE_NAME_CASE(FILD)
32975 NODE_NAME_CASE(FIST)
32976 NODE_NAME_CASE(FP_TO_INT_IN_MEM)
32977 NODE_NAME_CASE(FLD)
32978 NODE_NAME_CASE(FST)
32979 NODE_NAME_CASE(CALL)
32980 NODE_NAME_CASE(CALL_RVMARKER)
32981 NODE_NAME_CASE(BT)
32982 NODE_NAME_CASE(CMP)
32983 NODE_NAME_CASE(FCMP)
32984 NODE_NAME_CASE(STRICT_FCMP)
32985 NODE_NAME_CASE(STRICT_FCMPS)
32986 NODE_NAME_CASE(COMI)
32987 NODE_NAME_CASE(UCOMI)
32988 NODE_NAME_CASE(CMPM)
32989 NODE_NAME_CASE(CMPMM)
32990 NODE_NAME_CASE(STRICT_CMPM)
32991 NODE_NAME_CASE(CMPMM_SAE)
32992 NODE_NAME_CASE(SETCC)
32993 NODE_NAME_CASE(SETCC_CARRY)
32994 NODE_NAME_CASE(FSETCC)
32995 NODE_NAME_CASE(FSETCCM)
32996 NODE_NAME_CASE(FSETCCM_SAE)
32997 NODE_NAME_CASE(CMOV)
32998 NODE_NAME_CASE(BRCOND)
32999 NODE_NAME_CASE(RET_FLAG)
33000 NODE_NAME_CASE(IRET)
33001 NODE_NAME_CASE(REP_STOS)
33002 NODE_NAME_CASE(REP_MOVS)
33003 NODE_NAME_CASE(GlobalBaseReg)
33004 NODE_NAME_CASE(Wrapper)
33005 NODE_NAME_CASE(WrapperRIP)
33006 NODE_NAME_CASE(MOVQ2DQ)
33007 NODE_NAME_CASE(MOVDQ2Q)
33008 NODE_NAME_CASE(MMX_MOVD2W)
33009 NODE_NAME_CASE(MMX_MOVW2D)
33010 NODE_NAME_CASE(PEXTRB)
33011 NODE_NAME_CASE(PEXTRW)
33012 NODE_NAME_CASE(INSERTPS)
33013 NODE_NAME_CASE(PINSRB)
33014 NODE_NAME_CASE(PINSRW)
33015 NODE_NAME_CASE(PSHUFB)
33016 NODE_NAME_CASE(ANDNP)
33017 NODE_NAME_CASE(BLENDI)
33018 NODE_NAME_CASE(BLENDV)
33019 NODE_NAME_CASE(HADD)
33020 NODE_NAME_CASE(HSUB)
33021 NODE_NAME_CASE(FHADD)
33022 NODE_NAME_CASE(FHSUB)
33023 NODE_NAME_CASE(CONFLICT)
33024 NODE_NAME_CASE(FMAX)
33025 NODE_NAME_CASE(FMAXS)
33026 NODE_NAME_CASE(FMAX_SAE)
33027 NODE_NAME_CASE(FMAXS_SAE)
33028 NODE_NAME_CASE(FMIN)
33029 NODE_NAME_CASE(FMINS)
33030 NODE_NAME_CASE(FMIN_SAE)
33031 NODE_NAME_CASE(FMINS_SAE)
33032 NODE_NAME_CASE(FMAXC)
33033 NODE_NAME_CASE(FMINC)
33034 NODE_NAME_CASE(FRSQRT)
33035 NODE_NAME_CASE(FRCP)
33036 NODE_NAME_CASE(EXTRQI)
33037 NODE_NAME_CASE(INSERTQI)
33038 NODE_NAME_CASE(TLSADDR)
33039 NODE_NAME_CASE(TLSBASEADDR)
33040 NODE_NAME_CASE(TLSCALL)
33041 NODE_NAME_CASE(EH_SJLJ_SETJMP)
33042 NODE_NAME_CASE(EH_SJLJ_LONGJMP)
33043 NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)
33044 NODE_NAME_CASE(EH_RETURN)
33045 NODE_NAME_CASE(TC_RETURN)
33046 NODE_NAME_CASE(FNSTCW16m)
33047 NODE_NAME_CASE(FLDCW16m)
33048 NODE_NAME_CASE(LCMPXCHG_DAG)
33049 NODE_NAME_CASE(LCMPXCHG8_DAG)
33050 NODE_NAME_CASE(LCMPXCHG16_DAG)
33051 NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)
33052 NODE_NAME_CASE(LADD)
33053 NODE_NAME_CASE(LSUB)
33054 NODE_NAME_CASE(LOR)
33055 NODE_NAME_CASE(LXOR)
33056 NODE_NAME_CASE(LAND)
33057 NODE_NAME_CASE(LBTS)
33058 NODE_NAME_CASE(LBTC)
33059 NODE_NAME_CASE(LBTR)
33060 NODE_NAME_CASE(VZEXT_MOVL)
33061 NODE_NAME_CASE(VZEXT_LOAD)
33062 NODE_NAME_CASE(VEXTRACT_STORE)
33063 NODE_NAME_CASE(VTRUNC)
33064 NODE_NAME_CASE(VTRUNCS)
33065 NODE_NAME_CASE(VTRUNCUS)
33066 NODE_NAME_CASE(VMTRUNC)
33067 NODE_NAME_CASE(VMTRUNCS)
33068 NODE_NAME_CASE(VMTRUNCUS)
33069 NODE_NAME_CASE(VTRUNCSTORES)
33070 NODE_NAME_CASE(VTRUNCSTOREUS)
33071 NODE_NAME_CASE(VMTRUNCSTORES)
33072 NODE_NAME_CASE(VMTRUNCSTOREUS)
33073 NODE_NAME_CASE(VFPEXT)
33074 NODE_NAME_CASE(STRICT_VFPEXT)
33075 NODE_NAME_CASE(VFPEXT_SAE)
33076 NODE_NAME_CASE(VFPEXTS)
33077 NODE_NAME_CASE(VFPEXTS_SAE)
33078 NODE_NAME_CASE(VFPROUND)
33079 NODE_NAME_CASE(STRICT_VFPROUND)
33080 NODE_NAME_CASE(VMFPROUND)
33081 NODE_NAME_CASE(VFPROUND_RND)
33082 NODE_NAME_CASE(VFPROUNDS)
33083 NODE_NAME_CASE(VFPROUNDS_RND)
33084 NODE_NAME_CASE(VSHLDQ)
33085 NODE_NAME_CASE(VSRLDQ)
33086 NODE_NAME_CASE(VSHL)
33087 NODE_NAME_CASE(VSRL)
33088 NODE_NAME_CASE(VSRA)
33089 NODE_NAME_CASE(VSHLI)
33090 NODE_NAME_CASE(VSRLI)
33091 NODE_NAME_CASE(VSRAI)
33092 NODE_NAME_CASE(VSHLV)
33093 NODE_NAME_CASE(VSRLV)
33094 NODE_NAME_CASE(VSRAV)
33095 NODE_NAME_CASE(VROTLI)
33096 NODE_NAME_CASE(VROTRI)
33097 NODE_NAME_CASE(VPPERM)
33098 NODE_NAME_CASE(CMPP)
33099 NODE_NAME_CASE(STRICT_CMPP)
33100 NODE_NAME_CASE(PCMPEQ)
33101 NODE_NAME_CASE(PCMPGT)
33102 NODE_NAME_CASE(PHMINPOS)
33103 NODE_NAME_CASE(ADD)
33104 NODE_NAME_CASE(SUB)
33105 NODE_NAME_CASE(ADC)
33106 NODE_NAME_CASE(SBB)
33107 NODE_NAME_CASE(SMUL)
33108 NODE_NAME_CASE(UMUL)
33109 NODE_NAME_CASE(OR)
33110 NODE_NAME_CASE(XOR)
33111 NODE_NAME_CASE(AND)
33112 NODE_NAME_CASE(BEXTR)
33113 NODE_NAME_CASE(BEXTRI)
33114 NODE_NAME_CASE(BZHI)
33115 NODE_NAME_CASE(PDEP)
33116 NODE_NAME_CASE(PEXT)
33117 NODE_NAME_CASE(MUL_IMM)
33118 NODE_NAME_CASE(MOVMSK)
33119 NODE_NAME_CASE(PTEST)
33120 NODE_NAME_CASE(TESTP)
33121 NODE_NAME_CASE(KORTEST)
33122 NODE_NAME_CASE(KTEST)
33123 NODE_NAME_CASE(KADD)
33124 NODE_NAME_CASE(KSHIFTL)
33125 NODE_NAME_CASE(KSHIFTR)
33126 NODE_NAME_CASE(PACKSS)
33127 NODE_NAME_CASE(PACKUS)
33128 NODE_NAME_CASE(PALIGNR)
33129 NODE_NAME_CASE(VALIGN)
33130 NODE_NAME_CASE(VSHLD)
33131 NODE_NAME_CASE(VSHRD)
33132 NODE_NAME_CASE(VSHLDV)
33133 NODE_NAME_CASE(VSHRDV)
33134 NODE_NAME_CASE(PSHUFD)
33135 NODE_NAME_CASE(PSHUFHW)
33136 NODE_NAME_CASE(PSHUFLW)
33137 NODE_NAME_CASE(SHUFP)
33138 NODE_NAME_CASE(SHUF128)
33139 NODE_NAME_CASE(MOVLHPS)
33140 NODE_NAME_CASE(MOVHLPS)
33141 NODE_NAME_CASE(MOVDDUP)
33142 NODE_NAME_CASE(MOVSHDUP)
33143 NODE_NAME_CASE(MOVSLDUP)
33144 NODE_NAME_CASE(MOVSD)
33145 NODE_NAME_CASE(MOVSS)
33146 NODE_NAME_CASE(MOVSH)
33147 NODE_NAME_CASE(UNPCKL)
33148 NODE_NAME_CASE(UNPCKH)
33149 NODE_NAME_CASE(VBROADCAST)
33150 NODE_NAME_CASE(VBROADCAST_LOAD)
33151 NODE_NAME_CASE(VBROADCASTM)
33152 NODE_NAME_CASE(SUBV_BROADCAST_LOAD)
33153 NODE_NAME_CASE(VPERMILPV)
33154 NODE_NAME_CASE(VPERMILPI)
33155 NODE_NAME_CASE(VPERM2X128)
33156 NODE_NAME_CASE(VPERMV)
33157 NODE_NAME_CASE(VPERMV3)
33158 NODE_NAME_CASE(VPERMI)
33159 NODE_NAME_CASE(VPTERNLOG)
33160 NODE_NAME_CASE(VFIXUPIMM)
33161 NODE_NAME_CASE(VFIXUPIMM_SAE)
33162 NODE_NAME_CASE(VFIXUPIMMS)
33163 NODE_NAME_CASE(VFIXUPIMMS_SAE)
33164 NODE_NAME_CASE(VRANGE)
33165 NODE_NAME_CASE(VRANGE_SAE)
33166 NODE_NAME_CASE(VRANGES)
33167 NODE_NAME_CASE(VRANGES_SAE)
33168 NODE_NAME_CASE(PMULUDQ)
33169 NODE_NAME_CASE(PMULDQ)
33170 NODE_NAME_CASE(PSADBW)
33171 NODE_NAME_CASE(DBPSADBW)
33172 NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)
33173 NODE_NAME_CASE(VAARG_64)
33174 NODE_NAME_CASE(VAARG_X32)
33175 NODE_NAME_CASE(DYN_ALLOCA)
33176 NODE_NAME_CASE(MEMBARRIER)
33177 NODE_NAME_CASE(MFENCE)
33178 NODE_NAME_CASE(SEG_ALLOCA)
33179 NODE_NAME_CASE(PROBED_ALLOCA)
33180 NODE_NAME_CASE(RDRAND)
33181 NODE_NAME_CASE(RDSEED)
33182 NODE_NAME_CASE(RDPKRU)
33183 NODE_NAME_CASE(WRPKRU)
33184 NODE_NAME_CASE(VPMADDUBSW)
33185 NODE_NAME_CASE(VPMADDWD)
33186 NODE_NAME_CASE(VPSHA)
33187 NODE_NAME_CASE(VPSHL)
33188 NODE_NAME_CASE(VPCOM)
33189 NODE_NAME_CASE(VPCOMU)
33190 NODE_NAME_CASE(VPERMIL2)
33191 NODE_NAME_CASE(FMSUB)
33192 NODE_NAME_CASE(STRICT_FMSUB)
33193 NODE_NAME_CASE(FNMADD)
33194 NODE_NAME_CASE(STRICT_FNMADD)
33195 NODE_NAME_CASE(FNMSUB)
33196 NODE_NAME_CASE(STRICT_FNMSUB)
33197 NODE_NAME_CASE(FMADDSUB)
33198 NODE_NAME_CASE(FMSUBADD)
33199 NODE_NAME_CASE(FMADD_RND)
33200 NODE_NAME_CASE(FNMADD_RND)
33201 NODE_NAME_CASE(FMSUB_RND)
33202 NODE_NAME_CASE(FNMSUB_RND)
33203 NODE_NAME_CASE(FMADDSUB_RND)
33204 NODE_NAME_CASE(FMSUBADD_RND)
33205 NODE_NAME_CASE(VFMADDC)
33206 NODE_NAME_CASE(VFMADDC_RND)
33207 NODE_NAME_CASE(VFCMADDC)
33208 NODE_NAME_CASE(VFCMADDC_RND)
33209 NODE_NAME_CASE(VFMULC)
33210 NODE_NAME_CASE(VFMULC_RND)
33211 NODE_NAME_CASE(VFCMULC)
33212 NODE_NAME_CASE(VFCMULC_RND)
33213 NODE_NAME_CASE(VFMULCSH)
33214 NODE_NAME_CASE(VFMULCSH_RND)
33215 NODE_NAME_CASE(VFCMULCSH)
33216 NODE_NAME_CASE(VFCMULCSH_RND)
33217 NODE_NAME_CASE(VFMADDCSH)
33218 NODE_NAME_CASE(VFMADDCSH_RND)
33219 NODE_NAME_CASE(VFCMADDCSH)
33220 NODE_NAME_CASE(VFCMADDCSH_RND)
33221 NODE_NAME_CASE(VPMADD52H)
33222 NODE_NAME_CASE(VPMADD52L)
33223 NODE_NAME_CASE(VRNDSCALE)
33224 NODE_NAME_CASE(STRICT_VRNDSCALE)
33225 NODE_NAME_CASE(VRNDSCALE_SAE)
33226 NODE_NAME_CASE(VRNDSCALES)
33227 NODE_NAME_CASE(VRNDSCALES_SAE)
33228 NODE_NAME_CASE(VREDUCE)
33229 NODE_NAME_CASE(VREDUCE_SAE)
33230 NODE_NAME_CASE(VREDUCES)
33231 NODE_NAME_CASE(VREDUCES_SAE)
33232 NODE_NAME_CASE(VGETMANT)
33233 NODE_NAME_CASE(VGETMANT_SAE)
33234 NODE_NAME_CASE(VGETMANTS)
33235 NODE_NAME_CASE(VGETMANTS_SAE)
33236 NODE_NAME_CASE(PCMPESTR)
33237 NODE_NAME_CASE(PCMPISTR)
33238 NODE_NAME_CASE(XTEST)
33239 NODE_NAME_CASE(COMPRESS)
33240 NODE_NAME_CASE(EXPAND)
33241 NODE_NAME_CASE(SELECTS)
33242 NODE_NAME_CASE(ADDSUB)
33243 NODE_NAME_CASE(RCP14)
33244 NODE_NAME_CASE(RCP14S)
33245 NODE_NAME_CASE(RCP28)
33246 NODE_NAME_CASE(RCP28_SAE)
33247 NODE_NAME_CASE(RCP28S)
33248 NODE_NAME_CASE(RCP28S_SAE)
33249 NODE_NAME_CASE(EXP2)
33250 NODE_NAME_CASE(EXP2_SAE)
33251 NODE_NAME_CASE(RSQRT14)
33252 NODE_NAME_CASE(RSQRT14S)
33253 NODE_NAME_CASE(RSQRT28)
33254 NODE_NAME_CASE(RSQRT28_SAE)
33255 NODE_NAME_CASE(RSQRT28S)
33256 NODE_NAME_CASE(RSQRT28S_SAE)
33257 NODE_NAME_CASE(FADD_RND)
33258 NODE_NAME_CASE(FADDS)
33259 NODE_NAME_CASE(FADDS_RND)
33260 NODE_NAME_CASE(FSUB_RND)
33261 NODE_NAME_CASE(FSUBS)
33262 NODE_NAME_CASE(FSUBS_RND)
33263 NODE_NAME_CASE(FMUL_RND)
33264 NODE_NAME_CASE(FMULS)
33265 NODE_NAME_CASE(FMULS_RND)
33266 NODE_NAME_CASE(FDIV_RND)
33267 NODE_NAME_CASE(FDIVS)
33268 NODE_NAME_CASE(FDIVS_RND)
33269 NODE_NAME_CASE(FSQRT_RND)
33270 NODE_NAME_CASE(FSQRTS)
33271 NODE_NAME_CASE(FSQRTS_RND)
33272 NODE_NAME_CASE(FGETEXP)
33273 NODE_NAME_CASE(FGETEXP_SAE)
33274 NODE_NAME_CASE(FGETEXPS)
33275 NODE_NAME_CASE(FGETEXPS_SAE)
33276 NODE_NAME_CASE(SCALEF)
33277 NODE_NAME_CASE(SCALEF_RND)
33278 NODE_NAME_CASE(SCALEFS)
33279 NODE_NAME_CASE(SCALEFS_RND)
33280 NODE_NAME_CASE(MULHRS)
33281 NODE_NAME_CASE(SINT_TO_FP_RND)
33282 NODE_NAME_CASE(UINT_TO_FP_RND)
33283 NODE_NAME_CASE(CVTTP2SI)
33284 NODE_NAME_CASE(CVTTP2UI)
33285 NODE_NAME_CASE(STRICT_CVTTP2SI)
33286 NODE_NAME_CASE(STRICT_CVTTP2UI)
33287 NODE_NAME_CASE(MCVTTP2SI)
33288 NODE_NAME_CASE(MCVTTP2UI)
33289 NODE_NAME_CASE(CVTTP2SI_SAE)
33290 NODE_NAME_CASE(CVTTP2UI_SAE)
33291 NODE_NAME_CASE(CVTTS2SI)
33292 NODE_NAME_CASE(CVTTS2UI)
33293 NODE_NAME_CASE(CVTTS2SI_SAE)
33294 NODE_NAME_CASE(CVTTS2UI_SAE)
33295 NODE_NAME_CASE(CVTSI2P)
33296 NODE_NAME_CASE(CVTUI2P)
33297 NODE_NAME_CASE(STRICT_CVTSI2P)
33298 NODE_NAME_CASE(STRICT_CVTUI2P)
33299 NODE_NAME_CASE(MCVTSI2P)
33300 NODE_NAME_CASE(MCVTUI2P)
33301 NODE_NAME_CASE(VFPCLASS)
33302 NODE_NAME_CASE(VFPCLASSS)
33303 NODE_NAME_CASE(MULTISHIFT)
33304 NODE_NAME_CASE(SCALAR_SINT_TO_FP)
33305 NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND)
33306 NODE_NAME_CASE(SCALAR_UINT_TO_FP)
33307 NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)
33308 NODE_NAME_CASE(CVTPS2PH)
33309 NODE_NAME_CASE(STRICT_CVTPS2PH)
33310 NODE_NAME_CASE(MCVTPS2PH)
33311 NODE_NAME_CASE(CVTPH2PS)
33312 NODE_NAME_CASE(STRICT_CVTPH2PS)
33313 NODE_NAME_CASE(CVTPH2PS_SAE)
33314 NODE_NAME_CASE(CVTP2SI)
33315 NODE_NAME_CASE(CVTP2UI)
33316 NODE_NAME_CASE(MCVTP2SI)
33317 NODE_NAME_CASE(MCVTP2UI)
33318 NODE_NAME_CASE(CVTP2SI_RND)
33319 NODE_NAME_CASE(CVTP2UI_RND)
33320 NODE_NAME_CASE(CVTS2SI)
33321 NODE_NAME_CASE(CVTS2UI)
33322 NODE_NAME_CASE(CVTS2SI_RND)
33323 NODE_NAME_CASE(CVTS2UI_RND)
33324 NODE_NAME_CASE(CVTNE2PS2BF16)
33325 NODE_NAME_CASE(CVTNEPS2BF16)
33326 NODE_NAME_CASE(MCVTNEPS2BF16)
33327 NODE_NAME_CASE(DPBF16PS)
33328 NODE_NAME_CASE(LWPINS)
33329 NODE_NAME_CASE(MGATHER)
33330 NODE_NAME_CASE(MSCATTER)
33331 NODE_NAME_CASE(VPDPBUSD)
33332 NODE_NAME_CASE(VPDPBUSDS)
33333 NODE_NAME_CASE(VPDPWSSD)
33334 NODE_NAME_CASE(VPDPWSSDS)
33335 NODE_NAME_CASE(VPSHUFBITQMB)
33336 NODE_NAME_CASE(GF2P8MULB)
33337 NODE_NAME_CASE(GF2P8AFFINEQB)
33338 NODE_NAME_CASE(GF2P8AFFINEINVQB)
33339 NODE_NAME_CASE(NT_CALL)
33340 NODE_NAME_CASE(NT_BRIND)
33341 NODE_NAME_CASE(UMWAIT)
33342 NODE_NAME_CASE(TPAUSE)
33343 NODE_NAME_CASE(ENQCMD)
33344 NODE_NAME_CASE(ENQCMDS)
33345 NODE_NAME_CASE(VP2INTERSECT)
33346 NODE_NAME_CASE(AESENC128KL)
33347 NODE_NAME_CASE(AESDEC128KL)
33348 NODE_NAME_CASE(AESENC256KL)
33349 NODE_NAME_CASE(AESDEC256KL)
33350 NODE_NAME_CASE(AESENCWIDE128KL)
33351 NODE_NAME_CASE(AESDECWIDE128KL)
33352 NODE_NAME_CASE(AESENCWIDE256KL)
33353 NODE_NAME_CASE(AESDECWIDE256KL)
33354 NODE_NAME_CASE(TESTUI)
33355 }
33356 return nullptr;
33357#undef NODE_NAME_CASE
33358}
33359
33360/// Return true if the addressing mode represented by AM is legal for this
33361/// target, for a load/store of the specified type.
33362bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
33363 const AddrMode &AM, Type *Ty,
33364 unsigned AS,
33365 Instruction *I) const {
33366 // X86 supports extremely general addressing modes.
33367 CodeModel::Model M = getTargetMachine().getCodeModel();
33368
33369 // X86 allows a sign-extended 32-bit immediate field as a displacement.
33370 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
33371 return false;
33372
33373 if (AM.BaseGV) {
33374 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
33375
33376 // If a reference to this global requires an extra load, we can't fold it.
33377 if (isGlobalStubReference(GVFlags))
33378 return false;
33379
33380 // If BaseGV requires a register for the PIC base, we cannot also have a
33381 // BaseReg specified.
33382 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
33383 return false;
33384
33385 // If lower 4G is not available, then we must use rip-relative addressing.
33386 if ((M != CodeModel::Small || isPositionIndependent()) &&
33387 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
33388 return false;
33389 }
33390
33391 switch (AM.Scale) {
33392 case 0:
33393 case 1:
33394 case 2:
33395 case 4:
33396 case 8:
33397 // These scales always work.
33398 break;
33399 case 3:
33400 case 5:
33401 case 9:
33402 // These scales are formed with basereg+scalereg. Only accept if there is
33403 // no basereg yet.
33404 if (AM.HasBaseReg)
33405 return false;
33406 break;
33407 default: // Other stuff never works.
33408 return false;
33409 }
33410
33411 return true;
33412}
33413
33414bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
33415 unsigned Bits = Ty->getScalarSizeInBits();
33416
33417 // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
33418 // Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred.
33419 if (Subtarget.hasXOP() &&
33420 (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))
33421 return false;
33422
33423 // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
33424 // shifts just as cheap as scalar ones.
33425 if (Subtarget.hasAVX2() && (Bits == 32 || Bits == 64))
33426 return false;
33427
33428 // AVX512BW has shifts such as vpsllvw.
33429 if (Subtarget.hasBWI() && Bits == 16)
33430 return false;
33431
33432 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
33433 // fully general vector.
33434 return true;
33435}
33436
33437bool X86TargetLowering::isBinOp(unsigned Opcode) const {
33438 switch (Opcode) {
33439 // These are non-commutative binops.
33440 // TODO: Add more X86ISD opcodes once we have test coverage.
33441 case X86ISD::ANDNP:
33442 case X86ISD::PCMPGT:
33443 case X86ISD::FMAX:
33444 case X86ISD::FMIN:
33445 case X86ISD::FANDN:
33446 case X86ISD::VPSHA:
33447 case X86ISD::VPSHL:
33448 case X86ISD::VSHLV:
33449 case X86ISD::VSRLV:
33450 case X86ISD::VSRAV:
33451 return true;
33452 }
33453
33454 return TargetLoweringBase::isBinOp(Opcode);
33455}
33456
33457bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {
33458 switch (Opcode) {
33459 // TODO: Add more X86ISD opcodes once we have test coverage.
33460 case X86ISD::PCMPEQ:
33461 case X86ISD::PMULDQ:
33462 case X86ISD::PMULUDQ:
33463 case X86ISD::FMAXC:
33464 case X86ISD::FMINC:
33465 case X86ISD::FAND:
33466 case X86ISD::FOR:
33467 case X86ISD::FXOR:
33468 return true;
33469 }
33470
33471 return TargetLoweringBase::isCommutativeBinOp(Opcode);
33472}
33473
33474bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
33475 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
33476 return false;
33477 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
33478 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
33479 return NumBits1 > NumBits2;
33480}
33481
33482bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
33483 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
33484 return false;
33485
33486 if (!isTypeLegal(EVT::getEVT(Ty1)))
33487 return false;
33488
33489 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop")(static_cast <bool> (Ty1->getPrimitiveSizeInBits() <=
64 && "i128 is probably not a noop") ? void (0) : __assert_fail
("Ty1->getPrimitiveSizeInBits() <= 64 && \"i128 is probably not a noop\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33489, __extension__
__PRETTY_FUNCTION__))
;
33490
33491 // Assuming the caller doesn't have a zeroext or signext return parameter,
33492 // truncation all the way down to i1 is valid.
33493 return true;
33494}
33495
33496bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
33497 return isInt<32>(Imm);
33498}
33499
33500bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
33501 // Can also use sub to handle negated immediates.
33502 return isInt<32>(Imm);
33503}
33504
33505bool X86TargetLowering::isLegalStoreImmediate(int64_t Imm) const {
33506 return isInt<32>(Imm);
33507}
33508
33509bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
33510 if (!VT1.isScalarInteger() || !VT2.isScalarInteger())
33511 return false;
33512 unsigned NumBits1 = VT1.getSizeInBits();
33513 unsigned NumBits2 = VT2.getSizeInBits();
33514 return NumBits1 > NumBits2;
33515}
33516
33517bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
33518 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
33519 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
33520}
33521
33522bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
33523 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
33524 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
33525}
33526
33527bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
33528 EVT VT1 = Val.getValueType();
33529 if (isZExtFree(VT1, VT2))
33530 return true;
33531
33532 if (Val.getOpcode() != ISD::LOAD)
33533 return false;
33534
33535 if (!VT1.isSimple() || !VT1.isInteger() ||
33536 !VT2.isSimple() || !VT2.isInteger())
33537 return false;
33538
33539 switch (VT1.getSimpleVT().SimpleTy) {
33540 default: break;
33541 case MVT::i8:
33542 case MVT::i16:
33543 case MVT::i32:
33544 // X86 has 8, 16, and 32-bit zero-extending loads.
33545 return true;
33546 }
33547
33548 return false;
33549}
33550
33551bool X86TargetLowering::shouldSinkOperands(Instruction *I,
33552 SmallVectorImpl<Use *> &Ops) const {
33553 using namespace llvm::PatternMatch;
33554
33555 FixedVectorType *VTy = dyn_cast<FixedVectorType>(I->getType());
33556 if (!VTy)
33557 return false;
33558
33559 if (I->getOpcode() == Instruction::Mul &&
33560 VTy->getElementType()->isIntegerTy(64)) {
33561 for (auto &Op : I->operands()) {
33562 // Make sure we are not already sinking this operand
33563 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
33564 continue;
33565
33566 // Look for PMULDQ pattern where the input is a sext_inreg from vXi32 or
33567 // the PMULUDQ pattern where the input is a zext_inreg from vXi32.
33568 if (Subtarget.hasSSE41() &&
33569 match(Op.get(), m_AShr(m_Shl(m_Value(), m_SpecificInt(32)),
33570 m_SpecificInt(32)))) {
33571 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
33572 Ops.push_back(&Op);
33573 } else if (Subtarget.hasSSE2() &&
33574 match(Op.get(),
33575 m_And(m_Value(), m_SpecificInt(UINT64_C(0xffffffff)0xffffffffUL)))) {
33576 Ops.push_back(&Op);
33577 }
33578 }
33579
33580 return !Ops.empty();
33581 }
33582
33583 // A uniform shift amount in a vector shift or funnel shift may be much
33584 // cheaper than a generic variable vector shift, so make that pattern visible
33585 // to SDAG by sinking the shuffle instruction next to the shift.
33586 int ShiftAmountOpNum = -1;
33587 if (I->isShift())
33588 ShiftAmountOpNum = 1;
33589 else if (auto *II = dyn_cast<IntrinsicInst>(I)) {
33590 if (II->getIntrinsicID() == Intrinsic::fshl ||
33591 II->getIntrinsicID() == Intrinsic::fshr)
33592 ShiftAmountOpNum = 2;
33593 }
33594
33595 if (ShiftAmountOpNum == -1)
33596 return false;
33597
33598 auto *Shuf = dyn_cast<ShuffleVectorInst>(I->getOperand(ShiftAmountOpNum));
33599 if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 &&
33600 isVectorShiftByScalarCheap(I->getType())) {
33601 Ops.push_back(&I->getOperandUse(ShiftAmountOpNum));
33602 return true;
33603 }
33604
33605 return false;
33606}
33607
33608bool X86TargetLowering::shouldConvertPhiType(Type *From, Type *To) const {
33609 if (!Subtarget.is64Bit())
33610 return false;
33611 return TargetLowering::shouldConvertPhiType(From, To);
33612}
33613
33614bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
33615 if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))
33616 return false;
33617
33618 EVT SrcVT = ExtVal.getOperand(0).getValueType();
33619
33620 // There is no extending load for vXi1.
33621 if (SrcVT.getScalarType() == MVT::i1)
33622 return false;
33623
33624 return true;
33625}
33626
33627bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
33628 EVT VT) const {
33629 if (!Subtarget.hasAnyFMA())
33630 return false;
33631
33632 VT = VT.getScalarType();
33633
33634 if (!VT.isSimple())
33635 return false;
33636
33637 switch (VT.getSimpleVT().SimpleTy) {
33638 case MVT::f16:
33639 return Subtarget.hasFP16();
33640 case MVT::f32:
33641 case MVT::f64:
33642 return true;
33643 default:
33644 break;
33645 }
33646
33647 return false;
33648}
33649
33650bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
33651 // i16 instructions are longer (0x66 prefix) and potentially slower.
33652 return !(VT1 == MVT::i32 && VT2 == MVT::i16);
33653}
33654
33655bool X86TargetLowering::shouldFoldSelectWithIdentityConstant(unsigned Opcode,
33656 EVT VT) const {
33657 // TODO: This is too general. There are cases where pre-AVX512 codegen would
33658 // benefit. The transform may also be profitable for scalar code.
33659 if (!Subtarget.hasAVX512())
33660 return false;
33661 if (!Subtarget.hasVLX() && !VT.is512BitVector())
33662 return false;
33663 if (!VT.isVector())
33664 return false;
33665
33666 return true;
33667}
33668
33669/// Targets can use this to indicate that they only support *some*
33670/// VECTOR_SHUFFLE operations, those with specific masks.
33671/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
33672/// are assumed to be legal.
33673bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const {
33674 if (!VT.isSimple())
33675 return false;
33676
33677 // Not for i1 vectors
33678 if (VT.getSimpleVT().getScalarType() == MVT::i1)
33679 return false;
33680
33681 // Very little shuffling can be done for 64-bit vectors right now.
33682 if (VT.getSimpleVT().getSizeInBits() == 64)
33683 return false;
33684
33685 // We only care that the types being shuffled are legal. The lowering can
33686 // handle any possible shuffle mask that results.
33687 return isTypeLegal(VT.getSimpleVT());
33688}
33689
33690bool X86TargetLowering::isVectorClearMaskLegal(ArrayRef<int> Mask,
33691 EVT VT) const {
33692 // Don't convert an 'and' into a shuffle that we don't directly support.
33693 // vpblendw and vpshufb for 256-bit vectors are not available on AVX1.
33694 if (!Subtarget.hasAVX2())
33695 if (VT == MVT::v32i8 || VT == MVT::v16i16)
33696 return false;
33697
33698 // Just delegate to the generic legality, clear masks aren't special.
33699 return isShuffleMaskLegal(Mask, VT);
33700}
33701
33702bool X86TargetLowering::areJTsAllowed(const Function *Fn) const {
33703 // If the subtarget is using thunks, we need to not generate jump tables.
33704 if (Subtarget.useIndirectThunkBranches())
33705 return false;
33706
33707 // Otherwise, fallback on the generic logic.
33708 return TargetLowering::areJTsAllowed(Fn);
33709}
33710
33711//===----------------------------------------------------------------------===//
33712// X86 Scheduler Hooks
33713//===----------------------------------------------------------------------===//
33714
33715// Returns true if EFLAG is consumed after this iterator in the rest of the
33716// basic block or any successors of the basic block.
33717static bool isEFLAGSLiveAfter(MachineBasicBlock::iterator Itr,
33718 MachineBasicBlock *BB) {
33719 // Scan forward through BB for a use/def of EFLAGS.
33720 for (const MachineInstr &mi : llvm::make_range(std::next(Itr), BB->end())) {
33721 if (mi.readsRegister(X86::EFLAGS))
33722 return true;
33723 // If we found a def, we can stop searching.
33724 if (mi.definesRegister(X86::EFLAGS))
33725 return false;
33726 }
33727
33728 // If we hit the end of the block, check whether EFLAGS is live into a
33729 // successor.
33730 for (MachineBasicBlock *Succ : BB->successors())
33731 if (Succ->isLiveIn(X86::EFLAGS))
33732 return true;
33733
33734 return false;
33735}
33736
33737/// Utility function to emit xbegin specifying the start of an RTM region.
33738static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
33739 const TargetInstrInfo *TII) {
33740 const DebugLoc &DL = MI.getDebugLoc();
33741
33742 const BasicBlock *BB = MBB->getBasicBlock();
33743 MachineFunction::iterator I = ++MBB->getIterator();
33744
33745 // For the v = xbegin(), we generate
33746 //
33747 // thisMBB:
33748 // xbegin sinkMBB
33749 //
33750 // mainMBB:
33751 // s0 = -1
33752 //
33753 // fallBB:
33754 // eax = # XABORT_DEF
33755 // s1 = eax
33756 //
33757 // sinkMBB:
33758 // v = phi(s0/mainBB, s1/fallBB)
33759
33760 MachineBasicBlock *thisMBB = MBB;
33761 MachineFunction *MF = MBB->getParent();
33762 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
33763 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
33764 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
33765 MF->insert(I, mainMBB);
33766 MF->insert(I, fallMBB);
33767 MF->insert(I, sinkMBB);
33768
33769 if (isEFLAGSLiveAfter(MI, MBB)) {
33770 mainMBB->addLiveIn(X86::EFLAGS);
33771 fallMBB->addLiveIn(X86::EFLAGS);
33772 sinkMBB->addLiveIn(X86::EFLAGS);
33773 }
33774
33775 // Transfer the remainder of BB and its successor edges to sinkMBB.
33776 sinkMBB->splice(sinkMBB->begin(), MBB,
33777 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
33778 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
33779
33780 MachineRegisterInfo &MRI = MF->getRegInfo();
33781 Register DstReg = MI.getOperand(0).getReg();
33782 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
33783 Register mainDstReg = MRI.createVirtualRegister(RC);
33784 Register fallDstReg = MRI.createVirtualRegister(RC);
33785
33786 // thisMBB:
33787 // xbegin fallMBB
33788 // # fallthrough to mainMBB
33789 // # abortion to fallMBB
33790 BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
33791 thisMBB->addSuccessor(mainMBB);
33792 thisMBB->addSuccessor(fallMBB);
33793
33794 // mainMBB:
33795 // mainDstReg := -1
33796 BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
33797 BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
33798 mainMBB->addSuccessor(sinkMBB);
33799
33800 // fallMBB:
33801 // ; pseudo instruction to model hardware's definition from XABORT
33802 // EAX := XABORT_DEF
33803 // fallDstReg := EAX
33804 BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));
33805 BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)
33806 .addReg(X86::EAX);
33807 fallMBB->addSuccessor(sinkMBB);
33808
33809 // sinkMBB:
33810 // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
33811 BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)
33812 .addReg(mainDstReg).addMBB(mainMBB)
33813 .addReg(fallDstReg).addMBB(fallMBB);
33814
33815 MI.eraseFromParent();
33816 return sinkMBB;
33817}
33818
33819MachineBasicBlock *
33820X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
33821 MachineBasicBlock *MBB) const {
33822 // Emit va_arg instruction on X86-64.
33823
33824 // Operands to this pseudo-instruction:
33825 // 0 ) Output : destination address (reg)
33826 // 1-5) Input : va_list address (addr, i64mem)
33827 // 6 ) ArgSize : Size (in bytes) of vararg type
33828 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
33829 // 8 ) Align : Alignment of type
33830 // 9 ) EFLAGS (implicit-def)
33831
33832 assert(MI.getNumOperands() == 10 && "VAARG should have 10 operands!")(static_cast <bool> (MI.getNumOperands() == 10 &&
"VAARG should have 10 operands!") ? void (0) : __assert_fail
("MI.getNumOperands() == 10 && \"VAARG should have 10 operands!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33832, __extension__
__PRETTY_FUNCTION__))
;
33833 static_assert(X86::AddrNumOperands == 5, "VAARG assumes 5 address operands");
33834
33835 Register DestReg = MI.getOperand(0).getReg();
33836 MachineOperand &Base = MI.getOperand(1);
33837 MachineOperand &Scale = MI.getOperand(2);
33838 MachineOperand &Index = MI.getOperand(3);
33839 MachineOperand &Disp = MI.getOperand(4);
33840 MachineOperand &Segment = MI.getOperand(5);
33841 unsigned ArgSize = MI.getOperand(6).getImm();
33842 unsigned ArgMode = MI.getOperand(7).getImm();
33843 Align Alignment = Align(MI.getOperand(8).getImm());
33844
33845 MachineFunction *MF = MBB->getParent();
33846
33847 // Memory Reference
33848 assert(MI.hasOneMemOperand() && "Expected VAARG to have one memoperand")(static_cast <bool> (MI.hasOneMemOperand() && "Expected VAARG to have one memoperand"
) ? void (0) : __assert_fail ("MI.hasOneMemOperand() && \"Expected VAARG to have one memoperand\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33848, __extension__
__PRETTY_FUNCTION__))
;
33849
33850 MachineMemOperand *OldMMO = MI.memoperands().front();
33851
33852 // Clone the MMO into two separate MMOs for loading and storing
33853 MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(
33854 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);
33855 MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(
33856 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);
33857
33858 // Machine Information
33859 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
33860 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
33861 const TargetRegisterClass *AddrRegClass =
33862 getRegClassFor(getPointerTy(MBB->getParent()->getDataLayout()));
33863 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
33864 const DebugLoc &DL = MI.getDebugLoc();
33865
33866 // struct va_list {
33867 // i32 gp_offset
33868 // i32 fp_offset
33869 // i64 overflow_area (address)
33870 // i64 reg_save_area (address)
33871 // }
33872 // sizeof(va_list) = 24
33873 // alignment(va_list) = 8
33874
33875 unsigned TotalNumIntRegs = 6;
33876 unsigned TotalNumXMMRegs = 8;
33877 bool UseGPOffset = (ArgMode == 1);
33878 bool UseFPOffset = (ArgMode == 2);
33879 unsigned MaxOffset = TotalNumIntRegs * 8 +
33880 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
33881
33882 /* Align ArgSize to a multiple of 8 */
33883 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
33884 bool NeedsAlign = (Alignment > 8);
33885
33886 MachineBasicBlock *thisMBB = MBB;
33887 MachineBasicBlock *overflowMBB;
33888 MachineBasicBlock *offsetMBB;
33889 MachineBasicBlock *endMBB;
33890
33891 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
33892 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
33893 unsigned OffsetReg = 0;
33894
33895 if (!UseGPOffset && !UseFPOffset) {
33896 // If we only pull from the overflow region, we don't create a branch.
33897 // We don't need to alter control flow.
33898 OffsetDestReg = 0; // unused
33899 OverflowDestReg = DestReg;
33900
33901 offsetMBB = nullptr;
33902 overflowMBB = thisMBB;
33903 endMBB = thisMBB;
33904 } else {
33905 // First emit code to check if gp_offset (or fp_offset) is below the bound.
33906 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
33907 // If not, pull from overflow_area. (branch to overflowMBB)
33908 //
33909 // thisMBB
33910 // | .
33911 // | .
33912 // offsetMBB overflowMBB
33913 // | .
33914 // | .
33915 // endMBB
33916
33917 // Registers for the PHI in endMBB
33918 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
33919 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
33920
33921 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
33922 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
33923 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
33924 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
33925
33926 MachineFunction::iterator MBBIter = ++MBB->getIterator();
33927
33928 // Insert the new basic blocks
33929 MF->insert(MBBIter, offsetMBB);
33930 MF->insert(MBBIter, overflowMBB);
33931 MF->insert(MBBIter, endMBB);
33932
33933 // Transfer the remainder of MBB and its successor edges to endMBB.
33934 endMBB->splice(endMBB->begin(), thisMBB,
33935 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
33936 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
33937
33938 // Make offsetMBB and overflowMBB successors of thisMBB
33939 thisMBB->addSuccessor(offsetMBB);
33940 thisMBB->addSuccessor(overflowMBB);
33941
33942 // endMBB is a successor of both offsetMBB and overflowMBB
33943 offsetMBB->addSuccessor(endMBB);
33944 overflowMBB->addSuccessor(endMBB);
33945
33946 // Load the offset value into a register
33947 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
33948 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
33949 .add(Base)
33950 .add(Scale)
33951 .add(Index)
33952 .addDisp(Disp, UseFPOffset ? 4 : 0)
33953 .add(Segment)
33954 .setMemRefs(LoadOnlyMMO);
33955
33956 // Check if there is enough room left to pull this argument.
33957 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
33958 .addReg(OffsetReg)
33959 .addImm(MaxOffset + 8 - ArgSizeA8);
33960
33961 // Branch to "overflowMBB" if offset >= max
33962 // Fall through to "offsetMBB" otherwise
33963 BuildMI(thisMBB, DL, TII->get(X86::JCC_1))
33964 .addMBB(overflowMBB).addImm(X86::COND_AE);
33965 }
33966
33967 // In offsetMBB, emit code to use the reg_save_area.
33968 if (offsetMBB) {
33969 assert(OffsetReg != 0)(static_cast <bool> (OffsetReg != 0) ? void (0) : __assert_fail
("OffsetReg != 0", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33969, __extension__ __PRETTY_FUNCTION__))
;
33970
33971 // Read the reg_save_area address.
33972 Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
33973 BuildMI(
33974 offsetMBB, DL,
33975 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
33976 RegSaveReg)
33977 .add(Base)
33978 .add(Scale)
33979 .add(Index)
33980 .addDisp(Disp, Subtarget.isTarget64BitLP64() ? 16 : 12)
33981 .add(Segment)
33982 .setMemRefs(LoadOnlyMMO);
33983
33984 if (Subtarget.isTarget64BitLP64()) {
33985 // Zero-extend the offset
33986 Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
33987 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
33988 .addImm(0)
33989 .addReg(OffsetReg)
33990 .addImm(X86::sub_32bit);
33991
33992 // Add the offset to the reg_save_area to get the final address.
33993 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
33994 .addReg(OffsetReg64)
33995 .addReg(RegSaveReg);
33996 } else {
33997 // Add the offset to the reg_save_area to get the final address.
33998 BuildMI(offsetMBB, DL, TII->get(X86::ADD32rr), OffsetDestReg)
33999 .addReg(OffsetReg)
34000 .addReg(RegSaveReg);
34001 }
34002
34003 // Compute the offset for the next argument
34004 Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
34005 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
34006 .addReg(OffsetReg)
34007 .addImm(UseFPOffset ? 16 : 8);
34008
34009 // Store it back into the va_list.
34010 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
34011 .add(Base)
34012 .add(Scale)
34013 .add(Index)
34014 .addDisp(Disp, UseFPOffset ? 4 : 0)
34015 .add(Segment)
34016 .addReg(NextOffsetReg)
34017 .setMemRefs(StoreOnlyMMO);
34018
34019 // Jump to endMBB
34020 BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
34021 .addMBB(endMBB);
34022 }
34023
34024 //
34025 // Emit code to use overflow area
34026 //
34027
34028 // Load the overflow_area address into a register.
34029 Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
34030 BuildMI(overflowMBB, DL,
34031 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
34032 OverflowAddrReg)
34033 .add(Base)
34034 .add(Scale)
34035 .add(Index)
34036 .addDisp(Disp, 8)
34037 .add(Segment)
34038 .setMemRefs(LoadOnlyMMO);
34039
34040 // If we need to align it, do so. Otherwise, just copy the address
34041 // to OverflowDestReg.
34042 if (NeedsAlign) {
34043 // Align the overflow address
34044 Register TmpReg = MRI.createVirtualRegister(AddrRegClass);
34045
34046 // aligned_addr = (addr + (align-1)) & ~(align-1)
34047 BuildMI(
34048 overflowMBB, DL,
34049 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
34050 TmpReg)
34051 .addReg(OverflowAddrReg)
34052 .addImm(Alignment.value() - 1);
34053
34054 BuildMI(
34055 overflowMBB, DL,
34056 TII->get(Subtarget.isTarget64BitLP64() ? X86::AND64ri32 : X86::AND32ri),
34057 OverflowDestReg)
34058 .addReg(TmpReg)
34059 .addImm(~(uint64_t)(Alignment.value() - 1));
34060 } else {
34061 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
34062 .addReg(OverflowAddrReg);
34063 }
34064
34065 // Compute the next overflow address after this argument.
34066 // (the overflow address should be kept 8-byte aligned)
34067 Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
34068 BuildMI(
34069 overflowMBB, DL,
34070 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
34071 NextAddrReg)
34072 .addReg(OverflowDestReg)
34073 .addImm(ArgSizeA8);
34074
34075 // Store the new overflow address.
34076 BuildMI(overflowMBB, DL,
34077 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64mr : X86::MOV32mr))
34078 .add(Base)
34079 .add(Scale)
34080 .add(Index)
34081 .addDisp(Disp, 8)
34082 .add(Segment)
34083 .addReg(NextAddrReg)
34084 .setMemRefs(StoreOnlyMMO);
34085
34086 // If we branched, emit the PHI to the front of endMBB.
34087 if (offsetMBB) {
34088 BuildMI(*endMBB, endMBB->begin(), DL,
34089 TII->get(X86::PHI), DestReg)
34090 .addReg(OffsetDestReg).addMBB(offsetMBB)
34091 .addReg(OverflowDestReg).addMBB(overflowMBB);
34092 }
34093
34094 // Erase the pseudo instruction
34095 MI.eraseFromParent();
34096
34097 return endMBB;
34098}
34099
34100// The EFLAGS operand of SelectItr might be missing a kill marker
34101// because there were multiple uses of EFLAGS, and ISel didn't know
34102// which to mark. Figure out whether SelectItr should have had a
34103// kill marker, and set it if it should. Returns the correct kill
34104// marker value.
34105static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
34106 MachineBasicBlock* BB,
34107 const TargetRegisterInfo* TRI) {
34108 if (isEFLAGSLiveAfter(SelectItr, BB))
34109 return false;
34110
34111 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
34112 // out. SelectMI should have a kill flag on EFLAGS.
34113 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
34114 return true;
34115}
34116
34117// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
34118// together with other CMOV pseudo-opcodes into a single basic-block with
34119// conditional jump around it.
34120static bool isCMOVPseudo(MachineInstr &MI) {
34121 switch (MI.getOpcode()) {
34122 case X86::CMOV_FR16X:
34123 case X86::CMOV_FR32:
34124 case X86::CMOV_FR32X:
34125 case X86::CMOV_FR64:
34126 case X86::CMOV_FR64X:
34127 case X86::CMOV_GR8:
34128 case X86::CMOV_GR16:
34129 case X86::CMOV_GR32:
34130 case X86::CMOV_RFP32:
34131 case X86::CMOV_RFP64:
34132 case X86::CMOV_RFP80:
34133 case X86::CMOV_VR64:
34134 case X86::CMOV_VR128:
34135 case X86::CMOV_VR128X:
34136 case X86::CMOV_VR256:
34137 case X86::CMOV_VR256X:
34138 case X86::CMOV_VR512:
34139 case X86::CMOV_VK1:
34140 case X86::CMOV_VK2:
34141 case X86::CMOV_VK4:
34142 case X86::CMOV_VK8:
34143 case X86::CMOV_VK16:
34144 case X86::CMOV_VK32:
34145 case X86::CMOV_VK64:
34146 return true;
34147
34148 default:
34149 return false;
34150 }
34151}
34152
34153// Helper function, which inserts PHI functions into SinkMBB:
34154// %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
34155// where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
34156// in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
34157// the last PHI function inserted.
34158static MachineInstrBuilder createPHIsForCMOVsInSinkBB(
34159 MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd,
34160 MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
34161 MachineBasicBlock *SinkMBB) {
34162 MachineFunction *MF = TrueMBB->getParent();
34163 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
34164 const DebugLoc &DL = MIItBegin->getDebugLoc();
34165
34166 X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
34167 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
34168
34169 MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
34170
34171 // As we are creating the PHIs, we have to be careful if there is more than
34172 // one. Later CMOVs may reference the results of earlier CMOVs, but later
34173 // PHIs have to reference the individual true/false inputs from earlier PHIs.
34174 // That also means that PHI construction must work forward from earlier to
34175 // later, and that the code must maintain a mapping from earlier PHI's
34176 // destination registers, and the registers that went into the PHI.
34177 DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
34178 MachineInstrBuilder MIB;
34179
34180 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
34181 Register DestReg = MIIt->getOperand(0).getReg();
34182 Register Op1Reg = MIIt->getOperand(1).getReg();
34183 Register Op2Reg = MIIt->getOperand(2).getReg();
34184
34185 // If this CMOV we are generating is the opposite condition from
34186 // the jump we generated, then we have to swap the operands for the
34187 // PHI that is going to be generated.
34188 if (MIIt->getOperand(3).getImm() == OppCC)
34189 std::swap(Op1Reg, Op2Reg);
34190
34191 if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
34192 Op1Reg = RegRewriteTable[Op1Reg].first;
34193
34194 if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
34195 Op2Reg = RegRewriteTable[Op2Reg].second;
34196
34197 MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)
34198 .addReg(Op1Reg)
34199 .addMBB(FalseMBB)
34200 .addReg(Op2Reg)
34201 .addMBB(TrueMBB);
34202
34203 // Add this PHI to the rewrite table.
34204 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
34205 }
34206
34207 return MIB;
34208}
34209
34210// Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
34211MachineBasicBlock *
34212X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
34213 MachineInstr &SecondCascadedCMOV,
34214 MachineBasicBlock *ThisMBB) const {
34215 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
34216 const DebugLoc &DL = FirstCMOV.getDebugLoc();
34217
34218 // We lower cascaded CMOVs such as
34219 //
34220 // (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
34221 //
34222 // to two successive branches.
34223 //
34224 // Without this, we would add a PHI between the two jumps, which ends up
34225 // creating a few copies all around. For instance, for
34226 //
34227 // (sitofp (zext (fcmp une)))
34228 //
34229 // we would generate:
34230 //
34231 // ucomiss %xmm1, %xmm0
34232 // movss <1.0f>, %xmm0
34233 // movaps %xmm0, %xmm1
34234 // jne .LBB5_2
34235 // xorps %xmm1, %xmm1
34236 // .LBB5_2:
34237 // jp .LBB5_4
34238 // movaps %xmm1, %xmm0
34239 // .LBB5_4:
34240 // retq
34241 //
34242 // because this custom-inserter would have generated:
34243 //
34244 // A
34245 // | \
34246 // | B
34247 // | /
34248 // C
34249 // | \
34250 // | D
34251 // | /
34252 // E
34253 //
34254 // A: X = ...; Y = ...
34255 // B: empty
34256 // C: Z = PHI [X, A], [Y, B]
34257 // D: empty
34258 // E: PHI [X, C], [Z, D]
34259 //
34260 // If we lower both CMOVs in a single step, we can instead generate:
34261 //
34262 // A
34263 // | \
34264 // | C
34265 // | /|
34266 // |/ |
34267 // | |
34268 // | D
34269 // | /
34270 // E
34271 //
34272 // A: X = ...; Y = ...
34273 // D: empty
34274 // E: PHI [X, A], [X, C], [Y, D]
34275 //
34276 // Which, in our sitofp/fcmp example, gives us something like:
34277 //
34278 // ucomiss %xmm1, %xmm0
34279 // movss <1.0f>, %xmm0
34280 // jne .LBB5_4
34281 // jp .LBB5_4
34282 // xorps %xmm0, %xmm0
34283 // .LBB5_4:
34284 // retq
34285 //
34286
34287 // We lower cascaded CMOV into two successive branches to the same block.
34288 // EFLAGS is used by both, so mark it as live in the second.
34289 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
34290 MachineFunction *F = ThisMBB->getParent();
34291 MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
34292 MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
34293 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
34294
34295 MachineFunction::iterator It = ++ThisMBB->getIterator();
34296 F->insert(It, FirstInsertedMBB);
34297 F->insert(It, SecondInsertedMBB);
34298 F->insert(It, SinkMBB);
34299
34300 // For a cascaded CMOV, we lower it to two successive branches to
34301 // the same block (SinkMBB). EFLAGS is used by both, so mark it as live in
34302 // the FirstInsertedMBB.
34303 FirstInsertedMBB->addLiveIn(X86::EFLAGS);
34304
34305 // If the EFLAGS register isn't dead in the terminator, then claim that it's
34306 // live into the sink and copy blocks.
34307 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
34308 if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS) &&
34309 !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
34310 SecondInsertedMBB->addLiveIn(X86::EFLAGS);
34311 SinkMBB->addLiveIn(X86::EFLAGS);
34312 }
34313
34314 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
34315 SinkMBB->splice(SinkMBB->begin(), ThisMBB,
34316 std::next(MachineBasicBlock::iterator(FirstCMOV)),
34317 ThisMBB->end());
34318 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
34319
34320 // Fallthrough block for ThisMBB.
34321 ThisMBB->addSuccessor(FirstInsertedMBB);
34322 // The true block target of the first branch is always SinkMBB.
34323 ThisMBB->addSuccessor(SinkMBB);
34324 // Fallthrough block for FirstInsertedMBB.
34325 FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
34326 // The true block for the branch of FirstInsertedMBB.
34327 FirstInsertedMBB->addSuccessor(SinkMBB);
34328 // This is fallthrough.
34329 SecondInsertedMBB->addSuccessor(SinkMBB);
34330
34331 // Create the conditional branch instructions.
34332 X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
34333 BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);
34334
34335 X86::CondCode SecondCC =
34336 X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
34337 BuildMI(FirstInsertedMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(SecondCC);
34338
34339 // SinkMBB:
34340 // %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
34341 Register DestReg = FirstCMOV.getOperand(0).getReg();
34342 Register Op1Reg = FirstCMOV.getOperand(1).getReg();
34343 Register Op2Reg = FirstCMOV.getOperand(2).getReg();
34344 MachineInstrBuilder MIB =
34345 BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg)
34346 .addReg(Op1Reg)
34347 .addMBB(SecondInsertedMBB)
34348 .addReg(Op2Reg)
34349 .addMBB(ThisMBB);
34350
34351 // The second SecondInsertedMBB provides the same incoming value as the
34352 // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
34353 MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
34354 // Copy the PHI result to the register defined by the second CMOV.
34355 BuildMI(*SinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), DL,
34356 TII->get(TargetOpcode::COPY),
34357 SecondCascadedCMOV.getOperand(0).getReg())
34358 .addReg(FirstCMOV.getOperand(0).getReg());
34359
34360 // Now remove the CMOVs.
34361 FirstCMOV.eraseFromParent();
34362 SecondCascadedCMOV.eraseFromParent();
34363
34364 return SinkMBB;
34365}
34366
34367MachineBasicBlock *
34368X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
34369 MachineBasicBlock *ThisMBB) const {
34370 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
34371 const DebugLoc &DL = MI.getDebugLoc();
34372
34373 // To "insert" a SELECT_CC instruction, we actually have to insert the
34374 // diamond control-flow pattern. The incoming instruction knows the
34375 // destination vreg to set, the condition code register to branch on, the
34376 // true/false values to select between and a branch opcode to use.
34377
34378 // ThisMBB:
34379 // ...
34380 // TrueVal = ...
34381 // cmpTY ccX, r1, r2
34382 // bCC copy1MBB
34383 // fallthrough --> FalseMBB
34384
34385 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
34386 // as described above, by inserting a BB, and then making a PHI at the join
34387 // point to select the true and false operands of the CMOV in the PHI.
34388 //
34389 // The code also handles two different cases of multiple CMOV opcodes
34390 // in a row.
34391 //
34392 // Case 1:
34393 // In this case, there are multiple CMOVs in a row, all which are based on
34394 // the same condition setting (or the exact opposite condition setting).
34395 // In this case we can lower all the CMOVs using a single inserted BB, and
34396 // then make a number of PHIs at the join point to model the CMOVs. The only
34397 // trickiness here, is that in a case like:
34398 //
34399 // t2 = CMOV cond1 t1, f1
34400 // t3 = CMOV cond1 t2, f2
34401 //
34402 // when rewriting this into PHIs, we have to perform some renaming on the
34403 // temps since you cannot have a PHI operand refer to a PHI result earlier
34404 // in the same block. The "simple" but wrong lowering would be:
34405 //
34406 // t2 = PHI t1(BB1), f1(BB2)
34407 // t3 = PHI t2(BB1), f2(BB2)
34408 //
34409 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
34410 // renaming is to note that on the path through BB1, t2 is really just a
34411 // copy of t1, and do that renaming, properly generating:
34412 //
34413 // t2 = PHI t1(BB1), f1(BB2)
34414 // t3 = PHI t1(BB1), f2(BB2)
34415 //
34416 // Case 2:
34417 // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
34418 // function - EmitLoweredCascadedSelect.
34419
34420 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
34421 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
34422 MachineInstr *LastCMOV = &MI;
34423 MachineBasicBlock::iterator NextMIIt = MachineBasicBlock::iterator(MI);
34424
34425 // Check for case 1, where there are multiple CMOVs with the same condition
34426 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
34427 // number of jumps the most.
34428
34429 if (isCMOVPseudo(MI)) {
34430 // See if we have a string of CMOVS with the same condition. Skip over
34431 // intervening debug insts.
34432 while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
34433 (NextMIIt->getOperand(3).getImm() == CC ||
34434 NextMIIt->getOperand(3).getImm() == OppCC)) {
34435 LastCMOV = &*NextMIIt;
34436 NextMIIt = next_nodbg(NextMIIt, ThisMBB->end());
34437 }
34438 }
34439
34440 // This checks for case 2, but only do this if we didn't already find
34441 // case 1, as indicated by LastCMOV == MI.
34442 if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
34443 NextMIIt->getOpcode() == MI.getOpcode() &&
34444 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
34445 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
34446 NextMIIt->getOperand(1).isKill()) {
34447 return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
34448 }
34449
34450 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
34451 MachineFunction *F = ThisMBB->getParent();
34452 MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
34453 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
34454
34455 MachineFunction::iterator It = ++ThisMBB->getIterator();
34456 F->insert(It, FalseMBB);
34457 F->insert(It, SinkMBB);
34458
34459 // If the EFLAGS register isn't dead in the terminator, then claim that it's
34460 // live into the sink and copy blocks.
34461 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
34462 if (!LastCMOV->killsRegister(X86::EFLAGS) &&
34463 !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
34464 FalseMBB->addLiveIn(X86::EFLAGS);
34465 SinkMBB->addLiveIn(X86::EFLAGS);
34466 }
34467
34468 // Transfer any debug instructions inside the CMOV sequence to the sunk block.
34469 auto DbgRange = llvm::make_range(MachineBasicBlock::iterator(MI),
34470 MachineBasicBlock::iterator(LastCMOV));
34471 for (MachineInstr &MI : llvm::make_early_inc_range(DbgRange))
34472 if (MI.isDebugInstr())
34473 SinkMBB->push_back(MI.removeFromParent());
34474
34475 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
34476 SinkMBB->splice(SinkMBB->end(), ThisMBB,
34477 std::next(MachineBasicBlock::iterator(LastCMOV)),
34478 ThisMBB->end());
34479 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
34480
34481 // Fallthrough block for ThisMBB.
34482 ThisMBB->addSuccessor(FalseMBB);
34483 // The true block target of the first (or only) branch is always a SinkMBB.
34484 ThisMBB->addSuccessor(SinkMBB);
34485 // Fallthrough block for FalseMBB.
34486 FalseMBB->addSuccessor(SinkMBB);
34487
34488 // Create the conditional branch instruction.
34489 BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);
34490
34491 // SinkMBB:
34492 // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
34493 // ...
34494 MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
34495 MachineBasicBlock::iterator MIItEnd =
34496 std::next(MachineBasicBlock::iterator(LastCMOV));
34497 createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);
34498
34499 // Now remove the CMOV(s).
34500 ThisMBB->erase(MIItBegin, MIItEnd);
34501
34502 return SinkMBB;
34503}
34504
34505static unsigned getSUBriOpcode(bool IsLP64, int64_t Imm) {
34506 if (IsLP64) {
34507 if (isInt<8>(Imm))
34508 return X86::SUB64ri8;
34509 return X86::SUB64ri32;
34510 } else {
34511 if (isInt<8>(Imm))
34512 return X86::SUB32ri8;
34513 return X86::SUB32ri;
34514 }
34515}
34516
34517MachineBasicBlock *
34518X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,
34519 MachineBasicBlock *MBB) const {
34520 MachineFunction *MF = MBB->getParent();
34521 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
34522 const X86FrameLowering &TFI = *Subtarget.getFrameLowering();
34523 const DebugLoc &DL = MI.getDebugLoc();
34524 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
34525
34526 const unsigned ProbeSize = getStackProbeSize(*MF);
34527
34528 MachineRegisterInfo &MRI = MF->getRegInfo();
34529 MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB);
34530 MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB);
34531 MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB);
34532
34533 MachineFunction::iterator MBBIter = ++MBB->getIterator();
34534 MF->insert(MBBIter, testMBB);
34535 MF->insert(MBBIter, blockMBB);
34536 MF->insert(MBBIter, tailMBB);
34537
34538 Register sizeVReg = MI.getOperand(1).getReg();
34539
34540 Register physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP;
34541
34542 Register TmpStackPtr = MRI.createVirtualRegister(
34543 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
34544 Register FinalStackPtr = MRI.createVirtualRegister(
34545 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
34546
34547 BuildMI(*MBB, {MI}, DL, TII->get(TargetOpcode::COPY), TmpStackPtr)
34548 .addReg(physSPReg);
34549 {
34550 const unsigned Opc = TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr;
34551 BuildMI(*MBB, {MI}, DL, TII->get(Opc), FinalStackPtr)
34552 .addReg(TmpStackPtr)
34553 .addReg(sizeVReg);
34554 }
34555
34556 // test rsp size
34557
34558 BuildMI(testMBB, DL,
34559 TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
34560 .addReg(FinalStackPtr)
34561 .addReg(physSPReg);
34562
34563 BuildMI(testMBB, DL, TII->get(X86::JCC_1))
34564 .addMBB(tailMBB)
34565 .addImm(X86::COND_GE);
34566 testMBB->addSuccessor(blockMBB);
34567 testMBB->addSuccessor(tailMBB);
34568
34569 // Touch the block then extend it. This is done on the opposite side of
34570 // static probe where we allocate then touch, to avoid the need of probing the
34571 // tail of the static alloca. Possible scenarios are:
34572 //
34573 // + ---- <- ------------ <- ------------- <- ------------ +
34574 // | |
34575 // [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +
34576 // | |
34577 // + <- ----------- <- ------------ <- ----------- <- ------------ +
34578 //
34579 // The property we want to enforce is to never have more than [page alloc] between two probes.
34580
34581 const unsigned XORMIOpc =
34582 TFI.Uses64BitFramePtr ? X86::XOR64mi8 : X86::XOR32mi8;
34583 addRegOffset(BuildMI(blockMBB, DL, TII->get(XORMIOpc)), physSPReg, false, 0)
34584 .addImm(0);
34585
34586 BuildMI(blockMBB, DL,
34587 TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr, ProbeSize)), physSPReg)
34588 .addReg(physSPReg)
34589 .addImm(ProbeSize);
34590
34591
34592 BuildMI(blockMBB, DL, TII->get(X86::JMP_1)).addMBB(testMBB);
34593 blockMBB->addSuccessor(testMBB);
34594
34595 // Replace original instruction by the expected stack ptr
34596 BuildMI(tailMBB, DL, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
34597 .addReg(FinalStackPtr);
34598
34599 tailMBB->splice(tailMBB->end(), MBB,
34600 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
34601 tailMBB->transferSuccessorsAndUpdatePHIs(MBB);
34602 MBB->addSuccessor(testMBB);
34603
34604 // Delete the original pseudo instruction.
34605 MI.eraseFromParent();
34606
34607 // And we're done.
34608 return tailMBB;
34609}
34610
34611MachineBasicBlock *
34612X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
34613 MachineBasicBlock *BB) const {
34614 MachineFunction *MF = BB->getParent();
34615 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
34616 const DebugLoc &DL = MI.getDebugLoc();
34617 const BasicBlock *LLVM_BB = BB->getBasicBlock();
34618
34619 assert(MF->shouldSplitStack())(static_cast <bool> (MF->shouldSplitStack()) ? void (
0) : __assert_fail ("MF->shouldSplitStack()", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34619, __extension__ __PRETTY_FUNCTION__))
;
34620
34621 const bool Is64Bit = Subtarget.is64Bit();
34622 const bool IsLP64 = Subtarget.isTarget64BitLP64();
34623
34624 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
34625 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
34626
34627 // BB:
34628 // ... [Till the alloca]
34629 // If stacklet is not large enough, jump to mallocMBB
34630 //
34631 // bumpMBB:
34632 // Allocate by subtracting from RSP
34633 // Jump to continueMBB
34634 //
34635 // mallocMBB:
34636 // Allocate by call to runtime
34637 //
34638 // continueMBB:
34639 // ...
34640 // [rest of original BB]
34641 //
34642
34643 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
34644 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
34645 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
34646
34647 MachineRegisterInfo &MRI = MF->getRegInfo();
34648 const TargetRegisterClass *AddrRegClass =
34649 getRegClassFor(getPointerTy(MF->getDataLayout()));
34650
34651 Register mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
34652 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
34653 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
34654 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
34655 sizeVReg = MI.getOperand(1).getReg(),
34656 physSPReg =
34657 IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
34658
34659 MachineFunction::iterator MBBIter = ++BB->getIterator();
34660
34661 MF->insert(MBBIter, bumpMBB);
34662 MF->insert(MBBIter, mallocMBB);
34663 MF->insert(MBBIter, continueMBB);
34664
34665 continueMBB->splice(continueMBB->begin(), BB,
34666 std::next(MachineBasicBlock::iterator(MI)), BB->end());
34667 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
34668
34669 // Add code to the main basic block to check if the stack limit has been hit,
34670 // and if so, jump to mallocMBB otherwise to bumpMBB.
34671 BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
34672 BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
34673 .addReg(tmpSPVReg).addReg(sizeVReg);
34674 BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
34675 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
34676 .addReg(SPLimitVReg);
34677 BuildMI(BB, DL, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);
34678
34679 // bumpMBB simply decreases the stack pointer, since we know the current
34680 // stacklet has enough space.
34681 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
34682 .addReg(SPLimitVReg);
34683 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
34684 .addReg(SPLimitVReg);
34685 BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
34686
34687 // Calls into a routine in libgcc to allocate more space from the heap.
34688 const uint32_t *RegMask =
34689 Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
34690 if (IsLP64) {
34691 BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
34692 .addReg(sizeVReg);
34693 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
34694 .addExternalSymbol("__morestack_allocate_stack_space")
34695 .addRegMask(RegMask)
34696 .addReg(X86::RDI, RegState::Implicit)
34697 .addReg(X86::RAX, RegState::ImplicitDefine);
34698 } else if (Is64Bit) {
34699 BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
34700 .addReg(sizeVReg);
34701 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
34702 .addExternalSymbol("__morestack_allocate_stack_space")
34703 .addRegMask(RegMask)
34704 .addReg(X86::EDI, RegState::Implicit)
34705 .addReg(X86::EAX, RegState::ImplicitDefine);
34706 } else {
34707 BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
34708 .addImm(12);
34709 BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
34710 BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
34711 .addExternalSymbol("__morestack_allocate_stack_space")
34712 .addRegMask(RegMask)
34713 .addReg(X86::EAX, RegState::ImplicitDefine);
34714 }
34715
34716 if (!Is64Bit)
34717 BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
34718 .addImm(16);
34719
34720 BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
34721 .addReg(IsLP64 ? X86::RAX : X86::EAX);
34722 BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
34723
34724 // Set up the CFG correctly.
34725 BB->addSuccessor(bumpMBB);
34726 BB->addSuccessor(mallocMBB);
34727 mallocMBB->addSuccessor(continueMBB);
34728 bumpMBB->addSuccessor(continueMBB);
34729
34730 // Take care of the PHI nodes.
34731 BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
34732 MI.getOperand(0).getReg())
34733 .addReg(mallocPtrVReg)
34734 .addMBB(mallocMBB)
34735 .addReg(bumpSPPtrVReg)
34736 .addMBB(bumpMBB);
34737
34738 // Delete the original pseudo instruction.
34739 MI.eraseFromParent();
34740
34741 // And we're done.
34742 return continueMBB;
34743}
34744
34745MachineBasicBlock *
34746X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
34747 MachineBasicBlock *BB) const {
34748 MachineFunction *MF = BB->getParent();
34749 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
34750 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
34751 const DebugLoc &DL = MI.getDebugLoc();
34752
34753 assert(!isAsynchronousEHPersonality((static_cast <bool> (!isAsynchronousEHPersonality( classifyEHPersonality
(MF->getFunction().getPersonalityFn())) && "SEH does not use catchret!"
) ? void (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction().getPersonalityFn())) && \"SEH does not use catchret!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34755, __extension__
__PRETTY_FUNCTION__))
34754 classifyEHPersonality(MF->getFunction().getPersonalityFn())) &&(static_cast <bool> (!isAsynchronousEHPersonality( classifyEHPersonality
(MF->getFunction().getPersonalityFn())) && "SEH does not use catchret!"
) ? void (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction().getPersonalityFn())) && \"SEH does not use catchret!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34755, __extension__
__PRETTY_FUNCTION__))
34755 "SEH does not use catchret!")(static_cast <bool> (!isAsynchronousEHPersonality( classifyEHPersonality
(MF->getFunction().getPersonalityFn())) && "SEH does not use catchret!"
) ? void (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction().getPersonalityFn())) && \"SEH does not use catchret!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34755, __extension__
__PRETTY_FUNCTION__))
;
34756
34757 // Only 32-bit EH needs to worry about manually restoring stack pointers.
34758 if (!Subtarget.is32Bit())
34759 return BB;
34760
34761 // C++ EH creates a new target block to hold the restore code, and wires up
34762 // the new block to the return destination with a normal JMP_4.
34763 MachineBasicBlock *RestoreMBB =
34764 MF->CreateMachineBasicBlock(BB->getBasicBlock());
34765 assert(BB->succ_size() == 1)(static_cast <bool> (BB->succ_size() == 1) ? void (0
) : __assert_fail ("BB->succ_size() == 1", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34765, __extension__ __PRETTY_FUNCTION__))
;
34766 MF->insert(std::next(BB->getIterator()), RestoreMBB);
34767 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
34768 BB->addSuccessor(RestoreMBB);
34769 MI.getOperand(0).setMBB(RestoreMBB);
34770
34771 // Marking this as an EH pad but not a funclet entry block causes PEI to
34772 // restore stack pointers in the block.
34773 RestoreMBB->setIsEHPad(true);
34774
34775 auto RestoreMBBI = RestoreMBB->begin();
34776 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
34777 return BB;
34778}
34779
34780MachineBasicBlock *
34781X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
34782 MachineBasicBlock *BB) const {
34783 // So, here we replace TLSADDR with the sequence:
34784 // adjust_stackdown -> TLSADDR -> adjust_stackup.
34785 // We need this because TLSADDR is lowered into calls
34786 // inside MC, therefore without the two markers shrink-wrapping
34787 // may push the prologue/epilogue pass them.
34788 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
34789 const DebugLoc &DL = MI.getDebugLoc();
34790 MachineFunction &MF = *BB->getParent();
34791
34792 // Emit CALLSEQ_START right before the instruction.
34793 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
34794 MachineInstrBuilder CallseqStart =
34795 BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
34796 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
34797
34798 // Emit CALLSEQ_END right after the instruction.
34799 // We don't call erase from parent because we want to keep the
34800 // original instruction around.
34801 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
34802 MachineInstrBuilder CallseqEnd =
34803 BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
34804 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
34805
34806 return BB;
34807}
34808
34809MachineBasicBlock *
34810X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
34811 MachineBasicBlock *BB) const {
34812 // This is pretty easy. We're taking the value that we received from
34813 // our load from the relocation, sticking it in either RDI (x86-64)
34814 // or EAX and doing an indirect call. The return value will then
34815 // be in the normal return register.
34816 MachineFunction *F = BB->getParent();
34817 const X86InstrInfo *TII = Subtarget.getInstrInfo();
34818 const DebugLoc &DL = MI.getDebugLoc();
34819
34820 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?")(static_cast <bool> (Subtarget.isTargetDarwin() &&
"Darwin only instr emitted?") ? void (0) : __assert_fail ("Subtarget.isTargetDarwin() && \"Darwin only instr emitted?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34820, __extension__
__PRETTY_FUNCTION__))
;
34821 assert(MI.getOperand(3).isGlobal() && "This should be a global")(static_cast <bool> (MI.getOperand(3).isGlobal() &&
"This should be a global") ? void (0) : __assert_fail ("MI.getOperand(3).isGlobal() && \"This should be a global\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34821, __extension__
__PRETTY_FUNCTION__))
;
34822
34823 // Get a register mask for the lowered call.
34824 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
34825 // proper register mask.
34826 const uint32_t *RegMask =
34827 Subtarget.is64Bit() ?
34828 Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
34829 Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
34830 if (Subtarget.is64Bit()) {
34831 MachineInstrBuilder MIB =
34832 BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
34833 .addReg(X86::RIP)
34834 .addImm(0)
34835 .addReg(0)
34836 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
34837 MI.getOperand(3).getTargetFlags())
34838 .addReg(0);
34839 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
34840 addDirectMem(MIB, X86::RDI);
34841 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
34842 } else if (!isPositionIndependent()) {
34843 MachineInstrBuilder MIB =
34844 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
34845 .addReg(0)
34846 .addImm(0)
34847 .addReg(0)
34848 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
34849 MI.getOperand(3).getTargetFlags())
34850 .addReg(0);
34851 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
34852 addDirectMem(MIB, X86::EAX);
34853 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
34854 } else {
34855 MachineInstrBuilder MIB =
34856 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
34857 .addReg(TII->getGlobalBaseReg(F))
34858 .addImm(0)
34859 .addReg(0)
34860 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
34861 MI.getOperand(3).getTargetFlags())
34862 .addReg(0);
34863 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
34864 addDirectMem(MIB, X86::EAX);
34865 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
34866 }
34867
34868 MI.eraseFromParent(); // The pseudo instruction is gone now.
34869 return BB;
34870}
34871
34872static unsigned getOpcodeForIndirectThunk(unsigned RPOpc) {
34873 switch (RPOpc) {
34874 case X86::INDIRECT_THUNK_CALL32:
34875 return X86::CALLpcrel32;
34876 case X86::INDIRECT_THUNK_CALL64:
34877 return X86::CALL64pcrel32;
34878 case X86::INDIRECT_THUNK_TCRETURN32:
34879 return X86::TCRETURNdi;
34880 case X86::INDIRECT_THUNK_TCRETURN64:
34881 return X86::TCRETURNdi64;
34882 }
34883 llvm_unreachable("not indirect thunk opcode")::llvm::llvm_unreachable_internal("not indirect thunk opcode"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34883)
;
34884}
34885
34886static const char *getIndirectThunkSymbol(const X86Subtarget &Subtarget,
34887 unsigned Reg) {
34888 if (Subtarget.useRetpolineExternalThunk()) {
34889 // When using an external thunk for retpolines, we pick names that match the
34890 // names GCC happens to use as well. This helps simplify the implementation
34891 // of the thunks for kernels where they have no easy ability to create
34892 // aliases and are doing non-trivial configuration of the thunk's body. For
34893 // example, the Linux kernel will do boot-time hot patching of the thunk
34894 // bodies and cannot easily export aliases of these to loaded modules.
34895 //
34896 // Note that at any point in the future, we may need to change the semantics
34897 // of how we implement retpolines and at that time will likely change the
34898 // name of the called thunk. Essentially, there is no hard guarantee that
34899 // LLVM will generate calls to specific thunks, we merely make a best-effort
34900 // attempt to help out kernels and other systems where duplicating the
34901 // thunks is costly.
34902 switch (Reg) {
34903 case X86::EAX:
34904 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34904, __extension__
__PRETTY_FUNCTION__))
;
34905 return "__x86_indirect_thunk_eax";
34906 case X86::ECX:
34907 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34907, __extension__
__PRETTY_FUNCTION__))
;
34908 return "__x86_indirect_thunk_ecx";
34909 case X86::EDX:
34910 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34910, __extension__
__PRETTY_FUNCTION__))
;
34911 return "__x86_indirect_thunk_edx";
34912 case X86::EDI:
34913 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34913, __extension__
__PRETTY_FUNCTION__))
;
34914 return "__x86_indirect_thunk_edi";
34915 case X86::R11:
34916 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!")(static_cast <bool> (Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Should not be using a 64-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34916, __extension__
__PRETTY_FUNCTION__))
;
34917 return "__x86_indirect_thunk_r11";
34918 }
34919 llvm_unreachable("unexpected reg for external indirect thunk")::llvm::llvm_unreachable_internal("unexpected reg for external indirect thunk"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34919)
;
34920 }
34921
34922 if (Subtarget.useRetpolineIndirectCalls() ||
34923 Subtarget.useRetpolineIndirectBranches()) {
34924 // When targeting an internal COMDAT thunk use an LLVM-specific name.
34925 switch (Reg) {
34926 case X86::EAX:
34927 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34927, __extension__
__PRETTY_FUNCTION__))
;
34928 return "__llvm_retpoline_eax";
34929 case X86::ECX:
34930 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34930, __extension__
__PRETTY_FUNCTION__))
;
34931 return "__llvm_retpoline_ecx";
34932 case X86::EDX:
34933 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34933, __extension__
__PRETTY_FUNCTION__))
;
34934 return "__llvm_retpoline_edx";
34935 case X86::EDI:
34936 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34936, __extension__
__PRETTY_FUNCTION__))
;
34937 return "__llvm_retpoline_edi";
34938 case X86::R11:
34939 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!")(static_cast <bool> (Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Should not be using a 64-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34939, __extension__
__PRETTY_FUNCTION__))
;
34940 return "__llvm_retpoline_r11";
34941 }
34942 llvm_unreachable("unexpected reg for retpoline")::llvm::llvm_unreachable_internal("unexpected reg for retpoline"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34942)
;
34943 }
34944
34945 if (Subtarget.useLVIControlFlowIntegrity()) {
34946 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!")(static_cast <bool> (Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Should not be using a 64-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34946, __extension__
__PRETTY_FUNCTION__))
;
34947 return "__llvm_lvi_thunk_r11";
34948 }
34949 llvm_unreachable("getIndirectThunkSymbol() invoked without thunk feature")::llvm::llvm_unreachable_internal("getIndirectThunkSymbol() invoked without thunk feature"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34949)
;
34950}
34951
34952MachineBasicBlock *
34953X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI,
34954 MachineBasicBlock *BB) const {
34955 // Copy the virtual register into the R11 physical register and
34956 // call the retpoline thunk.
34957 const DebugLoc &DL = MI.getDebugLoc();
34958 const X86InstrInfo *TII = Subtarget.getInstrInfo();
34959 Register CalleeVReg = MI.getOperand(0).getReg();
34960 unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode());
34961
34962 // Find an available scratch register to hold the callee. On 64-bit, we can
34963 // just use R11, but we scan for uses anyway to ensure we don't generate
34964 // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
34965 // already a register use operand to the call to hold the callee. If none
34966 // are available, use EDI instead. EDI is chosen because EBX is the PIC base
34967 // register and ESI is the base pointer to realigned stack frames with VLAs.
34968 SmallVector<unsigned, 3> AvailableRegs;
34969 if (Subtarget.is64Bit())
34970 AvailableRegs.push_back(X86::R11);
34971 else
34972 AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});
34973
34974 // Zero out any registers that are already used.
34975 for (const auto &MO : MI.operands()) {
34976 if (MO.isReg() && MO.isUse())
34977 for (unsigned &Reg : AvailableRegs)
34978 if (Reg == MO.getReg())
34979 Reg = 0;
34980 }
34981
34982 // Choose the first remaining non-zero available register.
34983 unsigned AvailableReg = 0;
34984 for (unsigned MaybeReg : AvailableRegs) {
34985 if (MaybeReg) {
34986 AvailableReg = MaybeReg;
34987 break;
34988 }
34989 }
34990 if (!AvailableReg)
34991 report_fatal_error("calling convention incompatible with retpoline, no "
34992 "available registers");
34993
34994 const char *Symbol = getIndirectThunkSymbol(Subtarget, AvailableReg);
34995
34996 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)
34997 .addReg(CalleeVReg);
34998 MI.getOperand(0).ChangeToES(Symbol);
34999 MI.setDesc(TII->get(Opc));
35000 MachineInstrBuilder(*BB->getParent(), &MI)
35001 .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
35002 return BB;
35003}
35004
35005/// SetJmp implies future control flow change upon calling the corresponding
35006/// LongJmp.
35007/// Instead of using the 'return' instruction, the long jump fixes the stack and
35008/// performs an indirect branch. To do so it uses the registers that were stored
35009/// in the jump buffer (when calling SetJmp).
35010/// In case the shadow stack is enabled we need to fix it as well, because some
35011/// return addresses will be skipped.
35012/// The function will save the SSP for future fixing in the function
35013/// emitLongJmpShadowStackFix.
35014/// \sa emitLongJmpShadowStackFix
35015/// \param [in] MI The temporary Machine Instruction for the builtin.
35016/// \param [in] MBB The Machine Basic Block that will be modified.
35017void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
35018 MachineBasicBlock *MBB) const {
35019 const DebugLoc &DL = MI.getDebugLoc();
35020 MachineFunction *MF = MBB->getParent();
35021 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35022 MachineRegisterInfo &MRI = MF->getRegInfo();
35023 MachineInstrBuilder MIB;
35024
35025 // Memory Reference.
35026 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
35027 MI.memoperands_end());
35028
35029 // Initialize a register with zero.
35030 MVT PVT = getPointerTy(MF->getDataLayout());
35031 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
35032 Register ZReg = MRI.createVirtualRegister(PtrRC);
35033 unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
35034 BuildMI(*MBB, MI, DL, TII->get(XorRROpc))
35035 .addDef(ZReg)
35036 .addReg(ZReg, RegState::Undef)
35037 .addReg(ZReg, RegState::Undef);
35038
35039 // Read the current SSP Register value to the zeroed register.
35040 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
35041 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
35042 BuildMI(*MBB, MI, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
35043
35044 // Write the SSP register value to offset 3 in input memory buffer.
35045 unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
35046 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrStoreOpc));
35047 const int64_t SSPOffset = 3 * PVT.getStoreSize();
35048 const unsigned MemOpndSlot = 1;
35049 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
35050 if (i == X86::AddrDisp)
35051 MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);
35052 else
35053 MIB.add(MI.getOperand(MemOpndSlot + i));
35054 }
35055 MIB.addReg(SSPCopyReg);
35056 MIB.setMemRefs(MMOs);
35057}
35058
35059MachineBasicBlock *
35060X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
35061 MachineBasicBlock *MBB) const {
35062 const DebugLoc &DL = MI.getDebugLoc();
35063 MachineFunction *MF = MBB->getParent();
35064 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35065 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
35066 MachineRegisterInfo &MRI = MF->getRegInfo();
35067
35068 const BasicBlock *BB = MBB->getBasicBlock();
35069 MachineFunction::iterator I = ++MBB->getIterator();
35070
35071 // Memory Reference
35072 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
35073 MI.memoperands_end());
35074
35075 unsigned DstReg;
35076 unsigned MemOpndSlot = 0;
35077
35078 unsigned CurOp = 0;
35079
35080 DstReg = MI.getOperand(CurOp++).getReg();
35081 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
35082 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!")(static_cast <bool> (TRI->isTypeLegalForClass(*RC, MVT
::i32) && "Invalid destination!") ? void (0) : __assert_fail
("TRI->isTypeLegalForClass(*RC, MVT::i32) && \"Invalid destination!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35082, __extension__
__PRETTY_FUNCTION__))
;
35083 (void)TRI;
35084 Register mainDstReg = MRI.createVirtualRegister(RC);
35085 Register restoreDstReg = MRI.createVirtualRegister(RC);
35086
35087 MemOpndSlot = CurOp;
35088
35089 MVT PVT = getPointerTy(MF->getDataLayout());
35090 assert((PVT == MVT::i64 || PVT == MVT::i32) &&(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35091, __extension__
__PRETTY_FUNCTION__))
35091 "Invalid Pointer Size!")(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35091, __extension__
__PRETTY_FUNCTION__))
;
35092
35093 // For v = setjmp(buf), we generate
35094 //
35095 // thisMBB:
35096 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
35097 // SjLjSetup restoreMBB
35098 //
35099 // mainMBB:
35100 // v_main = 0
35101 //
35102 // sinkMBB:
35103 // v = phi(main, restore)
35104 //
35105 // restoreMBB:
35106 // if base pointer being used, load it from frame
35107 // v_restore = 1
35108
35109 MachineBasicBlock *thisMBB = MBB;
35110 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
35111 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
35112 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
35113 MF->insert(I, mainMBB);
35114 MF->insert(I, sinkMBB);
35115 MF->push_back(restoreMBB);
35116 restoreMBB->setHasAddressTaken();
35117
35118 MachineInstrBuilder MIB;
35119
35120 // Transfer the remainder of BB and its successor edges to sinkMBB.
35121 sinkMBB->splice(sinkMBB->begin(), MBB,
35122 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
35123 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
35124
35125 // thisMBB:
35126 unsigned PtrStoreOpc = 0;
35127 unsigned LabelReg = 0;
35128 const int64_t LabelOffset = 1 * PVT.getStoreSize();
35129 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
35130 !isPositionIndependent();
35131
35132 // Prepare IP either in reg or imm.
35133 if (!UseImmLabel) {
35134 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
35135 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
35136 LabelReg = MRI.createVirtualRegister(PtrRC);
35137 if (Subtarget.is64Bit()) {
35138 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
35139 .addReg(X86::RIP)
35140 .addImm(0)
35141 .addReg(0)
35142 .addMBB(restoreMBB)
35143 .addReg(0);
35144 } else {
35145 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
35146 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
35147 .addReg(XII->getGlobalBaseReg(MF))
35148 .addImm(0)
35149 .addReg(0)
35150 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
35151 .addReg(0);
35152 }
35153 } else
35154 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
35155 // Store IP
35156 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
35157 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
35158 if (i == X86::AddrDisp)
35159 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
35160 else
35161 MIB.add(MI.getOperand(MemOpndSlot + i));
35162 }
35163 if (!UseImmLabel)
35164 MIB.addReg(LabelReg);
35165 else
35166 MIB.addMBB(restoreMBB);
35167 MIB.setMemRefs(MMOs);
35168
35169 if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
35170 emitSetJmpShadowStackFix(MI, thisMBB);
35171 }
35172
35173 // Setup
35174 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
35175 .addMBB(restoreMBB);
35176
35177 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
35178 MIB.addRegMask(RegInfo->getNoPreservedMask());
35179 thisMBB->addSuccessor(mainMBB);
35180 thisMBB->addSuccessor(restoreMBB);
35181
35182 // mainMBB:
35183 // EAX = 0
35184 BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
35185 mainMBB->addSuccessor(sinkMBB);
35186
35187 // sinkMBB:
35188 BuildMI(*sinkMBB, sinkMBB->begin(), DL,
35189 TII->get(X86::PHI), DstReg)
35190 .addReg(mainDstReg).addMBB(mainMBB)
35191 .addReg(restoreDstReg).addMBB(restoreMBB);
35192
35193 // restoreMBB:
35194 if (RegInfo->hasBasePointer(*MF)) {
35195 const bool Uses64BitFramePtr =
35196 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
35197 X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
35198 X86FI->setRestoreBasePointer(MF);
35199 Register FramePtr = RegInfo->getFrameRegister(*MF);
35200 Register BasePtr = RegInfo->getBaseRegister();
35201 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
35202 addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
35203 FramePtr, true, X86FI->getRestoreBasePointerOffset())
35204 .setMIFlag(MachineInstr::FrameSetup);
35205 }
35206 BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
35207 BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
35208 restoreMBB->addSuccessor(sinkMBB);
35209
35210 MI.eraseFromParent();
35211 return sinkMBB;
35212}
35213
35214/// Fix the shadow stack using the previously saved SSP pointer.
35215/// \sa emitSetJmpShadowStackFix
35216/// \param [in] MI The temporary Machine Instruction for the builtin.
35217/// \param [in] MBB The Machine Basic Block that will be modified.
35218/// \return The sink MBB that will perform the future indirect branch.
35219MachineBasicBlock *
35220X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
35221 MachineBasicBlock *MBB) const {
35222 const DebugLoc &DL = MI.getDebugLoc();
35223 MachineFunction *MF = MBB->getParent();
35224 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35225 MachineRegisterInfo &MRI = MF->getRegInfo();
35226
35227 // Memory Reference
35228 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
35229 MI.memoperands_end());
35230
35231 MVT PVT = getPointerTy(MF->getDataLayout());
35232 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
35233
35234 // checkSspMBB:
35235 // xor vreg1, vreg1
35236 // rdssp vreg1
35237 // test vreg1, vreg1
35238 // je sinkMBB # Jump if Shadow Stack is not supported
35239 // fallMBB:
35240 // mov buf+24/12(%rip), vreg2
35241 // sub vreg1, vreg2
35242 // jbe sinkMBB # No need to fix the Shadow Stack
35243 // fixShadowMBB:
35244 // shr 3/2, vreg2
35245 // incssp vreg2 # fix the SSP according to the lower 8 bits
35246 // shr 8, vreg2
35247 // je sinkMBB
35248 // fixShadowLoopPrepareMBB:
35249 // shl vreg2
35250 // mov 128, vreg3
35251 // fixShadowLoopMBB:
35252 // incssp vreg3
35253 // dec vreg2
35254 // jne fixShadowLoopMBB # Iterate until you finish fixing
35255 // # the Shadow Stack
35256 // sinkMBB:
35257
35258 MachineFunction::iterator I = ++MBB->getIterator();
35259 const BasicBlock *BB = MBB->getBasicBlock();
35260
35261 MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);
35262 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
35263 MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);
35264 MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);
35265 MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);
35266 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
35267 MF->insert(I, checkSspMBB);
35268 MF->insert(I, fallMBB);
35269 MF->insert(I, fixShadowMBB);
35270 MF->insert(I, fixShadowLoopPrepareMBB);
35271 MF->insert(I, fixShadowLoopMBB);
35272 MF->insert(I, sinkMBB);
35273
35274 // Transfer the remainder of BB and its successor edges to sinkMBB.
35275 sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),
35276 MBB->end());
35277 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
35278
35279 MBB->addSuccessor(checkSspMBB);
35280
35281 // Initialize a register with zero.
35282 Register ZReg = MRI.createVirtualRegister(&X86::GR32RegClass);
35283 BuildMI(checkSspMBB, DL, TII->get(X86::MOV32r0), ZReg);
35284
35285 if (PVT == MVT::i64) {
35286 Register TmpZReg = MRI.createVirtualRegister(PtrRC);
35287 BuildMI(checkSspMBB, DL, TII->get(X86::SUBREG_TO_REG), TmpZReg)
35288 .addImm(0)
35289 .addReg(ZReg)
35290 .addImm(X86::sub_32bit);
35291 ZReg = TmpZReg;
35292 }
35293
35294 // Read the current SSP Register value to the zeroed register.
35295 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
35296 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
35297 BuildMI(checkSspMBB, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
35298
35299 // Check whether the result of the SSP register is zero and jump directly
35300 // to the sink.
35301 unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;
35302 BuildMI(checkSspMBB, DL, TII->get(TestRROpc))
35303 .addReg(SSPCopyReg)
35304 .addReg(SSPCopyReg);
35305 BuildMI(checkSspMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);
35306 checkSspMBB->addSuccessor(sinkMBB);
35307 checkSspMBB->addSuccessor(fallMBB);
35308
35309 // Reload the previously saved SSP register value.
35310 Register PrevSSPReg = MRI.createVirtualRegister(PtrRC);
35311 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
35312 const int64_t SPPOffset = 3 * PVT.getStoreSize();
35313 MachineInstrBuilder MIB =
35314 BuildMI(fallMBB, DL, TII->get(PtrLoadOpc), PrevSSPReg);
35315 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
35316 const MachineOperand &MO = MI.getOperand(i);
35317 if (i == X86::AddrDisp)
35318 MIB.addDisp(MO, SPPOffset);
35319 else if (MO.isReg()) // Don't add the whole operand, we don't want to
35320 // preserve kill flags.
35321 MIB.addReg(MO.getReg());
35322 else
35323 MIB.add(MO);
35324 }
35325 MIB.setMemRefs(MMOs);
35326
35327 // Subtract the current SSP from the previous SSP.
35328 Register SspSubReg = MRI.createVirtualRegister(PtrRC);
35329 unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
35330 BuildMI(fallMBB, DL, TII->get(SubRROpc), SspSubReg)
35331 .addReg(PrevSSPReg)
35332 .addReg(SSPCopyReg);
35333
35334 // Jump to sink in case PrevSSPReg <= SSPCopyReg.
35335 BuildMI(fallMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_BE);
35336 fallMBB->addSuccessor(sinkMBB);
35337 fallMBB->addSuccessor(fixShadowMBB);
35338
35339 // Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.
35340 unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
35341 unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
35342 Register SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
35343 BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspFirstShrReg)
35344 .addReg(SspSubReg)
35345 .addImm(Offset);
35346
35347 // Increase SSP when looking only on the lower 8 bits of the delta.
35348 unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;
35349 BuildMI(fixShadowMBB, DL, TII->get(IncsspOpc)).addReg(SspFirstShrReg);
35350
35351 // Reset the lower 8 bits.
35352 Register SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
35353 BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspSecondShrReg)
35354 .addReg(SspFirstShrReg)
35355 .addImm(8);
35356
35357 // Jump if the result of the shift is zero.
35358 BuildMI(fixShadowMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);
35359 fixShadowMBB->addSuccessor(sinkMBB);
35360 fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);
35361
35362 // Do a single shift left.
35363 unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64r1 : X86::SHL32r1;
35364 Register SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
35365 BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(ShlR1Opc), SspAfterShlReg)
35366 .addReg(SspSecondShrReg);
35367
35368 // Save the value 128 to a register (will be used next with incssp).
35369 Register Value128InReg = MRI.createVirtualRegister(PtrRC);
35370 unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
35371 BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(MovRIOpc), Value128InReg)
35372 .addImm(128);
35373 fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);
35374
35375 // Since incssp only looks at the lower 8 bits, we might need to do several
35376 // iterations of incssp until we finish fixing the shadow stack.
35377 Register DecReg = MRI.createVirtualRegister(PtrRC);
35378 Register CounterReg = MRI.createVirtualRegister(PtrRC);
35379 BuildMI(fixShadowLoopMBB, DL, TII->get(X86::PHI), CounterReg)
35380 .addReg(SspAfterShlReg)
35381 .addMBB(fixShadowLoopPrepareMBB)
35382 .addReg(DecReg)
35383 .addMBB(fixShadowLoopMBB);
35384
35385 // Every iteration we increase the SSP by 128.
35386 BuildMI(fixShadowLoopMBB, DL, TII->get(IncsspOpc)).addReg(Value128InReg);
35387
35388 // Every iteration we decrement the counter by 1.
35389 unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;
35390 BuildMI(fixShadowLoopMBB, DL, TII->get(DecROpc), DecReg).addReg(CounterReg);
35391
35392 // Jump if the counter is not zero yet.
35393 BuildMI(fixShadowLoopMBB, DL, TII->get(X86::JCC_1)).addMBB(fixShadowLoopMBB).addImm(X86::COND_NE);
35394 fixShadowLoopMBB->addSuccessor(sinkMBB);
35395 fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);
35396
35397 return sinkMBB;
35398}
35399
35400MachineBasicBlock *
35401X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
35402 MachineBasicBlock *MBB) const {
35403 const DebugLoc &DL = MI.getDebugLoc();
35404 MachineFunction *MF = MBB->getParent();
35405 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35406 MachineRegisterInfo &MRI = MF->getRegInfo();
35407
35408 // Memory Reference
35409 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
35410 MI.memoperands_end());
35411
35412 MVT PVT = getPointerTy(MF->getDataLayout());
35413 assert((PVT == MVT::i64 || PVT == MVT::i32) &&(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35414, __extension__
__PRETTY_FUNCTION__))
35414 "Invalid Pointer Size!")(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35414, __extension__
__PRETTY_FUNCTION__))
;
35415
35416 const TargetRegisterClass *RC =
35417 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
35418 Register Tmp = MRI.createVirtualRegister(RC);
35419 // Since FP is only updated here but NOT referenced, it's treated as GPR.
35420 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
35421 Register FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
35422 Register SP = RegInfo->getStackRegister();
35423
35424 MachineInstrBuilder MIB;
35425
35426 const int64_t LabelOffset = 1 * PVT.getStoreSize();
35427 const int64_t SPOffset = 2 * PVT.getStoreSize();
35428
35429 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
35430 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
35431
35432 MachineBasicBlock *thisMBB = MBB;
35433
35434 // When CET and shadow stack is enabled, we need to fix the Shadow Stack.
35435 if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
35436 thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);
35437 }
35438
35439 // Reload FP
35440 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), FP);
35441 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
35442 const MachineOperand &MO = MI.getOperand(i);
35443 if (MO.isReg()) // Don't add the whole operand, we don't want to
35444 // preserve kill flags.
35445 MIB.addReg(MO.getReg());
35446 else
35447 MIB.add(MO);
35448 }
35449 MIB.setMemRefs(MMOs);
35450
35451 // Reload IP
35452 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
35453 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
35454 const MachineOperand &MO = MI.getOperand(i);
35455 if (i == X86::AddrDisp)
35456 MIB.addDisp(MO, LabelOffset);
35457 else if (MO.isReg()) // Don't add the whole operand, we don't want to
35458 // preserve kill flags.
35459 MIB.addReg(MO.getReg());
35460 else
35461 MIB.add(MO);
35462 }
35463 MIB.setMemRefs(MMOs);
35464
35465 // Reload SP
35466 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), SP);
35467 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
35468 if (i == X86::AddrDisp)
35469 MIB.addDisp(MI.getOperand(i), SPOffset);
35470 else
35471 MIB.add(MI.getOperand(i)); // We can preserve the kill flags here, it's
35472 // the last instruction of the expansion.
35473 }
35474 MIB.setMemRefs(MMOs);
35475
35476 // Jump
35477 BuildMI(*thisMBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
35478
35479 MI.eraseFromParent();
35480 return thisMBB;
35481}
35482
35483void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
35484 MachineBasicBlock *MBB,
35485 MachineBasicBlock *DispatchBB,
35486 int FI) const {
35487 const DebugLoc &DL = MI.getDebugLoc();
35488 MachineFunction *MF = MBB->getParent();
35489 MachineRegisterInfo *MRI = &MF->getRegInfo();
35490 const X86InstrInfo *TII = Subtarget.getInstrInfo();
35491
35492 MVT PVT = getPointerTy(MF->getDataLayout());
35493 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!")(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35493, __extension__
__PRETTY_FUNCTION__))
;
35494
35495 unsigned Op = 0;
35496 unsigned VR = 0;
35497
35498 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
35499 !isPositionIndependent();
35500
35501 if (UseImmLabel) {
35502 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
35503 } else {
35504 const TargetRegisterClass *TRC =
35505 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
35506 VR = MRI->createVirtualRegister(TRC);
35507 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
35508
35509 if (Subtarget.is64Bit())
35510 BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
35511 .addReg(X86::RIP)
35512 .addImm(1)
35513 .addReg(0)
35514 .addMBB(DispatchBB)
35515 .addReg(0);
35516 else
35517 BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
35518 .addReg(0) /* TII->getGlobalBaseReg(MF) */
35519 .addImm(1)
35520 .addReg(0)
35521 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
35522 .addReg(0);
35523 }
35524
35525 MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
35526 addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
35527 if (UseImmLabel)
35528 MIB.addMBB(DispatchBB);
35529 else
35530 MIB.addReg(VR);
35531}
35532
35533MachineBasicBlock *
35534X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
35535 MachineBasicBlock *BB) const {
35536 const DebugLoc &DL = MI.getDebugLoc();
35537 MachineFunction *MF = BB->getParent();
35538 MachineRegisterInfo *MRI = &MF->getRegInfo();
35539 const X86InstrInfo *TII = Subtarget.getInstrInfo();
35540 int FI = MF->getFrameInfo().getFunctionContextIndex();
35541
35542 // Get a mapping of the call site numbers to all of the landing pads they're
35543 // associated with.
35544 DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
35545 unsigned MaxCSNum = 0;
35546 for (auto &MBB : *MF) {
35547 if (!MBB.isEHPad())
35548 continue;
35549
35550 MCSymbol *Sym = nullptr;
35551 for (const auto &MI : MBB) {
35552 if (MI.isDebugInstr())
35553 continue;
35554
35555 assert(MI.isEHLabel() && "expected EH_LABEL")(static_cast <bool> (MI.isEHLabel() && "expected EH_LABEL"
) ? void (0) : __assert_fail ("MI.isEHLabel() && \"expected EH_LABEL\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35555, __extension__
__PRETTY_FUNCTION__))
;
35556 Sym = MI.getOperand(0).getMCSymbol();
35557 break;
35558 }
35559
35560 if (!MF->hasCallSiteLandingPad(Sym))
35561 continue;
35562
35563 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
35564 CallSiteNumToLPad[CSI].push_back(&MBB);
35565 MaxCSNum = std::max(MaxCSNum, CSI);
35566 }
35567 }
35568
35569 // Get an ordered list of the machine basic blocks for the jump table.
35570 std::vector<MachineBasicBlock *> LPadList;
35571 SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
35572 LPadList.reserve(CallSiteNumToLPad.size());
35573
35574 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
35575 for (auto &LP : CallSiteNumToLPad[CSI]) {
35576 LPadList.push_back(LP);
35577 InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
35578 }
35579 }
35580
35581 assert(!LPadList.empty() &&(static_cast <bool> (!LPadList.empty() && "No landing pad destinations for the dispatch jump table!"
) ? void (0) : __assert_fail ("!LPadList.empty() && \"No landing pad destinations for the dispatch jump table!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35582, __extension__
__PRETTY_FUNCTION__))
35582 "No landing pad destinations for the dispatch jump table!")(static_cast <bool> (!LPadList.empty() && "No landing pad destinations for the dispatch jump table!"
) ? void (0) : __assert_fail ("!LPadList.empty() && \"No landing pad destinations for the dispatch jump table!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35582, __extension__
__PRETTY_FUNCTION__))
;
35583
35584 // Create the MBBs for the dispatch code.
35585
35586 // Shove the dispatch's address into the return slot in the function context.
35587 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
35588 DispatchBB->setIsEHPad(true);
35589
35590 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
35591 BuildMI(TrapBB, DL, TII->get(X86::TRAP));
35592 DispatchBB->addSuccessor(TrapBB);
35593
35594 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
35595 DispatchBB->addSuccessor(DispContBB);
35596
35597 // Insert MBBs.
35598 MF->push_back(DispatchBB);
35599 MF->push_back(DispContBB);
35600 MF->push_back(TrapBB);
35601
35602 // Insert code into the entry block that creates and registers the function
35603 // context.
35604 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
35605
35606 // Create the jump table and associated information
35607 unsigned JTE = getJumpTableEncoding();
35608 MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
35609 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
35610
35611 const X86RegisterInfo &RI = TII->getRegisterInfo();
35612 // Add a register mask with no preserved registers. This results in all
35613 // registers being marked as clobbered.
35614 if (RI.hasBasePointer(*MF)) {
35615 const bool FPIs64Bit =
35616 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
35617 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
35618 MFI->setRestoreBasePointer(MF);
35619
35620 Register FP = RI.getFrameRegister(*MF);
35621 Register BP = RI.getBaseRegister();
35622 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
35623 addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
35624 MFI->getRestoreBasePointerOffset())
35625 .addRegMask(RI.getNoPreservedMask());
35626 } else {
35627 BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
35628 .addRegMask(RI.getNoPreservedMask());
35629 }
35630
35631 // IReg is used as an index in a memory operand and therefore can't be SP
35632 Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
35633 addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
35634 Subtarget.is64Bit() ? 8 : 4);
35635 BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
35636 .addReg(IReg)
35637 .addImm(LPadList.size());
35638 BuildMI(DispatchBB, DL, TII->get(X86::JCC_1)).addMBB(TrapBB).addImm(X86::COND_AE);
35639
35640 if (Subtarget.is64Bit()) {
35641 Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
35642 Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
35643
35644 // leaq .LJTI0_0(%rip), BReg
35645 BuildMI(DispContBB, DL, TII->get(X86::LEA64r), BReg)
35646 .addReg(X86::RIP)
35647 .addImm(1)
35648 .addReg(0)
35649 .addJumpTableIndex(MJTI)
35650 .addReg(0);
35651 // movzx IReg64, IReg
35652 BuildMI(DispContBB, DL, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
35653 .addImm(0)
35654 .addReg(IReg)
35655 .addImm(X86::sub_32bit);
35656
35657 switch (JTE) {
35658 case MachineJumpTableInfo::EK_BlockAddress:
35659 // jmpq *(BReg,IReg64,8)
35660 BuildMI(DispContBB, DL, TII->get(X86::JMP64m))
35661 .addReg(BReg)
35662 .addImm(8)
35663 .addReg(IReg64)
35664 .addImm(0)
35665 .addReg(0);
35666 break;
35667 case MachineJumpTableInfo::EK_LabelDifference32: {
35668 Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
35669 Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
35670 Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
35671
35672 // movl (BReg,IReg64,4), OReg
35673 BuildMI(DispContBB, DL, TII->get(X86::MOV32rm), OReg)
35674 .addReg(BReg)
35675 .addImm(4)
35676 .addReg(IReg64)
35677 .addImm(0)
35678 .addReg(0);
35679 // movsx OReg64, OReg
35680 BuildMI(DispContBB, DL, TII->get(X86::MOVSX64rr32), OReg64).addReg(OReg);
35681 // addq BReg, OReg64, TReg
35682 BuildMI(DispContBB, DL, TII->get(X86::ADD64rr), TReg)
35683 .addReg(OReg64)
35684 .addReg(BReg);
35685 // jmpq *TReg
35686 BuildMI(DispContBB, DL, TII->get(X86::JMP64r)).addReg(TReg);
35687 break;
35688 }
35689 default:
35690 llvm_unreachable("Unexpected jump table encoding")::llvm::llvm_unreachable_internal("Unexpected jump table encoding"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35690)
;
35691 }
35692 } else {
35693 // jmpl *.LJTI0_0(,IReg,4)
35694 BuildMI(DispContBB, DL, TII->get(X86::JMP32m))
35695 .addReg(0)
35696 .addImm(4)
35697 .addReg(IReg)
35698 .addJumpTableIndex(MJTI)
35699 .addReg(0);
35700 }
35701
35702 // Add the jump table entries as successors to the MBB.
35703 SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
35704 for (auto &LP : LPadList)
35705 if (SeenMBBs.insert(LP).second)
35706 DispContBB->addSuccessor(LP);
35707
35708 // N.B. the order the invoke BBs are processed in doesn't matter here.
35709 SmallVector<MachineBasicBlock *, 64> MBBLPads;
35710 const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
35711 for (MachineBasicBlock *MBB : InvokeBBs) {
35712 // Remove the landing pad successor from the invoke block and replace it
35713 // with the new dispatch block.
35714 // Keep a copy of Successors since it's modified inside the loop.
35715 SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
35716 MBB->succ_rend());
35717 // FIXME: Avoid quadratic complexity.
35718 for (auto MBBS : Successors) {
35719 if (MBBS->isEHPad()) {
35720 MBB->removeSuccessor(MBBS);
35721 MBBLPads.push_back(MBBS);
35722 }
35723 }
35724
35725 MBB->addSuccessor(DispatchBB);
35726
35727 // Find the invoke call and mark all of the callee-saved registers as
35728 // 'implicit defined' so that they're spilled. This prevents code from
35729 // moving instructions to before the EH block, where they will never be
35730 // executed.
35731 for (auto &II : reverse(*MBB)) {
35732 if (!II.isCall())
35733 continue;
35734
35735 DenseMap<unsigned, bool> DefRegs;
35736 for (auto &MOp : II.operands())
35737 if (MOp.isReg())
35738 DefRegs[MOp.getReg()] = true;
35739
35740 MachineInstrBuilder MIB(*MF, &II);
35741 for (unsigned RegIdx = 0; SavedRegs[RegIdx]; ++RegIdx) {
35742 unsigned Reg = SavedRegs[RegIdx];
35743 if (!DefRegs[Reg])
35744 MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
35745 }
35746
35747 break;
35748 }
35749 }
35750
35751 // Mark all former landing pads as non-landing pads. The dispatch is the only
35752 // landing pad now.
35753 for (auto &LP : MBBLPads)
35754 LP->setIsEHPad(false);
35755
35756 // The instruction is gone now.
35757 MI.eraseFromParent();
35758 return BB;
35759}
35760
35761MachineBasicBlock *
35762X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
35763 MachineBasicBlock *BB) const {
35764 MachineFunction *MF = BB->getParent();
35765 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35766 const DebugLoc &DL = MI.getDebugLoc();
35767
35768 auto TMMImmToTMMReg = [](unsigned Imm) {
35769 assert (Imm < 8 && "Illegal tmm index")(static_cast <bool> (Imm < 8 && "Illegal tmm index"
) ? void (0) : __assert_fail ("Imm < 8 && \"Illegal tmm index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35769, __extension__
__PRETTY_FUNCTION__))
;
35770 return X86::TMM0 + Imm;
35771 };
35772 switch (MI.getOpcode()) {
35773 default: llvm_unreachable("Unexpected instr type to insert")::llvm::llvm_unreachable_internal("Unexpected instr type to insert"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35773)
;
35774 case X86::TLS_addr32:
35775 case X86::TLS_addr64:
35776 case X86::TLS_addrX32:
35777 case X86::TLS_base_addr32:
35778 case X86::TLS_base_addr64:
35779 case X86::TLS_base_addrX32:
35780 return EmitLoweredTLSAddr(MI, BB);
35781 case X86::INDIRECT_THUNK_CALL32:
35782 case X86::INDIRECT_THUNK_CALL64:
35783 case X86::INDIRECT_THUNK_TCRETURN32:
35784 case X86::INDIRECT_THUNK_TCRETURN64:
35785 return EmitLoweredIndirectThunk(MI, BB);
35786 case X86::CATCHRET:
35787 return EmitLoweredCatchRet(MI, BB);
35788 case X86::SEG_ALLOCA_32:
35789 case X86::SEG_ALLOCA_64:
35790 return EmitLoweredSegAlloca(MI, BB);
35791 case X86::PROBED_ALLOCA_32:
35792 case X86::PROBED_ALLOCA_64:
35793 return EmitLoweredProbedAlloca(MI, BB);
35794 case X86::TLSCall_32:
35795 case X86::TLSCall_64:
35796 return EmitLoweredTLSCall(MI, BB);
35797 case X86::CMOV_FR32:
35798 case X86::CMOV_FR32X:
35799 case X86::CMOV_FR64:
35800 case X86::CMOV_FR64X:
35801 case X86::CMOV_GR8:
35802 case X86::CMOV_GR16:
35803 case X86::CMOV_GR32:
35804 case X86::CMOV_RFP32:
35805 case X86::CMOV_RFP64:
35806 case X86::CMOV_RFP80:
35807 case X86::CMOV_VR64:
35808 case X86::CMOV_VR128:
35809 case X86::CMOV_VR128X:
35810 case X86::CMOV_VR256:
35811 case X86::CMOV_VR256X:
35812 case X86::CMOV_VR512:
35813 case X86::CMOV_VK1:
35814 case X86::CMOV_VK2:
35815 case X86::CMOV_VK4:
35816 case X86::CMOV_VK8:
35817 case X86::CMOV_VK16:
35818 case X86::CMOV_VK32:
35819 case X86::CMOV_VK64:
35820 return EmitLoweredSelect(MI, BB);
35821
35822 case X86::RDFLAGS32:
35823 case X86::RDFLAGS64: {
35824 unsigned PushF =
35825 MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
35826 unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
35827 MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));
35828 // Permit reads of the EFLAGS and DF registers without them being defined.
35829 // This intrinsic exists to read external processor state in flags, such as
35830 // the trap flag, interrupt flag, and direction flag, none of which are
35831 // modeled by the backend.
35832 assert(Push->getOperand(2).getReg() == X86::EFLAGS &&(static_cast <bool> (Push->getOperand(2).getReg() ==
X86::EFLAGS && "Unexpected register in operand!") ? void
(0) : __assert_fail ("Push->getOperand(2).getReg() == X86::EFLAGS && \"Unexpected register in operand!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35833, __extension__
__PRETTY_FUNCTION__))
35833 "Unexpected register in operand!")(static_cast <bool> (Push->getOperand(2).getReg() ==
X86::EFLAGS && "Unexpected register in operand!") ? void
(0) : __assert_fail ("Push->getOperand(2).getReg() == X86::EFLAGS && \"Unexpected register in operand!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35833, __extension__
__PRETTY_FUNCTION__))
;
35834 Push->getOperand(2).setIsUndef();
35835 assert(Push->getOperand(3).getReg() == X86::DF &&(static_cast <bool> (Push->getOperand(3).getReg() ==
X86::DF && "Unexpected register in operand!") ? void
(0) : __assert_fail ("Push->getOperand(3).getReg() == X86::DF && \"Unexpected register in operand!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35836, __extension__
__PRETTY_FUNCTION__))
35836 "Unexpected register in operand!")(static_cast <bool> (Push->getOperand(3).getReg() ==
X86::DF && "Unexpected register in operand!") ? void
(0) : __assert_fail ("Push->getOperand(3).getReg() == X86::DF && \"Unexpected register in operand!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35836, __extension__
__PRETTY_FUNCTION__))
;
35837 Push->getOperand(3).setIsUndef();
35838 BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());
35839
35840 MI.eraseFromParent(); // The pseudo is gone now.
35841 return BB;
35842 }
35843
35844 case X86::WRFLAGS32:
35845 case X86::WRFLAGS64: {
35846 unsigned Push =
35847 MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
35848 unsigned PopF =
35849 MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
35850 BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
35851 BuildMI(*BB, MI, DL, TII->get(PopF));
35852
35853 MI.eraseFromParent(); // The pseudo is gone now.
35854 return BB;
35855 }
35856
35857 case X86::FP32_TO_INT16_IN_MEM:
35858 case X86::FP32_TO_INT32_IN_MEM:
35859 case X86::FP32_TO_INT64_IN_MEM:
35860 case X86::FP64_TO_INT16_IN_MEM:
35861 case X86::FP64_TO_INT32_IN_MEM:
35862 case X86::FP64_TO_INT64_IN_MEM:
35863 case X86::FP80_TO_INT16_IN_MEM:
35864 case X86::FP80_TO_INT32_IN_MEM:
35865 case X86::FP80_TO_INT64_IN_MEM: {
35866 // Change the floating point control register to use "round towards zero"
35867 // mode when truncating to an integer value.
35868 int OrigCWFrameIdx =
35869 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
35870 addFrameReference(BuildMI(*BB, MI, DL,
35871 TII->get(X86::FNSTCW16m)), OrigCWFrameIdx);
35872
35873 // Load the old value of the control word...
35874 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
35875 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW),
35876 OrigCWFrameIdx);
35877
35878 // OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.
35879 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
35880 BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW)
35881 .addReg(OldCW, RegState::Kill).addImm(0xC00);
35882
35883 // Extract to 16 bits.
35884 Register NewCW16 =
35885 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
35886 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16)
35887 .addReg(NewCW, RegState::Kill, X86::sub_16bit);
35888
35889 // Prepare memory for FLDCW.
35890 int NewCWFrameIdx =
35891 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
35892 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)),
35893 NewCWFrameIdx)
35894 .addReg(NewCW16, RegState::Kill);
35895
35896 // Reload the modified control word now...
35897 addFrameReference(BuildMI(*BB, MI, DL,
35898 TII->get(X86::FLDCW16m)), NewCWFrameIdx);
35899
35900 // Get the X86 opcode to use.
35901 unsigned Opc;
35902 switch (MI.getOpcode()) {
35903 default: llvm_unreachable("illegal opcode!")::llvm::llvm_unreachable_internal("illegal opcode!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35903)
;
35904 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
35905 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
35906 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
35907 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
35908 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
35909 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
35910 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
35911 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
35912 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
35913 }
35914
35915 X86AddressMode AM = getAddressFromInstr(&MI, 0);
35916 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
35917 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
35918
35919 // Reload the original control word now.
35920 addFrameReference(BuildMI(*BB, MI, DL,
35921 TII->get(X86::FLDCW16m)), OrigCWFrameIdx);
35922
35923 MI.eraseFromParent(); // The pseudo instruction is gone now.
35924 return BB;
35925 }
35926
35927 // xbegin
35928 case X86::XBEGIN:
35929 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
35930
35931 case X86::VAARG_64:
35932 case X86::VAARG_X32:
35933 return EmitVAARGWithCustomInserter(MI, BB);
35934
35935 case X86::EH_SjLj_SetJmp32:
35936 case X86::EH_SjLj_SetJmp64:
35937 return emitEHSjLjSetJmp(MI, BB);
35938
35939 case X86::EH_SjLj_LongJmp32:
35940 case X86::EH_SjLj_LongJmp64:
35941 return emitEHSjLjLongJmp(MI, BB);
35942
35943 case X86::Int_eh_sjlj_setup_dispatch:
35944 return EmitSjLjDispatchBlock(MI, BB);
35945
35946 case TargetOpcode::STATEPOINT:
35947 // As an implementation detail, STATEPOINT shares the STACKMAP format at
35948 // this point in the process. We diverge later.
35949 return emitPatchPoint(MI, BB);
35950
35951 case TargetOpcode::STACKMAP:
35952 case TargetOpcode::PATCHPOINT:
35953 return emitPatchPoint(MI, BB);
35954
35955 case TargetOpcode::PATCHABLE_EVENT_CALL:
35956 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
35957 return BB;
35958
35959 case X86::LCMPXCHG8B: {
35960 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
35961 // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
35962 // requires a memory operand. If it happens that current architecture is
35963 // i686 and for current function we need a base pointer
35964 // - which is ESI for i686 - register allocator would not be able to
35965 // allocate registers for an address in form of X(%reg, %reg, Y)
35966 // - there never would be enough unreserved registers during regalloc
35967 // (without the need for base ptr the only option would be X(%edi, %esi, Y).
35968 // We are giving a hand to register allocator by precomputing the address in
35969 // a new vreg using LEA.
35970
35971 // If it is not i686 or there is no base pointer - nothing to do here.
35972 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
35973 return BB;
35974
35975 // Even though this code does not necessarily needs the base pointer to
35976 // be ESI, we check for that. The reason: if this assert fails, there are
35977 // some changes happened in the compiler base pointer handling, which most
35978 // probably have to be addressed somehow here.
35979 assert(TRI->getBaseRegister() == X86::ESI &&(static_cast <bool> (TRI->getBaseRegister() == X86::
ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
"base pointer in mind") ? void (0) : __assert_fail ("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35981, __extension__
__PRETTY_FUNCTION__))
35980 "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "(static_cast <bool> (TRI->getBaseRegister() == X86::
ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
"base pointer in mind") ? void (0) : __assert_fail ("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35981, __extension__
__PRETTY_FUNCTION__))
35981 "base pointer in mind")(static_cast <bool> (TRI->getBaseRegister() == X86::
ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
"base pointer in mind") ? void (0) : __assert_fail ("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35981, __extension__
__PRETTY_FUNCTION__))
;
35982
35983 MachineRegisterInfo &MRI = MF->getRegInfo();
35984 MVT SPTy = getPointerTy(MF->getDataLayout());
35985 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
35986 Register computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
35987
35988 X86AddressMode AM = getAddressFromInstr(&MI, 0);
35989 // Regalloc does not need any help when the memory operand of CMPXCHG8B
35990 // does not use index register.
35991 if (AM.IndexReg == X86::NoRegister)
35992 return BB;
35993
35994 // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
35995 // four operand definitions that are E[ABCD] registers. We skip them and
35996 // then insert the LEA.
35997 MachineBasicBlock::reverse_iterator RMBBI(MI.getReverseIterator());
35998 while (RMBBI != BB->rend() && (RMBBI->definesRegister(X86::EAX) ||
35999 RMBBI->definesRegister(X86::EBX) ||
36000 RMBBI->definesRegister(X86::ECX) ||
36001 RMBBI->definesRegister(X86::EDX))) {
36002 ++RMBBI;
36003 }
36004 MachineBasicBlock::iterator MBBI(RMBBI);
36005 addFullAddress(
36006 BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);
36007
36008 setDirectAddressInInstr(&MI, 0, computedAddrVReg);
36009
36010 return BB;
36011 }
36012 case X86::LCMPXCHG16B_NO_RBX: {
36013 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
36014 Register BasePtr = TRI->getBaseRegister();
36015 if (TRI->hasBasePointer(*MF) &&
36016 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
36017 if (!BB->isLiveIn(BasePtr))
36018 BB->addLiveIn(BasePtr);
36019 // Save RBX into a virtual register.
36020 Register SaveRBX =
36021 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
36022 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX)
36023 .addReg(X86::RBX);
36024 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
36025 MachineInstrBuilder MIB =
36026 BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B_SAVE_RBX), Dst);
36027 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
36028 MIB.add(MI.getOperand(Idx));
36029 MIB.add(MI.getOperand(X86::AddrNumOperands));
36030 MIB.addReg(SaveRBX);
36031 } else {
36032 // Simple case, just copy the virtual register to RBX.
36033 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::RBX)
36034 .add(MI.getOperand(X86::AddrNumOperands));
36035 MachineInstrBuilder MIB =
36036 BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B));
36037 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
36038 MIB.add(MI.getOperand(Idx));
36039 }
36040 MI.eraseFromParent();
36041 return BB;
36042 }
36043 case X86::MWAITX: {
36044 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
36045 Register BasePtr = TRI->getBaseRegister();
36046 bool IsRBX = (BasePtr == X86::RBX || BasePtr == X86::EBX);
36047 // If no need to save the base pointer, we generate MWAITXrrr,
36048 // else we generate pseudo MWAITX_SAVE_RBX.
36049 if (!IsRBX || !TRI->hasBasePointer(*MF)) {
36050 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX)
36051 .addReg(MI.getOperand(0).getReg());
36052 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX)
36053 .addReg(MI.getOperand(1).getReg());
36054 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EBX)
36055 .addReg(MI.getOperand(2).getReg());
36056 BuildMI(*BB, MI, DL, TII->get(X86::MWAITXrrr));
36057 MI.eraseFromParent();
36058 } else {
36059 if (!BB->isLiveIn(BasePtr)) {
36060 BB->addLiveIn(BasePtr);
36061 }
36062 // Parameters can be copied into ECX and EAX but not EBX yet.
36063 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX)
36064 .addReg(MI.getOperand(0).getReg());
36065 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX)
36066 .addReg(MI.getOperand(1).getReg());
36067 assert(Subtarget.is64Bit() && "Expected 64-bit mode!")(static_cast <bool> (Subtarget.is64Bit() && "Expected 64-bit mode!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Expected 64-bit mode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36067, __extension__
__PRETTY_FUNCTION__))
;
36068 // Save RBX into a virtual register.
36069 Register SaveRBX =
36070 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
36071 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX)
36072 .addReg(X86::RBX);
36073 // Generate mwaitx pseudo.
36074 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
36075 BuildMI(*BB, MI, DL, TII->get(X86::MWAITX_SAVE_RBX))
36076 .addDef(Dst) // Destination tied in with SaveRBX.
36077 .addReg(MI.getOperand(2).getReg()) // input value of EBX.
36078 .addUse(SaveRBX); // Save of base pointer.
36079 MI.eraseFromParent();
36080 }
36081 return BB;
36082 }
36083 case TargetOpcode::PREALLOCATED_SETUP: {
36084 assert(Subtarget.is32Bit() && "preallocated only used in 32-bit")(static_cast <bool> (Subtarget.is32Bit() && "preallocated only used in 32-bit"
) ? void (0) : __assert_fail ("Subtarget.is32Bit() && \"preallocated only used in 32-bit\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36084, __extension__
__PRETTY_FUNCTION__))
;
36085 auto MFI = MF->getInfo<X86MachineFunctionInfo>();
36086 MFI->setHasPreallocatedCall(true);
36087 int64_t PreallocatedId = MI.getOperand(0).getImm();
36088 size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId);
36089 assert(StackAdjustment != 0 && "0 stack adjustment")(static_cast <bool> (StackAdjustment != 0 && "0 stack adjustment"
) ? void (0) : __assert_fail ("StackAdjustment != 0 && \"0 stack adjustment\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36089, __extension__
__PRETTY_FUNCTION__))
;
36090 LLVM_DEBUG(dbgs() << "PREALLOCATED_SETUP stack adjustment "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("x86-isel")) { dbgs() << "PREALLOCATED_SETUP stack adjustment "
<< StackAdjustment << "\n"; } } while (false)
36091 << StackAdjustment << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("x86-isel")) { dbgs() << "PREALLOCATED_SETUP stack adjustment "
<< StackAdjustment << "\n"; } } while (false)
;
36092 BuildMI(*BB, MI, DL, TII->get(X86::SUB32ri), X86::ESP)
36093 .addReg(X86::ESP)
36094 .addImm(StackAdjustment);
36095 MI.eraseFromParent();
36096 return BB;
36097 }
36098 case TargetOpcode::PREALLOCATED_ARG: {
36099 assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit")(static_cast <bool> (Subtarget.is32Bit() && "preallocated calls only used in 32-bit"
) ? void (0) : __assert_fail ("Subtarget.is32Bit() && \"preallocated calls only used in 32-bit\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36099, __extension__
__PRETTY_FUNCTION__))
;
36100 int64_t PreallocatedId = MI.getOperand(1).getImm();
36101 int64_t ArgIdx = MI.getOperand(2).getImm();
36102 auto MFI = MF->getInfo<X86MachineFunctionInfo>();
36103 size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx];
36104 LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdxdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("x86-isel")) { dbgs() << "PREALLOCATED_ARG arg index "
<< ArgIdx << ", arg offset " << ArgOffset <<
"\n"; } } while (false)
36105 << ", arg offset " << ArgOffset << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("x86-isel")) { dbgs() << "PREALLOCATED_ARG arg index "
<< ArgIdx << ", arg offset " << ArgOffset <<
"\n"; } } while (false)
;
36106 // stack pointer + offset
36107 addRegOffset(
36108 BuildMI(*BB, MI, DL, TII->get(X86::LEA32r), MI.getOperand(0).getReg()),
36109 X86::ESP, false, ArgOffset);
36110 MI.eraseFromParent();
36111 return BB;
36112 }
36113 case X86::PTDPBSSD:
36114 case X86::PTDPBSUD:
36115 case X86::PTDPBUSD:
36116 case X86::PTDPBUUD:
36117 case X86::PTDPBF16PS: {
36118 unsigned Opc;
36119 switch (MI.getOpcode()) {
36120 default: llvm_unreachable("illegal opcode!")::llvm::llvm_unreachable_internal("illegal opcode!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36120)
;
36121 case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;
36122 case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;
36123 case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;
36124 case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;
36125 case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;
36126 }
36127
36128 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
36129 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
36130 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
36131 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
36132 MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
36133
36134 MI.eraseFromParent(); // The pseudo is gone now.
36135 return BB;
36136 }
36137 case X86::PTILEZERO: {
36138 unsigned Imm = MI.getOperand(0).getImm();
36139 BuildMI(*BB, MI, DL, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));
36140 MI.eraseFromParent(); // The pseudo is gone now.
36141 return BB;
36142 }
36143 case X86::PTILELOADD:
36144 case X86::PTILELOADDT1:
36145 case X86::PTILESTORED: {
36146 unsigned Opc;
36147 switch (MI.getOpcode()) {
36148 default: llvm_unreachable("illegal opcode!")::llvm::llvm_unreachable_internal("illegal opcode!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36148)
;
36149 case X86::PTILELOADD: Opc = X86::TILELOADD; break;
36150 case X86::PTILELOADDT1: Opc = X86::TILELOADDT1; break;
36151 case X86::PTILESTORED: Opc = X86::TILESTORED; break;
36152 }
36153
36154 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
36155 unsigned CurOp = 0;
36156 if (Opc != X86::TILESTORED)
36157 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
36158 RegState::Define);
36159
36160 MIB.add(MI.getOperand(CurOp++)); // base
36161 MIB.add(MI.getOperand(CurOp++)); // scale
36162 MIB.add(MI.getOperand(CurOp++)); // index -- stride
36163 MIB.add(MI.getOperand(CurOp++)); // displacement
36164 MIB.add(MI.getOperand(CurOp++)); // segment
36165
36166 if (Opc == X86::TILESTORED)
36167 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
36168 RegState::Undef);
36169
36170 MI.eraseFromParent(); // The pseudo is gone now.
36171 return BB;
36172 }
36173 }
36174}
36175
36176//===----------------------------------------------------------------------===//
36177// X86 Optimization Hooks
36178//===----------------------------------------------------------------------===//
36179
36180bool
36181X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,
36182 const APInt &DemandedBits,
36183 const APInt &DemandedElts,
36184 TargetLoweringOpt &TLO) const {
36185 EVT VT = Op.getValueType();
36186 unsigned Opcode = Op.getOpcode();
36187 unsigned EltSize = VT.getScalarSizeInBits();
36188
36189 if (VT.isVector()) {
36190 // If the constant is only all signbits in the active bits, then we should
36191 // extend it to the entire constant to allow it act as a boolean constant
36192 // vector.
36193 auto NeedsSignExtension = [&](SDValue V, unsigned ActiveBits) {
36194 if (!ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
36195 return false;
36196 for (unsigned i = 0, e = V.getNumOperands(); i != e; ++i) {
36197 if (!DemandedElts[i] || V.getOperand(i).isUndef())
36198 continue;
36199 const APInt &Val = V.getConstantOperandAPInt(i);
36200 if (Val.getBitWidth() > Val.getNumSignBits() &&
36201 Val.trunc(ActiveBits).getNumSignBits() == ActiveBits)
36202 return true;
36203 }
36204 return false;
36205 };
36206 // For vectors - if we have a constant, then try to sign extend.
36207 // TODO: Handle AND/ANDN cases.
36208 unsigned ActiveBits = DemandedBits.getActiveBits();
36209 if (EltSize > ActiveBits && EltSize > 1 && isTypeLegal(VT) &&
36210 (Opcode == ISD::OR || Opcode == ISD::XOR) &&
36211 NeedsSignExtension(Op.getOperand(1), ActiveBits)) {
36212 EVT ExtSVT = EVT::getIntegerVT(*TLO.DAG.getContext(), ActiveBits);
36213 EVT ExtVT = EVT::getVectorVT(*TLO.DAG.getContext(), ExtSVT,
36214 VT.getVectorNumElements());
36215 SDValue NewC =
36216 TLO.DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(Op), VT,
36217 Op.getOperand(1), TLO.DAG.getValueType(ExtVT));
36218 SDValue NewOp =
36219 TLO.DAG.getNode(Opcode, SDLoc(Op), VT, Op.getOperand(0), NewC);
36220 return TLO.CombineTo(Op, NewOp);
36221 }
36222 return false;
36223 }
36224
36225 // Only optimize Ands to prevent shrinking a constant that could be
36226 // matched by movzx.
36227 if (Opcode != ISD::AND)
36228 return false;
36229
36230 // Make sure the RHS really is a constant.
36231 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
36232 if (!C)
36233 return false;
36234
36235 const APInt &Mask = C->getAPIntValue();
36236
36237 // Clear all non-demanded bits initially.
36238 APInt ShrunkMask = Mask & DemandedBits;
36239
36240 // Find the width of the shrunk mask.
36241 unsigned Width = ShrunkMask.getActiveBits();
36242
36243 // If the mask is all 0s there's nothing to do here.
36244 if (Width == 0)
36245 return false;
36246
36247 // Find the next power of 2 width, rounding up to a byte.
36248 Width = PowerOf2Ceil(std::max(Width, 8U));
36249 // Truncate the width to size to handle illegal types.
36250 Width = std::min(Width, EltSize);
36251
36252 // Calculate a possible zero extend mask for this constant.
36253 APInt ZeroExtendMask = APInt::getLowBitsSet(EltSize, Width);
36254
36255 // If we aren't changing the mask, just return true to keep it and prevent
36256 // the caller from optimizing.
36257 if (ZeroExtendMask == Mask)
36258 return true;
36259
36260 // Make sure the new mask can be represented by a combination of mask bits
36261 // and non-demanded bits.
36262 if (!ZeroExtendMask.isSubsetOf(Mask | ~DemandedBits))
36263 return false;
36264
36265 // Replace the constant with the zero extend mask.
36266 SDLoc DL(Op);
36267 SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);
36268 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
36269 return TLO.CombineTo(Op, NewOp);
36270}
36271
36272void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
36273 KnownBits &Known,
36274 const APInt &DemandedElts,
36275 const SelectionDAG &DAG,
36276 unsigned Depth) const {
36277 unsigned BitWidth = Known.getBitWidth();
36278 unsigned NumElts = DemandedElts.getBitWidth();
36279 unsigned Opc = Op.getOpcode();
36280 EVT VT = Op.getValueType();
36281 assert((Opc >= ISD::BUILTIN_OP_END ||(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36286, __extension__
__PRETTY_FUNCTION__))
36282 Opc == ISD::INTRINSIC_WO_CHAIN ||(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36286, __extension__
__PRETTY_FUNCTION__))
36283 Opc == ISD::INTRINSIC_W_CHAIN ||(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36286, __extension__
__PRETTY_FUNCTION__))
36284 Opc == ISD::INTRINSIC_VOID) &&(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36286, __extension__
__PRETTY_FUNCTION__))
36285 "Should use MaskedValueIsZero if you don't know whether Op"(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36286, __extension__
__PRETTY_FUNCTION__))
36286 " is a target node!")(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36286, __extension__
__PRETTY_FUNCTION__))
;
36287
36288 Known.resetAll();
36289 switch (Opc) {
36290 default: break;
36291 case X86ISD::SETCC:
36292 Known.Zero.setBitsFrom(1);
36293 break;
36294 case X86ISD::MOVMSK: {
36295 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
36296 Known.Zero.setBitsFrom(NumLoBits);
36297 break;
36298 }
36299 case X86ISD::PEXTRB:
36300 case X86ISD::PEXTRW: {
36301 SDValue Src = Op.getOperand(0);
36302 EVT SrcVT = Src.getValueType();
36303 APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
36304 Op.getConstantOperandVal(1));
36305 Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);
36306 Known = Known.anyextOrTrunc(BitWidth);
36307 Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
36308 break;
36309 }
36310 case X86ISD::VSRAI:
36311 case X86ISD::VSHLI:
36312 case X86ISD::VSRLI: {
36313 unsigned ShAmt = Op.getConstantOperandVal(1);
36314 if (ShAmt >= VT.getScalarSizeInBits()) {
36315 Known.setAllZero();
36316 break;
36317 }
36318
36319 Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
36320 if (Opc == X86ISD::VSHLI) {
36321 Known.Zero <<= ShAmt;
36322 Known.One <<= ShAmt;
36323 // Low bits are known zero.
36324 Known.Zero.setLowBits(ShAmt);
36325 } else if (Opc == X86ISD::VSRLI) {
36326 Known.Zero.lshrInPlace(ShAmt);
36327 Known.One.lshrInPlace(ShAmt);
36328 // High bits are known zero.
36329 Known.Zero.setHighBits(ShAmt);
36330 } else {
36331 Known.Zero.ashrInPlace(ShAmt);
36332 Known.One.ashrInPlace(ShAmt);
36333 }
36334 break;
36335 }
36336 case X86ISD::PACKUS: {
36337 // PACKUS is just a truncation if the upper half is zero.
36338 APInt DemandedLHS, DemandedRHS;
36339 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
36340
36341 Known.One = APInt::getAllOnes(BitWidth * 2);
36342 Known.Zero = APInt::getAllOnes(BitWidth * 2);
36343
36344 KnownBits Known2;
36345 if (!!DemandedLHS) {
36346 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);
36347 Known = KnownBits::commonBits(Known, Known2);
36348 }
36349 if (!!DemandedRHS) {
36350 Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);
36351 Known = KnownBits::commonBits(Known, Known2);
36352 }
36353
36354 if (Known.countMinLeadingZeros() < BitWidth)
36355 Known.resetAll();
36356 Known = Known.trunc(BitWidth);
36357 break;
36358 }
36359 case X86ISD::VBROADCAST: {
36360 SDValue Src = Op.getOperand(0);
36361 if (!Src.getSimpleValueType().isVector()) {
36362 Known = DAG.computeKnownBits(Src, Depth + 1);
36363 return;
36364 }
36365 break;
36366 }
36367 case X86ISD::AND: {
36368 if (Op.getResNo() == 0) {
36369 KnownBits Known2;
36370 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
36371 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
36372 Known &= Known2;
36373 }
36374 break;
36375 }
36376 case X86ISD::ANDNP: {
36377 KnownBits Known2;
36378 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
36379 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
36380
36381 // ANDNP = (~X & Y);
36382 Known.One &= Known2.Zero;
36383 Known.Zero |= Known2.One;
36384 break;
36385 }
36386 case X86ISD::FOR: {
36387 KnownBits Known2;
36388 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
36389 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
36390
36391 Known |= Known2;
36392 break;
36393 }
36394 case X86ISD::PSADBW: {
36395 assert(VT.getScalarType() == MVT::i64 &&(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&
"Unexpected PSADBW types") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && Op.getOperand(0).getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36397, __extension__
__PRETTY_FUNCTION__))
36396 Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&
"Unexpected PSADBW types") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && Op.getOperand(0).getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36397, __extension__
__PRETTY_FUNCTION__))
36397 "Unexpected PSADBW types")(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&
"Unexpected PSADBW types") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && Op.getOperand(0).getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36397, __extension__
__PRETTY_FUNCTION__))
;
36398
36399 // PSADBW - fills low 16 bits and zeros upper 48 bits of each i64 result.
36400 Known.Zero.setBitsFrom(16);
36401 break;
36402 }
36403 case X86ISD::PMULUDQ: {
36404 KnownBits Known2;
36405 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
36406 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
36407
36408 Known = Known.trunc(BitWidth / 2).zext(BitWidth);
36409 Known2 = Known2.trunc(BitWidth / 2).zext(BitWidth);
36410 Known = KnownBits::mul(Known, Known2);
36411 break;
36412 }
36413 case X86ISD::CMOV: {
36414 Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
36415 // If we don't know any bits, early out.
36416 if (Known.isUnknown())
36417 break;
36418 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
36419
36420 // Only known if known in both the LHS and RHS.
36421 Known = KnownBits::commonBits(Known, Known2);
36422 break;
36423 }
36424 case X86ISD::BEXTR:
36425 case X86ISD::BEXTRI: {
36426 SDValue Op0 = Op.getOperand(0);
36427 SDValue Op1 = Op.getOperand(1);
36428
36429 if (auto* Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
36430 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
36431 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
36432
36433 // If the length is 0, the result is 0.
36434 if (Length == 0) {
36435 Known.setAllZero();
36436 break;
36437 }
36438
36439 if ((Shift + Length) <= BitWidth) {
36440 Known = DAG.computeKnownBits(Op0, Depth + 1);
36441 Known = Known.extractBits(Length, Shift);
36442 Known = Known.zextOrTrunc(BitWidth);
36443 }
36444 }
36445 break;
36446 }
36447 case X86ISD::PDEP: {
36448 KnownBits Known2;
36449 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
36450 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
36451 // Zeros are retained from the mask operand. But not ones.
36452 Known.One.clearAllBits();
36453 // The result will have at least as many trailing zeros as the non-mask
36454 // operand since bits can only map to the same or higher bit position.
36455 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
36456 break;
36457 }
36458 case X86ISD::PEXT: {
36459 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
36460 // The result has as many leading zeros as the number of zeroes in the mask.
36461 unsigned Count = Known.Zero.countPopulation();
36462 Known.Zero = APInt::getHighBitsSet(BitWidth, Count);
36463 Known.One.clearAllBits();
36464 break;
36465 }
36466 case X86ISD::VTRUNC:
36467 case X86ISD::VTRUNCS:
36468 case X86ISD::VTRUNCUS:
36469 case X86ISD::CVTSI2P:
36470 case X86ISD::CVTUI2P:
36471 case X86ISD::CVTP2SI:
36472 case X86ISD::CVTP2UI:
36473 case X86ISD::MCVTP2SI:
36474 case X86ISD::MCVTP2UI:
36475 case X86ISD::CVTTP2SI:
36476 case X86ISD::CVTTP2UI:
36477 case X86ISD::MCVTTP2SI:
36478 case X86ISD::MCVTTP2UI:
36479 case X86ISD::MCVTSI2P:
36480 case X86ISD::MCVTUI2P:
36481 case X86ISD::VFPROUND:
36482 case X86ISD::VMFPROUND:
36483 case X86ISD::CVTPS2PH:
36484 case X86ISD::MCVTPS2PH: {
36485 // Truncations/Conversions - upper elements are known zero.
36486 EVT SrcVT = Op.getOperand(0).getValueType();
36487 if (SrcVT.isVector()) {
36488 unsigned NumSrcElts = SrcVT.getVectorNumElements();
36489 if (NumElts > NumSrcElts &&
36490 DemandedElts.countTrailingZeros() >= NumSrcElts)
36491 Known.setAllZero();
36492 }
36493 break;
36494 }
36495 case X86ISD::STRICT_CVTTP2SI:
36496 case X86ISD::STRICT_CVTTP2UI:
36497 case X86ISD::STRICT_CVTSI2P:
36498 case X86ISD::STRICT_CVTUI2P:
36499 case X86ISD::STRICT_VFPROUND:
36500 case X86ISD::STRICT_CVTPS2PH: {
36501 // Strict Conversions - upper elements are known zero.
36502 EVT SrcVT = Op.getOperand(1).getValueType();
36503 if (SrcVT.isVector()) {
36504 unsigned NumSrcElts = SrcVT.getVectorNumElements();
36505 if (NumElts > NumSrcElts &&
36506 DemandedElts.countTrailingZeros() >= NumSrcElts)
36507 Known.setAllZero();
36508 }
36509 break;
36510 }
36511 case X86ISD::MOVQ2DQ: {
36512 // Move from MMX to XMM. Upper half of XMM should be 0.
36513 if (DemandedElts.countTrailingZeros() >= (NumElts / 2))
36514 Known.setAllZero();
36515 break;
36516 }
36517 }
36518
36519 // Handle target shuffles.
36520 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
36521 if (isTargetShuffle(Opc)) {
36522 SmallVector<int, 64> Mask;
36523 SmallVector<SDValue, 2> Ops;
36524 if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask)) {
36525 unsigned NumOps = Ops.size();
36526 unsigned NumElts = VT.getVectorNumElements();
36527 if (Mask.size() == NumElts) {
36528 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
36529 Known.Zero.setAllBits(); Known.One.setAllBits();
36530 for (unsigned i = 0; i != NumElts; ++i) {
36531 if (!DemandedElts[i])
36532 continue;
36533 int M = Mask[i];
36534 if (M == SM_SentinelUndef) {
36535 // For UNDEF elements, we don't know anything about the common state
36536 // of the shuffle result.
36537 Known.resetAll();
36538 break;
36539 }
36540 if (M == SM_SentinelZero) {
36541 Known.One.clearAllBits();
36542 continue;
36543 }
36544 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&(static_cast <bool> (0 <= M && (unsigned)M <
(NumOps * NumElts) && "Shuffle index out of range") ?
void (0) : __assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36545, __extension__
__PRETTY_FUNCTION__))
36545 "Shuffle index out of range")(static_cast <bool> (0 <= M && (unsigned)M <
(NumOps * NumElts) && "Shuffle index out of range") ?
void (0) : __assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36545, __extension__
__PRETTY_FUNCTION__))
;
36546
36547 unsigned OpIdx = (unsigned)M / NumElts;
36548 unsigned EltIdx = (unsigned)M % NumElts;
36549 if (Ops[OpIdx].getValueType() != VT) {
36550 // TODO - handle target shuffle ops with different value types.
36551 Known.resetAll();
36552 break;
36553 }
36554 DemandedOps[OpIdx].setBit(EltIdx);
36555 }
36556 // Known bits are the values that are shared by every demanded element.
36557 for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {
36558 if (!DemandedOps[i])
36559 continue;
36560 KnownBits Known2 =
36561 DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);
36562 Known = KnownBits::commonBits(Known, Known2);
36563 }
36564 }
36565 }
36566 }
36567}
36568
36569unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
36570 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
36571 unsigned Depth) const {
36572 EVT VT = Op.getValueType();
36573 unsigned VTBits = VT.getScalarSizeInBits();
36574 unsigned Opcode = Op.getOpcode();
36575 switch (Opcode) {
36576 case X86ISD::SETCC_CARRY:
36577 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
36578 return VTBits;
36579
36580 case X86ISD::VTRUNC: {
36581 SDValue Src = Op.getOperand(0);
36582 MVT SrcVT = Src.getSimpleValueType();
36583 unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
36584 assert(VTBits < NumSrcBits && "Illegal truncation input type")(static_cast <bool> (VTBits < NumSrcBits && "Illegal truncation input type"
) ? void (0) : __assert_fail ("VTBits < NumSrcBits && \"Illegal truncation input type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36584, __extension__
__PRETTY_FUNCTION__))
;
36585 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
36586 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
36587 if (Tmp > (NumSrcBits - VTBits))
36588 return Tmp - (NumSrcBits - VTBits);
36589 return 1;
36590 }
36591
36592 case X86ISD::PACKSS: {
36593 // PACKSS is just a truncation if the sign bits extend to the packed size.
36594 APInt DemandedLHS, DemandedRHS;
36595 getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,
36596 DemandedRHS);
36597
36598 unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
36599 unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;
36600 if (!!DemandedLHS)
36601 Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), DemandedLHS, Depth + 1);
36602 if (!!DemandedRHS)
36603 Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), DemandedRHS, Depth + 1);
36604 unsigned Tmp = std::min(Tmp0, Tmp1);
36605 if (Tmp > (SrcBits - VTBits))
36606 return Tmp - (SrcBits - VTBits);
36607 return 1;
36608 }
36609
36610 case X86ISD::VBROADCAST: {
36611 SDValue Src = Op.getOperand(0);
36612 if (!Src.getSimpleValueType().isVector())
36613 return DAG.ComputeNumSignBits(Src, Depth + 1);
36614 break;
36615 }
36616
36617 case X86ISD::VSHLI: {
36618 SDValue Src = Op.getOperand(0);
36619 const APInt &ShiftVal = Op.getConstantOperandAPInt(1);
36620 if (ShiftVal.uge(VTBits))
36621 return VTBits; // Shifted all bits out --> zero.
36622 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
36623 if (ShiftVal.uge(Tmp))
36624 return 1; // Shifted all sign bits out --> unknown.
36625 return Tmp - ShiftVal.getZExtValue();
36626 }
36627
36628 case X86ISD::VSRAI: {
36629 SDValue Src = Op.getOperand(0);
36630 APInt ShiftVal = Op.getConstantOperandAPInt(1);
36631 if (ShiftVal.uge(VTBits - 1))
36632 return VTBits; // Sign splat.
36633 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
36634 ShiftVal += Tmp;
36635 return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
36636 }
36637
36638 case X86ISD::FSETCC:
36639 // cmpss/cmpsd return zero/all-bits result values in the bottom element.
36640 if (VT == MVT::f32 || VT == MVT::f64 ||
36641 ((VT == MVT::v4f32 || VT == MVT::v2f64) && DemandedElts == 1))
36642 return VTBits;
36643 break;
36644
36645 case X86ISD::PCMPGT:
36646 case X86ISD::PCMPEQ:
36647 case X86ISD::CMPP:
36648 case X86ISD::VPCOM:
36649 case X86ISD::VPCOMU:
36650 // Vector compares return zero/all-bits result values.
36651 return VTBits;
36652
36653 case X86ISD::ANDNP: {
36654 unsigned Tmp0 =
36655 DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
36656 if (Tmp0 == 1) return 1; // Early out.
36657 unsigned Tmp1 =
36658 DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
36659 return std::min(Tmp0, Tmp1);
36660 }
36661
36662 case X86ISD::CMOV: {
36663 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
36664 if (Tmp0 == 1) return 1; // Early out.
36665 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
36666 return std::min(Tmp0, Tmp1);
36667 }
36668 }
36669
36670 // Handle target shuffles.
36671 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
36672 if (isTargetShuffle(Opcode)) {
36673 SmallVector<int, 64> Mask;
36674 SmallVector<SDValue, 2> Ops;
36675 if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask)) {
36676 unsigned NumOps = Ops.size();
36677 unsigned NumElts = VT.getVectorNumElements();
36678 if (Mask.size() == NumElts) {
36679 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
36680 for (unsigned i = 0; i != NumElts; ++i) {
36681 if (!DemandedElts[i])
36682 continue;
36683 int M = Mask[i];
36684 if (M == SM_SentinelUndef) {
36685 // For UNDEF elements, we don't know anything about the common state
36686 // of the shuffle result.
36687 return 1;
36688 } else if (M == SM_SentinelZero) {
36689 // Zero = all sign bits.
36690 continue;
36691 }
36692 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&(static_cast <bool> (0 <= M && (unsigned)M <
(NumOps * NumElts) && "Shuffle index out of range") ?
void (0) : __assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36693, __extension__
__PRETTY_FUNCTION__))
36693 "Shuffle index out of range")(static_cast <bool> (0 <= M && (unsigned)M <
(NumOps * NumElts) && "Shuffle index out of range") ?
void (0) : __assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36693, __extension__
__PRETTY_FUNCTION__))
;
36694
36695 unsigned OpIdx = (unsigned)M / NumElts;
36696 unsigned EltIdx = (unsigned)M % NumElts;
36697 if (Ops[OpIdx].getValueType() != VT) {
36698 // TODO - handle target shuffle ops with different value types.
36699 return 1;
36700 }
36701 DemandedOps[OpIdx].setBit(EltIdx);
36702 }
36703 unsigned Tmp0 = VTBits;
36704 for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) {
36705 if (!DemandedOps[i])
36706 continue;
36707 unsigned Tmp1 =
36708 DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1);
36709 Tmp0 = std::min(Tmp0, Tmp1);
36710 }
36711 return Tmp0;
36712 }
36713 }
36714 }
36715
36716 // Fallback case.
36717 return 1;
36718}
36719
36720SDValue X86TargetLowering::unwrapAddress(SDValue N) const {
36721 if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
36722 return N->getOperand(0);
36723 return N;
36724}
36725
36726// Helper to look for a normal load that can be narrowed into a vzload with the
36727// specified VT and memory VT. Returns SDValue() on failure.
36728static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT,
36729 SelectionDAG &DAG) {
36730 // Can't if the load is volatile or atomic.
36731 if (!LN->isSimple())
36732 return SDValue();
36733
36734 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
36735 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
36736 return DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, SDLoc(LN), Tys, Ops, MemVT,
36737 LN->getPointerInfo(), LN->getOriginalAlign(),
36738 LN->getMemOperand()->getFlags());
36739}
36740
36741// Attempt to match a combined shuffle mask against supported unary shuffle
36742// instructions.
36743// TODO: Investigate sharing more of this with shuffle lowering.
36744static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
36745 bool AllowFloatDomain, bool AllowIntDomain,
36746 SDValue V1, const X86Subtarget &Subtarget,
36747 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) {
36748 unsigned NumMaskElts = Mask.size();
36749 unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
36750
36751 // Match against a VZEXT_MOVL vXi32 and vXi16 zero-extending instruction.
36752 if (Mask[0] == 0 &&
36753 (MaskEltSize == 32 || (MaskEltSize == 16 && Subtarget.hasFP16()))) {
36754 if ((isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) ||
36755 (V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
36756 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1))) {
36757 Shuffle = X86ISD::VZEXT_MOVL;
36758 if (MaskEltSize == 16)
36759 SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
36760 else
36761 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
36762 return true;
36763 }
36764 }
36765
36766 // Match against a ANY/ZERO_EXTEND_VECTOR_INREG instruction.
36767 // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
36768 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
36769 (MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
36770 unsigned MaxScale = 64 / MaskEltSize;
36771 for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
36772 bool MatchAny = true;
36773 bool MatchZero = true;
36774 unsigned NumDstElts = NumMaskElts / Scale;
36775 for (unsigned i = 0; i != NumDstElts && (MatchAny || MatchZero); ++i) {
36776 if (!isUndefOrEqual(Mask[i * Scale], (int)i)) {
36777 MatchAny = MatchZero = false;
36778 break;
36779 }
36780 MatchAny &= isUndefInRange(Mask, (i * Scale) + 1, Scale - 1);
36781 MatchZero &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
36782 }
36783 if (MatchAny || MatchZero) {
36784 assert(MatchZero && "Failed to match zext but matched aext?")(static_cast <bool> (MatchZero && "Failed to match zext but matched aext?"
) ? void (0) : __assert_fail ("MatchZero && \"Failed to match zext but matched aext?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36784, __extension__
__PRETTY_FUNCTION__))
;
36785 unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
36786 MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() :
36787 MVT::getIntegerVT(MaskEltSize);
36788 SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
36789
36790 Shuffle = unsigned(MatchAny ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND);
36791 if (SrcVT.getVectorNumElements() != NumDstElts)
36792 Shuffle = getOpcode_EXTEND_VECTOR_INREG(Shuffle);
36793
36794 DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
36795 DstVT = MVT::getVectorVT(DstVT, NumDstElts);
36796 return true;
36797 }
36798 }
36799 }
36800
36801 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
36802 if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2()) ||
36803 (MaskEltSize == 16 && Subtarget.hasFP16())) &&
36804 isUndefOrEqual(Mask[0], 0) &&
36805 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
36806 Shuffle = X86ISD::VZEXT_MOVL;
36807 if (MaskEltSize == 16)
36808 SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
36809 else
36810 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
36811 return true;
36812 }
36813
36814 // Check if we have SSE3 which will let us use MOVDDUP etc. The
36815 // instructions are no slower than UNPCKLPD but has the option to
36816 // fold the input operand into even an unaligned memory load.
36817 if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
36818 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, V1)) {
36819 Shuffle = X86ISD::MOVDDUP;
36820 SrcVT = DstVT = MVT::v2f64;
36821 return true;
36822 }
36823 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, V1)) {
36824 Shuffle = X86ISD::MOVSLDUP;
36825 SrcVT = DstVT = MVT::v4f32;
36826 return true;
36827 }
36828 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3}, V1)) {
36829 Shuffle = X86ISD::MOVSHDUP;
36830 SrcVT = DstVT = MVT::v4f32;
36831 return true;
36832 }
36833 }
36834
36835 if (MaskVT.is256BitVector() && AllowFloatDomain) {
36836 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles")(static_cast <bool> (Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"AVX required for 256-bit vector shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36836, __extension__
__PRETTY_FUNCTION__))
;
36837 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, V1)) {
36838 Shuffle = X86ISD::MOVDDUP;
36839 SrcVT = DstVT = MVT::v4f64;
36840 return true;
36841 }
36842 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1)) {
36843 Shuffle = X86ISD::MOVSLDUP;
36844 SrcVT = DstVT = MVT::v8f32;
36845 return true;
36846 }
36847 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3, 5, 5, 7, 7}, V1)) {
36848 Shuffle = X86ISD::MOVSHDUP;
36849 SrcVT = DstVT = MVT::v8f32;
36850 return true;
36851 }
36852 }
36853
36854 if (MaskVT.is512BitVector() && AllowFloatDomain) {
36855 assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "AVX512 required for 512-bit vector shuffles"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"AVX512 required for 512-bit vector shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36856, __extension__
__PRETTY_FUNCTION__))
36856 "AVX512 required for 512-bit vector shuffles")(static_cast <bool> (Subtarget.hasAVX512() && "AVX512 required for 512-bit vector shuffles"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"AVX512 required for 512-bit vector shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36856, __extension__
__PRETTY_FUNCTION__))
;
36857 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1)) {
36858 Shuffle = X86ISD::MOVDDUP;
36859 SrcVT = DstVT = MVT::v8f64;
36860 return true;
36861 }
36862 if (isTargetShuffleEquivalent(
36863 MaskVT, Mask,
36864 {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, V1)) {
36865 Shuffle = X86ISD::MOVSLDUP;
36866 SrcVT = DstVT = MVT::v16f32;
36867 return true;
36868 }
36869 if (isTargetShuffleEquivalent(
36870 MaskVT, Mask,
36871 {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, V1)) {
36872 Shuffle = X86ISD::MOVSHDUP;
36873 SrcVT = DstVT = MVT::v16f32;
36874 return true;
36875 }
36876 }
36877
36878 return false;
36879}
36880
36881// Attempt to match a combined shuffle mask against supported unary immediate
36882// permute instructions.
36883// TODO: Investigate sharing more of this with shuffle lowering.
36884static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,
36885 const APInt &Zeroable,
36886 bool AllowFloatDomain, bool AllowIntDomain,
36887 const X86Subtarget &Subtarget,
36888 unsigned &Shuffle, MVT &ShuffleVT,
36889 unsigned &PermuteImm) {
36890 unsigned NumMaskElts = Mask.size();
36891 unsigned InputSizeInBits = MaskVT.getSizeInBits();
36892 unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
36893 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
36894 bool ContainsZeros = isAnyZero(Mask);
36895
36896 // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
36897 if (!ContainsZeros && MaskScalarSizeInBits == 64) {
36898 // Check for lane crossing permutes.
36899 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
36900 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
36901 if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
36902 Shuffle = X86ISD::VPERMI;
36903 ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
36904 PermuteImm = getV4X86ShuffleImm(Mask);
36905 return true;
36906 }
36907 if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
36908 SmallVector<int, 4> RepeatedMask;
36909 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
36910 Shuffle = X86ISD::VPERMI;
36911 ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
36912 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
36913 return true;
36914 }
36915 }
36916 } else if (AllowFloatDomain && Subtarget.hasAVX()) {
36917 // VPERMILPD can permute with a non-repeating shuffle.
36918 Shuffle = X86ISD::VPERMILPI;
36919 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
36920 PermuteImm = 0;
36921 for (int i = 0, e = Mask.size(); i != e; ++i) {
36922 int M = Mask[i];
36923 if (M == SM_SentinelUndef)
36924 continue;
36925 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index")(static_cast <bool> (((M / 2) == (i / 2)) && "Out of range shuffle mask index"
) ? void (0) : __assert_fail ("((M / 2) == (i / 2)) && \"Out of range shuffle mask index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36925, __extension__
__PRETTY_FUNCTION__))
;
36926 PermuteImm |= (M & 1) << i;
36927 }
36928 return true;
36929 }
36930 }
36931
36932 // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
36933 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
36934 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
36935 if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
36936 !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
36937 SmallVector<int, 4> RepeatedMask;
36938 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
36939 // Narrow the repeated mask to create 32-bit element permutes.
36940 SmallVector<int, 4> WordMask = RepeatedMask;
36941 if (MaskScalarSizeInBits == 64)
36942 narrowShuffleMaskElts(2, RepeatedMask, WordMask);
36943
36944 Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
36945 ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
36946 ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
36947 PermuteImm = getV4X86ShuffleImm(WordMask);
36948 return true;
36949 }
36950 }
36951
36952 // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
36953 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16 &&
36954 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
36955 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
36956 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
36957 SmallVector<int, 4> RepeatedMask;
36958 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
36959 ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4);
36960 ArrayRef<int> HiMask(RepeatedMask.data() + 4, 4);
36961
36962 // PSHUFLW: permute lower 4 elements only.
36963 if (isUndefOrInRange(LoMask, 0, 4) &&
36964 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
36965 Shuffle = X86ISD::PSHUFLW;
36966 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
36967 PermuteImm = getV4X86ShuffleImm(LoMask);
36968 return true;
36969 }
36970
36971 // PSHUFHW: permute upper 4 elements only.
36972 if (isUndefOrInRange(HiMask, 4, 8) &&
36973 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
36974 // Offset the HiMask so that we can create the shuffle immediate.
36975 int OffsetHiMask[4];
36976 for (int i = 0; i != 4; ++i)
36977 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
36978
36979 Shuffle = X86ISD::PSHUFHW;
36980 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
36981 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
36982 return true;
36983 }
36984 }
36985 }
36986
36987 // Attempt to match against byte/bit shifts.
36988 if (AllowIntDomain &&
36989 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
36990 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
36991 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
36992 int ShiftAmt = matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits,
36993 Mask, 0, Zeroable, Subtarget);
36994 if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() || Subtarget.hasBWI() ||
36995 32 <= ShuffleVT.getScalarSizeInBits())) {
36996 PermuteImm = (unsigned)ShiftAmt;
36997 return true;
36998 }
36999 }
37000
37001 // Attempt to match against bit rotates.
37002 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 &&
37003 ((MaskVT.is128BitVector() && Subtarget.hasXOP()) ||
37004 Subtarget.hasAVX512())) {
37005 int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits,
37006 Subtarget, Mask);
37007 if (0 < RotateAmt) {
37008 Shuffle = X86ISD::VROTLI;
37009 PermuteImm = (unsigned)RotateAmt;
37010 return true;
37011 }
37012 }
37013
37014 return false;
37015}
37016
37017// Attempt to match a combined unary shuffle mask against supported binary
37018// shuffle instructions.
37019// TODO: Investigate sharing more of this with shuffle lowering.
37020static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
37021 bool AllowFloatDomain, bool AllowIntDomain,
37022 SDValue &V1, SDValue &V2, const SDLoc &DL,
37023 SelectionDAG &DAG, const X86Subtarget &Subtarget,
37024 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
37025 bool IsUnary) {
37026 unsigned NumMaskElts = Mask.size();
37027 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
37028
37029 if (MaskVT.is128BitVector()) {
37030 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}) && AllowFloatDomain) {
37031 V2 = V1;
37032 V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
37033 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;
37034 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
37035 return true;
37036 }
37037 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1}) && AllowFloatDomain) {
37038 V2 = V1;
37039 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;
37040 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
37041 return true;
37042 }
37043 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 3}) &&
37044 Subtarget.hasSSE2() && (AllowFloatDomain || !Subtarget.hasSSE41())) {
37045 std::swap(V1, V2);
37046 Shuffle = X86ISD::MOVSD;
37047 SrcVT = DstVT = MVT::v2f64;
37048 return true;
37049 }
37050 if (isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}) &&
37051 (AllowFloatDomain || !Subtarget.hasSSE41())) {
37052 Shuffle = X86ISD::MOVSS;
37053 SrcVT = DstVT = MVT::v4f32;
37054 return true;
37055 }
37056 if (isTargetShuffleEquivalent(MaskVT, Mask, {8, 1, 2, 3, 4, 5, 6, 7}) &&
37057 Subtarget.hasFP16()) {
37058 Shuffle = X86ISD::MOVSH;
37059 SrcVT = DstVT = MVT::v8f16;
37060 return true;
37061 }
37062 }
37063
37064 // Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.
37065 if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) ||
37066 ((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) ||
37067 ((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {
37068 if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
37069 Subtarget)) {
37070 DstVT = MaskVT;
37071 return true;
37072 }
37073 }
37074
37075 // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
37076 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
37077 (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
37078 (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
37079 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
37080 (MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
37081 if (matchShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG,
37082 Subtarget)) {
37083 SrcVT = DstVT = MaskVT;
37084 if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
37085 SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
37086 return true;
37087 }
37088 }
37089
37090 // Attempt to match against a OR if we're performing a blend shuffle and the
37091 // non-blended source element is zero in each case.
37092 if ((EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
37093 (EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) {
37094 bool IsBlend = true;
37095 unsigned NumV1Elts = V1.getValueType().getVectorNumElements();
37096 unsigned NumV2Elts = V2.getValueType().getVectorNumElements();
37097 unsigned Scale1 = NumV1Elts / NumMaskElts;
37098 unsigned Scale2 = NumV2Elts / NumMaskElts;
37099 APInt DemandedZeroV1 = APInt::getZero(NumV1Elts);
37100 APInt DemandedZeroV2 = APInt::getZero(NumV2Elts);
37101 for (unsigned i = 0; i != NumMaskElts; ++i) {
37102 int M = Mask[i];
37103 if (M == SM_SentinelUndef)
37104 continue;
37105 if (M == SM_SentinelZero) {
37106 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
37107 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
37108 continue;
37109 }
37110 if (M == (int)i) {
37111 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
37112 continue;
37113 }
37114 if (M == (int)(i + NumMaskElts)) {
37115 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
37116 continue;
37117 }
37118 IsBlend = false;
37119 break;
37120 }
37121 if (IsBlend) {
37122 if (DAG.computeKnownBits(V1, DemandedZeroV1).isZero() &&
37123 DAG.computeKnownBits(V2, DemandedZeroV2).isZero()) {
37124 Shuffle = ISD::OR;
37125 SrcVT = DstVT = MaskVT.changeTypeToInteger();
37126 return true;
37127 }
37128 if (NumV1Elts == NumV2Elts && NumV1Elts == NumMaskElts) {
37129 // FIXME: handle mismatched sizes?
37130 // TODO: investigate if `ISD::OR` handling in
37131 // `TargetLowering::SimplifyDemandedVectorElts` can be improved instead.
37132 auto computeKnownBitsElementWise = [&DAG](SDValue V) {
37133 unsigned NumElts = V.getValueType().getVectorNumElements();
37134 KnownBits Known(NumElts);
37135 for (unsigned EltIdx = 0; EltIdx != NumElts; ++EltIdx) {
37136 APInt Mask = APInt::getOneBitSet(NumElts, EltIdx);
37137 KnownBits PeepholeKnown = DAG.computeKnownBits(V, Mask);
37138 if (PeepholeKnown.isZero())
37139 Known.Zero.setBit(EltIdx);
37140 if (PeepholeKnown.isAllOnes())
37141 Known.One.setBit(EltIdx);
37142 }
37143 return Known;
37144 };
37145
37146 KnownBits V1Known = computeKnownBitsElementWise(V1);
37147 KnownBits V2Known = computeKnownBitsElementWise(V2);
37148
37149 for (unsigned i = 0; i != NumMaskElts && IsBlend; ++i) {
37150 int M = Mask[i];
37151 if (M == SM_SentinelUndef)
37152 continue;
37153 if (M == SM_SentinelZero) {
37154 IsBlend &= V1Known.Zero[i] && V2Known.Zero[i];
37155 continue;
37156 }
37157 if (M == (int)i) {
37158 IsBlend &= V2Known.Zero[i] || V1Known.One[i];
37159 continue;
37160 }
37161 if (M == (int)(i + NumMaskElts)) {
37162 IsBlend &= V1Known.Zero[i] || V2Known.One[i];
37163 continue;
37164 }
37165 llvm_unreachable("will not get here.")::llvm::llvm_unreachable_internal("will not get here.", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37165)
;
37166 }
37167 if (IsBlend) {
37168 Shuffle = ISD::OR;
37169 SrcVT = DstVT = MaskVT.changeTypeToInteger();
37170 return true;
37171 }
37172 }
37173 }
37174 }
37175
37176 return false;
37177}
37178
37179static bool matchBinaryPermuteShuffle(
37180 MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
37181 bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
37182 const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
37183 unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {
37184 unsigned NumMaskElts = Mask.size();
37185 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
37186
37187 // Attempt to match against VALIGND/VALIGNQ rotate.
37188 if (AllowIntDomain && (EltSizeInBits == 64 || EltSizeInBits == 32) &&
37189 ((MaskVT.is128BitVector() && Subtarget.hasVLX()) ||
37190 (MaskVT.is256BitVector() && Subtarget.hasVLX()) ||
37191 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
37192 if (!isAnyZero(Mask)) {
37193 int Rotation = matchShuffleAsElementRotate(V1, V2, Mask);
37194 if (0 < Rotation) {
37195 Shuffle = X86ISD::VALIGN;
37196 if (EltSizeInBits == 64)
37197 ShuffleVT = MVT::getVectorVT(MVT::i64, MaskVT.getSizeInBits() / 64);
37198 else
37199 ShuffleVT = MVT::getVectorVT(MVT::i32, MaskVT.getSizeInBits() / 32);
37200 PermuteImm = Rotation;
37201 return true;
37202 }
37203 }
37204 }
37205
37206 // Attempt to match against PALIGNR byte rotate.
37207 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
37208 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
37209 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
37210 int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);
37211 if (0 < ByteRotation) {
37212 Shuffle = X86ISD::PALIGNR;
37213 ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
37214 PermuteImm = ByteRotation;
37215 return true;
37216 }
37217 }
37218
37219 // Attempt to combine to X86ISD::BLENDI.
37220 if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
37221 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
37222 (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
37223 uint64_t BlendMask = 0;
37224 bool ForceV1Zero = false, ForceV2Zero = false;
37225 SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
37226 if (matchShuffleAsBlend(V1, V2, TargetMask, Zeroable, ForceV1Zero,
37227 ForceV2Zero, BlendMask)) {
37228 if (MaskVT == MVT::v16i16) {
37229 // We can only use v16i16 PBLENDW if the lanes are repeated.
37230 SmallVector<int, 8> RepeatedMask;
37231 if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
37232 RepeatedMask)) {
37233 assert(RepeatedMask.size() == 8 &&(static_cast <bool> (RepeatedMask.size() == 8 &&
"Repeated mask size doesn't match!") ? void (0) : __assert_fail
("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37234, __extension__
__PRETTY_FUNCTION__))
37234 "Repeated mask size doesn't match!")(static_cast <bool> (RepeatedMask.size() == 8 &&
"Repeated mask size doesn't match!") ? void (0) : __assert_fail
("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37234, __extension__
__PRETTY_FUNCTION__))
;
37235 PermuteImm = 0;
37236 for (int i = 0; i < 8; ++i)
37237 if (RepeatedMask[i] >= 8)
37238 PermuteImm |= 1 << i;
37239 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
37240 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
37241 Shuffle = X86ISD::BLENDI;
37242 ShuffleVT = MaskVT;
37243 return true;
37244 }
37245 } else {
37246 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
37247 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
37248 PermuteImm = (unsigned)BlendMask;
37249 Shuffle = X86ISD::BLENDI;
37250 ShuffleVT = MaskVT;
37251 return true;
37252 }
37253 }
37254 }
37255
37256 // Attempt to combine to INSERTPS, but only if it has elements that need to
37257 // be set to zero.
37258 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
37259 MaskVT.is128BitVector() && isAnyZero(Mask) &&
37260 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
37261 Shuffle = X86ISD::INSERTPS;
37262 ShuffleVT = MVT::v4f32;
37263 return true;
37264 }
37265
37266 // Attempt to combine to SHUFPD.
37267 if (AllowFloatDomain && EltSizeInBits == 64 &&
37268 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
37269 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
37270 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
37271 bool ForceV1Zero = false, ForceV2Zero = false;
37272 if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero,
37273 PermuteImm, Mask, Zeroable)) {
37274 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
37275 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
37276 Shuffle = X86ISD::SHUFP;
37277 ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
37278 return true;
37279 }
37280 }
37281
37282 // Attempt to combine to SHUFPS.
37283 if (AllowFloatDomain && EltSizeInBits == 32 &&
37284 ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
37285 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
37286 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
37287 SmallVector<int, 4> RepeatedMask;
37288 if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
37289 // Match each half of the repeated mask, to determine if its just
37290 // referencing one of the vectors, is zeroable or entirely undef.
37291 auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
37292 int M0 = RepeatedMask[Offset];
37293 int M1 = RepeatedMask[Offset + 1];
37294
37295 if (isUndefInRange(RepeatedMask, Offset, 2)) {
37296 return DAG.getUNDEF(MaskVT);
37297 } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
37298 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
37299 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
37300 return getZeroVector(MaskVT, Subtarget, DAG, DL);
37301 } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
37302 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
37303 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
37304 return V1;
37305 } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
37306 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
37307 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
37308 return V2;
37309 }
37310
37311 return SDValue();
37312 };
37313
37314 int ShufMask[4] = {-1, -1, -1, -1};
37315 SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
37316 SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
37317
37318 if (Lo && Hi) {
37319 V1 = Lo;
37320 V2 = Hi;
37321 Shuffle = X86ISD::SHUFP;
37322 ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
37323 PermuteImm = getV4X86ShuffleImm(ShufMask);
37324 return true;
37325 }
37326 }
37327 }
37328
37329 // Attempt to combine to INSERTPS more generally if X86ISD::SHUFP failed.
37330 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
37331 MaskVT.is128BitVector() &&
37332 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
37333 Shuffle = X86ISD::INSERTPS;
37334 ShuffleVT = MVT::v4f32;
37335 return true;
37336 }
37337
37338 return false;
37339}
37340
37341static SDValue combineX86ShuffleChainWithExtract(
37342 ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
37343 bool HasVariableMask, bool AllowVariableCrossLaneMask,
37344 bool AllowVariablePerLaneMask, SelectionDAG &DAG,
37345 const X86Subtarget &Subtarget);
37346
37347/// Combine an arbitrary chain of shuffles into a single instruction if
37348/// possible.
37349///
37350/// This is the leaf of the recursive combine below. When we have found some
37351/// chain of single-use x86 shuffle instructions and accumulated the combined
37352/// shuffle mask represented by them, this will try to pattern match that mask
37353/// into either a single instruction if there is a special purpose instruction
37354/// for this operation, or into a PSHUFB instruction which is a fully general
37355/// instruction but should only be used to replace chains over a certain depth.
37356static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
37357 ArrayRef<int> BaseMask, int Depth,
37358 bool HasVariableMask,
37359 bool AllowVariableCrossLaneMask,
37360 bool AllowVariablePerLaneMask,
37361 SelectionDAG &DAG,
37362 const X86Subtarget &Subtarget) {
37363 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!")(static_cast <bool> (!BaseMask.empty() && "Cannot combine an empty shuffle mask!"
) ? void (0) : __assert_fail ("!BaseMask.empty() && \"Cannot combine an empty shuffle mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37363, __extension__
__PRETTY_FUNCTION__))
;
37364 assert((Inputs.size() == 1 || Inputs.size() == 2) &&(static_cast <bool> ((Inputs.size() == 1 || Inputs.size
() == 2) && "Unexpected number of shuffle inputs!") ?
void (0) : __assert_fail ("(Inputs.size() == 1 || Inputs.size() == 2) && \"Unexpected number of shuffle inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37365, __extension__
__PRETTY_FUNCTION__))
37365 "Unexpected number of shuffle inputs!")(static_cast <bool> ((Inputs.size() == 1 || Inputs.size
() == 2) && "Unexpected number of shuffle inputs!") ?
void (0) : __assert_fail ("(Inputs.size() == 1 || Inputs.size() == 2) && \"Unexpected number of shuffle inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37365, __extension__
__PRETTY_FUNCTION__))
;
37366
37367 SDLoc DL(Root);
37368 MVT RootVT = Root.getSimpleValueType();
37369 unsigned RootSizeInBits = RootVT.getSizeInBits();
37370 unsigned NumRootElts = RootVT.getVectorNumElements();
37371
37372 // Canonicalize shuffle input op to the requested type.
37373 auto CanonicalizeShuffleInput = [&](MVT VT, SDValue Op) {
37374 if (VT.getSizeInBits() > Op.getValueSizeInBits())
37375 Op = widenSubVector(Op, false, Subtarget, DAG, DL, VT.getSizeInBits());
37376 else if (VT.getSizeInBits() < Op.getValueSizeInBits())
37377 Op = extractSubVector(Op, 0, DAG, DL, VT.getSizeInBits());
37378 return DAG.getBitcast(VT, Op);
37379 };
37380
37381 // Find the inputs that enter the chain. Note that multiple uses are OK
37382 // here, we're not going to remove the operands we find.
37383 bool UnaryShuffle = (Inputs.size() == 1);
37384 SDValue V1 = peekThroughBitcasts(Inputs[0]);
37385 SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
37386 : peekThroughBitcasts(Inputs[1]));
37387
37388 MVT VT1 = V1.getSimpleValueType();
37389 MVT VT2 = V2.getSimpleValueType();
37390 assert((RootSizeInBits % VT1.getSizeInBits()) == 0 &&(static_cast <bool> ((RootSizeInBits % VT1.getSizeInBits
()) == 0 && (RootSizeInBits % VT2.getSizeInBits()) ==
0 && "Vector size mismatch") ? void (0) : __assert_fail
("(RootSizeInBits % VT1.getSizeInBits()) == 0 && (RootSizeInBits % VT2.getSizeInBits()) == 0 && \"Vector size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37391, __extension__
__PRETTY_FUNCTION__))
37391 (RootSizeInBits % VT2.getSizeInBits()) == 0 && "Vector size mismatch")(static_cast <bool> ((RootSizeInBits % VT1.getSizeInBits
()) == 0 && (RootSizeInBits % VT2.getSizeInBits()) ==
0 && "Vector size mismatch") ? void (0) : __assert_fail
("(RootSizeInBits % VT1.getSizeInBits()) == 0 && (RootSizeInBits % VT2.getSizeInBits()) == 0 && \"Vector size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37391, __extension__
__PRETTY_FUNCTION__))
;
37392
37393 SDValue Res;
37394
37395 unsigned NumBaseMaskElts = BaseMask.size();
37396 if (NumBaseMaskElts == 1) {
37397 assert(BaseMask[0] == 0 && "Invalid shuffle index found!")(static_cast <bool> (BaseMask[0] == 0 && "Invalid shuffle index found!"
) ? void (0) : __assert_fail ("BaseMask[0] == 0 && \"Invalid shuffle index found!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37397, __extension__
__PRETTY_FUNCTION__))
;
37398 return CanonicalizeShuffleInput(RootVT, V1);
37399 }
37400
37401 bool OptForSize = DAG.shouldOptForSize();
37402 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
37403 bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
37404 (RootVT.isFloatingPoint() && Depth >= 1) ||
37405 (RootVT.is256BitVector() && !Subtarget.hasAVX2());
37406
37407 // Don't combine if we are a AVX512/EVEX target and the mask element size
37408 // is different from the root element size - this would prevent writemasks
37409 // from being reused.
37410 bool IsMaskedShuffle = false;
37411 if (RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128)) {
37412 if (Root.hasOneUse() && Root->use_begin()->getOpcode() == ISD::VSELECT &&
37413 Root->use_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
37414 IsMaskedShuffle = true;
37415 }
37416 }
37417
37418 // If we are shuffling a broadcast (and not introducing zeros) then
37419 // we can just use the broadcast directly. This works for smaller broadcast
37420 // elements as well as they already repeat across each mask element
37421 if (UnaryShuffle && isTargetShuffleSplat(V1) && !isAnyZero(BaseMask) &&
37422 (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
37423 V1.getValueSizeInBits() >= RootSizeInBits) {
37424 return CanonicalizeShuffleInput(RootVT, V1);
37425 }
37426
37427 SmallVector<int, 64> Mask(BaseMask.begin(), BaseMask.end());
37428
37429 // See if the shuffle is a hidden identity shuffle - repeated args in HOPs
37430 // etc. can be simplified.
37431 if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits && VT1.isVector()) {
37432 SmallVector<int> ScaledMask, IdentityMask;
37433 unsigned NumElts = VT1.getVectorNumElements();
37434 if (Mask.size() <= NumElts &&
37435 scaleShuffleElements(Mask, NumElts, ScaledMask)) {
37436 for (unsigned i = 0; i != NumElts; ++i)
37437 IdentityMask.push_back(i);
37438 if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, V1, V2))
37439 return CanonicalizeShuffleInput(RootVT, V1);
37440 }
37441 }
37442
37443 // Handle 128/256-bit lane shuffles of 512-bit vectors.
37444 if (RootVT.is512BitVector() &&
37445 (NumBaseMaskElts == 2 || NumBaseMaskElts == 4)) {
37446 // If the upper subvectors are zeroable, then an extract+insert is more
37447 // optimal than using X86ISD::SHUF128. The insertion is free, even if it has
37448 // to zero the upper subvectors.
37449 if (isUndefOrZeroInRange(Mask, 1, NumBaseMaskElts - 1)) {
37450 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
37451 return SDValue(); // Nothing to do!
37452 assert(isInRange(Mask[0], 0, NumBaseMaskElts) &&(static_cast <bool> (isInRange(Mask[0], 0, NumBaseMaskElts
) && "Unexpected lane shuffle") ? void (0) : __assert_fail
("isInRange(Mask[0], 0, NumBaseMaskElts) && \"Unexpected lane shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37453, __extension__
__PRETTY_FUNCTION__))
37453 "Unexpected lane shuffle")(static_cast <bool> (isInRange(Mask[0], 0, NumBaseMaskElts
) && "Unexpected lane shuffle") ? void (0) : __assert_fail
("isInRange(Mask[0], 0, NumBaseMaskElts) && \"Unexpected lane shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37453, __extension__
__PRETTY_FUNCTION__))
;
37454 Res = CanonicalizeShuffleInput(RootVT, V1);
37455 unsigned SubIdx = Mask[0] * (NumRootElts / NumBaseMaskElts);
37456 bool UseZero = isAnyZero(Mask);
37457 Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);
37458 return widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);
37459 }
37460
37461 // Narrow shuffle mask to v4x128.
37462 SmallVector<int, 4> ScaledMask;
37463 assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size")(static_cast <bool> ((BaseMaskEltSizeInBits % 128) == 0
&& "Illegal mask size") ? void (0) : __assert_fail (
"(BaseMaskEltSizeInBits % 128) == 0 && \"Illegal mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37463, __extension__
__PRETTY_FUNCTION__))
;
37464 narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, Mask, ScaledMask);
37465
37466 // Try to lower to vshuf64x2/vshuf32x4.
37467 auto MatchSHUF128 = [&](MVT ShuffleVT, const SDLoc &DL,
37468 ArrayRef<int> ScaledMask, SDValue V1, SDValue V2,
37469 SelectionDAG &DAG) {
37470 unsigned PermMask = 0;
37471 // Insure elements came from the same Op.
37472 SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)};
37473 for (int i = 0; i < 4; ++i) {
37474 assert(ScaledMask[i] >= -1 && "Illegal shuffle sentinel value")(static_cast <bool> (ScaledMask[i] >= -1 && "Illegal shuffle sentinel value"
) ? void (0) : __assert_fail ("ScaledMask[i] >= -1 && \"Illegal shuffle sentinel value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37474, __extension__
__PRETTY_FUNCTION__))
;
37475 if (ScaledMask[i] < 0)
37476 continue;
37477
37478 SDValue Op = ScaledMask[i] >= 4 ? V2 : V1;
37479 unsigned OpIndex = i / 2;
37480 if (Ops[OpIndex].isUndef())
37481 Ops[OpIndex] = Op;
37482 else if (Ops[OpIndex] != Op)
37483 return SDValue();
37484
37485 // Convert the 128-bit shuffle mask selection values into 128-bit
37486 // selection bits defined by a vshuf64x2 instruction's immediate control
37487 // byte.
37488 PermMask |= (ScaledMask[i] % 4) << (i * 2);
37489 }
37490
37491 return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
37492 CanonicalizeShuffleInput(ShuffleVT, Ops[0]),
37493 CanonicalizeShuffleInput(ShuffleVT, Ops[1]),
37494 DAG.getTargetConstant(PermMask, DL, MVT::i8));
37495 };
37496
37497 // FIXME: Is there a better way to do this? is256BitLaneRepeatedShuffleMask
37498 // doesn't work because our mask is for 128 bits and we don't have an MVT
37499 // to match that.
37500 bool PreferPERMQ = UnaryShuffle && isUndefOrInRange(ScaledMask[0], 0, 2) &&
37501 isUndefOrInRange(ScaledMask[1], 0, 2) &&
37502 isUndefOrInRange(ScaledMask[2], 2, 4) &&
37503 isUndefOrInRange(ScaledMask[3], 2, 4) &&
37504 (ScaledMask[0] < 0 || ScaledMask[2] < 0 ||
37505 ScaledMask[0] == (ScaledMask[2] % 2)) &&
37506 (ScaledMask[1] < 0 || ScaledMask[3] < 0 ||
37507 ScaledMask[1] == (ScaledMask[3] % 2));
37508
37509 if (!isAnyZero(ScaledMask) && !PreferPERMQ) {
37510 if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
37511 return SDValue(); // Nothing to do!
37512 MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
37513 if (SDValue V = MatchSHUF128(ShuffleVT, DL, ScaledMask, V1, V2, DAG))
37514 return DAG.getBitcast(RootVT, V);
37515 }
37516 }
37517
37518 // Handle 128-bit lane shuffles of 256-bit vectors.
37519 if (RootVT.is256BitVector() && NumBaseMaskElts == 2) {
37520 // If the upper half is zeroable, then an extract+insert is more optimal
37521 // than using X86ISD::VPERM2X128. The insertion is free, even if it has to
37522 // zero the upper half.
37523 if (isUndefOrZero(Mask[1])) {
37524 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
37525 return SDValue(); // Nothing to do!
37526 assert(isInRange(Mask[0], 0, 2) && "Unexpected lane shuffle")(static_cast <bool> (isInRange(Mask[0], 0, 2) &&
"Unexpected lane shuffle") ? void (0) : __assert_fail ("isInRange(Mask[0], 0, 2) && \"Unexpected lane shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37526, __extension__
__PRETTY_FUNCTION__))
;
37527 Res = CanonicalizeShuffleInput(RootVT, V1);
37528 Res = extract128BitVector(Res, Mask[0] * (NumRootElts / 2), DAG, DL);
37529 return widenSubVector(Res, Mask[1] == SM_SentinelZero, Subtarget, DAG, DL,
37530 256);
37531 }
37532
37533 // If we're inserting the low subvector, an insert-subvector 'concat'
37534 // pattern is quicker than VPERM2X128.
37535 // TODO: Add AVX2 support instead of VPERMQ/VPERMPD.
37536 if (BaseMask[0] == 0 && (BaseMask[1] == 0 || BaseMask[1] == 2) &&
37537 !Subtarget.hasAVX2()) {
37538 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
37539 return SDValue(); // Nothing to do!
37540 SDValue Lo = CanonicalizeShuffleInput(RootVT, V1);
37541 SDValue Hi = CanonicalizeShuffleInput(RootVT, BaseMask[1] == 0 ? V1 : V2);
37542 Hi = extractSubVector(Hi, 0, DAG, DL, 128);
37543 return insertSubVector(Lo, Hi, NumRootElts / 2, DAG, DL, 128);
37544 }
37545
37546 if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128)
37547 return SDValue(); // Nothing to do!
37548
37549 // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
37550 // we need to use the zeroing feature.
37551 // Prefer blends for sequential shuffles unless we are optimizing for size.
37552 if (UnaryShuffle &&
37553 !(Subtarget.hasAVX2() && isUndefOrInRange(Mask, 0, 2)) &&
37554 (OptForSize || !isSequentialOrUndefOrZeroInRange(Mask, 0, 2, 0))) {
37555 unsigned PermMask = 0;
37556 PermMask |= ((Mask[0] < 0 ? 0x8 : (Mask[0] & 1)) << 0);
37557 PermMask |= ((Mask[1] < 0 ? 0x8 : (Mask[1] & 1)) << 4);
37558 return DAG.getNode(
37559 X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, V1),
37560 DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8));
37561 }
37562
37563 if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
37564 return SDValue(); // Nothing to do!
37565
37566 // TODO - handle AVX512VL cases with X86ISD::SHUF128.
37567 if (!UnaryShuffle && !IsMaskedShuffle) {
37568 assert(llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) &&(static_cast <bool> (llvm::all_of(Mask, [](int M) { return
0 <= M && M < 4; }) && "Unexpected shuffle sentinel value"
) ? void (0) : __assert_fail ("llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) && \"Unexpected shuffle sentinel value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37569, __extension__
__PRETTY_FUNCTION__))
37569 "Unexpected shuffle sentinel value")(static_cast <bool> (llvm::all_of(Mask, [](int M) { return
0 <= M && M < 4; }) && "Unexpected shuffle sentinel value"
) ? void (0) : __assert_fail ("llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) && \"Unexpected shuffle sentinel value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37569, __extension__
__PRETTY_FUNCTION__))
;
37570 // Prefer blends to X86ISD::VPERM2X128.
37571 if (!((Mask[0] == 0 && Mask[1] == 3) || (Mask[0] == 2 && Mask[1] == 1))) {
37572 unsigned PermMask = 0;
37573 PermMask |= ((Mask[0] & 3) << 0);
37574 PermMask |= ((Mask[1] & 3) << 4);
37575 SDValue LHS = isInRange(Mask[0], 0, 2) ? V1 : V2;
37576 SDValue RHS = isInRange(Mask[1], 0, 2) ? V1 : V2;
37577 return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT,
37578 CanonicalizeShuffleInput(RootVT, LHS),
37579 CanonicalizeShuffleInput(RootVT, RHS),
37580 DAG.getTargetConstant(PermMask, DL, MVT::i8));
37581 }
37582 }
37583 }
37584
37585 // For masks that have been widened to 128-bit elements or more,
37586 // narrow back down to 64-bit elements.
37587 if (BaseMaskEltSizeInBits > 64) {
37588 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size")(static_cast <bool> ((BaseMaskEltSizeInBits % 64) == 0 &&
"Illegal mask size") ? void (0) : __assert_fail ("(BaseMaskEltSizeInBits % 64) == 0 && \"Illegal mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37588, __extension__
__PRETTY_FUNCTION__))
;
37589 int MaskScale = BaseMaskEltSizeInBits / 64;
37590 SmallVector<int, 64> ScaledMask;
37591 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
37592 Mask = std::move(ScaledMask);
37593 }
37594
37595 // For masked shuffles, we're trying to match the root width for better
37596 // writemask folding, attempt to scale the mask.
37597 // TODO - variable shuffles might need this to be widened again.
37598 if (IsMaskedShuffle && NumRootElts > Mask.size()) {
37599 assert((NumRootElts % Mask.size()) == 0 && "Illegal mask size")(static_cast <bool> ((NumRootElts % Mask.size()) == 0 &&
"Illegal mask size") ? void (0) : __assert_fail ("(NumRootElts % Mask.size()) == 0 && \"Illegal mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37599, __extension__
__PRETTY_FUNCTION__))
;
37600 int MaskScale = NumRootElts / Mask.size();
37601 SmallVector<int, 64> ScaledMask;
37602 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
37603 Mask = std::move(ScaledMask);
37604 }
37605
37606 unsigned NumMaskElts = Mask.size();
37607 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
37608
37609 // Determine the effective mask value type.
37610 FloatDomain &= (32 <= MaskEltSizeInBits);
37611 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
37612 : MVT::getIntegerVT(MaskEltSizeInBits);
37613 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
37614
37615 // Only allow legal mask types.
37616 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
37617 return SDValue();
37618
37619 // Attempt to match the mask against known shuffle patterns.
37620 MVT ShuffleSrcVT, ShuffleVT;
37621 unsigned Shuffle, PermuteImm;
37622
37623 // Which shuffle domains are permitted?
37624 // Permit domain crossing at higher combine depths.
37625 // TODO: Should we indicate which domain is preferred if both are allowed?
37626 bool AllowFloatDomain = FloatDomain || (Depth >= 3);
37627 bool AllowIntDomain = (!FloatDomain || (Depth >= 3)) && Subtarget.hasSSE2() &&
37628 (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
37629
37630 // Determine zeroable mask elements.
37631 APInt KnownUndef, KnownZero;
37632 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
37633 APInt Zeroable = KnownUndef | KnownZero;
37634
37635 if (UnaryShuffle) {
37636 // Attempt to match against broadcast-from-vector.
37637 // Limit AVX1 to cases where we're loading+broadcasting a scalar element.
37638 if ((Subtarget.hasAVX2() ||
37639 (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) &&
37640 (!IsMaskedShuffle || NumRootElts == NumMaskElts)) {
37641 if (isUndefOrEqual(Mask, 0)) {
37642 if (V1.getValueType() == MaskVT &&
37643 V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
37644 X86::mayFoldLoad(V1.getOperand(0), Subtarget)) {
37645 if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
37646 return SDValue(); // Nothing to do!
37647 Res = V1.getOperand(0);
37648 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
37649 return DAG.getBitcast(RootVT, Res);
37650 }
37651 if (Subtarget.hasAVX2()) {
37652 if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
37653 return SDValue(); // Nothing to do!
37654 Res = CanonicalizeShuffleInput(MaskVT, V1);
37655 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
37656 return DAG.getBitcast(RootVT, Res);
37657 }
37658 }
37659 }
37660
37661 if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, V1,
37662 Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT) &&
37663 (!IsMaskedShuffle ||
37664 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
37665 if (Depth == 0 && Root.getOpcode() == Shuffle)
37666 return SDValue(); // Nothing to do!
37667 Res = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
37668 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
37669 return DAG.getBitcast(RootVT, Res);
37670 }
37671
37672 if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
37673 AllowIntDomain, Subtarget, Shuffle, ShuffleVT,
37674 PermuteImm) &&
37675 (!IsMaskedShuffle ||
37676 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
37677 if (Depth == 0 && Root.getOpcode() == Shuffle)
37678 return SDValue(); // Nothing to do!
37679 Res = CanonicalizeShuffleInput(ShuffleVT, V1);
37680 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
37681 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
37682 return DAG.getBitcast(RootVT, Res);
37683 }
37684 }
37685
37686 // Attempt to combine to INSERTPS, but only if the inserted element has come
37687 // from a scalar.
37688 // TODO: Handle other insertions here as well?
37689 if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 &&
37690 Subtarget.hasSSE41() &&
37691 !isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3})) {
37692 if (MaskEltSizeInBits == 32) {
37693 SDValue SrcV1 = V1, SrcV2 = V2;
37694 if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask,
37695 DAG) &&
37696 SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {
37697 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
37698 return SDValue(); // Nothing to do!
37699 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
37700 CanonicalizeShuffleInput(MVT::v4f32, SrcV1),
37701 CanonicalizeShuffleInput(MVT::v4f32, SrcV2),
37702 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
37703 return DAG.getBitcast(RootVT, Res);
37704 }
37705 }
37706 if (MaskEltSizeInBits == 64 &&
37707 isTargetShuffleEquivalent(MaskVT, Mask, {0, 2}) &&
37708 V2.getOpcode() == ISD::SCALAR_TO_VECTOR &&
37709 V2.getScalarValueSizeInBits() <= 32) {
37710 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
37711 return SDValue(); // Nothing to do!
37712 PermuteImm = (/*DstIdx*/2 << 4) | (/*SrcIdx*/0 << 0);
37713 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
37714 CanonicalizeShuffleInput(MVT::v4f32, V1),
37715 CanonicalizeShuffleInput(MVT::v4f32, V2),
37716 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
37717 return DAG.getBitcast(RootVT, Res);
37718 }
37719 }
37720
37721 SDValue NewV1 = V1; // Save operands in case early exit happens.
37722 SDValue NewV2 = V2;
37723 if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
37724 NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
37725 ShuffleVT, UnaryShuffle) &&
37726 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
37727 if (Depth == 0 && Root.getOpcode() == Shuffle)
37728 return SDValue(); // Nothing to do!
37729 NewV1 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);
37730 NewV2 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV2);
37731 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
37732 return DAG.getBitcast(RootVT, Res);
37733 }
37734
37735 NewV1 = V1; // Save operands in case early exit happens.
37736 NewV2 = V2;
37737 if (matchBinaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
37738 AllowIntDomain, NewV1, NewV2, DL, DAG,
37739 Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
37740 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
37741 if (Depth == 0 && Root.getOpcode() == Shuffle)
37742 return SDValue(); // Nothing to do!
37743 NewV1 = CanonicalizeShuffleInput(ShuffleVT, NewV1);
37744 NewV2 = CanonicalizeShuffleInput(ShuffleVT, NewV2);
37745 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
37746 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
37747 return DAG.getBitcast(RootVT, Res);
37748 }
37749
37750 // Typically from here on, we need an integer version of MaskVT.
37751 MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
37752 IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
37753
37754 // Annoyingly, SSE4A instructions don't map into the above match helpers.
37755 if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
37756 uint64_t BitLen, BitIdx;
37757 if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
37758 Zeroable)) {
37759 if (Depth == 0 && Root.getOpcode() == X86ISD::EXTRQI)
37760 return SDValue(); // Nothing to do!
37761 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
37762 Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
37763 DAG.getTargetConstant(BitLen, DL, MVT::i8),
37764 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
37765 return DAG.getBitcast(RootVT, Res);
37766 }
37767
37768 if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
37769 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTQI)
37770 return SDValue(); // Nothing to do!
37771 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
37772 V2 = CanonicalizeShuffleInput(IntMaskVT, V2);
37773 Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
37774 DAG.getTargetConstant(BitLen, DL, MVT::i8),
37775 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
37776 return DAG.getBitcast(RootVT, Res);
37777 }
37778 }
37779
37780 // Match shuffle against TRUNCATE patterns.
37781 if (AllowIntDomain && MaskEltSizeInBits < 64 && Subtarget.hasAVX512()) {
37782 // Match against a VTRUNC instruction, accounting for src/dst sizes.
37783 if (matchShuffleAsVTRUNC(ShuffleSrcVT, ShuffleVT, IntMaskVT, Mask, Zeroable,
37784 Subtarget)) {
37785 bool IsTRUNCATE = ShuffleVT.getVectorNumElements() ==
37786 ShuffleSrcVT.getVectorNumElements();
37787 unsigned Opc =
37788 IsTRUNCATE ? (unsigned)ISD::TRUNCATE : (unsigned)X86ISD::VTRUNC;
37789 if (Depth == 0 && Root.getOpcode() == Opc)
37790 return SDValue(); // Nothing to do!
37791 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
37792 Res = DAG.getNode(Opc, DL, ShuffleVT, V1);
37793 if (ShuffleVT.getSizeInBits() < RootSizeInBits)
37794 Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits);
37795 return DAG.getBitcast(RootVT, Res);
37796 }
37797
37798 // Do we need a more general binary truncation pattern?
37799 if (RootSizeInBits < 512 &&
37800 ((RootVT.is256BitVector() && Subtarget.useAVX512Regs()) ||
37801 (RootVT.is128BitVector() && Subtarget.hasVLX())) &&
37802 (MaskEltSizeInBits > 8 || Subtarget.hasBWI()) &&
37803 isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) {
37804 if (Depth == 0 && Root.getOpcode() == ISD::TRUNCATE)
37805 return SDValue(); // Nothing to do!
37806 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
37807 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);
37808 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
37809 V2 = CanonicalizeShuffleInput(ShuffleSrcVT, V2);
37810 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
37811 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts);
37812 Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2);
37813 Res = DAG.getNode(ISD::TRUNCATE, DL, IntMaskVT, Res);
37814 return DAG.getBitcast(RootVT, Res);
37815 }
37816 }
37817
37818 // Don't try to re-form single instruction chains under any circumstances now
37819 // that we've done encoding canonicalization for them.
37820 if (Depth < 1)
37821 return SDValue();
37822
37823 // Depth threshold above which we can efficiently use variable mask shuffles.
37824 int VariableCrossLaneShuffleDepth =
37825 Subtarget.hasFastVariableCrossLaneShuffle() ? 1 : 2;
37826 int VariablePerLaneShuffleDepth =
37827 Subtarget.hasFastVariablePerLaneShuffle() ? 1 : 2;
37828 AllowVariableCrossLaneMask &=
37829 (Depth >= VariableCrossLaneShuffleDepth) || HasVariableMask;
37830 AllowVariablePerLaneMask &=
37831 (Depth >= VariablePerLaneShuffleDepth) || HasVariableMask;
37832 // VPERMI2W/VPERMI2B are 3 uops on Skylake and Icelake so we require a
37833 // higher depth before combining them.
37834 bool AllowBWIVPERMV3 =
37835 (Depth >= (VariableCrossLaneShuffleDepth + 2) || HasVariableMask);
37836
37837 bool MaskContainsZeros = isAnyZero(Mask);
37838
37839 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
37840 // If we have a single input lane-crossing shuffle then lower to VPERMV.
37841 if (UnaryShuffle && AllowVariableCrossLaneMask && !MaskContainsZeros) {
37842 if (Subtarget.hasAVX2() &&
37843 (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) {
37844 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
37845 Res = CanonicalizeShuffleInput(MaskVT, V1);
37846 Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
37847 return DAG.getBitcast(RootVT, Res);
37848 }
37849 // AVX512 variants (non-VLX will pad to 512-bit shuffles).
37850 if ((Subtarget.hasAVX512() &&
37851 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
37852 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
37853 (Subtarget.hasBWI() &&
37854 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
37855 (Subtarget.hasVBMI() &&
37856 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8))) {
37857 V1 = CanonicalizeShuffleInput(MaskVT, V1);
37858 V2 = DAG.getUNDEF(MaskVT);
37859 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
37860 return DAG.getBitcast(RootVT, Res);
37861 }
37862 }
37863
37864 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
37865 // vector as the second source (non-VLX will pad to 512-bit shuffles).
37866 if (UnaryShuffle && AllowVariableCrossLaneMask &&
37867 ((Subtarget.hasAVX512() &&
37868 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
37869 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
37870 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32 ||
37871 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
37872 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
37873 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
37874 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
37875 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
37876 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
37877 for (unsigned i = 0; i != NumMaskElts; ++i)
37878 if (Mask[i] == SM_SentinelZero)
37879 Mask[i] = NumMaskElts + i;
37880 V1 = CanonicalizeShuffleInput(MaskVT, V1);
37881 V2 = getZeroVector(MaskVT, Subtarget, DAG, DL);
37882 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
37883 return DAG.getBitcast(RootVT, Res);
37884 }
37885
37886 // If that failed and either input is extracted then try to combine as a
37887 // shuffle with the larger type.
37888 if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
37889 Inputs, Root, BaseMask, Depth, HasVariableMask,
37890 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG,
37891 Subtarget))
37892 return WideShuffle;
37893
37894 // If we have a dual input lane-crossing shuffle then lower to VPERMV3,
37895 // (non-VLX will pad to 512-bit shuffles).
37896 if (AllowVariableCrossLaneMask && !MaskContainsZeros &&
37897 ((Subtarget.hasAVX512() &&
37898 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
37899 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
37900 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32 ||
37901 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
37902 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
37903 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
37904 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
37905 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
37906 V1 = CanonicalizeShuffleInput(MaskVT, V1);
37907 V2 = CanonicalizeShuffleInput(MaskVT, V2);
37908 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
37909 return DAG.getBitcast(RootVT, Res);
37910 }
37911 return SDValue();
37912 }
37913
37914 // See if we can combine a single input shuffle with zeros to a bit-mask,
37915 // which is much simpler than any shuffle.
37916 if (UnaryShuffle && MaskContainsZeros && AllowVariablePerLaneMask &&
37917 isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
37918 DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
37919 APInt Zero = APInt::getZero(MaskEltSizeInBits);
37920 APInt AllOnes = APInt::getAllOnes(MaskEltSizeInBits);
37921 APInt UndefElts(NumMaskElts, 0);
37922 SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
37923 for (unsigned i = 0; i != NumMaskElts; ++i) {
37924 int M = Mask[i];
37925 if (M == SM_SentinelUndef) {
37926 UndefElts.setBit(i);
37927 continue;
37928 }
37929 if (M == SM_SentinelZero)
37930 continue;
37931 EltBits[i] = AllOnes;
37932 }
37933 SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
37934 Res = CanonicalizeShuffleInput(MaskVT, V1);
37935 unsigned AndOpcode =
37936 MaskVT.isFloatingPoint() ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
37937 Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
37938 return DAG.getBitcast(RootVT, Res);
37939 }
37940
37941 // If we have a single input shuffle with different shuffle patterns in the
37942 // the 128-bit lanes use the variable mask to VPERMILPS.
37943 // TODO Combine other mask types at higher depths.
37944 if (UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
37945 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
37946 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
37947 SmallVector<SDValue, 16> VPermIdx;
37948 for (int M : Mask) {
37949 SDValue Idx =
37950 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
37951 VPermIdx.push_back(Idx);
37952 }
37953 SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
37954 Res = CanonicalizeShuffleInput(MaskVT, V1);
37955 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
37956 return DAG.getBitcast(RootVT, Res);
37957 }
37958
37959 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
37960 // to VPERMIL2PD/VPERMIL2PS.
37961 if (AllowVariablePerLaneMask && Subtarget.hasXOP() &&
37962 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
37963 MaskVT == MVT::v8f32)) {
37964 // VPERMIL2 Operation.
37965 // Bits[3] - Match Bit.
37966 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
37967 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
37968 unsigned NumLanes = MaskVT.getSizeInBits() / 128;
37969 unsigned NumEltsPerLane = NumMaskElts / NumLanes;
37970 SmallVector<int, 8> VPerm2Idx;
37971 unsigned M2ZImm = 0;
37972 for (int M : Mask) {
37973 if (M == SM_SentinelUndef) {
37974 VPerm2Idx.push_back(-1);
37975 continue;
37976 }
37977 if (M == SM_SentinelZero) {
37978 M2ZImm = 2;
37979 VPerm2Idx.push_back(8);
37980 continue;
37981 }
37982 int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
37983 Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
37984 VPerm2Idx.push_back(Index);
37985 }
37986 V1 = CanonicalizeShuffleInput(MaskVT, V1);
37987 V2 = CanonicalizeShuffleInput(MaskVT, V2);
37988 SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
37989 Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
37990 DAG.getTargetConstant(M2ZImm, DL, MVT::i8));
37991 return DAG.getBitcast(RootVT, Res);
37992 }
37993
37994 // If we have 3 or more shuffle instructions or a chain involving a variable
37995 // mask, we can replace them with a single PSHUFB instruction profitably.
37996 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
37997 // instructions, but in practice PSHUFB tends to be *very* fast so we're
37998 // more aggressive.
37999 if (UnaryShuffle && AllowVariablePerLaneMask &&
38000 ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
38001 (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
38002 (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
38003 SmallVector<SDValue, 16> PSHUFBMask;
38004 int NumBytes = RootVT.getSizeInBits() / 8;
38005 int Ratio = NumBytes / NumMaskElts;
38006 for (int i = 0; i < NumBytes; ++i) {
38007 int M = Mask[i / Ratio];
38008 if (M == SM_SentinelUndef) {
38009 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
38010 continue;
38011 }
38012 if (M == SM_SentinelZero) {
38013 PSHUFBMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
38014 continue;
38015 }
38016 M = Ratio * M + i % Ratio;
38017 assert((M / 16) == (i / 16) && "Lane crossing detected")(static_cast <bool> ((M / 16) == (i / 16) && "Lane crossing detected"
) ? void (0) : __assert_fail ("(M / 16) == (i / 16) && \"Lane crossing detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38017, __extension__
__PRETTY_FUNCTION__))
;
38018 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
38019 }
38020 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
38021 Res = CanonicalizeShuffleInput(ByteVT, V1);
38022 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
38023 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
38024 return DAG.getBitcast(RootVT, Res);
38025 }
38026
38027 // With XOP, if we have a 128-bit binary input shuffle we can always combine
38028 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
38029 // slower than PSHUFB on targets that support both.
38030 if (AllowVariablePerLaneMask && RootVT.is128BitVector() &&
38031 Subtarget.hasXOP()) {
38032 // VPPERM Mask Operation
38033 // Bits[4:0] - Byte Index (0 - 31)
38034 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
38035 SmallVector<SDValue, 16> VPPERMMask;
38036 int NumBytes = 16;
38037 int Ratio = NumBytes / NumMaskElts;
38038 for (int i = 0; i < NumBytes; ++i) {
38039 int M = Mask[i / Ratio];
38040 if (M == SM_SentinelUndef) {
38041 VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
38042 continue;
38043 }
38044 if (M == SM_SentinelZero) {
38045 VPPERMMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
38046 continue;
38047 }
38048 M = Ratio * M + i % Ratio;
38049 VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
38050 }
38051 MVT ByteVT = MVT::v16i8;
38052 V1 = CanonicalizeShuffleInput(ByteVT, V1);
38053 V2 = CanonicalizeShuffleInput(ByteVT, V2);
38054 SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
38055 Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
38056 return DAG.getBitcast(RootVT, Res);
38057 }
38058
38059 // If that failed and either input is extracted then try to combine as a
38060 // shuffle with the larger type.
38061 if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
38062 Inputs, Root, BaseMask, Depth, HasVariableMask,
38063 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG, Subtarget))
38064 return WideShuffle;
38065
38066 // If we have a dual input shuffle then lower to VPERMV3,
38067 // (non-VLX will pad to 512-bit shuffles)
38068 if (!UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
38069 ((Subtarget.hasAVX512() &&
38070 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v8f64 ||
38071 MaskVT == MVT::v2i64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8i64 ||
38072 MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || MaskVT == MVT::v8f32 ||
38073 MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 ||
38074 MaskVT == MVT::v16i32)) ||
38075 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
38076 (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 ||
38077 MaskVT == MVT::v32i16)) ||
38078 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
38079 (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 ||
38080 MaskVT == MVT::v64i8)))) {
38081 V1 = CanonicalizeShuffleInput(MaskVT, V1);
38082 V2 = CanonicalizeShuffleInput(MaskVT, V2);
38083 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
38084 return DAG.getBitcast(RootVT, Res);
38085 }
38086
38087 // Failed to find any combines.
38088 return SDValue();
38089}
38090
38091// Combine an arbitrary chain of shuffles + extract_subvectors into a single
38092// instruction if possible.
38093//
38094// Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger
38095// type size to attempt to combine:
38096// shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)
38097// -->
38098// extract_subvector(shuffle(x,y,m2),0)
38099static SDValue combineX86ShuffleChainWithExtract(
38100 ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
38101 bool HasVariableMask, bool AllowVariableCrossLaneMask,
38102 bool AllowVariablePerLaneMask, SelectionDAG &DAG,
38103 const X86Subtarget &Subtarget) {
38104 unsigned NumMaskElts = BaseMask.size();
38105 unsigned NumInputs = Inputs.size();
38106 if (NumInputs == 0)
38107 return SDValue();
38108
38109 EVT RootVT = Root.getValueType();
38110 unsigned RootSizeInBits = RootVT.getSizeInBits();
38111 assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask")(static_cast <bool> ((RootSizeInBits % NumMaskElts) == 0
&& "Unexpected root shuffle mask") ? void (0) : __assert_fail
("(RootSizeInBits % NumMaskElts) == 0 && \"Unexpected root shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38111, __extension__
__PRETTY_FUNCTION__))
;
38112
38113 // Bail if we have any smaller inputs.
38114 if (llvm::any_of(Inputs, [RootSizeInBits](SDValue Input) {
38115 return Input.getValueSizeInBits() < RootSizeInBits;
38116 }))
38117 return SDValue();
38118
38119 SmallVector<SDValue, 4> WideInputs(Inputs.begin(), Inputs.end());
38120 SmallVector<unsigned, 4> Offsets(NumInputs, 0);
38121
38122 // Peek through subvectors.
38123 // TODO: Support inter-mixed EXTRACT_SUBVECTORs + BITCASTs?
38124 unsigned WideSizeInBits = RootSizeInBits;
38125 for (unsigned i = 0; i != NumInputs; ++i) {
38126 SDValue &Src = WideInputs[i];
38127 unsigned &Offset = Offsets[i];
38128 Src = peekThroughBitcasts(Src);
38129 EVT BaseVT = Src.getValueType();
38130 while (Src.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
38131 Offset += Src.getConstantOperandVal(1);
38132 Src = Src.getOperand(0);
38133 }
38134 WideSizeInBits = std::max(WideSizeInBits,
38135 (unsigned)Src.getValueSizeInBits());
38136 assert((Offset % BaseVT.getVectorNumElements()) == 0 &&(static_cast <bool> ((Offset % BaseVT.getVectorNumElements
()) == 0 && "Unexpected subvector extraction") ? void
(0) : __assert_fail ("(Offset % BaseVT.getVectorNumElements()) == 0 && \"Unexpected subvector extraction\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38137, __extension__
__PRETTY_FUNCTION__))
38137 "Unexpected subvector extraction")(static_cast <bool> ((Offset % BaseVT.getVectorNumElements
()) == 0 && "Unexpected subvector extraction") ? void
(0) : __assert_fail ("(Offset % BaseVT.getVectorNumElements()) == 0 && \"Unexpected subvector extraction\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38137, __extension__
__PRETTY_FUNCTION__))
;
38138 Offset /= BaseVT.getVectorNumElements();
38139 Offset *= NumMaskElts;
38140 }
38141
38142 // Bail if we're always extracting from the lowest subvectors,
38143 // combineX86ShuffleChain should match this for the current width.
38144 if (llvm::all_of(Offsets, [](unsigned Offset) { return Offset == 0; }))
38145 return SDValue();
38146
38147 unsigned Scale = WideSizeInBits / RootSizeInBits;
38148 assert((WideSizeInBits % RootSizeInBits) == 0 &&(static_cast <bool> ((WideSizeInBits % RootSizeInBits) ==
0 && "Unexpected subvector extraction") ? void (0) :
__assert_fail ("(WideSizeInBits % RootSizeInBits) == 0 && \"Unexpected subvector extraction\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38149, __extension__
__PRETTY_FUNCTION__))
38149 "Unexpected subvector extraction")(static_cast <bool> ((WideSizeInBits % RootSizeInBits) ==
0 && "Unexpected subvector extraction") ? void (0) :
__assert_fail ("(WideSizeInBits % RootSizeInBits) == 0 && \"Unexpected subvector extraction\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38149, __extension__
__PRETTY_FUNCTION__))
;
38150
38151 // If the src vector types aren't the same, see if we can extend
38152 // them to match each other.
38153 // TODO: Support different scalar types?
38154 EVT WideSVT = WideInputs[0].getValueType().getScalarType();
38155 if (llvm::any_of(WideInputs, [&WideSVT, &DAG](SDValue Op) {
38156 return !DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()) ||
38157 Op.getValueType().getScalarType() != WideSVT;
38158 }))
38159 return SDValue();
38160
38161 // Create new mask for larger type.
38162 for (unsigned i = 1; i != NumInputs; ++i)
38163 Offsets[i] += i * Scale * NumMaskElts;
38164
38165 SmallVector<int, 64> WideMask(BaseMask.begin(), BaseMask.end());
38166 for (int &M : WideMask) {
38167 if (M < 0)
38168 continue;
38169 M = (M % NumMaskElts) + Offsets[M / NumMaskElts];
38170 }
38171 WideMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef);
38172
38173 // Remove unused/repeated shuffle source ops.
38174 resolveTargetShuffleInputsAndMask(WideInputs, WideMask);
38175 assert(!WideInputs.empty() && "Shuffle with no inputs detected")(static_cast <bool> (!WideInputs.empty() && "Shuffle with no inputs detected"
) ? void (0) : __assert_fail ("!WideInputs.empty() && \"Shuffle with no inputs detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38175, __extension__
__PRETTY_FUNCTION__))
;
38176
38177 if (WideInputs.size() > 2)
38178 return SDValue();
38179
38180 // Increase depth for every upper subvector we've peeked through.
38181 Depth += count_if(Offsets, [](unsigned Offset) { return Offset > 0; });
38182
38183 // Attempt to combine wider chain.
38184 // TODO: Can we use a better Root?
38185 SDValue WideRoot = WideInputs.front().getValueSizeInBits() >
38186 WideInputs.back().getValueSizeInBits()
38187 ? WideInputs.front()
38188 : WideInputs.back();
38189 if (SDValue WideShuffle =
38190 combineX86ShuffleChain(WideInputs, WideRoot, WideMask, Depth,
38191 HasVariableMask, AllowVariableCrossLaneMask,
38192 AllowVariablePerLaneMask, DAG, Subtarget)) {
38193 WideShuffle =
38194 extractSubVector(WideShuffle, 0, DAG, SDLoc(Root), RootSizeInBits);
38195 return DAG.getBitcast(RootVT, WideShuffle);
38196 }
38197 return SDValue();
38198}
38199
38200// Canonicalize the combined shuffle mask chain with horizontal ops.
38201// NOTE: This may update the Ops and Mask.
38202static SDValue canonicalizeShuffleMaskWithHorizOp(
38203 MutableArrayRef<SDValue> Ops, MutableArrayRef<int> Mask,
38204 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
38205 const X86Subtarget &Subtarget) {
38206 if (Mask.empty() || Ops.empty())
38207 return SDValue();
38208
38209 SmallVector<SDValue> BC;
38210 for (SDValue Op : Ops)
38211 BC.push_back(peekThroughBitcasts(Op));
38212
38213 // All ops must be the same horizop + type.
38214 SDValue BC0 = BC[0];
38215 EVT VT0 = BC0.getValueType();
38216 unsigned Opcode0 = BC0.getOpcode();
38217 if (VT0.getSizeInBits() != RootSizeInBits || llvm::any_of(BC, [&](SDValue V) {
38218 return V.getOpcode() != Opcode0 || V.getValueType() != VT0;
38219 }))
38220 return SDValue();
38221
38222 bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
38223 Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);
38224 bool isPack = (Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS);
38225 if (!isHoriz && !isPack)
38226 return SDValue();
38227
38228 // Do all ops have a single use?
38229 bool OneUseOps = llvm::all_of(Ops, [](SDValue Op) {
38230 return Op.hasOneUse() &&
38231 peekThroughBitcasts(Op) == peekThroughOneUseBitcasts(Op);
38232 });
38233
38234 int NumElts = VT0.getVectorNumElements();
38235 int NumLanes = VT0.getSizeInBits() / 128;
38236 int NumEltsPerLane = NumElts / NumLanes;
38237 int NumHalfEltsPerLane = NumEltsPerLane / 2;
38238 MVT SrcVT = BC0.getOperand(0).getSimpleValueType();
38239 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
38240
38241 if (NumEltsPerLane >= 4 &&
38242 (isPack || shouldUseHorizontalOp(Ops.size() == 1, DAG, Subtarget))) {
38243 SmallVector<int> LaneMask, ScaledMask;
38244 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, LaneMask) &&
38245 scaleShuffleElements(LaneMask, 4, ScaledMask)) {
38246 // See if we can remove the shuffle by resorting the HOP chain so that
38247 // the HOP args are pre-shuffled.
38248 // TODO: Generalize to any sized/depth chain.
38249 // TODO: Add support for PACKSS/PACKUS.
38250 if (isHoriz) {
38251 // Attempt to find a HOP(HOP(X,Y),HOP(Z,W)) source operand.
38252 auto GetHOpSrc = [&](int M) {
38253 if (M == SM_SentinelUndef)
38254 return DAG.getUNDEF(VT0);
38255 if (M == SM_SentinelZero)
38256 return getZeroVector(VT0.getSimpleVT(), Subtarget, DAG, DL);
38257 SDValue Src0 = BC[M / 4];
38258 SDValue Src1 = Src0.getOperand((M % 4) >= 2);
38259 if (Src1.getOpcode() == Opcode0 && Src0->isOnlyUserOf(Src1.getNode()))
38260 return Src1.getOperand(M % 2);
38261 return SDValue();
38262 };
38263 SDValue M0 = GetHOpSrc(ScaledMask[0]);
38264 SDValue M1 = GetHOpSrc(ScaledMask[1]);
38265 SDValue M2 = GetHOpSrc(ScaledMask[2]);
38266 SDValue M3 = GetHOpSrc(ScaledMask[3]);
38267 if (M0 && M1 && M2 && M3) {
38268 SDValue LHS = DAG.getNode(Opcode0, DL, SrcVT, M0, M1);
38269 SDValue RHS = DAG.getNode(Opcode0, DL, SrcVT, M2, M3);
38270 return DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
38271 }
38272 }
38273 // shuffle(hop(x,y),hop(z,w)) -> permute(hop(x,z)) etc.
38274 if (Ops.size() >= 2) {
38275 SDValue LHS, RHS;
38276 auto GetHOpSrc = [&](int M, int &OutM) {
38277 // TODO: Support SM_SentinelZero
38278 if (M < 0)
38279 return M == SM_SentinelUndef;
38280 SDValue Src = BC[M / 4].getOperand((M % 4) >= 2);
38281 if (!LHS || LHS == Src) {
38282 LHS = Src;
38283 OutM = (M % 2);
38284 return true;
38285 }
38286 if (!RHS || RHS == Src) {
38287 RHS = Src;
38288 OutM = (M % 2) + 2;
38289 return true;
38290 }
38291 return false;
38292 };
38293 int PostMask[4] = {-1, -1, -1, -1};
38294 if (GetHOpSrc(ScaledMask[0], PostMask[0]) &&
38295 GetHOpSrc(ScaledMask[1], PostMask[1]) &&
38296 GetHOpSrc(ScaledMask[2], PostMask[2]) &&
38297 GetHOpSrc(ScaledMask[3], PostMask[3])) {
38298 LHS = DAG.getBitcast(SrcVT, LHS);
38299 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
38300 SDValue Res = DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
38301 // Use SHUFPS for the permute so this will work on SSE3 targets,
38302 // shuffle combining and domain handling will simplify this later on.
38303 MVT ShuffleVT = MVT::getVectorVT(MVT::f32, RootSizeInBits / 32);
38304 Res = DAG.getBitcast(ShuffleVT, Res);
38305 return DAG.getNode(X86ISD::SHUFP, DL, ShuffleVT, Res, Res,
38306 getV4X86ShuffleImm8ForMask(PostMask, DL, DAG));
38307 }
38308 }
38309 }
38310 }
38311
38312 if (2 < Ops.size())
38313 return SDValue();
38314
38315 SDValue BC1 = BC[BC.size() - 1];
38316 if (Mask.size() == VT0.getVectorNumElements()) {
38317 // Canonicalize binary shuffles of horizontal ops that use the
38318 // same sources to an unary shuffle.
38319 // TODO: Try to perform this fold even if the shuffle remains.
38320 if (Ops.size() == 2) {
38321 auto ContainsOps = [](SDValue HOp, SDValue Op) {
38322 return Op == HOp.getOperand(0) || Op == HOp.getOperand(1);
38323 };
38324 // Commute if all BC0's ops are contained in BC1.
38325 if (ContainsOps(BC1, BC0.getOperand(0)) &&
38326 ContainsOps(BC1, BC0.getOperand(1))) {
38327 ShuffleVectorSDNode::commuteMask(Mask);
38328 std::swap(Ops[0], Ops[1]);
38329 std::swap(BC0, BC1);
38330 }
38331
38332 // If BC1 can be represented by BC0, then convert to unary shuffle.
38333 if (ContainsOps(BC0, BC1.getOperand(0)) &&
38334 ContainsOps(BC0, BC1.getOperand(1))) {
38335 for (int &M : Mask) {
38336 if (M < NumElts) // BC0 element or UNDEF/Zero sentinel.
38337 continue;
38338 int SubLane = ((M % NumEltsPerLane) >= NumHalfEltsPerLane) ? 1 : 0;
38339 M -= NumElts + (SubLane * NumHalfEltsPerLane);
38340 if (BC1.getOperand(SubLane) != BC0.getOperand(0))
38341 M += NumHalfEltsPerLane;
38342 }
38343 }
38344 }
38345
38346 // Canonicalize unary horizontal ops to only refer to lower halves.
38347 for (int i = 0; i != NumElts; ++i) {
38348 int &M = Mask[i];
38349 if (isUndefOrZero(M))
38350 continue;
38351 if (M < NumElts && BC0.getOperand(0) == BC0.getOperand(1) &&
38352 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
38353 M -= NumHalfEltsPerLane;
38354 if (NumElts <= M && BC1.getOperand(0) == BC1.getOperand(1) &&
38355 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
38356 M -= NumHalfEltsPerLane;
38357 }
38358 }
38359
38360 // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
38361 // single instruction. Attempt to match a v2X64 repeating shuffle pattern that
38362 // represents the LHS/RHS inputs for the lower/upper halves.
38363 SmallVector<int, 16> TargetMask128, WideMask128;
38364 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, TargetMask128) &&
38365 scaleShuffleElements(TargetMask128, 2, WideMask128)) {
38366 assert(isUndefOrZeroOrInRange(WideMask128, 0, 4) && "Illegal shuffle")(static_cast <bool> (isUndefOrZeroOrInRange(WideMask128
, 0, 4) && "Illegal shuffle") ? void (0) : __assert_fail
("isUndefOrZeroOrInRange(WideMask128, 0, 4) && \"Illegal shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38366, __extension__
__PRETTY_FUNCTION__))
;
38367 bool SingleOp = (Ops.size() == 1);
38368 if (isPack || OneUseOps ||
38369 shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {
38370 SDValue Lo = isInRange(WideMask128[0], 0, 2) ? BC0 : BC1;
38371 SDValue Hi = isInRange(WideMask128[1], 0, 2) ? BC0 : BC1;
38372 Lo = Lo.getOperand(WideMask128[0] & 1);
38373 Hi = Hi.getOperand(WideMask128[1] & 1);
38374 if (SingleOp) {
38375 SDValue Undef = DAG.getUNDEF(SrcVT);
38376 SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL);
38377 Lo = (WideMask128[0] == SM_SentinelZero ? Zero : Lo);
38378 Hi = (WideMask128[1] == SM_SentinelZero ? Zero : Hi);
38379 Lo = (WideMask128[0] == SM_SentinelUndef ? Undef : Lo);
38380 Hi = (WideMask128[1] == SM_SentinelUndef ? Undef : Hi);
38381 }
38382 return DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
38383 }
38384 }
38385
38386 return SDValue();
38387}
38388
38389// Attempt to constant fold all of the constant source ops.
38390// Returns true if the entire shuffle is folded to a constant.
38391// TODO: Extend this to merge multiple constant Ops and update the mask.
38392static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,
38393 ArrayRef<int> Mask, SDValue Root,
38394 bool HasVariableMask,
38395 SelectionDAG &DAG,
38396 const X86Subtarget &Subtarget) {
38397 MVT VT = Root.getSimpleValueType();
38398
38399 unsigned SizeInBits = VT.getSizeInBits();
38400 unsigned NumMaskElts = Mask.size();
38401 unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
38402 unsigned NumOps = Ops.size();
38403
38404 // Extract constant bits from each source op.
38405 bool OneUseConstantOp = false;
38406 SmallVector<APInt, 16> UndefEltsOps(NumOps);
38407 SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
38408 for (unsigned i = 0; i != NumOps; ++i) {
38409 SDValue SrcOp = Ops[i];
38410 OneUseConstantOp |= SrcOp.hasOneUse();
38411 if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
38412 RawBitsOps[i]))
38413 return SDValue();
38414 }
38415
38416 // If we're optimizing for size, only fold if at least one of the constants is
38417 // only used once or the combined shuffle has included a variable mask
38418 // shuffle, this is to avoid constant pool bloat.
38419 bool IsOptimizingSize = DAG.shouldOptForSize();
38420 if (IsOptimizingSize && !OneUseConstantOp && !HasVariableMask)
38421 return SDValue();
38422
38423 // Shuffle the constant bits according to the mask.
38424 SDLoc DL(Root);
38425 APInt UndefElts(NumMaskElts, 0);
38426 APInt ZeroElts(NumMaskElts, 0);
38427 APInt ConstantElts(NumMaskElts, 0);
38428 SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
38429 APInt::getZero(MaskSizeInBits));
38430 for (unsigned i = 0; i != NumMaskElts; ++i) {
38431 int M = Mask[i];
38432 if (M == SM_SentinelUndef) {
38433 UndefElts.setBit(i);
38434 continue;
38435 } else if (M == SM_SentinelZero) {
38436 ZeroElts.setBit(i);
38437 continue;
38438 }
38439 assert(0 <= M && M < (int)(NumMaskElts * NumOps))(static_cast <bool> (0 <= M && M < (int)(
NumMaskElts * NumOps)) ? void (0) : __assert_fail ("0 <= M && M < (int)(NumMaskElts * NumOps)"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38439, __extension__
__PRETTY_FUNCTION__))
;
38440
38441 unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
38442 unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
38443
38444 auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
38445 if (SrcUndefElts[SrcMaskIdx]) {
38446 UndefElts.setBit(i);
38447 continue;
38448 }
38449
38450 auto &SrcEltBits = RawBitsOps[SrcOpIdx];
38451 APInt &Bits = SrcEltBits[SrcMaskIdx];
38452 if (!Bits) {
38453 ZeroElts.setBit(i);
38454 continue;
38455 }
38456
38457 ConstantElts.setBit(i);
38458 ConstantBitData[i] = Bits;
38459 }
38460 assert((UndefElts | ZeroElts | ConstantElts).isAllOnes())(static_cast <bool> ((UndefElts | ZeroElts | ConstantElts
).isAllOnes()) ? void (0) : __assert_fail ("(UndefElts | ZeroElts | ConstantElts).isAllOnes()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38460, __extension__
__PRETTY_FUNCTION__))
;
38461
38462 // Attempt to create a zero vector.
38463 if ((UndefElts | ZeroElts).isAllOnes())
38464 return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG, DL);
38465
38466 // Create the constant data.
38467 MVT MaskSVT;
38468 if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
38469 MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
38470 else
38471 MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
38472
38473 MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
38474 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
38475 return SDValue();
38476
38477 SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
38478 return DAG.getBitcast(VT, CstOp);
38479}
38480
38481namespace llvm {
38482 namespace X86 {
38483 enum {
38484 MaxShuffleCombineDepth = 8
38485 };
38486 }
38487} // namespace llvm
38488
38489/// Fully generic combining of x86 shuffle instructions.
38490///
38491/// This should be the last combine run over the x86 shuffle instructions. Once
38492/// they have been fully optimized, this will recursively consider all chains
38493/// of single-use shuffle instructions, build a generic model of the cumulative
38494/// shuffle operation, and check for simpler instructions which implement this
38495/// operation. We use this primarily for two purposes:
38496///
38497/// 1) Collapse generic shuffles to specialized single instructions when
38498/// equivalent. In most cases, this is just an encoding size win, but
38499/// sometimes we will collapse multiple generic shuffles into a single
38500/// special-purpose shuffle.
38501/// 2) Look for sequences of shuffle instructions with 3 or more total
38502/// instructions, and replace them with the slightly more expensive SSSE3
38503/// PSHUFB instruction if available. We do this as the last combining step
38504/// to ensure we avoid using PSHUFB if we can implement the shuffle with
38505/// a suitable short sequence of other instructions. The PSHUFB will either
38506/// use a register or have to read from memory and so is slightly (but only
38507/// slightly) more expensive than the other shuffle instructions.
38508///
38509/// Because this is inherently a quadratic operation (for each shuffle in
38510/// a chain, we recurse up the chain), the depth is limited to 8 instructions.
38511/// This should never be an issue in practice as the shuffle lowering doesn't
38512/// produce sequences of more than 8 instructions.
38513///
38514/// FIXME: We will currently miss some cases where the redundant shuffling
38515/// would simplify under the threshold for PSHUFB formation because of
38516/// combine-ordering. To fix this, we should do the redundant instruction
38517/// combining in this recursive walk.
38518static SDValue combineX86ShufflesRecursively(
38519 ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
38520 ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
38521 unsigned MaxDepth, bool HasVariableMask, bool AllowVariableCrossLaneMask,
38522 bool AllowVariablePerLaneMask, SelectionDAG &DAG,
38523 const X86Subtarget &Subtarget) {
38524 assert(RootMask.size() > 0 &&(static_cast <bool> (RootMask.size() > 0 && (
RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex
== 0)) && "Illegal shuffle root mask") ? void (0) : __assert_fail
("RootMask.size() > 0 && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && \"Illegal shuffle root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38526, __extension__
__PRETTY_FUNCTION__))
38525 (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&(static_cast <bool> (RootMask.size() > 0 && (
RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex
== 0)) && "Illegal shuffle root mask") ? void (0) : __assert_fail
("RootMask.size() > 0 && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && \"Illegal shuffle root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38526, __extension__
__PRETTY_FUNCTION__))
38526 "Illegal shuffle root mask")(static_cast <bool> (RootMask.size() > 0 && (
RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex
== 0)) && "Illegal shuffle root mask") ? void (0) : __assert_fail
("RootMask.size() > 0 && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && \"Illegal shuffle root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38526, __extension__
__PRETTY_FUNCTION__))
;
38527 MVT RootVT = Root.getSimpleValueType();
38528 assert(RootVT.isVector() && "Shuffles operate on vector types!")(static_cast <bool> (RootVT.isVector() && "Shuffles operate on vector types!"
) ? void (0) : __assert_fail ("RootVT.isVector() && \"Shuffles operate on vector types!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38528, __extension__
__PRETTY_FUNCTION__))
;
38529 unsigned RootSizeInBits = RootVT.getSizeInBits();
38530
38531 // Bound the depth of our recursive combine because this is ultimately
38532 // quadratic in nature.
38533 if (Depth >= MaxDepth)
38534 return SDValue();
38535
38536 // Directly rip through bitcasts to find the underlying operand.
38537 SDValue Op = SrcOps[SrcOpIndex];
38538 Op = peekThroughOneUseBitcasts(Op);
38539
38540 EVT VT = Op.getValueType();
38541 if (!VT.isVector() || !VT.isSimple())
38542 return SDValue(); // Bail if we hit a non-simple non-vector.
38543
38544 // FIXME: Just bail on f16 for now.
38545 if (VT.getVectorElementType() == MVT::f16)
38546 return SDValue();
38547
38548 assert((RootSizeInBits % VT.getSizeInBits()) == 0 &&(static_cast <bool> ((RootSizeInBits % VT.getSizeInBits
()) == 0 && "Can only combine shuffles upto size of the root op."
) ? void (0) : __assert_fail ("(RootSizeInBits % VT.getSizeInBits()) == 0 && \"Can only combine shuffles upto size of the root op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38549, __extension__
__PRETTY_FUNCTION__))
38549 "Can only combine shuffles upto size of the root op.")(static_cast <bool> ((RootSizeInBits % VT.getSizeInBits
()) == 0 && "Can only combine shuffles upto size of the root op."
) ? void (0) : __assert_fail ("(RootSizeInBits % VT.getSizeInBits()) == 0 && \"Can only combine shuffles upto size of the root op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38549, __extension__
__PRETTY_FUNCTION__))
;
38550
38551 // Extract target shuffle mask and resolve sentinels and inputs.
38552 // TODO - determine Op's demanded elts from RootMask.
38553 SmallVector<int, 64> OpMask;
38554 SmallVector<SDValue, 2> OpInputs;
38555 APInt OpUndef, OpZero;
38556 APInt OpDemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
38557 bool IsOpVariableMask = isTargetShuffleVariableMask(Op.getOpcode());
38558 if (getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,
38559 OpZero, DAG, Depth, false)) {
38560 // Shuffle inputs must not be larger than the shuffle result.
38561 // TODO: Relax this for single input faux shuffles (e.g. trunc).
38562 if (llvm::any_of(OpInputs, [VT](SDValue OpInput) {
38563 return OpInput.getValueSizeInBits() > VT.getSizeInBits();
38564 }))
38565 return SDValue();
38566 } else if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
38567 (RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 &&
38568 !isNullConstant(Op.getOperand(1))) {
38569 SDValue SrcVec = Op.getOperand(0);
38570 int ExtractIdx = Op.getConstantOperandVal(1);
38571 unsigned NumElts = VT.getVectorNumElements();
38572 OpInputs.assign({SrcVec});
38573 OpMask.assign(NumElts, SM_SentinelUndef);
38574 std::iota(OpMask.begin(), OpMask.end(), ExtractIdx);
38575 OpZero = OpUndef = APInt::getNullValue(NumElts);
38576 } else {
38577 return SDValue();
38578 }
38579
38580 // If the shuffle result was smaller than the root, we need to adjust the
38581 // mask indices and pad the mask with undefs.
38582 if (RootSizeInBits > VT.getSizeInBits()) {
38583 unsigned NumSubVecs = RootSizeInBits / VT.getSizeInBits();
38584 unsigned OpMaskSize = OpMask.size();
38585 if (OpInputs.size() > 1) {
38586 unsigned PaddedMaskSize = NumSubVecs * OpMaskSize;
38587 for (int &M : OpMask) {
38588 if (M < 0)
38589 continue;
38590 int EltIdx = M % OpMaskSize;
38591 int OpIdx = M / OpMaskSize;
38592 M = (PaddedMaskSize * OpIdx) + EltIdx;
38593 }
38594 }
38595 OpZero = OpZero.zext(NumSubVecs * OpMaskSize);
38596 OpUndef = OpUndef.zext(NumSubVecs * OpMaskSize);
38597 OpMask.append((NumSubVecs - 1) * OpMaskSize, SM_SentinelUndef);
38598 }
38599
38600 SmallVector<int, 64> Mask;
38601 SmallVector<SDValue, 16> Ops;
38602
38603 // We don't need to merge masks if the root is empty.
38604 bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);
38605 if (EmptyRoot) {
38606 // Only resolve zeros if it will remove an input, otherwise we might end
38607 // up in an infinite loop.
38608 bool ResolveKnownZeros = true;
38609 if (!OpZero.isZero()) {
38610 APInt UsedInputs = APInt::getZero(OpInputs.size());
38611 for (int i = 0, e = OpMask.size(); i != e; ++i) {
38612 int M = OpMask[i];
38613 if (OpUndef[i] || OpZero[i] || isUndefOrZero(M))
38614 continue;
38615 UsedInputs.setBit(M / OpMask.size());
38616 if (UsedInputs.isAllOnes()) {
38617 ResolveKnownZeros = false;
38618 break;
38619 }
38620 }
38621 }
38622 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero,
38623 ResolveKnownZeros);
38624
38625 Mask = OpMask;
38626 Ops.append(OpInputs.begin(), OpInputs.end());
38627 } else {
38628 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);
38629
38630 // Add the inputs to the Ops list, avoiding duplicates.
38631 Ops.append(SrcOps.begin(), SrcOps.end());
38632
38633 auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
38634 // Attempt to find an existing match.
38635 SDValue InputBC = peekThroughBitcasts(Input);
38636 for (int i = 0, e = Ops.size(); i < e; ++i)
38637 if (InputBC == peekThroughBitcasts(Ops[i]))
38638 return i;
38639 // Match failed - should we replace an existing Op?
38640 if (InsertionPoint >= 0) {
38641 Ops[InsertionPoint] = Input;
38642 return InsertionPoint;
38643 }
38644 // Add to the end of the Ops list.
38645 Ops.push_back(Input);
38646 return Ops.size() - 1;
38647 };
38648
38649 SmallVector<int, 2> OpInputIdx;
38650 for (SDValue OpInput : OpInputs)
38651 OpInputIdx.push_back(
38652 AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));
38653
38654 assert(((RootMask.size() > OpMask.size() &&(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38659, __extension__
__PRETTY_FUNCTION__))
38655 RootMask.size() % OpMask.size() == 0) ||(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38659, __extension__
__PRETTY_FUNCTION__))
38656 (OpMask.size() > RootMask.size() &&(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38659, __extension__
__PRETTY_FUNCTION__))
38657 OpMask.size() % RootMask.size() == 0) ||(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38659, __extension__
__PRETTY_FUNCTION__))
38658 OpMask.size() == RootMask.size()) &&(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38659, __extension__
__PRETTY_FUNCTION__))
38659 "The smaller number of elements must divide the larger.")(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38659, __extension__
__PRETTY_FUNCTION__))
;
38660
38661 // This function can be performance-critical, so we rely on the power-of-2
38662 // knowledge that we have about the mask sizes to replace div/rem ops with
38663 // bit-masks and shifts.
38664 assert(isPowerOf2_32(RootMask.size()) &&(static_cast <bool> (isPowerOf2_32(RootMask.size()) &&
"Non-power-of-2 shuffle mask sizes") ? void (0) : __assert_fail
("isPowerOf2_32(RootMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38665, __extension__
__PRETTY_FUNCTION__))
38665 "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (isPowerOf2_32(RootMask.size()) &&
"Non-power-of-2 shuffle mask sizes") ? void (0) : __assert_fail
("isPowerOf2_32(RootMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38665, __extension__
__PRETTY_FUNCTION__))
;
38666 assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (isPowerOf2_32(OpMask.size()) &&
"Non-power-of-2 shuffle mask sizes") ? void (0) : __assert_fail
("isPowerOf2_32(OpMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38666, __extension__
__PRETTY_FUNCTION__))
;
38667 unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());
38668 unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());
38669
38670 unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
38671 unsigned RootRatio =
38672 std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
38673 unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
38674 assert((RootRatio == 1 || OpRatio == 1) &&(static_cast <bool> ((RootRatio == 1 || OpRatio == 1) &&
"Must not have a ratio for both incoming and op masks!") ? void
(0) : __assert_fail ("(RootRatio == 1 || OpRatio == 1) && \"Must not have a ratio for both incoming and op masks!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38675, __extension__
__PRETTY_FUNCTION__))
38675 "Must not have a ratio for both incoming and op masks!")(static_cast <bool> ((RootRatio == 1 || OpRatio == 1) &&
"Must not have a ratio for both incoming and op masks!") ? void
(0) : __assert_fail ("(RootRatio == 1 || OpRatio == 1) && \"Must not have a ratio for both incoming and op masks!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38675, __extension__
__PRETTY_FUNCTION__))
;
38676
38677 assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (isPowerOf2_32(MaskWidth) &&
"Non-power-of-2 shuffle mask sizes") ? void (0) : __assert_fail
("isPowerOf2_32(MaskWidth) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38677, __extension__
__PRETTY_FUNCTION__))
;
38678 assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (isPowerOf2_32(RootRatio) &&
"Non-power-of-2 shuffle mask sizes") ? void (0) : __assert_fail
("isPowerOf2_32(RootRatio) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38678, __extension__
__PRETTY_FUNCTION__))
;
38679 assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes"
) ? void (0) : __assert_fail ("isPowerOf2_32(OpRatio) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38679, __extension__
__PRETTY_FUNCTION__))
;
38680 unsigned RootRatioLog2 = countTrailingZeros(RootRatio);
38681 unsigned OpRatioLog2 = countTrailingZeros(OpRatio);
38682
38683 Mask.resize(MaskWidth, SM_SentinelUndef);
38684
38685 // Merge this shuffle operation's mask into our accumulated mask. Note that
38686 // this shuffle's mask will be the first applied to the input, followed by
38687 // the root mask to get us all the way to the root value arrangement. The
38688 // reason for this order is that we are recursing up the operation chain.
38689 for (unsigned i = 0; i < MaskWidth; ++i) {
38690 unsigned RootIdx = i >> RootRatioLog2;
38691 if (RootMask[RootIdx] < 0) {
38692 // This is a zero or undef lane, we're done.
38693 Mask[i] = RootMask[RootIdx];
38694 continue;
38695 }
38696
38697 unsigned RootMaskedIdx =
38698 RootRatio == 1
38699 ? RootMask[RootIdx]
38700 : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
38701
38702 // Just insert the scaled root mask value if it references an input other
38703 // than the SrcOp we're currently inserting.
38704 if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
38705 (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
38706 Mask[i] = RootMaskedIdx;
38707 continue;
38708 }
38709
38710 RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
38711 unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
38712 if (OpMask[OpIdx] < 0) {
38713 // The incoming lanes are zero or undef, it doesn't matter which ones we
38714 // are using.
38715 Mask[i] = OpMask[OpIdx];
38716 continue;
38717 }
38718
38719 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
38720 unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx]
38721 : (OpMask[OpIdx] << OpRatioLog2) +
38722 (RootMaskedIdx & (OpRatio - 1));
38723
38724 OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
38725 int InputIdx = OpMask[OpIdx] / (int)OpMask.size();
38726 assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input")(static_cast <bool> (0 <= OpInputIdx[InputIdx] &&
"Unknown target shuffle input") ? void (0) : __assert_fail (
"0 <= OpInputIdx[InputIdx] && \"Unknown target shuffle input\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38726, __extension__
__PRETTY_FUNCTION__))
;
38727 OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;
38728
38729 Mask[i] = OpMaskedIdx;
38730 }
38731 }
38732
38733 // Remove unused/repeated shuffle source ops.
38734 resolveTargetShuffleInputsAndMask(Ops, Mask);
38735
38736 // Handle the all undef/zero/ones cases early.
38737 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
38738 return DAG.getUNDEF(RootVT);
38739 if (all_of(Mask, [](int Idx) { return Idx < 0; }))
38740 return getZeroVector(RootVT, Subtarget, DAG, SDLoc(Root));
38741 if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) &&
38742 none_of(Mask, [](int M) { return M == SM_SentinelZero; }))
38743 return getOnesVector(RootVT, DAG, SDLoc(Root));
38744
38745 assert(!Ops.empty() && "Shuffle with no inputs detected")(static_cast <bool> (!Ops.empty() && "Shuffle with no inputs detected"
) ? void (0) : __assert_fail ("!Ops.empty() && \"Shuffle with no inputs detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38745, __extension__
__PRETTY_FUNCTION__))
;
38746 HasVariableMask |= IsOpVariableMask;
38747
38748 // Update the list of shuffle nodes that have been combined so far.
38749 SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
38750 SrcNodes.end());
38751 CombinedNodes.push_back(Op.getNode());
38752
38753 // See if we can recurse into each shuffle source op (if it's a target
38754 // shuffle). The source op should only be generally combined if it either has
38755 // a single use (i.e. current Op) or all its users have already been combined,
38756 // if not then we can still combine but should prevent generation of variable
38757 // shuffles to avoid constant pool bloat.
38758 // Don't recurse if we already have more source ops than we can combine in
38759 // the remaining recursion depth.
38760 if (Ops.size() < (MaxDepth - Depth)) {
38761 for (int i = 0, e = Ops.size(); i < e; ++i) {
38762 // For empty roots, we need to resolve zeroable elements before combining
38763 // them with other shuffles.
38764 SmallVector<int, 64> ResolvedMask = Mask;
38765 if (EmptyRoot)
38766 resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero);
38767 bool AllowCrossLaneVar = false;
38768 bool AllowPerLaneVar = false;
38769 if (Ops[i].getNode()->hasOneUse() ||
38770 SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode())) {
38771 AllowCrossLaneVar = AllowVariableCrossLaneMask;
38772 AllowPerLaneVar = AllowVariablePerLaneMask;
38773 }
38774 if (SDValue Res = combineX86ShufflesRecursively(
38775 Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1, MaxDepth,
38776 HasVariableMask, AllowCrossLaneVar, AllowPerLaneVar, DAG,
38777 Subtarget))
38778 return Res;
38779 }
38780 }
38781
38782 // Attempt to constant fold all of the constant source ops.
38783 if (SDValue Cst = combineX86ShufflesConstants(
38784 Ops, Mask, Root, HasVariableMask, DAG, Subtarget))
38785 return Cst;
38786
38787 // If constant fold failed and we only have constants - then we have
38788 // multiple uses by a single non-variable shuffle - just bail.
38789 if (Depth == 0 && llvm::all_of(Ops, [&](SDValue Op) {
38790 APInt UndefElts;
38791 SmallVector<APInt> RawBits;
38792 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
38793 return getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
38794 RawBits);
38795 })) {
38796 return SDValue();
38797 }
38798
38799 // Canonicalize the combined shuffle mask chain with horizontal ops.
38800 // NOTE: This will update the Ops and Mask.
38801 if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp(
38802 Ops, Mask, RootSizeInBits, SDLoc(Root), DAG, Subtarget))
38803 return DAG.getBitcast(RootVT, HOp);
38804
38805 // Try to refine our inputs given our knowledge of target shuffle mask.
38806 for (auto I : enumerate(Ops)) {
38807 int OpIdx = I.index();
38808 SDValue &Op = I.value();
38809
38810 // What range of shuffle mask element values results in picking from Op?
38811 int Lo = OpIdx * Mask.size();
38812 int Hi = Lo + Mask.size();
38813
38814 // Which elements of Op do we demand, given the mask's granularity?
38815 APInt OpDemandedElts(Mask.size(), 0);
38816 for (int MaskElt : Mask) {
38817 if (isInRange(MaskElt, Lo, Hi)) { // Picks from Op?
38818 int OpEltIdx = MaskElt - Lo;
38819 OpDemandedElts.setBit(OpEltIdx);
38820 }
38821 }
38822
38823 // Is the shuffle result smaller than the root?
38824 if (Op.getValueSizeInBits() < RootSizeInBits) {
38825 // We padded the mask with undefs. But we now need to undo that.
38826 unsigned NumExpectedVectorElts = Mask.size();
38827 unsigned EltSizeInBits = RootSizeInBits / NumExpectedVectorElts;
38828 unsigned NumOpVectorElts = Op.getValueSizeInBits() / EltSizeInBits;
38829 assert(!OpDemandedElts.extractBits((static_cast <bool> (!OpDemandedElts.extractBits( NumExpectedVectorElts
- NumOpVectorElts, NumOpVectorElts) && "Demanding the virtual undef widening padding?"
) ? void (0) : __assert_fail ("!OpDemandedElts.extractBits( NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) && \"Demanding the virtual undef widening padding?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38831, __extension__
__PRETTY_FUNCTION__))
38830 NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) &&(static_cast <bool> (!OpDemandedElts.extractBits( NumExpectedVectorElts
- NumOpVectorElts, NumOpVectorElts) && "Demanding the virtual undef widening padding?"
) ? void (0) : __assert_fail ("!OpDemandedElts.extractBits( NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) && \"Demanding the virtual undef widening padding?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38831, __extension__
__PRETTY_FUNCTION__))
38831 "Demanding the virtual undef widening padding?")(static_cast <bool> (!OpDemandedElts.extractBits( NumExpectedVectorElts
- NumOpVectorElts, NumOpVectorElts) && "Demanding the virtual undef widening padding?"
) ? void (0) : __assert_fail ("!OpDemandedElts.extractBits( NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) && \"Demanding the virtual undef widening padding?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38831, __extension__
__PRETTY_FUNCTION__))
;
38832 OpDemandedElts = OpDemandedElts.trunc(NumOpVectorElts); // NUW
38833 }
38834
38835 // The Op itself may be of different VT, so we need to scale the mask.
38836 unsigned NumOpElts = Op.getValueType().getVectorNumElements();
38837 APInt OpScaledDemandedElts = APIntOps::ScaleBitMask(OpDemandedElts, NumOpElts);
38838
38839 // Can this operand be simplified any further, given it's demanded elements?
38840 if (SDValue NewOp =
38841 DAG.getTargetLoweringInfo().SimplifyMultipleUseDemandedVectorElts(
38842 Op, OpScaledDemandedElts, DAG))
38843 Op = NewOp;
38844 }
38845 // FIXME: should we rerun resolveTargetShuffleInputsAndMask() now?
38846
38847 // Widen any subvector shuffle inputs we've collected.
38848 // TODO: Remove this to avoid generating temporary nodes, we should only
38849 // widen once combineX86ShuffleChain has found a match.
38850 if (any_of(Ops, [RootSizeInBits](SDValue Op) {
38851 return Op.getValueSizeInBits() < RootSizeInBits;
38852 })) {
38853 for (SDValue &Op : Ops)
38854 if (Op.getValueSizeInBits() < RootSizeInBits)
38855 Op = widenSubVector(Op, false, Subtarget, DAG, SDLoc(Op),
38856 RootSizeInBits);
38857 // Reresolve - we might have repeated subvector sources.
38858 resolveTargetShuffleInputsAndMask(Ops, Mask);
38859 }
38860
38861 // We can only combine unary and binary shuffle mask cases.
38862 if (Ops.size() <= 2) {
38863 // Minor canonicalization of the accumulated shuffle mask to make it easier
38864 // to match below. All this does is detect masks with sequential pairs of
38865 // elements, and shrink them to the half-width mask. It does this in a loop
38866 // so it will reduce the size of the mask to the minimal width mask which
38867 // performs an equivalent shuffle.
38868 while (Mask.size() > 1) {
38869 SmallVector<int, 64> WidenedMask;
38870 if (!canWidenShuffleElements(Mask, WidenedMask))
38871 break;
38872 Mask = std::move(WidenedMask);
38873 }
38874
38875 // Canonicalization of binary shuffle masks to improve pattern matching by
38876 // commuting the inputs.
38877 if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
38878 ShuffleVectorSDNode::commuteMask(Mask);
38879 std::swap(Ops[0], Ops[1]);
38880 }
38881
38882 // Finally, try to combine into a single shuffle instruction.
38883 return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask,
38884 AllowVariableCrossLaneMask,
38885 AllowVariablePerLaneMask, DAG, Subtarget);
38886 }
38887
38888 // If that failed and any input is extracted then try to combine as a
38889 // shuffle with the larger type.
38890 return combineX86ShuffleChainWithExtract(
38891 Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask,
38892 AllowVariablePerLaneMask, DAG, Subtarget);
38893}
38894
38895/// Helper entry wrapper to combineX86ShufflesRecursively.
38896static SDValue combineX86ShufflesRecursively(SDValue Op, SelectionDAG &DAG,
38897 const X86Subtarget &Subtarget) {
38898 return combineX86ShufflesRecursively(
38899 {Op}, 0, Op, {0}, {}, /*Depth*/ 0, X86::MaxShuffleCombineDepth,
38900 /*HasVarMask*/ false,
38901 /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, DAG,
38902 Subtarget);
38903}
38904
38905/// Get the PSHUF-style mask from PSHUF node.
38906///
38907/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
38908/// PSHUF-style masks that can be reused with such instructions.
38909static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
38910 MVT VT = N.getSimpleValueType();
38911 SmallVector<int, 4> Mask;
38912 SmallVector<SDValue, 2> Ops;
38913 bool HaveMask =
38914 getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask);
38915 (void)HaveMask;
38916 assert(HaveMask)(static_cast <bool> (HaveMask) ? void (0) : __assert_fail
("HaveMask", "llvm/lib/Target/X86/X86ISelLowering.cpp", 38916
, __extension__ __PRETTY_FUNCTION__))
;
38917
38918 // If we have more than 128-bits, only the low 128-bits of shuffle mask
38919 // matter. Check that the upper masks are repeats and remove them.
38920 if (VT.getSizeInBits() > 128) {
38921 int LaneElts = 128 / VT.getScalarSizeInBits();
38922#ifndef NDEBUG
38923 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
38924 for (int j = 0; j < LaneElts; ++j)
38925 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&(static_cast <bool> (Mask[j] == Mask[i * LaneElts + j] -
(LaneElts * i) && "Mask doesn't repeat in high 128-bit lanes!"
) ? void (0) : __assert_fail ("Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) && \"Mask doesn't repeat in high 128-bit lanes!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38926, __extension__
__PRETTY_FUNCTION__))
38926 "Mask doesn't repeat in high 128-bit lanes!")(static_cast <bool> (Mask[j] == Mask[i * LaneElts + j] -
(LaneElts * i) && "Mask doesn't repeat in high 128-bit lanes!"
) ? void (0) : __assert_fail ("Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) && \"Mask doesn't repeat in high 128-bit lanes!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38926, __extension__
__PRETTY_FUNCTION__))
;
38927#endif
38928 Mask.resize(LaneElts);
38929 }
38930
38931 switch (N.getOpcode()) {
38932 case X86ISD::PSHUFD:
38933 return Mask;
38934 case X86ISD::PSHUFLW:
38935 Mask.resize(4);
38936 return Mask;
38937 case X86ISD::PSHUFHW:
38938 Mask.erase(Mask.begin(), Mask.begin() + 4);
38939 for (int &M : Mask)
38940 M -= 4;
38941 return Mask;
38942 default:
38943 llvm_unreachable("No valid shuffle instruction found!")::llvm::llvm_unreachable_internal("No valid shuffle instruction found!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38943)
;
38944 }
38945}
38946
38947/// Search for a combinable shuffle across a chain ending in pshufd.
38948///
38949/// We walk up the chain and look for a combinable shuffle, skipping over
38950/// shuffles that we could hoist this shuffle's transformation past without
38951/// altering anything.
38952static SDValue
38953combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
38954 SelectionDAG &DAG) {
38955 assert(N.getOpcode() == X86ISD::PSHUFD &&(static_cast <bool> (N.getOpcode() == X86ISD::PSHUFD &&
"Called with something other than an x86 128-bit half shuffle!"
) ? void (0) : __assert_fail ("N.getOpcode() == X86ISD::PSHUFD && \"Called with something other than an x86 128-bit half shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38956, __extension__
__PRETTY_FUNCTION__))
38956 "Called with something other than an x86 128-bit half shuffle!")(static_cast <bool> (N.getOpcode() == X86ISD::PSHUFD &&
"Called with something other than an x86 128-bit half shuffle!"
) ? void (0) : __assert_fail ("N.getOpcode() == X86ISD::PSHUFD && \"Called with something other than an x86 128-bit half shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38956, __extension__
__PRETTY_FUNCTION__))
;
38957 SDLoc DL(N);
38958
38959 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
38960 // of the shuffles in the chain so that we can form a fresh chain to replace
38961 // this one.
38962 SmallVector<SDValue, 8> Chain;
38963 SDValue V = N.getOperand(0);
38964 for (; V.hasOneUse(); V = V.getOperand(0)) {
38965 switch (V.getOpcode()) {
38966 default:
38967 return SDValue(); // Nothing combined!
38968
38969 case ISD::BITCAST:
38970 // Skip bitcasts as we always know the type for the target specific
38971 // instructions.
38972 continue;
38973
38974 case X86ISD::PSHUFD:
38975 // Found another dword shuffle.
38976 break;
38977
38978 case X86ISD::PSHUFLW:
38979 // Check that the low words (being shuffled) are the identity in the
38980 // dword shuffle, and the high words are self-contained.
38981 if (Mask[0] != 0 || Mask[1] != 1 ||
38982 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
38983 return SDValue();
38984
38985 Chain.push_back(V);
38986 continue;
38987
38988 case X86ISD::PSHUFHW:
38989 // Check that the high words (being shuffled) are the identity in the
38990 // dword shuffle, and the low words are self-contained.
38991 if (Mask[2] != 2 || Mask[3] != 3 ||
38992 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
38993 return SDValue();
38994
38995 Chain.push_back(V);
38996 continue;
38997
38998 case X86ISD::UNPCKL:
38999 case X86ISD::UNPCKH:
39000 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
39001 // shuffle into a preceding word shuffle.
39002 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
39003 V.getSimpleValueType().getVectorElementType() != MVT::i16)
39004 return SDValue();
39005
39006 // Search for a half-shuffle which we can combine with.
39007 unsigned CombineOp =
39008 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
39009 if (V.getOperand(0) != V.getOperand(1) ||
39010 !V->isOnlyUserOf(V.getOperand(0).getNode()))
39011 return SDValue();
39012 Chain.push_back(V);
39013 V = V.getOperand(0);
39014 do {
39015 switch (V.getOpcode()) {
39016 default:
39017 return SDValue(); // Nothing to combine.
39018
39019 case X86ISD::PSHUFLW:
39020 case X86ISD::PSHUFHW:
39021 if (V.getOpcode() == CombineOp)
39022 break;
39023
39024 Chain.push_back(V);
39025
39026 LLVM_FALLTHROUGH[[gnu::fallthrough]];
39027 case ISD::BITCAST:
39028 V = V.getOperand(0);
39029 continue;
39030 }
39031 break;
39032 } while (V.hasOneUse());
39033 break;
39034 }
39035 // Break out of the loop if we break out of the switch.
39036 break;
39037 }
39038
39039 if (!V.hasOneUse())
39040 // We fell out of the loop without finding a viable combining instruction.
39041 return SDValue();
39042
39043 // Merge this node's mask and our incoming mask.
39044 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
39045 for (int &M : Mask)
39046 M = VMask[M];
39047 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
39048 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
39049
39050 // Rebuild the chain around this new shuffle.
39051 while (!Chain.empty()) {
39052 SDValue W = Chain.pop_back_val();
39053
39054 if (V.getValueType() != W.getOperand(0).getValueType())
39055 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
39056
39057 switch (W.getOpcode()) {
39058 default:
39059 llvm_unreachable("Only PSHUF and UNPCK instructions get here!")::llvm::llvm_unreachable_internal("Only PSHUF and UNPCK instructions get here!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39059)
;
39060
39061 case X86ISD::UNPCKL:
39062 case X86ISD::UNPCKH:
39063 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
39064 break;
39065
39066 case X86ISD::PSHUFD:
39067 case X86ISD::PSHUFLW:
39068 case X86ISD::PSHUFHW:
39069 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
39070 break;
39071 }
39072 }
39073 if (V.getValueType() != N.getValueType())
39074 V = DAG.getBitcast(N.getValueType(), V);
39075
39076 // Return the new chain to replace N.
39077 return V;
39078}
39079
39080// Attempt to commute shufps LHS loads:
39081// permilps(shufps(load(),x)) --> permilps(shufps(x,load()))
39082static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL,
39083 SelectionDAG &DAG) {
39084 // TODO: Add vXf64 support.
39085 if (VT != MVT::v4f32 && VT != MVT::v8f32 && VT != MVT::v16f32)
39086 return SDValue();
39087
39088 // SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.
39089 auto commuteSHUFP = [&VT, &DL, &DAG](SDValue Parent, SDValue V) {
39090 if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode()))
39091 return SDValue();
39092 SDValue N0 = V.getOperand(0);
39093 SDValue N1 = V.getOperand(1);
39094 unsigned Imm = V.getConstantOperandVal(2);
39095 const X86Subtarget &Subtarget =
39096 static_cast<const X86Subtarget &>(DAG.getSubtarget());
39097 if (!X86::mayFoldLoad(peekThroughOneUseBitcasts(N0), Subtarget) ||
39098 X86::mayFoldLoad(peekThroughOneUseBitcasts(N1), Subtarget))
39099 return SDValue();
39100 Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);
39101 return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,
39102 DAG.getTargetConstant(Imm, DL, MVT::i8));
39103 };
39104
39105 switch (N.getOpcode()) {
39106 case X86ISD::VPERMILPI:
39107 if (SDValue NewSHUFP = commuteSHUFP(N, N.getOperand(0))) {
39108 unsigned Imm = N.getConstantOperandVal(1);
39109 return DAG.getNode(X86ISD::VPERMILPI, DL, VT, NewSHUFP,
39110 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
39111 }
39112 break;
39113 case X86ISD::SHUFP: {
39114 SDValue N0 = N.getOperand(0);
39115 SDValue N1 = N.getOperand(1);
39116 unsigned Imm = N.getConstantOperandVal(2);
39117 if (N0 == N1) {
39118 if (SDValue NewSHUFP = commuteSHUFP(N, N0))
39119 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, NewSHUFP,
39120 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
39121 } else if (SDValue NewSHUFP = commuteSHUFP(N, N0)) {
39122 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, N1,
39123 DAG.getTargetConstant(Imm ^ 0x0A, DL, MVT::i8));
39124 } else if (SDValue NewSHUFP = commuteSHUFP(N, N1)) {
39125 return DAG.getNode(X86ISD::SHUFP, DL, VT, N0, NewSHUFP,
39126 DAG.getTargetConstant(Imm ^ 0xA0, DL, MVT::i8));
39127 }
39128 break;
39129 }
39130 }
39131
39132 return SDValue();
39133}
39134
39135// Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
39136static SDValue canonicalizeShuffleWithBinOps(SDValue N, SelectionDAG &DAG,
39137 const SDLoc &DL) {
39138 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
39139 EVT ShuffleVT = N.getValueType();
39140
39141 auto IsMergeableWithShuffle = [](SDValue Op) {
39142 // AllZeros/AllOnes constants are freely shuffled and will peek through
39143 // bitcasts. Other constant build vectors do not peek through bitcasts. Only
39144 // merge with target shuffles if it has one use so shuffle combining is
39145 // likely to kick in.
39146 return ISD::isBuildVectorAllOnes(Op.getNode()) ||
39147 ISD::isBuildVectorAllZeros(Op.getNode()) ||
39148 ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) ||
39149 ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode()) ||
39150 (isTargetShuffle(Op.getOpcode()) && Op->hasOneUse());
39151 };
39152 auto IsSafeToMoveShuffle = [ShuffleVT](SDValue Op, unsigned BinOp) {
39153 // Ensure we only shuffle whole vector src elements, unless its a logical
39154 // binops where we can more aggressively move shuffles from dst to src.
39155 return BinOp == ISD::AND || BinOp == ISD::OR || BinOp == ISD::XOR ||
39156 (Op.getScalarValueSizeInBits() <= ShuffleVT.getScalarSizeInBits());
39157 };
39158
39159 unsigned Opc = N.getOpcode();
39160 switch (Opc) {
39161 // Unary and Unary+Permute Shuffles.
39162 case X86ISD::PSHUFB: {
39163 // Don't merge PSHUFB if it contains zero'd elements.
39164 SmallVector<int> Mask;
39165 SmallVector<SDValue> Ops;
39166 if (!getTargetShuffleMask(N.getNode(), ShuffleVT.getSimpleVT(), false, Ops,
39167 Mask))
39168 break;
39169 LLVM_FALLTHROUGH[[gnu::fallthrough]];
39170 }
39171 case X86ISD::VBROADCAST:
39172 case X86ISD::MOVDDUP:
39173 case X86ISD::PSHUFD:
39174 case X86ISD::PSHUFHW:
39175 case X86ISD::PSHUFLW:
39176 case X86ISD::VPERMI:
39177 case X86ISD::VPERMILPI: {
39178 if (N.getOperand(0).getValueType() == ShuffleVT &&
39179 N->isOnlyUserOf(N.getOperand(0).getNode())) {
39180 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
39181 unsigned SrcOpcode = N0.getOpcode();
39182 if (TLI.isBinOp(SrcOpcode) && IsSafeToMoveShuffle(N0, SrcOpcode)) {
39183 SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0));
39184 SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1));
39185 if (IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op01)) {
39186 SDValue LHS, RHS;
39187 Op00 = DAG.getBitcast(ShuffleVT, Op00);
39188 Op01 = DAG.getBitcast(ShuffleVT, Op01);
39189 if (N.getNumOperands() == 2) {
39190 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, N.getOperand(1));
39191 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, N.getOperand(1));
39192 } else {
39193 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00);
39194 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01);
39195 }
39196 EVT OpVT = N0.getValueType();
39197 return DAG.getBitcast(ShuffleVT,
39198 DAG.getNode(SrcOpcode, DL, OpVT,
39199 DAG.getBitcast(OpVT, LHS),
39200 DAG.getBitcast(OpVT, RHS)));
39201 }
39202 }
39203 }
39204 break;
39205 }
39206 // Binary and Binary+Permute Shuffles.
39207 case X86ISD::INSERTPS: {
39208 // Don't merge INSERTPS if it contains zero'd elements.
39209 unsigned InsertPSMask = N.getConstantOperandVal(2);
39210 unsigned ZeroMask = InsertPSMask & 0xF;
39211 if (ZeroMask != 0)
39212 break;
39213 LLVM_FALLTHROUGH[[gnu::fallthrough]];
39214 }
39215 case X86ISD::MOVSD:
39216 case X86ISD::MOVSS:
39217 case X86ISD::BLENDI:
39218 case X86ISD::SHUFP:
39219 case X86ISD::UNPCKH:
39220 case X86ISD::UNPCKL: {
39221 if (N->isOnlyUserOf(N.getOperand(0).getNode()) &&
39222 N->isOnlyUserOf(N.getOperand(1).getNode())) {
39223 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
39224 SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));
39225 unsigned SrcOpcode = N0.getOpcode();
39226 if (TLI.isBinOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&
39227 IsSafeToMoveShuffle(N0, SrcOpcode) &&
39228 IsSafeToMoveShuffle(N1, SrcOpcode)) {
39229 SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0));
39230 SDValue Op10 = peekThroughOneUseBitcasts(N1.getOperand(0));
39231 SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1));
39232 SDValue Op11 = peekThroughOneUseBitcasts(N1.getOperand(1));
39233 // Ensure the total number of shuffles doesn't increase by folding this
39234 // shuffle through to the source ops.
39235 if (((IsMergeableWithShuffle(Op00) && IsMergeableWithShuffle(Op10)) ||
39236 (IsMergeableWithShuffle(Op01) && IsMergeableWithShuffle(Op11))) ||
39237 ((IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op10)) &&
39238 (IsMergeableWithShuffle(Op01) || IsMergeableWithShuffle(Op11)))) {
39239 SDValue LHS, RHS;
39240 Op00 = DAG.getBitcast(ShuffleVT, Op00);
39241 Op10 = DAG.getBitcast(ShuffleVT, Op10);
39242 Op01 = DAG.getBitcast(ShuffleVT, Op01);
39243 Op11 = DAG.getBitcast(ShuffleVT, Op11);
39244 if (N.getNumOperands() == 3) {
39245 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));
39246 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11, N.getOperand(2));
39247 } else {
39248 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);
39249 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11);
39250 }
39251 EVT OpVT = N0.getValueType();
39252 return DAG.getBitcast(ShuffleVT,
39253 DAG.getNode(SrcOpcode, DL, OpVT,
39254 DAG.getBitcast(OpVT, LHS),
39255 DAG.getBitcast(OpVT, RHS)));
39256 }
39257 }
39258 }
39259 break;
39260 }
39261 }
39262 return SDValue();
39263}
39264
39265/// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
39266static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V,
39267 SelectionDAG &DAG,
39268 const SDLoc &DL) {
39269 assert(V.getOpcode() == X86ISD::VPERM2X128 && "Unknown lane shuffle")(static_cast <bool> (V.getOpcode() == X86ISD::VPERM2X128
&& "Unknown lane shuffle") ? void (0) : __assert_fail
("V.getOpcode() == X86ISD::VPERM2X128 && \"Unknown lane shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39269, __extension__
__PRETTY_FUNCTION__))
;
39270
39271 MVT VT = V.getSimpleValueType();
39272 SDValue Src0 = peekThroughBitcasts(V.getOperand(0));
39273 SDValue Src1 = peekThroughBitcasts(V.getOperand(1));
39274 unsigned SrcOpc0 = Src0.getOpcode();
39275 unsigned SrcOpc1 = Src1.getOpcode();
39276 EVT SrcVT0 = Src0.getValueType();
39277 EVT SrcVT1 = Src1.getValueType();
39278
39279 if (!Src1.isUndef() && (SrcVT0 != SrcVT1 || SrcOpc0 != SrcOpc1))
39280 return SDValue();
39281
39282 switch (SrcOpc0) {
39283 case X86ISD::MOVDDUP: {
39284 SDValue LHS = Src0.getOperand(0);
39285 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
39286 SDValue Res =
39287 DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS, V.getOperand(2));
39288 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res);
39289 return DAG.getBitcast(VT, Res);
39290 }
39291 case X86ISD::VPERMILPI:
39292 // TODO: Handle v4f64 permutes with different low/high lane masks.
39293 if (SrcVT0 == MVT::v4f64) {
39294 uint64_t Mask = Src0.getConstantOperandVal(1);
39295 if ((Mask & 0x3) != ((Mask >> 2) & 0x3))
39296 break;
39297 }
39298 LLVM_FALLTHROUGH[[gnu::fallthrough]];
39299 case X86ISD::VSHLI:
39300 case X86ISD::VSRLI:
39301 case X86ISD::VSRAI:
39302 case X86ISD::PSHUFD:
39303 if (Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) {
39304 SDValue LHS = Src0.getOperand(0);
39305 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
39306 SDValue Res = DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS,
39307 V.getOperand(2));
39308 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res, Src0.getOperand(1));
39309 return DAG.getBitcast(VT, Res);
39310 }
39311 break;
39312 }
39313
39314 return SDValue();
39315}
39316
39317/// Try to combine x86 target specific shuffles.
39318static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
39319 TargetLowering::DAGCombinerInfo &DCI,
39320 const X86Subtarget &Subtarget) {
39321 SDLoc DL(N);
39322 MVT VT = N.getSimpleValueType();
39323 SmallVector<int, 4> Mask;
39324 unsigned Opcode = N.getOpcode();
39325
39326 if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))
39327 return R;
39328
39329 // Handle specific target shuffles.
39330 switch (Opcode) {
39331 case X86ISD::MOVDDUP: {
39332 SDValue Src = N.getOperand(0);
39333 // Turn a 128-bit MOVDDUP of a full vector load into movddup+vzload.
39334 if (VT == MVT::v2f64 && Src.hasOneUse() &&
39335 ISD::isNormalLoad(Src.getNode())) {
39336 LoadSDNode *LN = cast<LoadSDNode>(Src);
39337 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::f64, MVT::v2f64, DAG)) {
39338 SDValue Movddup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, VZLoad);
39339 DCI.CombineTo(N.getNode(), Movddup);
39340 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
39341 DCI.recursivelyDeleteUnusedNodes(LN);
39342 return N; // Return N so it doesn't get rechecked!
39343 }
39344 }
39345
39346 return SDValue();
39347 }
39348 case X86ISD::VBROADCAST: {
39349 SDValue Src = N.getOperand(0);
39350 SDValue BC = peekThroughBitcasts(Src);
39351 EVT SrcVT = Src.getValueType();
39352 EVT BCVT = BC.getValueType();
39353
39354 // If broadcasting from another shuffle, attempt to simplify it.
39355 // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
39356 if (isTargetShuffle(BC.getOpcode()) &&
39357 VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
39358 unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
39359 SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),
39360 SM_SentinelUndef);
39361 for (unsigned i = 0; i != Scale; ++i)
39362 DemandedMask[i] = i;
39363 if (SDValue Res = combineX86ShufflesRecursively(
39364 {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 0,
39365 X86::MaxShuffleCombineDepth,
39366 /*HasVarMask*/ false, /*AllowCrossLaneVarMask*/ true,
39367 /*AllowPerLaneVarMask*/ true, DAG, Subtarget))
39368 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
39369 DAG.getBitcast(SrcVT, Res));
39370 }
39371
39372 // broadcast(bitcast(src)) -> bitcast(broadcast(src))
39373 // 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.
39374 if (Src.getOpcode() == ISD::BITCAST &&
39375 SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits() &&
39376 DAG.getTargetLoweringInfo().isTypeLegal(BCVT) &&
39377 FixedVectorType::isValidElementType(
39378 BCVT.getScalarType().getTypeForEVT(*DAG.getContext()))) {
39379 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),
39380 VT.getVectorNumElements());
39381 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
39382 }
39383
39384 // Reduce broadcast source vector to lowest 128-bits.
39385 if (SrcVT.getSizeInBits() > 128)
39386 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
39387 extract128BitVector(Src, 0, DAG, DL));
39388
39389 // broadcast(scalar_to_vector(x)) -> broadcast(x).
39390 if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR)
39391 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
39392
39393 // broadcast(extract_vector_elt(x, 0)) -> broadcast(x).
39394 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
39395 isNullConstant(Src.getOperand(1)) &&
39396 DAG.getTargetLoweringInfo().isTypeLegal(
39397 Src.getOperand(0).getValueType()))
39398 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
39399
39400 // Share broadcast with the longest vector and extract low subvector (free).
39401 // Ensure the same SDValue from the SDNode use is being used.
39402 for (SDNode *User : Src->uses())
39403 if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&
39404 Src == User->getOperand(0) &&
39405 User->getValueSizeInBits(0).getFixedSize() >
39406 VT.getFixedSizeInBits()) {
39407 return extractSubVector(SDValue(User, 0), 0, DAG, DL,
39408 VT.getSizeInBits());
39409 }
39410
39411 // vbroadcast(scalarload X) -> vbroadcast_load X
39412 // For float loads, extract other uses of the scalar from the broadcast.
39413 if (!SrcVT.isVector() && (Src.hasOneUse() || VT.isFloatingPoint()) &&
39414 ISD::isNormalLoad(Src.getNode())) {
39415 LoadSDNode *LN = cast<LoadSDNode>(Src);
39416 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
39417 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
39418 SDValue BcastLd =
39419 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
39420 LN->getMemoryVT(), LN->getMemOperand());
39421 // If the load value is used only by N, replace it via CombineTo N.
39422 bool NoReplaceExtract = Src.hasOneUse();
39423 DCI.CombineTo(N.getNode(), BcastLd);
39424 if (NoReplaceExtract) {
39425 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
39426 DCI.recursivelyDeleteUnusedNodes(LN);
39427 } else {
39428 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd,
39429 DAG.getIntPtrConstant(0, DL));
39430 DCI.CombineTo(LN, Scl, BcastLd.getValue(1));
39431 }
39432 return N; // Return N so it doesn't get rechecked!
39433 }
39434
39435 // Due to isTypeDesirableForOp, we won't always shrink a load truncated to
39436 // i16. So shrink it ourselves if we can make a broadcast_load.
39437 if (SrcVT == MVT::i16 && Src.getOpcode() == ISD::TRUNCATE &&
39438 Src.hasOneUse() && Src.getOperand(0).hasOneUse()) {
39439 assert(Subtarget.hasAVX2() && "Expected AVX2")(static_cast <bool> (Subtarget.hasAVX2() && "Expected AVX2"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"Expected AVX2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39439, __extension__
__PRETTY_FUNCTION__))
;
39440 SDValue TruncIn = Src.getOperand(0);
39441
39442 // If this is a truncate of a non extending load we can just narrow it to
39443 // use a broadcast_load.
39444 if (ISD::isNormalLoad(TruncIn.getNode())) {
39445 LoadSDNode *LN = cast<LoadSDNode>(TruncIn);
39446 // Unless its volatile or atomic.
39447 if (LN->isSimple()) {
39448 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
39449 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
39450 SDValue BcastLd = DAG.getMemIntrinsicNode(
39451 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
39452 LN->getPointerInfo(), LN->getOriginalAlign(),
39453 LN->getMemOperand()->getFlags());
39454 DCI.CombineTo(N.getNode(), BcastLd);
39455 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
39456 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
39457 return N; // Return N so it doesn't get rechecked!
39458 }
39459 }
39460
39461 // If this is a truncate of an i16 extload, we can directly replace it.
39462 if (ISD::isUNINDEXEDLoad(Src.getOperand(0).getNode()) &&
39463 ISD::isEXTLoad(Src.getOperand(0).getNode())) {
39464 LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0));
39465 if (LN->getMemoryVT().getSizeInBits() == 16) {
39466 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
39467 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
39468 SDValue BcastLd =
39469 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
39470 LN->getMemoryVT(), LN->getMemOperand());
39471 DCI.CombineTo(N.getNode(), BcastLd);
39472 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
39473 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
39474 return N; // Return N so it doesn't get rechecked!
39475 }
39476 }
39477
39478 // If this is a truncate of load that has been shifted right, we can
39479 // offset the pointer and use a narrower load.
39480 if (TruncIn.getOpcode() == ISD::SRL &&
39481 TruncIn.getOperand(0).hasOneUse() &&
39482 isa<ConstantSDNode>(TruncIn.getOperand(1)) &&
39483 ISD::isNormalLoad(TruncIn.getOperand(0).getNode())) {
39484 LoadSDNode *LN = cast<LoadSDNode>(TruncIn.getOperand(0));
39485 unsigned ShiftAmt = TruncIn.getConstantOperandVal(1);
39486 // Make sure the shift amount and the load size are divisible by 16.
39487 // Don't do this if the load is volatile or atomic.
39488 if (ShiftAmt % 16 == 0 && TruncIn.getValueSizeInBits() % 16 == 0 &&
39489 LN->isSimple()) {
39490 unsigned Offset = ShiftAmt / 8;
39491 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
39492 SDValue Ptr = DAG.getMemBasePlusOffset(LN->getBasePtr(),
39493 TypeSize::Fixed(Offset), DL);
39494 SDValue Ops[] = { LN->getChain(), Ptr };
39495 SDValue BcastLd = DAG.getMemIntrinsicNode(
39496 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
39497 LN->getPointerInfo().getWithOffset(Offset),
39498 LN->getOriginalAlign(),
39499 LN->getMemOperand()->getFlags());
39500 DCI.CombineTo(N.getNode(), BcastLd);
39501 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
39502 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
39503 return N; // Return N so it doesn't get rechecked!
39504 }
39505 }
39506 }
39507
39508 // vbroadcast(vzload X) -> vbroadcast_load X
39509 if (Src.getOpcode() == X86ISD::VZEXT_LOAD && Src.hasOneUse()) {
39510 MemSDNode *LN = cast<MemIntrinsicSDNode>(Src);
39511 if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) {
39512 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
39513 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
39514 SDValue BcastLd =
39515 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
39516 LN->getMemoryVT(), LN->getMemOperand());
39517 DCI.CombineTo(N.getNode(), BcastLd);
39518 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
39519 DCI.recursivelyDeleteUnusedNodes(LN);
39520 return N; // Return N so it doesn't get rechecked!
39521 }
39522 }
39523
39524 // vbroadcast(vector load X) -> vbroadcast_load
39525 if ((SrcVT == MVT::v2f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v2i64 ||
39526 SrcVT == MVT::v4i32) &&
39527 Src.hasOneUse() && ISD::isNormalLoad(Src.getNode())) {
39528 LoadSDNode *LN = cast<LoadSDNode>(Src);
39529 // Unless the load is volatile or atomic.
39530 if (LN->isSimple()) {
39531 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
39532 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
39533 SDValue BcastLd = DAG.getMemIntrinsicNode(
39534 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SrcVT.getScalarType(),
39535 LN->getPointerInfo(), LN->getOriginalAlign(),
39536 LN->getMemOperand()->getFlags());
39537 DCI.CombineTo(N.getNode(), BcastLd);
39538 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
39539 DCI.recursivelyDeleteUnusedNodes(LN);
39540 return N; // Return N so it doesn't get rechecked!
39541 }
39542 }
39543
39544 return SDValue();
39545 }
39546 case X86ISD::VZEXT_MOVL: {
39547 SDValue N0 = N.getOperand(0);
39548
39549 // If this a vzmovl of a full vector load, replace it with a vzload, unless
39550 // the load is volatile.
39551 if (N0.hasOneUse() && ISD::isNormalLoad(N0.getNode())) {
39552 auto *LN = cast<LoadSDNode>(N0);
39553 if (SDValue VZLoad =
39554 narrowLoadToVZLoad(LN, VT.getVectorElementType(), VT, DAG)) {
39555 DCI.CombineTo(N.getNode(), VZLoad);
39556 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
39557 DCI.recursivelyDeleteUnusedNodes(LN);
39558 return N;
39559 }
39560 }
39561
39562 // If this a VZEXT_MOVL of a VBROADCAST_LOAD, we don't need the broadcast
39563 // and can just use a VZEXT_LOAD.
39564 // FIXME: Is there some way to do this with SimplifyDemandedVectorElts?
39565 if (N0.hasOneUse() && N0.getOpcode() == X86ISD::VBROADCAST_LOAD) {
39566 auto *LN = cast<MemSDNode>(N0);
39567 if (VT.getScalarSizeInBits() == LN->getMemoryVT().getSizeInBits()) {
39568 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
39569 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
39570 SDValue VZLoad =
39571 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops,
39572 LN->getMemoryVT(), LN->getMemOperand());
39573 DCI.CombineTo(N.getNode(), VZLoad);
39574 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
39575 DCI.recursivelyDeleteUnusedNodes(LN);
39576 return N;
39577 }
39578 }
39579
39580 // Turn (v2i64 (vzext_movl (scalar_to_vector (i64 X)))) into
39581 // (v2i64 (bitcast (v4i32 (vzext_movl (scalar_to_vector (i32 (trunc X)))))))
39582 // if the upper bits of the i64 are zero.
39583 if (N0.hasOneUse() && N0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
39584 N0.getOperand(0).hasOneUse() &&
39585 N0.getOperand(0).getValueType() == MVT::i64) {
39586 SDValue In = N0.getOperand(0);
39587 APInt Mask = APInt::getHighBitsSet(64, 32);
39588 if (DAG.MaskedValueIsZero(In, Mask)) {
39589 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, In);
39590 MVT VecVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
39591 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Trunc);
39592 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, SclVec);
39593 return DAG.getBitcast(VT, Movl);
39594 }
39595 }
39596
39597 // Load a scalar integer constant directly to XMM instead of transferring an
39598 // immediate value from GPR.
39599 // vzext_movl (scalar_to_vector C) --> load [C,0...]
39600 if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR) {
39601 if (auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {
39602 // Create a vector constant - scalar constant followed by zeros.
39603 EVT ScalarVT = N0.getOperand(0).getValueType();
39604 Type *ScalarTy = ScalarVT.getTypeForEVT(*DAG.getContext());
39605 unsigned NumElts = VT.getVectorNumElements();
39606 Constant *Zero = ConstantInt::getNullValue(ScalarTy);
39607 SmallVector<Constant *, 32> ConstantVec(NumElts, Zero);
39608 ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue());
39609
39610 // Load the vector constant from constant pool.
39611 MVT PVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
39612 SDValue CP = DAG.getConstantPool(ConstantVector::get(ConstantVec), PVT);
39613 MachinePointerInfo MPI =
39614 MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
39615 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
39616 return DAG.getLoad(VT, DL, DAG.getEntryNode(), CP, MPI, Alignment,
39617 MachineMemOperand::MOLoad);
39618 }
39619 }
39620
39621 // Pull subvector inserts into undef through VZEXT_MOVL by making it an
39622 // insert into a zero vector. This helps get VZEXT_MOVL closer to
39623 // scalar_to_vectors where 256/512 are canonicalized to an insert and a
39624 // 128-bit scalar_to_vector. This reduces the number of isel patterns.
39625 if (!DCI.isBeforeLegalizeOps() && N0.hasOneUse()) {
39626 SDValue V = peekThroughOneUseBitcasts(N0);
39627
39628 if (V.getOpcode() == ISD::INSERT_SUBVECTOR && V.getOperand(0).isUndef() &&
39629 isNullConstant(V.getOperand(2))) {
39630 SDValue In = V.getOperand(1);
39631 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
39632 In.getValueSizeInBits() /
39633 VT.getScalarSizeInBits());
39634 In = DAG.getBitcast(SubVT, In);
39635 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, SubVT, In);
39636 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
39637 getZeroVector(VT, Subtarget, DAG, DL), Movl,
39638 V.getOperand(2));
39639 }
39640 }
39641
39642 return SDValue();
39643 }
39644 case X86ISD::BLENDI: {
39645 SDValue N0 = N.getOperand(0);
39646 SDValue N1 = N.getOperand(1);
39647
39648 // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
39649 // TODO: Handle MVT::v16i16 repeated blend mask.
39650 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
39651 N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {
39652 MVT SrcVT = N0.getOperand(0).getSimpleValueType();
39653 if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0 &&
39654 SrcVT.getScalarSizeInBits() >= 32) {
39655 unsigned BlendMask = N.getConstantOperandVal(2);
39656 unsigned Size = VT.getVectorNumElements();
39657 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
39658 BlendMask = scaleVectorShuffleBlendMask(BlendMask, Size, Scale);
39659 return DAG.getBitcast(
39660 VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
39661 N1.getOperand(0),
39662 DAG.getTargetConstant(BlendMask, DL, MVT::i8)));
39663 }
39664 }
39665 return SDValue();
39666 }
39667 case X86ISD::SHUFP: {
39668 // Fold shufps(shuffle(x),shuffle(y)) -> shufps(x,y).
39669 // This is a more relaxed shuffle combiner that can ignore oneuse limits.
39670 // TODO: Support types other than v4f32.
39671 if (VT == MVT::v4f32) {
39672 bool Updated = false;
39673 SmallVector<int> Mask;
39674 SmallVector<SDValue> Ops;
39675 if (getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask) &&
39676 Ops.size() == 2) {
39677 for (int i = 0; i != 2; ++i) {
39678 SmallVector<SDValue> SubOps;
39679 SmallVector<int> SubMask, SubScaledMask;
39680 SDValue Sub = peekThroughBitcasts(Ops[i]);
39681 // TODO: Scaling might be easier if we specify the demanded elts.
39682 if (getTargetShuffleInputs(Sub, SubOps, SubMask, DAG, 0, false) &&
39683 scaleShuffleElements(SubMask, 4, SubScaledMask) &&
39684 SubOps.size() == 1 && isUndefOrInRange(SubScaledMask, 0, 4)) {
39685 int Ofs = i * 2;
39686 Mask[Ofs + 0] = SubScaledMask[Mask[Ofs + 0] % 4] + (i * 4);
39687 Mask[Ofs + 1] = SubScaledMask[Mask[Ofs + 1] % 4] + (i * 4);
39688 Ops[i] = DAG.getBitcast(VT, SubOps[0]);
39689 Updated = true;
39690 }
39691 }
39692 }
39693 if (Updated) {
39694 for (int &M : Mask)
39695 M %= 4;
39696 Ops.push_back(getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
39697 return DAG.getNode(X86ISD::SHUFP, DL, VT, Ops);
39698 }
39699 }
39700 return SDValue();
39701 }
39702 case X86ISD::VPERMI: {
39703 // vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
39704 // TODO: Remove when we have preferred domains in combineX86ShuffleChain.
39705 SDValue N0 = N.getOperand(0);
39706 SDValue N1 = N.getOperand(1);
39707 unsigned EltSizeInBits = VT.getScalarSizeInBits();
39708 if (N0.getOpcode() == ISD::BITCAST &&
39709 N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) {
39710 SDValue Src = N0.getOperand(0);
39711 EVT SrcVT = Src.getValueType();
39712 SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1);
39713 return DAG.getBitcast(VT, Res);
39714 }
39715 return SDValue();
39716 }
39717 case X86ISD::VPERM2X128: {
39718 // Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)).
39719 SDValue LHS = N->getOperand(0);
39720 SDValue RHS = N->getOperand(1);
39721 if (LHS.getOpcode() == ISD::BITCAST &&
39722 (RHS.getOpcode() == ISD::BITCAST || RHS.isUndef())) {
39723 EVT SrcVT = LHS.getOperand(0).getValueType();
39724 if (RHS.isUndef() || SrcVT == RHS.getOperand(0).getValueType()) {
39725 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT,
39726 DAG.getBitcast(SrcVT, LHS),
39727 DAG.getBitcast(SrcVT, RHS),
39728 N->getOperand(2)));
39729 }
39730 }
39731
39732 // Fold vperm2x128(op(),op()) -> op(vperm2x128(),vperm2x128()).
39733 if (SDValue Res = canonicalizeLaneShuffleWithRepeatedOps(N, DAG, DL))
39734 return Res;
39735
39736 // Fold vperm2x128 subvector shuffle with an inner concat pattern.
39737 // vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc.
39738 auto FindSubVector128 = [&](unsigned Idx) {
39739 if (Idx > 3)
39740 return SDValue();
39741 SDValue Src = peekThroughBitcasts(N.getOperand(Idx < 2 ? 0 : 1));
39742 SmallVector<SDValue> SubOps;
39743 if (collectConcatOps(Src.getNode(), SubOps) && SubOps.size() == 2)
39744 return SubOps[Idx & 1];
39745 unsigned NumElts = Src.getValueType().getVectorNumElements();
39746 if ((Idx & 1) == 1 && Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
39747 Src.getOperand(1).getValueSizeInBits() == 128 &&
39748 Src.getConstantOperandAPInt(2) == (NumElts / 2)) {
39749 return Src.getOperand(1);
39750 }
39751 return SDValue();
39752 };
39753 unsigned Imm = N.getConstantOperandVal(2);
39754 if (SDValue SubLo = FindSubVector128(Imm & 0x0F)) {
39755 if (SDValue SubHi = FindSubVector128((Imm & 0xF0) >> 4)) {
39756 MVT SubVT = VT.getHalfNumVectorElementsVT();
39757 SubLo = DAG.getBitcast(SubVT, SubLo);
39758 SubHi = DAG.getBitcast(SubVT, SubHi);
39759 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, SubLo, SubHi);
39760 }
39761 }
39762 return SDValue();
39763 }
39764 case X86ISD::PSHUFD:
39765 case X86ISD::PSHUFLW:
39766 case X86ISD::PSHUFHW:
39767 Mask = getPSHUFShuffleMask(N);
39768 assert(Mask.size() == 4)(static_cast <bool> (Mask.size() == 4) ? void (0) : __assert_fail
("Mask.size() == 4", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 39768, __extension__ __PRETTY_FUNCTION__))
;
39769 break;
39770 case X86ISD::MOVSD:
39771 case X86ISD::MOVSH:
39772 case X86ISD::MOVSS: {
39773 SDValue N0 = N.getOperand(0);
39774 SDValue N1 = N.getOperand(1);
39775
39776 // Canonicalize scalar FPOps:
39777 // MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))
39778 // If commutable, allow OP(N1[0], N0[0]).
39779 unsigned Opcode1 = N1.getOpcode();
39780 if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB ||
39781 Opcode1 == ISD::FDIV) {
39782 SDValue N10 = N1.getOperand(0);
39783 SDValue N11 = N1.getOperand(1);
39784 if (N10 == N0 ||
39785 (N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) {
39786 if (N10 != N0)
39787 std::swap(N10, N11);
39788 MVT SVT = VT.getVectorElementType();
39789 SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
39790 N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);
39791 N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);
39792 SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);
39793 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
39794 return DAG.getNode(Opcode, DL, VT, N0, SclVec);
39795 }
39796 }
39797
39798 return SDValue();
39799 }
39800 case X86ISD::INSERTPS: {
39801 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32")(static_cast <bool> (VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32"
) ? void (0) : __assert_fail ("VT == MVT::v4f32 && \"INSERTPS ValueType must be MVT::v4f32\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39801, __extension__
__PRETTY_FUNCTION__))
;
39802 SDValue Op0 = N.getOperand(0);
39803 SDValue Op1 = N.getOperand(1);
39804 unsigned InsertPSMask = N.getConstantOperandVal(2);
39805 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
39806 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
39807 unsigned ZeroMask = InsertPSMask & 0xF;
39808
39809 // If we zero out all elements from Op0 then we don't need to reference it.
39810 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
39811 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
39812 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
39813
39814 // If we zero out the element from Op1 then we don't need to reference it.
39815 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
39816 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
39817 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
39818
39819 // Attempt to merge insertps Op1 with an inner target shuffle node.
39820 SmallVector<int, 8> TargetMask1;
39821 SmallVector<SDValue, 2> Ops1;
39822 APInt KnownUndef1, KnownZero1;
39823 if (getTargetShuffleAndZeroables(Op1, TargetMask1, Ops1, KnownUndef1,
39824 KnownZero1)) {
39825 if (KnownUndef1[SrcIdx] || KnownZero1[SrcIdx]) {
39826 // Zero/UNDEF insertion - zero out element and remove dependency.
39827 InsertPSMask |= (1u << DstIdx);
39828 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
39829 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
39830 }
39831 // Update insertps mask srcidx and reference the source input directly.
39832 int M = TargetMask1[SrcIdx];
39833 assert(0 <= M && M < 8 && "Shuffle index out of range")(static_cast <bool> (0 <= M && M < 8 &&
"Shuffle index out of range") ? void (0) : __assert_fail ("0 <= M && M < 8 && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39833, __extension__
__PRETTY_FUNCTION__))
;
39834 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
39835 Op1 = Ops1[M < 4 ? 0 : 1];
39836 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
39837 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
39838 }
39839
39840 // Attempt to merge insertps Op0 with an inner target shuffle node.
39841 SmallVector<int, 8> TargetMask0;
39842 SmallVector<SDValue, 2> Ops0;
39843 APInt KnownUndef0, KnownZero0;
39844 if (getTargetShuffleAndZeroables(Op0, TargetMask0, Ops0, KnownUndef0,
39845 KnownZero0)) {
39846 bool Updated = false;
39847 bool UseInput00 = false;
39848 bool UseInput01 = false;
39849 for (int i = 0; i != 4; ++i) {
39850 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
39851 // No change if element is already zero or the inserted element.
39852 continue;
39853 } else if (KnownUndef0[i] || KnownZero0[i]) {
39854 // If the target mask is undef/zero then we must zero the element.
39855 InsertPSMask |= (1u << i);
39856 Updated = true;
39857 continue;
39858 }
39859
39860 // The input vector element must be inline.
39861 int M = TargetMask0[i];
39862 if (M != i && M != (i + 4))
39863 return SDValue();
39864
39865 // Determine which inputs of the target shuffle we're using.
39866 UseInput00 |= (0 <= M && M < 4);
39867 UseInput01 |= (4 <= M);
39868 }
39869
39870 // If we're not using both inputs of the target shuffle then use the
39871 // referenced input directly.
39872 if (UseInput00 && !UseInput01) {
39873 Updated = true;
39874 Op0 = Ops0[0];
39875 } else if (!UseInput00 && UseInput01) {
39876 Updated = true;
39877 Op0 = Ops0[1];
39878 }
39879
39880 if (Updated)
39881 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
39882 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
39883 }
39884
39885 // If we're inserting an element from a vbroadcast load, fold the
39886 // load into the X86insertps instruction. We need to convert the scalar
39887 // load to a vector and clear the source lane of the INSERTPS control.
39888 if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) {
39889 auto *MemIntr = cast<MemIntrinsicSDNode>(Op1);
39890 if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {
39891 SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),
39892 MemIntr->getBasePtr(),
39893 MemIntr->getMemOperand());
39894 SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0,
39895 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT,
39896 Load),
39897 DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));
39898 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
39899 return Insert;
39900 }
39901 }
39902
39903 return SDValue();
39904 }
39905 default:
39906 return SDValue();
39907 }
39908
39909 // Nuke no-op shuffles that show up after combining.
39910 if (isNoopShuffleMask(Mask))
39911 return N.getOperand(0);
39912
39913 // Look for simplifications involving one or two shuffle instructions.
39914 SDValue V = N.getOperand(0);
39915 switch (N.getOpcode()) {
39916 default:
39917 break;
39918 case X86ISD::PSHUFLW:
39919 case X86ISD::PSHUFHW:
39920 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!")(static_cast <bool> (VT.getVectorElementType() == MVT::
i16 && "Bad word shuffle type!") ? void (0) : __assert_fail
("VT.getVectorElementType() == MVT::i16 && \"Bad word shuffle type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39920, __extension__
__PRETTY_FUNCTION__))
;
39921
39922 // See if this reduces to a PSHUFD which is no more expensive and can
39923 // combine with more operations. Note that it has to at least flip the
39924 // dwords as otherwise it would have been removed as a no-op.
39925 if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
39926 int DMask[] = {0, 1, 2, 3};
39927 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
39928 DMask[DOffset + 0] = DOffset + 1;
39929 DMask[DOffset + 1] = DOffset + 0;
39930 MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
39931 V = DAG.getBitcast(DVT, V);
39932 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
39933 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
39934 return DAG.getBitcast(VT, V);
39935 }
39936
39937 // Look for shuffle patterns which can be implemented as a single unpack.
39938 // FIXME: This doesn't handle the location of the PSHUFD generically, and
39939 // only works when we have a PSHUFD followed by two half-shuffles.
39940 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
39941 (V.getOpcode() == X86ISD::PSHUFLW ||
39942 V.getOpcode() == X86ISD::PSHUFHW) &&
39943 V.getOpcode() != N.getOpcode() &&
39944 V.hasOneUse() && V.getOperand(0).hasOneUse()) {
39945 SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
39946 if (D.getOpcode() == X86ISD::PSHUFD) {
39947 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
39948 SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
39949 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
39950 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
39951 int WordMask[8];
39952 for (int i = 0; i < 4; ++i) {
39953 WordMask[i + NOffset] = Mask[i] + NOffset;
39954 WordMask[i + VOffset] = VMask[i] + VOffset;
39955 }
39956 // Map the word mask through the DWord mask.
39957 int MappedMask[8];
39958 for (int i = 0; i < 8; ++i)
39959 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
39960 if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
39961 makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
39962 // We can replace all three shuffles with an unpack.
39963 V = DAG.getBitcast(VT, D.getOperand(0));
39964 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
39965 : X86ISD::UNPCKH,
39966 DL, VT, V, V);
39967 }
39968 }
39969 }
39970
39971 break;
39972
39973 case X86ISD::PSHUFD:
39974 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
39975 return NewN;
39976
39977 break;
39978 }
39979
39980 return SDValue();
39981}
39982
39983/// Checks if the shuffle mask takes subsequent elements
39984/// alternately from two vectors.
39985/// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.
39986static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {
39987
39988 int ParitySrc[2] = {-1, -1};
39989 unsigned Size = Mask.size();
39990 for (unsigned i = 0; i != Size; ++i) {
39991 int M = Mask[i];
39992 if (M < 0)
39993 continue;
39994
39995 // Make sure we are using the matching element from the input.
39996 if ((M % Size) != i)
39997 return false;
39998
39999 // Make sure we use the same input for all elements of the same parity.
40000 int Src = M / Size;
40001 if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)
40002 return false;
40003 ParitySrc[i % 2] = Src;
40004 }
40005
40006 // Make sure each input is used.
40007 if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])
40008 return false;
40009
40010 Op0Even = ParitySrc[0] == 0;
40011 return true;
40012}
40013
40014/// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
40015/// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
40016/// are written to the parameters \p Opnd0 and \p Opnd1.
40017///
40018/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
40019/// so it is easier to generically match. We also insert dummy vector shuffle
40020/// nodes for the operands which explicitly discard the lanes which are unused
40021/// by this operation to try to flow through the rest of the combiner
40022/// the fact that they're unused.
40023static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
40024 SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
40025 bool &IsSubAdd) {
40026
40027 EVT VT = N->getValueType(0);
40028 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40029 if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||
40030 !VT.getSimpleVT().isFloatingPoint())
40031 return false;
40032
40033 // We only handle target-independent shuffles.
40034 // FIXME: It would be easy and harmless to use the target shuffle mask
40035 // extraction tool to support more.
40036 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
40037 return false;
40038
40039 SDValue V1 = N->getOperand(0);
40040 SDValue V2 = N->getOperand(1);
40041
40042 // Make sure we have an FADD and an FSUB.
40043 if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||
40044 (V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||
40045 V1.getOpcode() == V2.getOpcode())
40046 return false;
40047
40048 // If there are other uses of these operations we can't fold them.
40049 if (!V1->hasOneUse() || !V2->hasOneUse())
40050 return false;
40051
40052 // Ensure that both operations have the same operands. Note that we can
40053 // commute the FADD operands.
40054 SDValue LHS, RHS;
40055 if (V1.getOpcode() == ISD::FSUB) {
40056 LHS = V1->getOperand(0); RHS = V1->getOperand(1);
40057 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
40058 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
40059 return false;
40060 } else {
40061 assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode")(static_cast <bool> (V2.getOpcode() == ISD::FSUB &&
"Unexpected opcode") ? void (0) : __assert_fail ("V2.getOpcode() == ISD::FSUB && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40061, __extension__
__PRETTY_FUNCTION__))
;
40062 LHS = V2->getOperand(0); RHS = V2->getOperand(1);
40063 if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&
40064 (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))
40065 return false;
40066 }
40067
40068 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
40069 bool Op0Even;
40070 if (!isAddSubOrSubAddMask(Mask, Op0Even))
40071 return false;
40072
40073 // It's a subadd if the vector in the even parity is an FADD.
40074 IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
40075 : V2->getOpcode() == ISD::FADD;
40076
40077 Opnd0 = LHS;
40078 Opnd1 = RHS;
40079 return true;
40080}
40081
40082/// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
40083static SDValue combineShuffleToFMAddSub(SDNode *N,
40084 const X86Subtarget &Subtarget,
40085 SelectionDAG &DAG) {
40086 // We only handle target-independent shuffles.
40087 // FIXME: It would be easy and harmless to use the target shuffle mask
40088 // extraction tool to support more.
40089 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
40090 return SDValue();
40091
40092 MVT VT = N->getSimpleValueType(0);
40093 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40094 if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT))
40095 return SDValue();
40096
40097 // We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).
40098 SDValue Op0 = N->getOperand(0);
40099 SDValue Op1 = N->getOperand(1);
40100 SDValue FMAdd = Op0, FMSub = Op1;
40101 if (FMSub.getOpcode() != X86ISD::FMSUB)
40102 std::swap(FMAdd, FMSub);
40103
40104 if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB ||
40105 FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() ||
40106 FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() ||
40107 FMAdd.getOperand(2) != FMSub.getOperand(2))
40108 return SDValue();
40109
40110 // Check for correct shuffle mask.
40111 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
40112 bool Op0Even;
40113 if (!isAddSubOrSubAddMask(Mask, Op0Even))
40114 return SDValue();
40115
40116 // FMAddSub takes zeroth operand from FMSub node.
40117 SDLoc DL(N);
40118 bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;
40119 unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
40120 return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),
40121 FMAdd.getOperand(2));
40122}
40123
40124/// Try to combine a shuffle into a target-specific add-sub or
40125/// mul-add-sub node.
40126static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
40127 const X86Subtarget &Subtarget,
40128 SelectionDAG &DAG) {
40129 if (SDValue V = combineShuffleToFMAddSub(N, Subtarget, DAG))
40130 return V;
40131
40132 SDValue Opnd0, Opnd1;
40133 bool IsSubAdd;
40134 if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd))
40135 return SDValue();
40136
40137 MVT VT = N->getSimpleValueType(0);
40138 SDLoc DL(N);
40139
40140 // Try to generate X86ISD::FMADDSUB node here.
40141 SDValue Opnd2;
40142 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) {
40143 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
40144 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
40145 }
40146
40147 if (IsSubAdd)
40148 return SDValue();
40149
40150 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
40151 // the ADDSUB idiom has been successfully recognized. There are no known
40152 // X86 targets with 512-bit ADDSUB instructions!
40153 if (VT.is512BitVector())
40154 return SDValue();
40155
40156 // Do not generate X86ISD::ADDSUB node for FP16's vector types even though
40157 // the ADDSUB idiom has been successfully recognized. There are no known
40158 // X86 targets with FP16 ADDSUB instructions!
40159 if (VT.getVectorElementType() == MVT::f16)
40160 return SDValue();
40161
40162 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
40163}
40164
40165// We are looking for a shuffle where both sources are concatenated with undef
40166// and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
40167// if we can express this as a single-source shuffle, that's preferable.
40168static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
40169 const X86Subtarget &Subtarget) {
40170 if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
40171 return SDValue();
40172
40173 EVT VT = N->getValueType(0);
40174
40175 // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
40176 if (!VT.is128BitVector() && !VT.is256BitVector())
40177 return SDValue();
40178
40179 if (VT.getVectorElementType() != MVT::i32 &&
40180 VT.getVectorElementType() != MVT::i64 &&
40181 VT.getVectorElementType() != MVT::f32 &&
40182 VT.getVectorElementType() != MVT::f64)
40183 return SDValue();
40184
40185 SDValue N0 = N->getOperand(0);
40186 SDValue N1 = N->getOperand(1);
40187
40188 // Check that both sources are concats with undef.
40189 if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
40190 N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
40191 N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
40192 !N1.getOperand(1).isUndef())
40193 return SDValue();
40194
40195 // Construct the new shuffle mask. Elements from the first source retain their
40196 // index, but elements from the second source no longer need to skip an undef.
40197 SmallVector<int, 8> Mask;
40198 int NumElts = VT.getVectorNumElements();
40199
40200 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
40201 for (int Elt : SVOp->getMask())
40202 Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
40203
40204 SDLoc DL(N);
40205 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
40206 N1.getOperand(0));
40207 return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
40208}
40209
40210/// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the
40211/// low half of each source vector and does not set any high half elements in
40212/// the destination vector, narrow the shuffle to half its original size.
40213static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG) {
40214 if (!Shuf->getValueType(0).isSimple())
40215 return SDValue();
40216 MVT VT = Shuf->getSimpleValueType(0);
40217 if (!VT.is256BitVector() && !VT.is512BitVector())
40218 return SDValue();
40219
40220 // See if we can ignore all of the high elements of the shuffle.
40221 ArrayRef<int> Mask = Shuf->getMask();
40222 if (!isUndefUpperHalf(Mask))
40223 return SDValue();
40224
40225 // Check if the shuffle mask accesses only the low half of each input vector
40226 // (half-index output is 0 or 2).
40227 int HalfIdx1, HalfIdx2;
40228 SmallVector<int, 8> HalfMask(Mask.size() / 2);
40229 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) ||
40230 (HalfIdx1 % 2 == 1) || (HalfIdx2 % 2 == 1))
40231 return SDValue();
40232
40233 // Create a half-width shuffle to replace the unnecessarily wide shuffle.
40234 // The trick is knowing that all of the insert/extract are actually free
40235 // subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle
40236 // of narrow inputs into a narrow output, and that is always cheaper than
40237 // the wide shuffle that we started with.
40238 return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),
40239 Shuf->getOperand(1), HalfMask, HalfIdx1,
40240 HalfIdx2, false, DAG, /*UseConcat*/true);
40241}
40242
40243static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
40244 TargetLowering::DAGCombinerInfo &DCI,
40245 const X86Subtarget &Subtarget) {
40246 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N))
40247 if (SDValue V = narrowShuffle(Shuf, DAG))
40248 return V;
40249
40250 // If we have legalized the vector types, look for blends of FADD and FSUB
40251 // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
40252 SDLoc dl(N);
40253 EVT VT = N->getValueType(0);
40254 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40255 if (TLI.isTypeLegal(VT))
40256 if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
40257 return AddSub;
40258
40259 // Attempt to combine into a vector load/broadcast.
40260 if (SDValue LD = combineToConsecutiveLoads(
40261 VT, SDValue(N, 0), dl, DAG, Subtarget, /*IsAfterLegalize*/ true))
40262 return LD;
40263
40264 // For AVX2, we sometimes want to combine
40265 // (vector_shuffle <mask> (concat_vectors t1, undef)
40266 // (concat_vectors t2, undef))
40267 // Into:
40268 // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
40269 // Since the latter can be efficiently lowered with VPERMD/VPERMQ
40270 if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
40271 return ShufConcat;
40272
40273 if (isTargetShuffle(N->getOpcode())) {
40274 SDValue Op(N, 0);
40275 if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
40276 return Shuffle;
40277
40278 // Try recursively combining arbitrary sequences of x86 shuffle
40279 // instructions into higher-order shuffles. We do this after combining
40280 // specific PSHUF instruction sequences into their minimal form so that we
40281 // can evaluate how many specialized shuffle instructions are involved in
40282 // a particular chain.
40283 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
40284 return Res;
40285
40286 // Simplify source operands based on shuffle mask.
40287 // TODO - merge this into combineX86ShufflesRecursively.
40288 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
40289 if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, DCI))
40290 return SDValue(N, 0);
40291
40292 // Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
40293 // Perform this after other shuffle combines to allow inner shuffles to be
40294 // combined away first.
40295 if (SDValue BinOp = canonicalizeShuffleWithBinOps(Op, DAG, SDLoc(N)))
40296 return BinOp;
40297 }
40298
40299 return SDValue();
40300}
40301
40302// Simplify variable target shuffle masks based on the demanded elements.
40303// TODO: Handle DemandedBits in mask indices as well?
40304bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetShuffle(
40305 SDValue Op, const APInt &DemandedElts, unsigned MaskIndex,
40306 TargetLowering::TargetLoweringOpt &TLO, unsigned Depth) const {
40307 // If we're demanding all elements don't bother trying to simplify the mask.
40308 unsigned NumElts = DemandedElts.getBitWidth();
40309 if (DemandedElts.isAllOnes())
40310 return false;
40311
40312 SDValue Mask = Op.getOperand(MaskIndex);
40313 if (!Mask.hasOneUse())
40314 return false;
40315
40316 // Attempt to generically simplify the variable shuffle mask.
40317 APInt MaskUndef, MaskZero;
40318 if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
40319 Depth + 1))
40320 return true;
40321
40322 // Attempt to extract+simplify a (constant pool load) shuffle mask.
40323 // TODO: Support other types from getTargetShuffleMaskIndices?
40324 SDValue BC = peekThroughOneUseBitcasts(Mask);
40325 EVT BCVT = BC.getValueType();
40326 auto *Load = dyn_cast<LoadSDNode>(BC);
40327 if (!Load)
40328 return false;
40329
40330 const Constant *C = getTargetConstantFromNode(Load);
40331 if (!C)
40332 return false;
40333
40334 Type *CTy = C->getType();
40335 if (!CTy->isVectorTy() ||
40336 CTy->getPrimitiveSizeInBits() != Mask.getValueSizeInBits())
40337 return false;
40338
40339 // Handle scaling for i64 elements on 32-bit targets.
40340 unsigned NumCstElts = cast<FixedVectorType>(CTy)->getNumElements();
40341 if (NumCstElts != NumElts && NumCstElts != (NumElts * 2))
40342 return false;
40343 unsigned Scale = NumCstElts / NumElts;
40344
40345 // Simplify mask if we have an undemanded element that is not undef.
40346 bool Simplified = false;
40347 SmallVector<Constant *, 32> ConstVecOps;
40348 for (unsigned i = 0; i != NumCstElts; ++i) {
40349 Constant *Elt = C->getAggregateElement(i);
40350 if (!DemandedElts[i / Scale] && !isa<UndefValue>(Elt)) {
40351 ConstVecOps.push_back(UndefValue::get(Elt->getType()));
40352 Simplified = true;
40353 continue;
40354 }
40355 ConstVecOps.push_back(Elt);
40356 }
40357 if (!Simplified)
40358 return false;
40359
40360 // Generate new constant pool entry + legalize immediately for the load.
40361 SDLoc DL(Op);
40362 SDValue CV = TLO.DAG.getConstantPool(ConstantVector::get(ConstVecOps), BCVT);
40363 SDValue LegalCV = LowerConstantPool(CV, TLO.DAG);
40364 SDValue NewMask = TLO.DAG.getLoad(
40365 BCVT, DL, TLO.DAG.getEntryNode(), LegalCV,
40366 MachinePointerInfo::getConstantPool(TLO.DAG.getMachineFunction()),
40367 Load->getAlign());
40368 return TLO.CombineTo(Mask, TLO.DAG.getBitcast(Mask.getValueType(), NewMask));
40369}
40370
40371bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
40372 SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
40373 TargetLoweringOpt &TLO, unsigned Depth) const {
40374 int NumElts = DemandedElts.getBitWidth();
40375 unsigned Opc = Op.getOpcode();
40376 EVT VT = Op.getValueType();
40377
40378 // Handle special case opcodes.
40379 switch (Opc) {
40380 case X86ISD::PMULDQ:
40381 case X86ISD::PMULUDQ: {
40382 APInt LHSUndef, LHSZero;
40383 APInt RHSUndef, RHSZero;
40384 SDValue LHS = Op.getOperand(0);
40385 SDValue RHS = Op.getOperand(1);
40386 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
40387 Depth + 1))
40388 return true;
40389 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
40390 Depth + 1))
40391 return true;
40392 // Multiply by zero.
40393 KnownZero = LHSZero | RHSZero;
40394 break;
40395 }
40396 case X86ISD::VPMADDWD: {
40397 APInt LHSUndef, LHSZero;
40398 APInt RHSUndef, RHSZero;
40399 SDValue LHS = Op.getOperand(0);
40400 SDValue RHS = Op.getOperand(1);
40401 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, 2 * NumElts);
40402
40403 if (SimplifyDemandedVectorElts(LHS, DemandedSrcElts, LHSUndef, LHSZero, TLO,
40404 Depth + 1))
40405 return true;
40406 if (SimplifyDemandedVectorElts(RHS, DemandedSrcElts, RHSUndef, RHSZero, TLO,
40407 Depth + 1))
40408 return true;
40409
40410 // TODO: Multiply by zero.
40411
40412 // If RHS/LHS elements are known zero then we don't need the LHS/RHS equivalent.
40413 APInt DemandedLHSElts = DemandedSrcElts & ~RHSZero;
40414 if (SimplifyDemandedVectorElts(LHS, DemandedLHSElts, LHSUndef, LHSZero, TLO,
40415 Depth + 1))
40416 return true;
40417 APInt DemandedRHSElts = DemandedSrcElts & ~LHSZero;
40418 if (SimplifyDemandedVectorElts(RHS, DemandedRHSElts, RHSUndef, RHSZero, TLO,
40419 Depth + 1))
40420 return true;
40421 break;
40422 }
40423 case X86ISD::PSADBW: {
40424 SDValue LHS = Op.getOperand(0);
40425 SDValue RHS = Op.getOperand(1);
40426 assert(VT.getScalarType() == MVT::i64 &&(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
LHS.getValueType() == RHS.getValueType() && LHS.getValueType
().getScalarType() == MVT::i8 && "Unexpected PSADBW types"
) ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && LHS.getValueType() == RHS.getValueType() && LHS.getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40429, __extension__
__PRETTY_FUNCTION__))
40427 LHS.getValueType() == RHS.getValueType() &&(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
LHS.getValueType() == RHS.getValueType() && LHS.getValueType
().getScalarType() == MVT::i8 && "Unexpected PSADBW types"
) ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && LHS.getValueType() == RHS.getValueType() && LHS.getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40429, __extension__
__PRETTY_FUNCTION__))
40428 LHS.getValueType().getScalarType() == MVT::i8 &&(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
LHS.getValueType() == RHS.getValueType() && LHS.getValueType
().getScalarType() == MVT::i8 && "Unexpected PSADBW types"
) ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && LHS.getValueType() == RHS.getValueType() && LHS.getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40429, __extension__
__PRETTY_FUNCTION__))
40429 "Unexpected PSADBW types")(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
LHS.getValueType() == RHS.getValueType() && LHS.getValueType
().getScalarType() == MVT::i8 && "Unexpected PSADBW types"
) ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && LHS.getValueType() == RHS.getValueType() && LHS.getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40429, __extension__
__PRETTY_FUNCTION__))
;
40430
40431 // Aggressively peek through ops to get at the demanded elts.
40432 if (!DemandedElts.isAllOnes()) {
40433 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
40434 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
40435 SDValue NewLHS = SimplifyMultipleUseDemandedVectorElts(
40436 LHS, DemandedSrcElts, TLO.DAG, Depth + 1);
40437 SDValue NewRHS = SimplifyMultipleUseDemandedVectorElts(
40438 RHS, DemandedSrcElts, TLO.DAG, Depth + 1);
40439 if (NewLHS || NewRHS) {
40440 NewLHS = NewLHS ? NewLHS : LHS;
40441 NewRHS = NewRHS ? NewRHS : RHS;
40442 return TLO.CombineTo(
40443 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
40444 }
40445 }
40446 break;
40447 }
40448 case X86ISD::VSHL:
40449 case X86ISD::VSRL:
40450 case X86ISD::VSRA: {
40451 // We only need the bottom 64-bits of the (128-bit) shift amount.
40452 SDValue Amt = Op.getOperand(1);
40453 MVT AmtVT = Amt.getSimpleValueType();
40454 assert(AmtVT.is128BitVector() && "Unexpected value type")(static_cast <bool> (AmtVT.is128BitVector() && "Unexpected value type"
) ? void (0) : __assert_fail ("AmtVT.is128BitVector() && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40454, __extension__
__PRETTY_FUNCTION__))
;
40455
40456 // If we reuse the shift amount just for sse shift amounts then we know that
40457 // only the bottom 64-bits are only ever used.
40458 bool AssumeSingleUse = llvm::all_of(Amt->uses(), [&Amt](SDNode *Use) {
40459 unsigned UseOpc = Use->getOpcode();
40460 return (UseOpc == X86ISD::VSHL || UseOpc == X86ISD::VSRL ||
40461 UseOpc == X86ISD::VSRA) &&
40462 Use->getOperand(0) != Amt;
40463 });
40464
40465 APInt AmtUndef, AmtZero;
40466 unsigned NumAmtElts = AmtVT.getVectorNumElements();
40467 APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);
40468 if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,
40469 Depth + 1, AssumeSingleUse))
40470 return true;
40471 LLVM_FALLTHROUGH[[gnu::fallthrough]];
40472 }
40473 case X86ISD::VSHLI:
40474 case X86ISD::VSRLI:
40475 case X86ISD::VSRAI: {
40476 SDValue Src = Op.getOperand(0);
40477 APInt SrcUndef;
40478 if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,
40479 Depth + 1))
40480 return true;
40481
40482 // Aggressively peek through ops to get at the demanded elts.
40483 if (!DemandedElts.isAllOnes())
40484 if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
40485 Src, DemandedElts, TLO.DAG, Depth + 1))
40486 return TLO.CombineTo(
40487 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1)));
40488 break;
40489 }
40490 case X86ISD::VPSHA:
40491 case X86ISD::VPSHL:
40492 case X86ISD::VSHLV:
40493 case X86ISD::VSRLV:
40494 case X86ISD::VSRAV: {
40495 APInt LHSUndef, LHSZero;
40496 APInt RHSUndef, RHSZero;
40497 SDValue LHS = Op.getOperand(0);
40498 SDValue RHS = Op.getOperand(1);
40499 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
40500 Depth + 1))
40501 return true;
40502 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
40503 Depth + 1))
40504 return true;
40505 KnownZero = LHSZero;
40506 break;
40507 }
40508 case X86ISD::KSHIFTL: {
40509 SDValue Src = Op.getOperand(0);
40510 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
40511 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount")(static_cast <bool> (Amt->getAPIntValue().ult(NumElts
) && "Out of range shift amount") ? void (0) : __assert_fail
("Amt->getAPIntValue().ult(NumElts) && \"Out of range shift amount\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40511, __extension__
__PRETTY_FUNCTION__))
;
40512 unsigned ShiftAmt = Amt->getZExtValue();
40513
40514 if (ShiftAmt == 0)
40515 return TLO.CombineTo(Op, Src);
40516
40517 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
40518 // single shift. We can do this if the bottom bits (which are shifted
40519 // out) are never demanded.
40520 if (Src.getOpcode() == X86ISD::KSHIFTR) {
40521 if (!DemandedElts.intersects(APInt::getLowBitsSet(NumElts, ShiftAmt))) {
40522 unsigned C1 = Src.getConstantOperandVal(1);
40523 unsigned NewOpc = X86ISD::KSHIFTL;
40524 int Diff = ShiftAmt - C1;
40525 if (Diff < 0) {
40526 Diff = -Diff;
40527 NewOpc = X86ISD::KSHIFTR;
40528 }
40529
40530 SDLoc dl(Op);
40531 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
40532 return TLO.CombineTo(
40533 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
40534 }
40535 }
40536
40537 APInt DemandedSrc = DemandedElts.lshr(ShiftAmt);
40538 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
40539 Depth + 1))
40540 return true;
40541
40542 KnownUndef <<= ShiftAmt;
40543 KnownZero <<= ShiftAmt;
40544 KnownZero.setLowBits(ShiftAmt);
40545 break;
40546 }
40547 case X86ISD::KSHIFTR: {
40548 SDValue Src = Op.getOperand(0);
40549 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
40550 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount")(static_cast <bool> (Amt->getAPIntValue().ult(NumElts
) && "Out of range shift amount") ? void (0) : __assert_fail
("Amt->getAPIntValue().ult(NumElts) && \"Out of range shift amount\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40550, __extension__
__PRETTY_FUNCTION__))
;
40551 unsigned ShiftAmt = Amt->getZExtValue();
40552
40553 if (ShiftAmt == 0)
40554 return TLO.CombineTo(Op, Src);
40555
40556 // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a
40557 // single shift. We can do this if the top bits (which are shifted
40558 // out) are never demanded.
40559 if (Src.getOpcode() == X86ISD::KSHIFTL) {
40560 if (!DemandedElts.intersects(APInt::getHighBitsSet(NumElts, ShiftAmt))) {
40561 unsigned C1 = Src.getConstantOperandVal(1);
40562 unsigned NewOpc = X86ISD::KSHIFTR;
40563 int Diff = ShiftAmt - C1;
40564 if (Diff < 0) {
40565 Diff = -Diff;
40566 NewOpc = X86ISD::KSHIFTL;
40567 }
40568
40569 SDLoc dl(Op);
40570 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
40571 return TLO.CombineTo(
40572 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
40573 }
40574 }
40575
40576 APInt DemandedSrc = DemandedElts.shl(ShiftAmt);
40577 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
40578 Depth + 1))
40579 return true;
40580
40581 KnownUndef.lshrInPlace(ShiftAmt);
40582 KnownZero.lshrInPlace(ShiftAmt);
40583 KnownZero.setHighBits(ShiftAmt);
40584 break;
40585 }
40586 case X86ISD::ANDNP: {
40587 // ANDNP = (~LHS & RHS);
40588 SDValue LHS = Op.getOperand(0);
40589 SDValue RHS = Op.getOperand(1);
40590
40591 auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
40592 APInt UndefElts;
40593 SmallVector<APInt> EltBits;
40594 int NumElts = VT.getVectorNumElements();
40595 int EltSizeInBits = VT.getScalarSizeInBits();
40596 APInt OpBits = APInt::getAllOnes(EltSizeInBits);
40597 APInt OpElts = DemandedElts;
40598 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
40599 EltBits)) {
40600 OpBits.clearAllBits();
40601 OpElts.clearAllBits();
40602 for (int I = 0; I != NumElts; ++I)
40603 if (DemandedElts[I] && ((Invert && !EltBits[I].isAllOnes()) ||
40604 (!Invert && !EltBits[I].isZero()))) {
40605 OpBits |= Invert ? ~EltBits[I] : EltBits[I];
40606 OpElts.setBit(I);
40607 }
40608 }
40609 return std::make_pair(OpBits, OpElts);
40610 };
40611 std::pair<APInt, APInt> DemandLHS = GetDemandedMasks(RHS);
40612 std::pair<APInt, APInt> DemandRHS = GetDemandedMasks(LHS, true);
40613
40614 APInt LHSUndef, LHSZero;
40615 APInt RHSUndef, RHSZero;
40616 if (SimplifyDemandedVectorElts(LHS, DemandLHS.second, LHSUndef, LHSZero,
40617 TLO, Depth + 1))
40618 return true;
40619 if (SimplifyDemandedVectorElts(RHS, DemandRHS.second, RHSUndef, RHSZero,
40620 TLO, Depth + 1))
40621 return true;
40622
40623 if (!DemandedElts.isAllOnes()) {
40624 SDValue NewLHS = SimplifyMultipleUseDemandedBits(
40625 LHS, DemandLHS.first, DemandLHS.second, TLO.DAG, Depth + 1);
40626 SDValue NewRHS = SimplifyMultipleUseDemandedBits(
40627 RHS, DemandRHS.first, DemandRHS.second, TLO.DAG, Depth + 1);
40628 if (NewLHS || NewRHS) {
40629 NewLHS = NewLHS ? NewLHS : LHS;
40630 NewRHS = NewRHS ? NewRHS : RHS;
40631 return TLO.CombineTo(
40632 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
40633 }
40634 }
40635 break;
40636 }
40637 case X86ISD::CVTSI2P:
40638 case X86ISD::CVTUI2P: {
40639 SDValue Src = Op.getOperand(0);
40640 MVT SrcVT = Src.getSimpleValueType();
40641 APInt SrcUndef, SrcZero;
40642 APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
40643 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
40644 Depth + 1))
40645 return true;
40646 break;
40647 }
40648 case X86ISD::PACKSS:
40649 case X86ISD::PACKUS: {
40650 SDValue N0 = Op.getOperand(0);
40651 SDValue N1 = Op.getOperand(1);
40652
40653 APInt DemandedLHS, DemandedRHS;
40654 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
40655
40656 APInt LHSUndef, LHSZero;
40657 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
40658 Depth + 1))
40659 return true;
40660 APInt RHSUndef, RHSZero;
40661 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
40662 Depth + 1))
40663 return true;
40664
40665 // TODO - pass on known zero/undef.
40666
40667 // Aggressively peek through ops to get at the demanded elts.
40668 // TODO - we should do this for all target/faux shuffles ops.
40669 if (!DemandedElts.isAllOnes()) {
40670 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
40671 TLO.DAG, Depth + 1);
40672 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
40673 TLO.DAG, Depth + 1);
40674 if (NewN0 || NewN1) {
40675 NewN0 = NewN0 ? NewN0 : N0;
40676 NewN1 = NewN1 ? NewN1 : N1;
40677 return TLO.CombineTo(Op,
40678 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
40679 }
40680 }
40681 break;
40682 }
40683 case X86ISD::HADD:
40684 case X86ISD::HSUB:
40685 case X86ISD::FHADD:
40686 case X86ISD::FHSUB: {
40687 SDValue N0 = Op.getOperand(0);
40688 SDValue N1 = Op.getOperand(1);
40689
40690 APInt DemandedLHS, DemandedRHS;
40691 getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
40692
40693 APInt LHSUndef, LHSZero;
40694 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
40695 Depth + 1))
40696 return true;
40697 APInt RHSUndef, RHSZero;
40698 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
40699 Depth + 1))
40700 return true;
40701
40702 // TODO - pass on known zero/undef.
40703
40704 // Aggressively peek through ops to get at the demanded elts.
40705 // TODO: Handle repeated operands.
40706 if (N0 != N1 && !DemandedElts.isAllOnes()) {
40707 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
40708 TLO.DAG, Depth + 1);
40709 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
40710 TLO.DAG, Depth + 1);
40711 if (NewN0 || NewN1) {
40712 NewN0 = NewN0 ? NewN0 : N0;
40713 NewN1 = NewN1 ? NewN1 : N1;
40714 return TLO.CombineTo(Op,
40715 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
40716 }
40717 }
40718 break;
40719 }
40720 case X86ISD::VTRUNC:
40721 case X86ISD::VTRUNCS:
40722 case X86ISD::VTRUNCUS: {
40723 SDValue Src = Op.getOperand(0);
40724 MVT SrcVT = Src.getSimpleValueType();
40725 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
40726 APInt SrcUndef, SrcZero;
40727 if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO,
40728 Depth + 1))
40729 return true;
40730 KnownZero = SrcZero.zextOrTrunc(NumElts);
40731 KnownUndef = SrcUndef.zextOrTrunc(NumElts);
40732 break;
40733 }
40734 case X86ISD::BLENDV: {
40735 APInt SelUndef, SelZero;
40736 if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,
40737 SelZero, TLO, Depth + 1))
40738 return true;
40739
40740 // TODO: Use SelZero to adjust LHS/RHS DemandedElts.
40741 APInt LHSUndef, LHSZero;
40742 if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef,
40743 LHSZero, TLO, Depth + 1))
40744 return true;
40745
40746 APInt RHSUndef, RHSZero;
40747 if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef,
40748 RHSZero, TLO, Depth + 1))
40749 return true;
40750
40751 KnownZero = LHSZero & RHSZero;
40752 KnownUndef = LHSUndef & RHSUndef;
40753 break;
40754 }
40755 case X86ISD::VZEXT_MOVL: {
40756 // If upper demanded elements are already zero then we have nothing to do.
40757 SDValue Src = Op.getOperand(0);
40758 APInt DemandedUpperElts = DemandedElts;
40759 DemandedUpperElts.clearLowBits(1);
40760 if (TLO.DAG.computeKnownBits(Src, DemandedUpperElts, Depth + 1).isZero())
40761 return TLO.CombineTo(Op, Src);
40762 break;
40763 }
40764 case X86ISD::VBROADCAST: {
40765 SDValue Src = Op.getOperand(0);
40766 MVT SrcVT = Src.getSimpleValueType();
40767 if (!SrcVT.isVector())
40768 break;
40769 // Don't bother broadcasting if we just need the 0'th element.
40770 if (DemandedElts == 1) {
40771 if (Src.getValueType() != VT)
40772 Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,
40773 SDLoc(Op));
40774 return TLO.CombineTo(Op, Src);
40775 }
40776 APInt SrcUndef, SrcZero;
40777 APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);
40778 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
40779 Depth + 1))
40780 return true;
40781 // Aggressively peek through src to get at the demanded elt.
40782 // TODO - we should do this for all target/faux shuffles ops.
40783 if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
40784 Src, SrcElts, TLO.DAG, Depth + 1))
40785 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
40786 break;
40787 }
40788 case X86ISD::VPERMV:
40789 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 0, TLO,
40790 Depth))
40791 return true;
40792 break;
40793 case X86ISD::PSHUFB:
40794 case X86ISD::VPERMV3:
40795 case X86ISD::VPERMILPV:
40796 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 1, TLO,
40797 Depth))
40798 return true;
40799 break;
40800 case X86ISD::VPPERM:
40801 case X86ISD::VPERMIL2:
40802 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 2, TLO,
40803 Depth))
40804 return true;
40805 break;
40806 }
40807
40808 // For 256/512-bit ops that are 128/256-bit ops glued together, if we do not
40809 // demand any of the high elements, then narrow the op to 128/256-bits: e.g.
40810 // (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0
40811 if ((VT.is256BitVector() || VT.is512BitVector()) &&
40812 DemandedElts.lshr(NumElts / 2) == 0) {
40813 unsigned SizeInBits = VT.getSizeInBits();
40814 unsigned ExtSizeInBits = SizeInBits / 2;
40815
40816 // See if 512-bit ops only use the bottom 128-bits.
40817 if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0)
40818 ExtSizeInBits = SizeInBits / 4;
40819
40820 switch (Opc) {
40821 // Scalar broadcast.
40822 case X86ISD::VBROADCAST: {
40823 SDLoc DL(Op);
40824 SDValue Src = Op.getOperand(0);
40825 if (Src.getValueSizeInBits() > ExtSizeInBits)
40826 Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);
40827 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
40828 ExtSizeInBits / VT.getScalarSizeInBits());
40829 SDValue Bcst = TLO.DAG.getNode(X86ISD::VBROADCAST, DL, BcstVT, Src);
40830 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
40831 TLO.DAG, DL, ExtSizeInBits));
40832 }
40833 case X86ISD::VBROADCAST_LOAD: {
40834 SDLoc DL(Op);
40835 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
40836 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
40837 ExtSizeInBits / VT.getScalarSizeInBits());
40838 SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);
40839 SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};
40840 SDValue Bcst = TLO.DAG.getMemIntrinsicNode(
40841 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),
40842 MemIntr->getMemOperand());
40843 TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),
40844 Bcst.getValue(1));
40845 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
40846 TLO.DAG, DL, ExtSizeInBits));
40847 }
40848 // Subvector broadcast.
40849 case X86ISD::SUBV_BROADCAST_LOAD: {
40850 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
40851 EVT MemVT = MemIntr->getMemoryVT();
40852 if (ExtSizeInBits == MemVT.getStoreSizeInBits()) {
40853 SDLoc DL(Op);
40854 SDValue Ld =
40855 TLO.DAG.getLoad(MemVT, DL, MemIntr->getChain(),
40856 MemIntr->getBasePtr(), MemIntr->getMemOperand());
40857 TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),
40858 Ld.getValue(1));
40859 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Ld, 0,
40860 TLO.DAG, DL, ExtSizeInBits));
40861 } else if ((ExtSizeInBits % MemVT.getStoreSizeInBits()) == 0) {
40862 SDLoc DL(Op);
40863 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
40864 ExtSizeInBits / VT.getScalarSizeInBits());
40865 if (SDValue BcstLd =
40866 getBROADCAST_LOAD(Opc, DL, BcstVT, MemVT, MemIntr, 0, TLO.DAG))
40867 return TLO.CombineTo(Op,
40868 insertSubVector(TLO.DAG.getUNDEF(VT), BcstLd, 0,
40869 TLO.DAG, DL, ExtSizeInBits));
40870 }
40871 break;
40872 }
40873 // Byte shifts by immediate.
40874 case X86ISD::VSHLDQ:
40875 case X86ISD::VSRLDQ:
40876 // Shift by uniform.
40877 case X86ISD::VSHL:
40878 case X86ISD::VSRL:
40879 case X86ISD::VSRA:
40880 // Shift by immediate.
40881 case X86ISD::VSHLI:
40882 case X86ISD::VSRLI:
40883 case X86ISD::VSRAI: {
40884 SDLoc DL(Op);
40885 SDValue Ext0 =
40886 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
40887 SDValue ExtOp =
40888 TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1));
40889 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
40890 SDValue Insert =
40891 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
40892 return TLO.CombineTo(Op, Insert);
40893 }
40894 case X86ISD::VPERMI: {
40895 // Simplify PERMPD/PERMQ to extract_subvector.
40896 // TODO: This should be done in shuffle combining.
40897 if (VT == MVT::v4f64 || VT == MVT::v4i64) {
40898 SmallVector<int, 4> Mask;
40899 DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask);
40900 if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) {
40901 SDLoc DL(Op);
40902 SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128);
40903 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
40904 SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128);
40905 return TLO.CombineTo(Op, Insert);
40906 }
40907 }
40908 break;
40909 }
40910 case X86ISD::VPERM2X128: {
40911 // Simplify VPERM2F128/VPERM2I128 to extract_subvector.
40912 SDLoc DL(Op);
40913 unsigned LoMask = Op.getConstantOperandVal(2) & 0xF;
40914 if (LoMask & 0x8)
40915 return TLO.CombineTo(
40916 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, DL));
40917 unsigned EltIdx = (LoMask & 0x1) * (NumElts / 2);
40918 unsigned SrcIdx = (LoMask & 0x2) >> 1;
40919 SDValue ExtOp =
40920 extractSubVector(Op.getOperand(SrcIdx), EltIdx, TLO.DAG, DL, 128);
40921 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
40922 SDValue Insert =
40923 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
40924 return TLO.CombineTo(Op, Insert);
40925 }
40926 // Zero upper elements.
40927 case X86ISD::VZEXT_MOVL:
40928 // Target unary shuffles by immediate:
40929 case X86ISD::PSHUFD:
40930 case X86ISD::PSHUFLW:
40931 case X86ISD::PSHUFHW:
40932 case X86ISD::VPERMILPI:
40933 // (Non-Lane Crossing) Target Shuffles.
40934 case X86ISD::VPERMILPV:
40935 case X86ISD::VPERMIL2:
40936 case X86ISD::PSHUFB:
40937 case X86ISD::UNPCKL:
40938 case X86ISD::UNPCKH:
40939 case X86ISD::BLENDI:
40940 // Integer ops.
40941 case X86ISD::PACKSS:
40942 case X86ISD::PACKUS:
40943 // Horizontal Ops.
40944 case X86ISD::HADD:
40945 case X86ISD::HSUB:
40946 case X86ISD::FHADD:
40947 case X86ISD::FHSUB: {
40948 SDLoc DL(Op);
40949 SmallVector<SDValue, 4> Ops;
40950 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
40951 SDValue SrcOp = Op.getOperand(i);
40952 EVT SrcVT = SrcOp.getValueType();
40953 assert((!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) &&(static_cast <bool> ((!SrcVT.isVector() || SrcVT.getSizeInBits
() == SizeInBits) && "Unsupported vector size") ? void
(0) : __assert_fail ("(!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) && \"Unsupported vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40954, __extension__
__PRETTY_FUNCTION__))
40954 "Unsupported vector size")(static_cast <bool> ((!SrcVT.isVector() || SrcVT.getSizeInBits
() == SizeInBits) && "Unsupported vector size") ? void
(0) : __assert_fail ("(!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) && \"Unsupported vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40954, __extension__
__PRETTY_FUNCTION__))
;
40955 Ops.push_back(SrcVT.isVector() ? extractSubVector(SrcOp, 0, TLO.DAG, DL,
40956 ExtSizeInBits)
40957 : SrcOp);
40958 }
40959 MVT ExtVT = VT.getSimpleVT();
40960 ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),
40961 ExtSizeInBits / ExtVT.getScalarSizeInBits());
40962 SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ops);
40963 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
40964 SDValue Insert =
40965 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
40966 return TLO.CombineTo(Op, Insert);
40967 }
40968 }
40969 }
40970
40971 // For broadcasts, unless we *only* demand the 0'th element,
40972 // stop attempts at simplification here, we aren't going to improve things,
40973 // this is better than any potential shuffle.
40974 if (isTargetShuffleSplat(Op) && !DemandedElts.isOne())
40975 return false;
40976
40977 // Get target/faux shuffle mask.
40978 APInt OpUndef, OpZero;
40979 SmallVector<int, 64> OpMask;
40980 SmallVector<SDValue, 2> OpInputs;
40981 if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, OpUndef,
40982 OpZero, TLO.DAG, Depth, false))
40983 return false;
40984
40985 // Shuffle inputs must be the same size as the result.
40986 if (OpMask.size() != (unsigned)NumElts ||
40987 llvm::any_of(OpInputs, [VT](SDValue V) {
40988 return VT.getSizeInBits() != V.getValueSizeInBits() ||
40989 !V.getValueType().isVector();
40990 }))
40991 return false;
40992
40993 KnownZero = OpZero;
40994 KnownUndef = OpUndef;
40995
40996 // Check if shuffle mask can be simplified to undef/zero/identity.
40997 int NumSrcs = OpInputs.size();
40998 for (int i = 0; i != NumElts; ++i)
40999 if (!DemandedElts[i])
41000 OpMask[i] = SM_SentinelUndef;
41001
41002 if (isUndefInRange(OpMask, 0, NumElts)) {
41003 KnownUndef.setAllBits();
41004 return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
41005 }
41006 if (isUndefOrZeroInRange(OpMask, 0, NumElts)) {
41007 KnownZero.setAllBits();
41008 return TLO.CombineTo(
41009 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
41010 }
41011 for (int Src = 0; Src != NumSrcs; ++Src)
41012 if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))
41013 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, OpInputs[Src]));
41014
41015 // Attempt to simplify inputs.
41016 for (int Src = 0; Src != NumSrcs; ++Src) {
41017 // TODO: Support inputs of different types.
41018 if (OpInputs[Src].getValueType() != VT)
41019 continue;
41020
41021 int Lo = Src * NumElts;
41022 APInt SrcElts = APInt::getZero(NumElts);
41023 for (int i = 0; i != NumElts; ++i)
41024 if (DemandedElts[i]) {
41025 int M = OpMask[i] - Lo;
41026 if (0 <= M && M < NumElts)
41027 SrcElts.setBit(M);
41028 }
41029
41030 // TODO - Propagate input undef/zero elts.
41031 APInt SrcUndef, SrcZero;
41032 if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,
41033 TLO, Depth + 1))
41034 return true;
41035 }
41036
41037 // If we don't demand all elements, then attempt to combine to a simpler
41038 // shuffle.
41039 // We need to convert the depth to something combineX86ShufflesRecursively
41040 // can handle - so pretend its Depth == 0 again, and reduce the max depth
41041 // to match. This prevents combineX86ShuffleChain from returning a
41042 // combined shuffle that's the same as the original root, causing an
41043 // infinite loop.
41044 if (!DemandedElts.isAllOnes()) {
41045 assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range")(static_cast <bool> (Depth < X86::MaxShuffleCombineDepth
&& "Depth out of range") ? void (0) : __assert_fail (
"Depth < X86::MaxShuffleCombineDepth && \"Depth out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41045, __extension__
__PRETTY_FUNCTION__))
;
41046
41047 SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);
41048 for (int i = 0; i != NumElts; ++i)
41049 if (DemandedElts[i])
41050 DemandedMask[i] = i;
41051
41052 SDValue NewShuffle = combineX86ShufflesRecursively(
41053 {Op}, 0, Op, DemandedMask, {}, 0, X86::MaxShuffleCombineDepth - Depth,
41054 /*HasVarMask*/ false,
41055 /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, TLO.DAG,
41056 Subtarget);
41057 if (NewShuffle)
41058 return TLO.CombineTo(Op, NewShuffle);
41059 }
41060
41061 return false;
41062}
41063
41064bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
41065 SDValue Op, const APInt &OriginalDemandedBits,
41066 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
41067 unsigned Depth) const {
41068 EVT VT = Op.getValueType();
41069 unsigned BitWidth = OriginalDemandedBits.getBitWidth();
41070 unsigned Opc = Op.getOpcode();
41071 switch(Opc) {
41072 case X86ISD::VTRUNC: {
41073 KnownBits KnownOp;
41074 SDValue Src = Op.getOperand(0);
41075 MVT SrcVT = Src.getSimpleValueType();
41076
41077 // Simplify the input, using demanded bit information.
41078 APInt TruncMask = OriginalDemandedBits.zext(SrcVT.getScalarSizeInBits());
41079 APInt DemandedElts = OriginalDemandedElts.trunc(SrcVT.getVectorNumElements());
41080 if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1))
41081 return true;
41082 break;
41083 }
41084 case X86ISD::PMULDQ:
41085 case X86ISD::PMULUDQ: {
41086 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
41087 KnownBits KnownOp;
41088 SDValue LHS = Op.getOperand(0);
41089 SDValue RHS = Op.getOperand(1);
41090 // FIXME: Can we bound this better?
41091 APInt DemandedMask = APInt::getLowBitsSet(64, 32);
41092 if (SimplifyDemandedBits(LHS, DemandedMask, OriginalDemandedElts, KnownOp,
41093 TLO, Depth + 1))
41094 return true;
41095 if (SimplifyDemandedBits(RHS, DemandedMask, OriginalDemandedElts, KnownOp,
41096 TLO, Depth + 1))
41097 return true;
41098
41099 // Aggressively peek through ops to get at the demanded low bits.
41100 SDValue DemandedLHS = SimplifyMultipleUseDemandedBits(
41101 LHS, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
41102 SDValue DemandedRHS = SimplifyMultipleUseDemandedBits(
41103 RHS, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
41104 if (DemandedLHS || DemandedRHS) {
41105 DemandedLHS = DemandedLHS ? DemandedLHS : LHS;
41106 DemandedRHS = DemandedRHS ? DemandedRHS : RHS;
41107 return TLO.CombineTo(
41108 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, DemandedLHS, DemandedRHS));
41109 }
41110 break;
41111 }
41112 case X86ISD::VSHLI: {
41113 SDValue Op0 = Op.getOperand(0);
41114
41115 unsigned ShAmt = Op.getConstantOperandVal(1);
41116 if (ShAmt >= BitWidth)
41117 break;
41118
41119 APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);
41120
41121 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
41122 // single shift. We can do this if the bottom bits (which are shifted
41123 // out) are never demanded.
41124 if (Op0.getOpcode() == X86ISD::VSRLI &&
41125 OriginalDemandedBits.countTrailingZeros() >= ShAmt) {
41126 unsigned Shift2Amt = Op0.getConstantOperandVal(1);
41127 if (Shift2Amt < BitWidth) {
41128 int Diff = ShAmt - Shift2Amt;
41129 if (Diff == 0)
41130 return TLO.CombineTo(Op, Op0.getOperand(0));
41131
41132 unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;
41133 SDValue NewShift = TLO.DAG.getNode(
41134 NewOpc, SDLoc(Op), VT, Op0.getOperand(0),
41135 TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));
41136 return TLO.CombineTo(Op, NewShift);
41137 }
41138 }
41139
41140 // If we are only demanding sign bits then we can use the shift source directly.
41141 unsigned NumSignBits =
41142 TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1);
41143 unsigned UpperDemandedBits =
41144 BitWidth - OriginalDemandedBits.countTrailingZeros();
41145 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
41146 return TLO.CombineTo(Op, Op0);
41147
41148 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
41149 TLO, Depth + 1))
41150 return true;
41151
41152 assert(!Known.hasConflict() && "Bits known to be one AND zero?")(static_cast <bool> (!Known.hasConflict() && "Bits known to be one AND zero?"
) ? void (0) : __assert_fail ("!Known.hasConflict() && \"Bits known to be one AND zero?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41152, __extension__
__PRETTY_FUNCTION__))
;
41153 Known.Zero <<= ShAmt;
41154 Known.One <<= ShAmt;
41155
41156 // Low bits known zero.
41157 Known.Zero.setLowBits(ShAmt);
41158 return false;
41159 }
41160 case X86ISD::VSRLI: {
41161 unsigned ShAmt = Op.getConstantOperandVal(1);
41162 if (ShAmt >= BitWidth)
41163 break;
41164
41165 APInt DemandedMask = OriginalDemandedBits << ShAmt;
41166
41167 if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask,
41168 OriginalDemandedElts, Known, TLO, Depth + 1))
41169 return true;
41170
41171 assert(!Known.hasConflict() && "Bits known to be one AND zero?")(static_cast <bool> (!Known.hasConflict() && "Bits known to be one AND zero?"
) ? void (0) : __assert_fail ("!Known.hasConflict() && \"Bits known to be one AND zero?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41171, __extension__
__PRETTY_FUNCTION__))
;
41172 Known.Zero.lshrInPlace(ShAmt);
41173 Known.One.lshrInPlace(ShAmt);
41174
41175 // High bits known zero.
41176 Known.Zero.setHighBits(ShAmt);
41177 return false;
41178 }
41179 case X86ISD::VSRAI: {
41180 SDValue Op0 = Op.getOperand(0);
41181 SDValue Op1 = Op.getOperand(1);
41182
41183 unsigned ShAmt = cast<ConstantSDNode>(Op1)->getZExtValue();
41184 if (ShAmt >= BitWidth)
41185 break;
41186
41187 APInt DemandedMask = OriginalDemandedBits << ShAmt;
41188
41189 // If we just want the sign bit then we don't need to shift it.
41190 if (OriginalDemandedBits.isSignMask())
41191 return TLO.CombineTo(Op, Op0);
41192
41193 // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
41194 if (Op0.getOpcode() == X86ISD::VSHLI &&
41195 Op.getOperand(1) == Op0.getOperand(1)) {
41196 SDValue Op00 = Op0.getOperand(0);
41197 unsigned NumSignBits =
41198 TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);
41199 if (ShAmt < NumSignBits)
41200 return TLO.CombineTo(Op, Op00);
41201 }
41202
41203 // If any of the demanded bits are produced by the sign extension, we also
41204 // demand the input sign bit.
41205 if (OriginalDemandedBits.countLeadingZeros() < ShAmt)
41206 DemandedMask.setSignBit();
41207
41208 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
41209 TLO, Depth + 1))
41210 return true;
41211
41212 assert(!Known.hasConflict() && "Bits known to be one AND zero?")(static_cast <bool> (!Known.hasConflict() && "Bits known to be one AND zero?"
) ? void (0) : __assert_fail ("!Known.hasConflict() && \"Bits known to be one AND zero?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41212, __extension__
__PRETTY_FUNCTION__))
;
41213 Known.Zero.lshrInPlace(ShAmt);
41214 Known.One.lshrInPlace(ShAmt);
41215
41216 // If the input sign bit is known to be zero, or if none of the top bits
41217 // are demanded, turn this into an unsigned shift right.
41218 if (Known.Zero[BitWidth - ShAmt - 1] ||
41219 OriginalDemandedBits.countLeadingZeros() >= ShAmt)
41220 return TLO.CombineTo(
41221 Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));
41222
41223 // High bits are known one.
41224 if (Known.One[BitWidth - ShAmt - 1])
41225 Known.One.setHighBits(ShAmt);
41226 return false;
41227 }
41228 case X86ISD::BLENDV: {
41229 SDValue Sel = Op.getOperand(0);
41230 SDValue LHS = Op.getOperand(1);
41231 SDValue RHS = Op.getOperand(2);
41232
41233 APInt SignMask = APInt::getSignMask(BitWidth);
41234 SDValue NewSel = SimplifyMultipleUseDemandedBits(
41235 Sel, SignMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
41236 SDValue NewLHS = SimplifyMultipleUseDemandedBits(
41237 LHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
41238 SDValue NewRHS = SimplifyMultipleUseDemandedBits(
41239 RHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
41240
41241 if (NewSel || NewLHS || NewRHS) {
41242 NewSel = NewSel ? NewSel : Sel;
41243 NewLHS = NewLHS ? NewLHS : LHS;
41244 NewRHS = NewRHS ? NewRHS : RHS;
41245 return TLO.CombineTo(Op, TLO.DAG.getNode(X86ISD::BLENDV, SDLoc(Op), VT,
41246 NewSel, NewLHS, NewRHS));
41247 }
41248 break;
41249 }
41250 case X86ISD::PEXTRB:
41251 case X86ISD::PEXTRW: {
41252 SDValue Vec = Op.getOperand(0);
41253 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
41254 MVT VecVT = Vec.getSimpleValueType();
41255 unsigned NumVecElts = VecVT.getVectorNumElements();
41256
41257 if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {
41258 unsigned Idx = CIdx->getZExtValue();
41259 unsigned VecBitWidth = VecVT.getScalarSizeInBits();
41260
41261 // If we demand no bits from the vector then we must have demanded
41262 // bits from the implict zext - simplify to zero.
41263 APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth);
41264 if (DemandedVecBits == 0)
41265 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
41266
41267 APInt KnownUndef, KnownZero;
41268 APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx);
41269 if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,
41270 KnownZero, TLO, Depth + 1))
41271 return true;
41272
41273 KnownBits KnownVec;
41274 if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,
41275 KnownVec, TLO, Depth + 1))
41276 return true;
41277
41278 if (SDValue V = SimplifyMultipleUseDemandedBits(
41279 Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1))
41280 return TLO.CombineTo(
41281 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1)));
41282
41283 Known = KnownVec.zext(BitWidth);
41284 return false;
41285 }
41286 break;
41287 }
41288 case X86ISD::PINSRB:
41289 case X86ISD::PINSRW: {
41290 SDValue Vec = Op.getOperand(0);
41291 SDValue Scl = Op.getOperand(1);
41292 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
41293 MVT VecVT = Vec.getSimpleValueType();
41294
41295 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {
41296 unsigned Idx = CIdx->getZExtValue();
41297 if (!OriginalDemandedElts[Idx])
41298 return TLO.CombineTo(Op, Vec);
41299
41300 KnownBits KnownVec;
41301 APInt DemandedVecElts(OriginalDemandedElts);
41302 DemandedVecElts.clearBit(Idx);
41303 if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts,
41304 KnownVec, TLO, Depth + 1))
41305 return true;
41306
41307 KnownBits KnownScl;
41308 unsigned NumSclBits = Scl.getScalarValueSizeInBits();
41309 APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits);
41310 if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))
41311 return true;
41312
41313 KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());
41314 Known = KnownBits::commonBits(KnownVec, KnownScl);
41315 return false;
41316 }
41317 break;
41318 }
41319 case X86ISD::PACKSS:
41320 // PACKSS saturates to MIN/MAX integer values. So if we just want the
41321 // sign bit then we can just ask for the source operands sign bit.
41322 // TODO - add known bits handling.
41323 if (OriginalDemandedBits.isSignMask()) {
41324 APInt DemandedLHS, DemandedRHS;
41325 getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS);
41326
41327 KnownBits KnownLHS, KnownRHS;
41328 APInt SignMask = APInt::getSignMask(BitWidth * 2);
41329 if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS,
41330 KnownLHS, TLO, Depth + 1))
41331 return true;
41332 if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,
41333 KnownRHS, TLO, Depth + 1))
41334 return true;
41335
41336 // Attempt to avoid multi-use ops if we don't need anything from them.
41337 SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
41338 Op.getOperand(0), SignMask, DemandedLHS, TLO.DAG, Depth + 1);
41339 SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits(
41340 Op.getOperand(1), SignMask, DemandedRHS, TLO.DAG, Depth + 1);
41341 if (DemandedOp0 || DemandedOp1) {
41342 SDValue Op0 = DemandedOp0 ? DemandedOp0 : Op.getOperand(0);
41343 SDValue Op1 = DemandedOp1 ? DemandedOp1 : Op.getOperand(1);
41344 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, Op0, Op1));
41345 }
41346 }
41347 // TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.
41348 break;
41349 case X86ISD::VBROADCAST: {
41350 SDValue Src = Op.getOperand(0);
41351 MVT SrcVT = Src.getSimpleValueType();
41352 APInt DemandedElts = APInt::getOneBitSet(
41353 SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1, 0);
41354 if (SimplifyDemandedBits(Src, OriginalDemandedBits, DemandedElts, Known,
41355 TLO, Depth + 1))
41356 return true;
41357 // If we don't need the upper bits, attempt to narrow the broadcast source.
41358 // Don't attempt this on AVX512 as it might affect broadcast folding.
41359 // TODO: Should we attempt this for i32/i16 splats? They tend to be slower.
41360 if ((BitWidth == 64) && SrcVT.isScalarInteger() && !Subtarget.hasAVX512() &&
41361 OriginalDemandedBits.countLeadingZeros() >= (BitWidth / 2) &&
41362 Src->hasOneUse()) {
41363 MVT NewSrcVT = MVT::getIntegerVT(BitWidth / 2);
41364 SDValue NewSrc =
41365 TLO.DAG.getNode(ISD::TRUNCATE, SDLoc(Src), NewSrcVT, Src);
41366 MVT NewVT = MVT::getVectorVT(NewSrcVT, VT.getVectorNumElements() * 2);
41367 SDValue NewBcst =
41368 TLO.DAG.getNode(X86ISD::VBROADCAST, SDLoc(Op), NewVT, NewSrc);
41369 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, NewBcst));
41370 }
41371 break;
41372 }
41373 case X86ISD::PCMPGT:
41374 // icmp sgt(0, R) == ashr(R, BitWidth-1).
41375 // iff we only need the sign bit then we can use R directly.
41376 if (OriginalDemandedBits.isSignMask() &&
41377 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
41378 return TLO.CombineTo(Op, Op.getOperand(1));
41379 break;
41380 case X86ISD::MOVMSK: {
41381 SDValue Src = Op.getOperand(0);
41382 MVT SrcVT = Src.getSimpleValueType();
41383 unsigned SrcBits = SrcVT.getScalarSizeInBits();
41384 unsigned NumElts = SrcVT.getVectorNumElements();
41385
41386 // If we don't need the sign bits at all just return zero.
41387 if (OriginalDemandedBits.countTrailingZeros() >= NumElts)
41388 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
41389
41390 // See if we only demand bits from the lower 128-bit vector.
41391 if (SrcVT.is256BitVector() &&
41392 OriginalDemandedBits.getActiveBits() <= (NumElts / 2)) {
41393 SDValue NewSrc = extract128BitVector(Src, 0, TLO.DAG, SDLoc(Src));
41394 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
41395 }
41396
41397 // Only demand the vector elements of the sign bits we need.
41398 APInt KnownUndef, KnownZero;
41399 APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);
41400 if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
41401 TLO, Depth + 1))
41402 return true;
41403
41404 Known.Zero = KnownZero.zextOrSelf(BitWidth);
41405 Known.Zero.setHighBits(BitWidth - NumElts);
41406
41407 // MOVMSK only uses the MSB from each vector element.
41408 KnownBits KnownSrc;
41409 APInt DemandedSrcBits = APInt::getSignMask(SrcBits);
41410 if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, KnownSrc, TLO,
41411 Depth + 1))
41412 return true;
41413
41414 if (KnownSrc.One[SrcBits - 1])
41415 Known.One.setLowBits(NumElts);
41416 else if (KnownSrc.Zero[SrcBits - 1])
41417 Known.Zero.setLowBits(NumElts);
41418
41419 // Attempt to avoid multi-use os if we don't need anything from it.
41420 if (SDValue NewSrc = SimplifyMultipleUseDemandedBits(
41421 Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1))
41422 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
41423 return false;
41424 }
41425 case X86ISD::BEXTR:
41426 case X86ISD::BEXTRI: {
41427 SDValue Op0 = Op.getOperand(0);
41428 SDValue Op1 = Op.getOperand(1);
41429
41430 // Only bottom 16-bits of the control bits are required.
41431 if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
41432 // NOTE: SimplifyDemandedBits won't do this for constants.
41433 uint64_t Val1 = Cst1->getZExtValue();
41434 uint64_t MaskedVal1 = Val1 & 0xFFFF;
41435 if (Opc == X86ISD::BEXTR && MaskedVal1 != Val1) {
41436 SDLoc DL(Op);
41437 return TLO.CombineTo(
41438 Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0,
41439 TLO.DAG.getConstant(MaskedVal1, DL, VT)));
41440 }
41441
41442 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
41443 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
41444
41445 // If the length is 0, the result is 0.
41446 if (Length == 0) {
41447 Known.setAllZero();
41448 return false;
41449 }
41450
41451 if ((Shift + Length) <= BitWidth) {
41452 APInt DemandedMask = APInt::getBitsSet(BitWidth, Shift, Shift + Length);
41453 if (SimplifyDemandedBits(Op0, DemandedMask, Known, TLO, Depth + 1))
41454 return true;
41455
41456 Known = Known.extractBits(Length, Shift);
41457 Known = Known.zextOrTrunc(BitWidth);
41458 return false;
41459 }
41460 } else {
41461 assert(Opc == X86ISD::BEXTR && "Unexpected opcode!")(static_cast <bool> (Opc == X86ISD::BEXTR && "Unexpected opcode!"
) ? void (0) : __assert_fail ("Opc == X86ISD::BEXTR && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41461, __extension__
__PRETTY_FUNCTION__))
;
41462 KnownBits Known1;
41463 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16));
41464 if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))
41465 return true;
41466
41467 // If the length is 0, replace with 0.
41468 KnownBits LengthBits = Known1.extractBits(8, 8);
41469 if (LengthBits.isZero())
41470 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
41471 }
41472
41473 break;
41474 }
41475 case X86ISD::PDEP: {
41476 SDValue Op0 = Op.getOperand(0);
41477 SDValue Op1 = Op.getOperand(1);
41478
41479 unsigned DemandedBitsLZ = OriginalDemandedBits.countLeadingZeros();
41480 APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
41481
41482 // If the demanded bits has leading zeroes, we don't demand those from the
41483 // mask.
41484 if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1))
41485 return true;
41486
41487 // The number of possible 1s in the mask determines the number of LSBs of
41488 // operand 0 used. Undemanded bits from the mask don't matter so filter
41489 // them before counting.
41490 KnownBits Known2;
41491 uint64_t Count = (~Known.Zero & LoMask).countPopulation();
41492 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count));
41493 if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1))
41494 return true;
41495
41496 // Zeroes are retained from the mask, but not ones.
41497 Known.One.clearAllBits();
41498 // The result will have at least as many trailing zeros as the non-mask
41499 // operand since bits can only map to the same or higher bit position.
41500 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
41501 return false;
41502 }
41503 }
41504
41505 return TargetLowering::SimplifyDemandedBitsForTargetNode(
41506 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
41507}
41508
41509SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
41510 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
41511 SelectionDAG &DAG, unsigned Depth) const {
41512 int NumElts = DemandedElts.getBitWidth();
41513 unsigned Opc = Op.getOpcode();
41514 EVT VT = Op.getValueType();
41515
41516 switch (Opc) {
41517 case X86ISD::PINSRB:
41518 case X86ISD::PINSRW: {
41519 // If we don't demand the inserted element, return the base vector.
41520 SDValue Vec = Op.getOperand(0);
41521 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
41522 MVT VecVT = Vec.getSimpleValueType();
41523 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&
41524 !DemandedElts[CIdx->getZExtValue()])
41525 return Vec;
41526 break;
41527 }
41528 case X86ISD::VSHLI: {
41529 // If we are only demanding sign bits then we can use the shift source
41530 // directly.
41531 SDValue Op0 = Op.getOperand(0);
41532 unsigned ShAmt = Op.getConstantOperandVal(1);
41533 unsigned BitWidth = DemandedBits.getBitWidth();
41534 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);
41535 unsigned UpperDemandedBits = BitWidth - DemandedBits.countTrailingZeros();
41536 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
41537 return Op0;
41538 break;
41539 }
41540 case X86ISD::VSRAI:
41541 // iff we only need the sign bit then we can use the source directly.
41542 // TODO: generalize where we only demand extended signbits.
41543 if (DemandedBits.isSignMask())
41544 return Op.getOperand(0);
41545 break;
41546 case X86ISD::PCMPGT:
41547 // icmp sgt(0, R) == ashr(R, BitWidth-1).
41548 // iff we only need the sign bit then we can use R directly.
41549 if (DemandedBits.isSignMask() &&
41550 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
41551 return Op.getOperand(1);
41552 break;
41553 }
41554
41555 APInt ShuffleUndef, ShuffleZero;
41556 SmallVector<int, 16> ShuffleMask;
41557 SmallVector<SDValue, 2> ShuffleOps;
41558 if (getTargetShuffleInputs(Op, DemandedElts, ShuffleOps, ShuffleMask,
41559 ShuffleUndef, ShuffleZero, DAG, Depth, false)) {
41560 // If all the demanded elts are from one operand and are inline,
41561 // then we can use the operand directly.
41562 int NumOps = ShuffleOps.size();
41563 if (ShuffleMask.size() == (unsigned)NumElts &&
41564 llvm::all_of(ShuffleOps, [VT](SDValue V) {
41565 return VT.getSizeInBits() == V.getValueSizeInBits();
41566 })) {
41567
41568 if (DemandedElts.isSubsetOf(ShuffleUndef))
41569 return DAG.getUNDEF(VT);
41570 if (DemandedElts.isSubsetOf(ShuffleUndef | ShuffleZero))
41571 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(Op));
41572
41573 // Bitmask that indicates which ops have only been accessed 'inline'.
41574 APInt IdentityOp = APInt::getAllOnes(NumOps);
41575 for (int i = 0; i != NumElts; ++i) {
41576 int M = ShuffleMask[i];
41577 if (!DemandedElts[i] || ShuffleUndef[i])
41578 continue;
41579 int OpIdx = M / NumElts;
41580 int EltIdx = M % NumElts;
41581 if (M < 0 || EltIdx != i) {
41582 IdentityOp.clearAllBits();
41583 break;
41584 }
41585 IdentityOp &= APInt::getOneBitSet(NumOps, OpIdx);
41586 if (IdentityOp == 0)
41587 break;
41588 }
41589 assert((IdentityOp == 0 || IdentityOp.countPopulation() == 1) &&(static_cast <bool> ((IdentityOp == 0 || IdentityOp.countPopulation
() == 1) && "Multiple identity shuffles detected") ? void
(0) : __assert_fail ("(IdentityOp == 0 || IdentityOp.countPopulation() == 1) && \"Multiple identity shuffles detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41590, __extension__
__PRETTY_FUNCTION__))
41590 "Multiple identity shuffles detected")(static_cast <bool> ((IdentityOp == 0 || IdentityOp.countPopulation
() == 1) && "Multiple identity shuffles detected") ? void
(0) : __assert_fail ("(IdentityOp == 0 || IdentityOp.countPopulation() == 1) && \"Multiple identity shuffles detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41590, __extension__
__PRETTY_FUNCTION__))
;
41591
41592 if (IdentityOp != 0)
41593 return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countTrailingZeros()]);
41594 }
41595 }
41596
41597 return TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
41598 Op, DemandedBits, DemandedElts, DAG, Depth);
41599}
41600
41601bool X86TargetLowering::isSplatValueForTargetNode(SDValue Op,
41602 const APInt &DemandedElts,
41603 APInt &UndefElts,
41604 unsigned Depth) const {
41605 unsigned NumElts = DemandedElts.getBitWidth();
41606 unsigned Opc = Op.getOpcode();
41607
41608 switch (Opc) {
41609 case X86ISD::VBROADCAST:
41610 case X86ISD::VBROADCAST_LOAD:
41611 UndefElts = APInt::getNullValue(NumElts);
41612 return true;
41613 }
41614
41615 return TargetLowering::isSplatValueForTargetNode(Op, DemandedElts, UndefElts,
41616 Depth);
41617}
41618
41619// Helper to peek through bitops/trunc/setcc to determine size of source vector.
41620// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.
41621static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size,
41622 bool AllowTruncate) {
41623 switch (Src.getOpcode()) {
41624 case ISD::TRUNCATE:
41625 if (!AllowTruncate)
41626 return false;
41627 LLVM_FALLTHROUGH[[gnu::fallthrough]];
41628 case ISD::SETCC:
41629 return Src.getOperand(0).getValueSizeInBits() == Size;
41630 case ISD::AND:
41631 case ISD::XOR:
41632 case ISD::OR:
41633 return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate) &&
41634 checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate);
41635 }
41636 return false;
41637}
41638
41639// Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents.
41640static unsigned getAltBitOpcode(unsigned Opcode) {
41641 switch(Opcode) {
41642 case ISD::AND: return X86ISD::FAND;
41643 case ISD::OR: return X86ISD::FOR;
41644 case ISD::XOR: return X86ISD::FXOR;
41645 case X86ISD::ANDNP: return X86ISD::FANDN;
41646 }
41647 llvm_unreachable("Unknown bitwise opcode")::llvm::llvm_unreachable_internal("Unknown bitwise opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41647)
;
41648}
41649
41650// Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets.
41651static SDValue adjustBitcastSrcVectorSSE1(SelectionDAG &DAG, SDValue Src,
41652 const SDLoc &DL) {
41653 EVT SrcVT = Src.getValueType();
41654 if (SrcVT != MVT::v4i1)
41655 return SDValue();
41656
41657 switch (Src.getOpcode()) {
41658 case ISD::SETCC:
41659 if (Src.getOperand(0).getValueType() == MVT::v4i32 &&
41660 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) &&
41661 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT) {
41662 SDValue Op0 = Src.getOperand(0);
41663 if (ISD::isNormalLoad(Op0.getNode()))
41664 return DAG.getBitcast(MVT::v4f32, Op0);
41665 if (Op0.getOpcode() == ISD::BITCAST &&
41666 Op0.getOperand(0).getValueType() == MVT::v4f32)
41667 return Op0.getOperand(0);
41668 }
41669 break;
41670 case ISD::AND:
41671 case ISD::XOR:
41672 case ISD::OR: {
41673 SDValue Op0 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(0), DL);
41674 SDValue Op1 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(1), DL);
41675 if (Op0 && Op1)
41676 return DAG.getNode(getAltBitOpcode(Src.getOpcode()), DL, MVT::v4f32, Op0,
41677 Op1);
41678 break;
41679 }
41680 }
41681 return SDValue();
41682}
41683
41684// Helper to push sign extension of vXi1 SETCC result through bitops.
41685static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT,
41686 SDValue Src, const SDLoc &DL) {
41687 switch (Src.getOpcode()) {
41688 case ISD::SETCC:
41689 case ISD::TRUNCATE:
41690 return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
41691 case ISD::AND:
41692 case ISD::XOR:
41693 case ISD::OR:
41694 return DAG.getNode(
41695 Src.getOpcode(), DL, SExtVT,
41696 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(0), DL),
41697 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL));
41698 }
41699 llvm_unreachable("Unexpected node type for vXi1 sign extension")::llvm::llvm_unreachable_internal("Unexpected node type for vXi1 sign extension"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41699)
;
41700}
41701
41702// Try to match patterns such as
41703// (i16 bitcast (v16i1 x))
41704// ->
41705// (i16 movmsk (16i8 sext (v16i1 x)))
41706// before the illegal vector is scalarized on subtargets that don't have legal
41707// vxi1 types.
41708static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
41709 const SDLoc &DL,
41710 const X86Subtarget &Subtarget) {
41711 EVT SrcVT = Src.getValueType();
41712 if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)
41713 return SDValue();
41714
41715 // Recognize the IR pattern for the movmsk intrinsic under SSE1 before type
41716 // legalization destroys the v4i32 type.
41717 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2()) {
41718 if (SDValue V = adjustBitcastSrcVectorSSE1(DAG, Src, DL)) {
41719 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32,
41720 DAG.getBitcast(MVT::v4f32, V));
41721 return DAG.getZExtOrTrunc(V, DL, VT);
41722 }
41723 }
41724
41725 // If the input is a truncate from v16i8 or v32i8 go ahead and use a
41726 // movmskb even with avx512. This will be better than truncating to vXi1 and
41727 // using a kmov. This can especially help KNL if the input is a v16i8/v32i8
41728 // vpcmpeqb/vpcmpgtb.
41729 bool PreferMovMsk = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&
41730 (Src.getOperand(0).getValueType() == MVT::v16i8 ||
41731 Src.getOperand(0).getValueType() == MVT::v32i8 ||
41732 Src.getOperand(0).getValueType() == MVT::v64i8);
41733
41734 // Prefer movmsk for AVX512 for (bitcast (setlt X, 0)) which can be handled
41735 // directly with vpmovmskb/vmovmskps/vmovmskpd.
41736 if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() &&
41737 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT &&
41738 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {
41739 EVT CmpVT = Src.getOperand(0).getValueType();
41740 EVT EltVT = CmpVT.getVectorElementType();
41741 if (CmpVT.getSizeInBits() <= 256 &&
41742 (EltVT == MVT::i8 || EltVT == MVT::i32 || EltVT == MVT::i64))
41743 PreferMovMsk = true;
41744 }
41745
41746 // With AVX512 vxi1 types are legal and we prefer using k-regs.
41747 // MOVMSK is supported in SSE2 or later.
41748 if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !PreferMovMsk))
41749 return SDValue();
41750
41751 // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
41752 // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
41753 // v8i16 and v16i16.
41754 // For these two cases, we can shuffle the upper element bytes to a
41755 // consecutive sequence at the start of the vector and treat the results as
41756 // v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
41757 // for v16i16 this is not the case, because the shuffle is expensive, so we
41758 // avoid sign-extending to this type entirely.
41759 // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
41760 // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
41761 MVT SExtVT;
41762 bool PropagateSExt = false;
41763 switch (SrcVT.getSimpleVT().SimpleTy) {
41764 default:
41765 return SDValue();
41766 case MVT::v2i1:
41767 SExtVT = MVT::v2i64;
41768 break;
41769 case MVT::v4i1:
41770 SExtVT = MVT::v4i32;
41771 // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
41772 // sign-extend to a 256-bit operation to avoid truncation.
41773 if (Subtarget.hasAVX() &&
41774 checkBitcastSrcVectorSize(Src, 256, Subtarget.hasAVX2())) {
41775 SExtVT = MVT::v4i64;
41776 PropagateSExt = true;
41777 }
41778 break;
41779 case MVT::v8i1:
41780 SExtVT = MVT::v8i16;
41781 // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
41782 // sign-extend to a 256-bit operation to match the compare.
41783 // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
41784 // 256-bit because the shuffle is cheaper than sign extending the result of
41785 // the compare.
41786 if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256, true) ||
41787 checkBitcastSrcVectorSize(Src, 512, true))) {
41788 SExtVT = MVT::v8i32;
41789 PropagateSExt = true;
41790 }
41791 break;
41792 case MVT::v16i1:
41793 SExtVT = MVT::v16i8;
41794 // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
41795 // it is not profitable to sign-extend to 256-bit because this will
41796 // require an extra cross-lane shuffle which is more expensive than
41797 // truncating the result of the compare to 128-bits.
41798 break;
41799 case MVT::v32i1:
41800 SExtVT = MVT::v32i8;
41801 break;
41802 case MVT::v64i1:
41803 // If we have AVX512F, but not AVX512BW and the input is truncated from
41804 // v64i8 checked earlier. Then split the input and make two pmovmskbs.
41805 if (Subtarget.hasAVX512()) {
41806 if (Subtarget.hasBWI())
41807 return SDValue();
41808 SExtVT = MVT::v64i8;
41809 break;
41810 }
41811 // Split if this is a <64 x i8> comparison result.
41812 if (checkBitcastSrcVectorSize(Src, 512, false)) {
41813 SExtVT = MVT::v64i8;
41814 break;
41815 }
41816 return SDValue();
41817 };
41818
41819 SDValue V = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)
41820 : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
41821
41822 if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8 || SExtVT == MVT::v64i8) {
41823 V = getPMOVMSKB(DL, V, DAG, Subtarget);
41824 } else {
41825 if (SExtVT == MVT::v8i16)
41826 V = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, V,
41827 DAG.getUNDEF(MVT::v8i16));
41828 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
41829 }
41830
41831 EVT IntVT =
41832 EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements());
41833 V = DAG.getZExtOrTrunc(V, DL, IntVT);
41834 return DAG.getBitcast(VT, V);
41835}
41836
41837// Convert a vXi1 constant build vector to the same width scalar integer.
41838static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG) {
41839 EVT SrcVT = Op.getValueType();
41840 assert(SrcVT.getVectorElementType() == MVT::i1 &&(static_cast <bool> (SrcVT.getVectorElementType() == MVT
::i1 && "Expected a vXi1 vector") ? void (0) : __assert_fail
("SrcVT.getVectorElementType() == MVT::i1 && \"Expected a vXi1 vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41841, __extension__
__PRETTY_FUNCTION__))
41841 "Expected a vXi1 vector")(static_cast <bool> (SrcVT.getVectorElementType() == MVT
::i1 && "Expected a vXi1 vector") ? void (0) : __assert_fail
("SrcVT.getVectorElementType() == MVT::i1 && \"Expected a vXi1 vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41841, __extension__
__PRETTY_FUNCTION__))
;
41842 assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&(static_cast <bool> (ISD::isBuildVectorOfConstantSDNodes
(Op.getNode()) && "Expected a constant build vector")
? void (0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && \"Expected a constant build vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41843, __extension__
__PRETTY_FUNCTION__))
41843 "Expected a constant build vector")(static_cast <bool> (ISD::isBuildVectorOfConstantSDNodes
(Op.getNode()) && "Expected a constant build vector")
? void (0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && \"Expected a constant build vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41843, __extension__
__PRETTY_FUNCTION__))
;
41844
41845 APInt Imm(SrcVT.getVectorNumElements(), 0);
41846 for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
41847 SDValue In = Op.getOperand(Idx);
41848 if (!In.isUndef() && (cast<ConstantSDNode>(In)->getZExtValue() & 0x1))
41849 Imm.setBit(Idx);
41850 }
41851 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
41852 return DAG.getConstant(Imm, SDLoc(Op), IntVT);
41853}
41854
41855static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,
41856 TargetLowering::DAGCombinerInfo &DCI,
41857 const X86Subtarget &Subtarget) {
41858 assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast")(static_cast <bool> (N->getOpcode() == ISD::BITCAST &&
"Expected a bitcast") ? void (0) : __assert_fail ("N->getOpcode() == ISD::BITCAST && \"Expected a bitcast\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41858, __extension__
__PRETTY_FUNCTION__))
;
41859
41860 if (!DCI.isBeforeLegalizeOps())
41861 return SDValue();
41862
41863 // Only do this if we have k-registers.
41864 if (!Subtarget.hasAVX512())
41865 return SDValue();
41866
41867 EVT DstVT = N->getValueType(0);
41868 SDValue Op = N->getOperand(0);
41869 EVT SrcVT = Op.getValueType();
41870
41871 if (!Op.hasOneUse())
41872 return SDValue();
41873
41874 // Look for logic ops.
41875 if (Op.getOpcode() != ISD::AND &&
41876 Op.getOpcode() != ISD::OR &&
41877 Op.getOpcode() != ISD::XOR)
41878 return SDValue();
41879
41880 // Make sure we have a bitcast between mask registers and a scalar type.
41881 if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
41882 DstVT.isScalarInteger()) &&
41883 !(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&
41884 SrcVT.isScalarInteger()))
41885 return SDValue();
41886
41887 SDValue LHS = Op.getOperand(0);
41888 SDValue RHS = Op.getOperand(1);
41889
41890 if (LHS.hasOneUse() && LHS.getOpcode() == ISD::BITCAST &&
41891 LHS.getOperand(0).getValueType() == DstVT)
41892 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, LHS.getOperand(0),
41893 DAG.getBitcast(DstVT, RHS));
41894
41895 if (RHS.hasOneUse() && RHS.getOpcode() == ISD::BITCAST &&
41896 RHS.getOperand(0).getValueType() == DstVT)
41897 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
41898 DAG.getBitcast(DstVT, LHS), RHS.getOperand(0));
41899
41900 // If the RHS is a vXi1 build vector, this is a good reason to flip too.
41901 // Most of these have to move a constant from the scalar domain anyway.
41902 if (ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) {
41903 RHS = combinevXi1ConstantToInteger(RHS, DAG);
41904 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
41905 DAG.getBitcast(DstVT, LHS), RHS);
41906 }
41907
41908 return SDValue();
41909}
41910
41911static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG,
41912 const X86Subtarget &Subtarget) {
41913 SDLoc DL(BV);
41914 unsigned NumElts = BV->getNumOperands();
41915 SDValue Splat = BV->getSplatValue();
41916
41917 // Build MMX element from integer GPR or SSE float values.
41918 auto CreateMMXElement = [&](SDValue V) {
41919 if (V.isUndef())
41920 return DAG.getUNDEF(MVT::x86mmx);
41921 if (V.getValueType().isFloatingPoint()) {
41922 if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {
41923 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);
41924 V = DAG.getBitcast(MVT::v2i64, V);
41925 return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);
41926 }
41927 V = DAG.getBitcast(MVT::i32, V);
41928 } else {
41929 V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);
41930 }
41931 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);
41932 };
41933
41934 // Convert build vector ops to MMX data in the bottom elements.
41935 SmallVector<SDValue, 8> Ops;
41936
41937 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41938
41939 // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
41940 if (Splat) {
41941 if (Splat.isUndef())
41942 return DAG.getUNDEF(MVT::x86mmx);
41943
41944 Splat = CreateMMXElement(Splat);
41945
41946 if (Subtarget.hasSSE1()) {
41947 // Unpack v8i8 to splat i8 elements to lowest 16-bits.
41948 if (NumElts == 8)
41949 Splat = DAG.getNode(
41950 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
41951 DAG.getTargetConstant(Intrinsic::x86_mmx_punpcklbw, DL,
41952 TLI.getPointerTy(DAG.getDataLayout())),
41953 Splat, Splat);
41954
41955 // Use PSHUFW to repeat 16-bit elements.
41956 unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
41957 return DAG.getNode(
41958 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
41959 DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL,
41960 TLI.getPointerTy(DAG.getDataLayout())),
41961 Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8));
41962 }
41963 Ops.append(NumElts, Splat);
41964 } else {
41965 for (unsigned i = 0; i != NumElts; ++i)
41966 Ops.push_back(CreateMMXElement(BV->getOperand(i)));
41967 }
41968
41969 // Use tree of PUNPCKLs to build up general MMX vector.
41970 while (Ops.size() > 1) {
41971 unsigned NumOps = Ops.size();
41972 unsigned IntrinOp =
41973 (NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
41974 : (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
41975 : Intrinsic::x86_mmx_punpcklbw));
41976 SDValue Intrin = DAG.getTargetConstant(
41977 IntrinOp, DL, TLI.getPointerTy(DAG.getDataLayout()));
41978 for (unsigned i = 0; i != NumOps; i += 2)
41979 Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
41980 Ops[i], Ops[i + 1]);
41981 Ops.resize(NumOps / 2);
41982 }
41983
41984 return Ops[0];
41985}
41986
41987// Recursive function that attempts to find if a bool vector node was originally
41988// a vector/float/double that got truncated/extended/bitcast to/from a scalar
41989// integer. If so, replace the scalar ops with bool vector equivalents back down
41990// the chain.
41991static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL,
41992 SelectionDAG &DAG,
41993 const X86Subtarget &Subtarget) {
41994 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41995 unsigned Opc = V.getOpcode();
41996 switch (Opc) {
41997 case ISD::BITCAST: {
41998 // Bitcast from a vector/float/double, we can cheaply bitcast to VT.
41999 SDValue Src = V.getOperand(0);
42000 EVT SrcVT = Src.getValueType();
42001 if (SrcVT.isVector() || SrcVT.isFloatingPoint())
42002 return DAG.getBitcast(VT, Src);
42003 break;
42004 }
42005 case ISD::TRUNCATE: {
42006 // If we find a suitable source, a truncated scalar becomes a subvector.
42007 SDValue Src = V.getOperand(0);
42008 EVT NewSrcVT =
42009 EVT::getVectorVT(*DAG.getContext(), MVT::i1, Src.getValueSizeInBits());
42010 if (TLI.isTypeLegal(NewSrcVT))
42011 if (SDValue N0 =
42012 combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))
42013 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N0,
42014 DAG.getIntPtrConstant(0, DL));
42015 break;
42016 }
42017 case ISD::ANY_EXTEND:
42018 case ISD::ZERO_EXTEND: {
42019 // If we find a suitable source, an extended scalar becomes a subvector.
42020 SDValue Src = V.getOperand(0);
42021 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
42022 Src.getScalarValueSizeInBits());
42023 if (TLI.isTypeLegal(NewSrcVT))
42024 if (SDValue N0 =
42025 combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))
42026 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
42027 Opc == ISD::ANY_EXTEND ? DAG.getUNDEF(VT)
42028 : DAG.getConstant(0, DL, VT),
42029 N0, DAG.getIntPtrConstant(0, DL));
42030 break;
42031 }
42032 case ISD::OR: {
42033 // If we find suitable sources, we can just move an OR to the vector domain.
42034 SDValue Src0 = V.getOperand(0);
42035 SDValue Src1 = V.getOperand(1);
42036 if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
42037 if (SDValue N1 = combineBitcastToBoolVector(VT, Src1, DL, DAG, Subtarget))
42038 return DAG.getNode(Opc, DL, VT, N0, N1);
42039 break;
42040 }
42041 case ISD::SHL: {
42042 // If we find a suitable source, a SHL becomes a KSHIFTL.
42043 SDValue Src0 = V.getOperand(0);
42044 if ((VT == MVT::v8i1 && !Subtarget.hasDQI()) ||
42045 ((VT == MVT::v32i1 || VT == MVT::v64i1) && !Subtarget.hasBWI()))
42046 break;
42047
42048 if (auto *Amt = dyn_cast<ConstantSDNode>(V.getOperand(1)))
42049 if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
42050 return DAG.getNode(
42051 X86ISD::KSHIFTL, DL, VT, N0,
42052 DAG.getTargetConstant(Amt->getZExtValue(), DL, MVT::i8));
42053 break;
42054 }
42055 }
42056 return SDValue();
42057}
42058
42059static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
42060 TargetLowering::DAGCombinerInfo &DCI,
42061 const X86Subtarget &Subtarget) {
42062 SDValue N0 = N->getOperand(0);
42063 EVT VT = N->getValueType(0);
42064 EVT SrcVT = N0.getValueType();
42065 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42066
42067 // Try to match patterns such as
42068 // (i16 bitcast (v16i1 x))
42069 // ->
42070 // (i16 movmsk (16i8 sext (v16i1 x)))
42071 // before the setcc result is scalarized on subtargets that don't have legal
42072 // vxi1 types.
42073 if (DCI.isBeforeLegalize()) {
42074 SDLoc dl(N);
42075 if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))
42076 return V;
42077
42078 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
42079 // type, widen both sides to avoid a trip through memory.
42080 if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
42081 Subtarget.hasAVX512()) {
42082 N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
42083 N0 = DAG.getBitcast(MVT::v8i1, N0);
42084 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
42085 DAG.getIntPtrConstant(0, dl));
42086 }
42087
42088 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
42089 // type, widen both sides to avoid a trip through memory.
42090 if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
42091 Subtarget.hasAVX512()) {
42092 // Use zeros for the widening if we already have some zeroes. This can
42093 // allow SimplifyDemandedBits to remove scalar ANDs that may be down
42094 // stream of this.
42095 // FIXME: It might make sense to detect a concat_vectors with a mix of
42096 // zeroes and undef and turn it into insert_subvector for i1 vectors as
42097 // a separate combine. What we can't do is canonicalize the operands of
42098 // such a concat or we'll get into a loop with SimplifyDemandedBits.
42099 if (N0.getOpcode() == ISD::CONCAT_VECTORS) {
42100 SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1);
42101 if (ISD::isBuildVectorAllZeros(LastOp.getNode())) {
42102 SrcVT = LastOp.getValueType();
42103 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
42104 SmallVector<SDValue, 4> Ops(N0->op_begin(), N0->op_end());
42105 Ops.resize(NumConcats, DAG.getConstant(0, dl, SrcVT));
42106 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
42107 N0 = DAG.getBitcast(MVT::i8, N0);
42108 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
42109 }
42110 }
42111
42112 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
42113 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
42114 Ops[0] = N0;
42115 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
42116 N0 = DAG.getBitcast(MVT::i8, N0);
42117 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
42118 }
42119 } else {
42120 // If we're bitcasting from iX to vXi1, see if the integer originally
42121 // began as a vXi1 and whether we can remove the bitcast entirely.
42122 if (VT.isVector() && VT.getScalarType() == MVT::i1 &&
42123 SrcVT.isScalarInteger() && TLI.isTypeLegal(VT)) {
42124 if (SDValue V =
42125 combineBitcastToBoolVector(VT, N0, SDLoc(N), DAG, Subtarget))
42126 return V;
42127 }
42128 }
42129
42130 // Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and
42131 // replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur
42132 // due to insert_subvector legalization on KNL. By promoting the copy to i16
42133 // we can help with known bits propagation from the vXi1 domain to the
42134 // scalar domain.
42135 if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() &&
42136 !Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
42137 N0.getOperand(0).getValueType() == MVT::v16i1 &&
42138 isNullConstant(N0.getOperand(1)))
42139 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,
42140 DAG.getBitcast(MVT::i16, N0.getOperand(0)));
42141
42142 // Canonicalize (bitcast (vbroadcast_load)) so that the output of the bitcast
42143 // and the vbroadcast_load are both integer or both fp. In some cases this
42144 // will remove the bitcast entirely.
42145 if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&
42146 VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) {
42147 auto *BCast = cast<MemIntrinsicSDNode>(N0);
42148 unsigned SrcVTSize = SrcVT.getScalarSizeInBits();
42149 unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits();
42150 // Don't swap i8/i16 since don't have fp types that size.
42151 if (MemSize >= 32) {
42152 MVT MemVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(MemSize)
42153 : MVT::getIntegerVT(MemSize);
42154 MVT LoadVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(SrcVTSize)
42155 : MVT::getIntegerVT(SrcVTSize);
42156 LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements());
42157
42158 SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
42159 SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };
42160 SDValue ResNode =
42161 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops,
42162 MemVT, BCast->getMemOperand());
42163 DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));
42164 return DAG.getBitcast(VT, ResNode);
42165 }
42166 }
42167
42168 // Since MMX types are special and don't usually play with other vector types,
42169 // it's better to handle them early to be sure we emit efficient code by
42170 // avoiding store-load conversions.
42171 if (VT == MVT::x86mmx) {
42172 // Detect MMX constant vectors.
42173 APInt UndefElts;
42174 SmallVector<APInt, 1> EltBits;
42175 if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits)) {
42176 SDLoc DL(N0);
42177 // Handle zero-extension of i32 with MOVD.
42178 if (EltBits[0].countLeadingZeros() >= 32)
42179 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,
42180 DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));
42181 // Else, bitcast to a double.
42182 // TODO - investigate supporting sext 32-bit immediates on x86_64.
42183 APFloat F64(APFloat::IEEEdouble(), EltBits[0]);
42184 return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));
42185 }
42186
42187 // Detect bitcasts to x86mmx low word.
42188 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
42189 (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&
42190 N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {
42191 bool LowUndef = true, AllUndefOrZero = true;
42192 for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
42193 SDValue Op = N0.getOperand(i);
42194 LowUndef &= Op.isUndef() || (i >= e/2);
42195 AllUndefOrZero &= (Op.isUndef() || isNullConstant(Op));
42196 }
42197 if (AllUndefOrZero) {
42198 SDValue N00 = N0.getOperand(0);
42199 SDLoc dl(N00);
42200 N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)
42201 : DAG.getZExtOrTrunc(N00, dl, MVT::i32);
42202 return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);
42203 }
42204 }
42205
42206 // Detect bitcasts of 64-bit build vectors and convert to a
42207 // MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the
42208 // lowest element.
42209 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
42210 (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||
42211 SrcVT == MVT::v8i8))
42212 return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget);
42213
42214 // Detect bitcasts between element or subvector extraction to x86mmx.
42215 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
42216 N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
42217 isNullConstant(N0.getOperand(1))) {
42218 SDValue N00 = N0.getOperand(0);
42219 if (N00.getValueType().is128BitVector())
42220 return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
42221 DAG.getBitcast(MVT::v2i64, N00));
42222 }
42223
42224 // Detect bitcasts from FP_TO_SINT to x86mmx.
42225 if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {
42226 SDLoc DL(N0);
42227 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
42228 DAG.getUNDEF(MVT::v2i32));
42229 return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
42230 DAG.getBitcast(MVT::v2i64, Res));
42231 }
42232 }
42233
42234 // Try to remove a bitcast of constant vXi1 vector. We have to legalize
42235 // most of these to scalar anyway.
42236 if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
42237 SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
42238 ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {
42239 return combinevXi1ConstantToInteger(N0, DAG);
42240 }
42241
42242 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
42243 VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
42244 isa<ConstantSDNode>(N0)) {
42245 auto *C = cast<ConstantSDNode>(N0);
42246 if (C->isAllOnes())
42247 return DAG.getConstant(1, SDLoc(N0), VT);
42248 if (C->isZero())
42249 return DAG.getConstant(0, SDLoc(N0), VT);
42250 }
42251
42252 // Look for MOVMSK that is maybe truncated and then bitcasted to vXi1.
42253 // Turn it into a sign bit compare that produces a k-register. This avoids
42254 // a trip through a GPR.
42255 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
42256 VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
42257 isPowerOf2_32(VT.getVectorNumElements())) {
42258 unsigned NumElts = VT.getVectorNumElements();
42259 SDValue Src = N0;
42260
42261 // Peek through truncate.
42262 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
42263 Src = N0.getOperand(0);
42264
42265 if (Src.getOpcode() == X86ISD::MOVMSK && Src.hasOneUse()) {
42266 SDValue MovmskIn = Src.getOperand(0);
42267 MVT MovmskVT = MovmskIn.getSimpleValueType();
42268 unsigned MovMskElts = MovmskVT.getVectorNumElements();
42269
42270 // We allow extra bits of the movmsk to be used since they are known zero.
42271 // We can't convert a VPMOVMSKB without avx512bw.
42272 if (MovMskElts <= NumElts &&
42273 (Subtarget.hasBWI() || MovmskVT.getVectorElementType() != MVT::i8)) {
42274 EVT IntVT = EVT(MovmskVT).changeVectorElementTypeToInteger();
42275 MovmskIn = DAG.getBitcast(IntVT, MovmskIn);
42276 SDLoc dl(N);
42277 MVT CmpVT = MVT::getVectorVT(MVT::i1, MovMskElts);
42278 SDValue Cmp = DAG.getSetCC(dl, CmpVT, MovmskIn,
42279 DAG.getConstant(0, dl, IntVT), ISD::SETLT);
42280 if (EVT(CmpVT) == VT)
42281 return Cmp;
42282
42283 // Pad with zeroes up to original VT to replace the zeroes that were
42284 // being used from the MOVMSK.
42285 unsigned NumConcats = NumElts / MovMskElts;
42286 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, CmpVT));
42287 Ops[0] = Cmp;
42288 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Ops);
42289 }
42290 }
42291 }
42292
42293 // Try to remove bitcasts from input and output of mask arithmetic to
42294 // remove GPR<->K-register crossings.
42295 if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
42296 return V;
42297
42298 // Convert a bitcasted integer logic operation that has one bitcasted
42299 // floating-point operand into a floating-point logic operation. This may
42300 // create a load of a constant, but that is cheaper than materializing the
42301 // constant in an integer register and transferring it to an SSE register or
42302 // transferring the SSE operand to integer register and back.
42303 unsigned FPOpcode;
42304 switch (N0.getOpcode()) {
42305 case ISD::AND: FPOpcode = X86ISD::FAND; break;
42306 case ISD::OR: FPOpcode = X86ISD::FOR; break;
42307 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
42308 default: return SDValue();
42309 }
42310
42311 // Check if we have a bitcast from another integer type as well.
42312 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
42313 (Subtarget.hasSSE2() && VT == MVT::f64) ||
42314 (Subtarget.hasFP16() && VT == MVT::f16) ||
42315 (Subtarget.hasSSE2() && VT.isInteger() && VT.isVector() &&
42316 TLI.isTypeLegal(VT))))
42317 return SDValue();
42318
42319 SDValue LogicOp0 = N0.getOperand(0);
42320 SDValue LogicOp1 = N0.getOperand(1);
42321 SDLoc DL0(N0);
42322
42323 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
42324 if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
42325 LogicOp0.hasOneUse() && LogicOp0.getOperand(0).hasOneUse() &&
42326 LogicOp0.getOperand(0).getValueType() == VT &&
42327 !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
42328 SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
42329 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
42330 return DAG.getNode(Opcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
42331 }
42332 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
42333 if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
42334 LogicOp1.hasOneUse() && LogicOp1.getOperand(0).hasOneUse() &&
42335 LogicOp1.getOperand(0).getValueType() == VT &&
42336 !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
42337 SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
42338 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
42339 return DAG.getNode(Opcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
42340 }
42341
42342 return SDValue();
42343}
42344
42345// (mul (zext a), (sext, b))
42346static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0,
42347 SDValue &Op1) {
42348 Op0 = Mul.getOperand(0);
42349 Op1 = Mul.getOperand(1);
42350
42351 // The operand1 should be signed extend
42352 if (Op0.getOpcode() == ISD::SIGN_EXTEND)
42353 std::swap(Op0, Op1);
42354
42355 auto IsFreeTruncation = [](SDValue &Op) -> bool {
42356 if ((Op.getOpcode() == ISD::ZERO_EXTEND ||
42357 Op.getOpcode() == ISD::SIGN_EXTEND) &&
42358 Op.getOperand(0).getScalarValueSizeInBits() <= 8)
42359 return true;
42360
42361 auto *BV = dyn_cast<BuildVectorSDNode>(Op);
42362 return (BV && BV->isConstant());
42363 };
42364
42365 // (dpbusd (zext a), (sext, b)). Since the first operand should be unsigned
42366 // value, we need to check Op0 is zero extended value. Op1 should be signed
42367 // value, so we just check the signed bits.
42368 if ((IsFreeTruncation(Op0) &&
42369 DAG.computeKnownBits(Op0).countMaxActiveBits() <= 8) &&
42370 (IsFreeTruncation(Op1) && DAG.ComputeMaxSignificantBits(Op1) <= 8))
42371 return true;
42372
42373 return false;
42374}
42375
42376// Given a ABS node, detect the following pattern:
42377// (ABS (SUB (ZERO_EXTEND a), (ZERO_EXTEND b))).
42378// This is useful as it is the input into a SAD pattern.
42379static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1) {
42380 SDValue AbsOp1 = Abs->getOperand(0);
42381 if (AbsOp1.getOpcode() != ISD::SUB)
42382 return false;
42383
42384 Op0 = AbsOp1.getOperand(0);
42385 Op1 = AbsOp1.getOperand(1);
42386
42387 // Check if the operands of the sub are zero-extended from vectors of i8.
42388 if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
42389 Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
42390 Op1.getOpcode() != ISD::ZERO_EXTEND ||
42391 Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
42392 return false;
42393
42394 return true;
42395}
42396
42397static SDValue createVPDPBUSD(SelectionDAG &DAG, SDValue LHS, SDValue RHS,
42398 unsigned &LogBias, const SDLoc &DL,
42399 const X86Subtarget &Subtarget) {
42400 // Extend or truncate to MVT::i8 first.
42401 MVT Vi8VT =
42402 MVT::getVectorVT(MVT::i8, LHS.getValueType().getVectorElementCount());
42403 LHS = DAG.getZExtOrTrunc(LHS, DL, Vi8VT);
42404 RHS = DAG.getSExtOrTrunc(RHS, DL, Vi8VT);
42405
42406 // VPDPBUSD(<16 x i32>C, <16 x i8>A, <16 x i8>B). For each dst element
42407 // C[0] = C[0] + A[0]B[0] + A[1]B[1] + A[2]B[2] + A[3]B[3].
42408 // The src A, B element type is i8, but the dst C element type is i32.
42409 // When we calculate the reduce stage, we use src vector type vXi8 for it
42410 // so we need logbias 2 to avoid extra 2 stages.
42411 LogBias = 2;
42412
42413 unsigned RegSize = std::max(128u, (unsigned)Vi8VT.getSizeInBits());
42414 if (Subtarget.hasVNNI() && !Subtarget.hasVLX())
42415 RegSize = std::max(512u, RegSize);
42416
42417 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
42418 // fill in the missing vector elements with 0.
42419 unsigned NumConcat = RegSize / Vi8VT.getSizeInBits();
42420 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, Vi8VT));
42421 Ops[0] = LHS;
42422 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
42423 SDValue DpOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
42424 Ops[0] = RHS;
42425 SDValue DpOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
42426
42427 // Actually build the DotProduct, split as 256/512 bits for
42428 // AVXVNNI/AVX512VNNI.
42429 auto DpBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
42430 ArrayRef<SDValue> Ops) {
42431 MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
42432 return DAG.getNode(X86ISD::VPDPBUSD, DL, VT, Ops);
42433 };
42434 MVT DpVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
42435 SDValue Zero = DAG.getConstant(0, DL, DpVT);
42436
42437 return SplitOpsAndApply(DAG, Subtarget, DL, DpVT, {Zero, DpOp0, DpOp1},
42438 DpBuilder, false);
42439}
42440
42441// Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
42442// to these zexts.
42443static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
42444 const SDValue &Zext1, const SDLoc &DL,
42445 const X86Subtarget &Subtarget) {
42446 // Find the appropriate width for the PSADBW.
42447 EVT InVT = Zext0.getOperand(0).getValueType();
42448 unsigned RegSize = std::max(128u, (unsigned)InVT.getSizeInBits());
42449
42450 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
42451 // fill in the missing vector elements with 0.
42452 unsigned NumConcat = RegSize / InVT.getSizeInBits();
42453 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
42454 Ops[0] = Zext0.getOperand(0);
42455 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
42456 SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
42457 Ops[0] = Zext1.getOperand(0);
42458 SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
42459
42460 // Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
42461 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
42462 ArrayRef<SDValue> Ops) {
42463 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
42464 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);
42465 };
42466 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
42467 return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, { SadOp0, SadOp1 },
42468 PSADBWBuilder);
42469}
42470
42471// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
42472// PHMINPOSUW.
42473static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG,
42474 const X86Subtarget &Subtarget) {
42475 // Bail without SSE41.
42476 if (!Subtarget.hasSSE41())
42477 return SDValue();
42478
42479 EVT ExtractVT = Extract->getValueType(0);
42480 if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
42481 return SDValue();
42482
42483 // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
42484 ISD::NodeType BinOp;
42485 SDValue Src = DAG.matchBinOpReduction(
42486 Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}, true);
42487 if (!Src)
42488 return SDValue();
42489
42490 EVT SrcVT = Src.getValueType();
42491 EVT SrcSVT = SrcVT.getScalarType();
42492 if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
42493 return SDValue();
42494
42495 SDLoc DL(Extract);
42496 SDValue MinPos = Src;
42497
42498 // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
42499 while (SrcVT.getSizeInBits() > 128) {
42500 SDValue Lo, Hi;
42501 std::tie(Lo, Hi) = splitVector(MinPos, DAG, DL);
42502 SrcVT = Lo.getValueType();
42503 MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
42504 }
42505 assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||(static_cast <bool> (((SrcVT == MVT::v8i16 && ExtractVT
== MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT ==
MVT::i8)) && "Unexpected value type") ? void (0) : __assert_fail
("((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42507, __extension__
__PRETTY_FUNCTION__))
42506 (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&(static_cast <bool> (((SrcVT == MVT::v8i16 && ExtractVT
== MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT ==
MVT::i8)) && "Unexpected value type") ? void (0) : __assert_fail
("((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42507, __extension__
__PRETTY_FUNCTION__))
42507 "Unexpected value type")(static_cast <bool> (((SrcVT == MVT::v8i16 && ExtractVT
== MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT ==
MVT::i8)) && "Unexpected value type") ? void (0) : __assert_fail
("((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42507, __extension__
__PRETTY_FUNCTION__))
;
42508
42509 // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
42510 // to flip the value accordingly.
42511 SDValue Mask;
42512 unsigned MaskEltsBits = ExtractVT.getSizeInBits();
42513 if (BinOp == ISD::SMAX)
42514 Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
42515 else if (BinOp == ISD::SMIN)
42516 Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
42517 else if (BinOp == ISD::UMAX)
42518 Mask = DAG.getAllOnesConstant(DL, SrcVT);
42519
42520 if (Mask)
42521 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
42522
42523 // For v16i8 cases we need to perform UMIN on pairs of byte elements,
42524 // shuffling each upper element down and insert zeros. This means that the
42525 // v16i8 UMIN will leave the upper element as zero, performing zero-extension
42526 // ready for the PHMINPOS.
42527 if (ExtractVT == MVT::i8) {
42528 SDValue Upper = DAG.getVectorShuffle(
42529 SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8),
42530 {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
42531 MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
42532 }
42533
42534 // Perform the PHMINPOS on a v8i16 vector,
42535 MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
42536 MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
42537 MinPos = DAG.getBitcast(SrcVT, MinPos);
42538
42539 if (Mask)
42540 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
42541
42542 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
42543 DAG.getIntPtrConstant(0, DL));
42544}
42545
42546// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
42547static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG,
42548 const X86Subtarget &Subtarget) {
42549 // Bail without SSE2.
42550 if (!Subtarget.hasSSE2())
42551 return SDValue();
42552
42553 EVT ExtractVT = Extract->getValueType(0);
42554 unsigned BitWidth = ExtractVT.getSizeInBits();
42555 if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
42556 ExtractVT != MVT::i8 && ExtractVT != MVT::i1)
42557 return SDValue();
42558
42559 // Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns.
42560 ISD::NodeType BinOp;
42561 SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
42562 if (!Match && ExtractVT == MVT::i1)
42563 Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR});
42564 if (!Match)
42565 return SDValue();
42566
42567 // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
42568 // which we can't support here for now.
42569 if (Match.getScalarValueSizeInBits() != BitWidth)
42570 return SDValue();
42571
42572 SDValue Movmsk;
42573 SDLoc DL(Extract);
42574 EVT MatchVT = Match.getValueType();
42575 unsigned NumElts = MatchVT.getVectorNumElements();
42576 unsigned MaxElts = Subtarget.hasInt256() ? 32 : 16;
42577 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42578
42579 if (ExtractVT == MVT::i1) {
42580 // Special case for (pre-legalization) vXi1 reductions.
42581 if (NumElts > 64 || !isPowerOf2_32(NumElts))
42582 return SDValue();
42583 if (TLI.isTypeLegal(MatchVT)) {
42584 // If this is a legal AVX512 predicate type then we can just bitcast.
42585 EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
42586 Movmsk = DAG.getBitcast(MovmskVT, Match);
42587 } else {
42588 // For all_of(setcc(x,y,eq)) - use PMOVMSKB(PCMPEQB()).
42589 if (BinOp == ISD::AND && Match.getOpcode() == ISD::SETCC &&
42590 cast<CondCodeSDNode>(Match.getOperand(2))->get() ==
42591 ISD::CondCode::SETEQ) {
42592 EVT VecSVT = Match.getOperand(0).getValueType().getScalarType();
42593 if (VecSVT != MVT::i8) {
42594 NumElts *= VecSVT.getSizeInBits() / 8;
42595 EVT CmpVT = EVT::getVectorVT(*DAG.getContext(), MVT::i8, NumElts);
42596 MatchVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
42597 Match = DAG.getSetCC(
42598 DL, MatchVT, DAG.getBitcast(CmpVT, Match.getOperand(0)),
42599 DAG.getBitcast(CmpVT, Match.getOperand(1)), ISD::CondCode::SETEQ);
42600 }
42601 }
42602
42603 // Use combineBitcastvxi1 to create the MOVMSK.
42604 while (NumElts > MaxElts) {
42605 SDValue Lo, Hi;
42606 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
42607 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
42608 NumElts /= 2;
42609 }
42610 EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
42611 Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);
42612 }
42613 if (!Movmsk)
42614 return SDValue();
42615 Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32);
42616 } else {
42617 // FIXME: Better handling of k-registers or 512-bit vectors?
42618 unsigned MatchSizeInBits = Match.getValueSizeInBits();
42619 if (!(MatchSizeInBits == 128 ||
42620 (MatchSizeInBits == 256 && Subtarget.hasAVX())))
42621 return SDValue();
42622
42623 // Make sure this isn't a vector of 1 element. The perf win from using
42624 // MOVMSK diminishes with less elements in the reduction, but it is
42625 // generally better to get the comparison over to the GPRs as soon as
42626 // possible to reduce the number of vector ops.
42627 if (Match.getValueType().getVectorNumElements() < 2)
42628 return SDValue();
42629
42630 // Check that we are extracting a reduction of all sign bits.
42631 if (DAG.ComputeNumSignBits(Match) != BitWidth)
42632 return SDValue();
42633
42634 if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) {
42635 SDValue Lo, Hi;
42636 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
42637 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
42638 MatchSizeInBits = Match.getValueSizeInBits();
42639 }
42640
42641 // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
42642 MVT MaskSrcVT;
42643 if (64 == BitWidth || 32 == BitWidth)
42644 MaskSrcVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
42645 MatchSizeInBits / BitWidth);
42646 else
42647 MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
42648
42649 SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match);
42650 Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);
42651 NumElts = MaskSrcVT.getVectorNumElements();
42652 }
42653 assert((NumElts <= 32 || NumElts == 64) &&(static_cast <bool> ((NumElts <= 32 || NumElts == 64
) && "Not expecting more than 64 elements") ? void (0
) : __assert_fail ("(NumElts <= 32 || NumElts == 64) && \"Not expecting more than 64 elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42654, __extension__
__PRETTY_FUNCTION__))
42654 "Not expecting more than 64 elements")(static_cast <bool> ((NumElts <= 32 || NumElts == 64
) && "Not expecting more than 64 elements") ? void (0
) : __assert_fail ("(NumElts <= 32 || NumElts == 64) && \"Not expecting more than 64 elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42654, __extension__
__PRETTY_FUNCTION__))
;
42655
42656 MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32;
42657 if (BinOp == ISD::XOR) {
42658 // parity -> (PARITY(MOVMSK X))
42659 SDValue Result = DAG.getNode(ISD::PARITY, DL, CmpVT, Movmsk);
42660 return DAG.getZExtOrTrunc(Result, DL, ExtractVT);
42661 }
42662
42663 SDValue CmpC;
42664 ISD::CondCode CondCode;
42665 if (BinOp == ISD::OR) {
42666 // any_of -> MOVMSK != 0
42667 CmpC = DAG.getConstant(0, DL, CmpVT);
42668 CondCode = ISD::CondCode::SETNE;
42669 } else {
42670 // all_of -> MOVMSK == ((1 << NumElts) - 1)
42671 CmpC = DAG.getConstant(APInt::getLowBitsSet(CmpVT.getSizeInBits(), NumElts),
42672 DL, CmpVT);
42673 CondCode = ISD::CondCode::SETEQ;
42674 }
42675
42676 // The setcc produces an i8 of 0/1, so extend that to the result width and
42677 // negate to get the final 0/-1 mask value.
42678 EVT SetccVT =
42679 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), CmpVT);
42680 SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);
42681 SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);
42682 SDValue Zero = DAG.getConstant(0, DL, ExtractVT);
42683 return DAG.getNode(ISD::SUB, DL, ExtractVT, Zero, Zext);
42684}
42685
42686static SDValue combineVPDPBUSDPattern(SDNode *Extract, SelectionDAG &DAG,
42687 const X86Subtarget &Subtarget) {
42688 if (!Subtarget.hasVNNI() && !Subtarget.hasAVXVNNI())
42689 return SDValue();
42690
42691 EVT ExtractVT = Extract->getValueType(0);
42692 // Verify the type we're extracting is i32, as the output element type of
42693 // vpdpbusd is i32.
42694 if (ExtractVT != MVT::i32)
42695 return SDValue();
42696
42697 EVT VT = Extract->getOperand(0).getValueType();
42698 if (!isPowerOf2_32(VT.getVectorNumElements()))
42699 return SDValue();
42700
42701 // Match shuffle + add pyramid.
42702 ISD::NodeType BinOp;
42703 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
42704
42705 // We can't combine to vpdpbusd for zext, because each of the 4 multiplies
42706 // done by vpdpbusd compute a signed 16-bit product that will be sign extended
42707 // before adding into the accumulator.
42708 // TODO:
42709 // We also need to verify that the multiply has at least 2x the number of bits
42710 // of the input. We shouldn't match
42711 // (sign_extend (mul (vXi9 (zext (vXi8 X))), (vXi9 (zext (vXi8 Y)))).
42712 // if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND))
42713 // Root = Root.getOperand(0);
42714
42715 // If there was a match, we want Root to be a mul.
42716 if (!Root || Root.getOpcode() != ISD::MUL)
42717 return SDValue();
42718
42719 // Check whether we have an extend and mul pattern
42720 SDValue LHS, RHS;
42721 if (!detectExtMul(DAG, Root, LHS, RHS))
42722 return SDValue();
42723
42724 // Create the dot product instruction.
42725 SDLoc DL(Extract);
42726 unsigned StageBias;
42727 SDValue DP = createVPDPBUSD(DAG, LHS, RHS, StageBias, DL, Subtarget);
42728
42729 // If the original vector was wider than 4 elements, sum over the results
42730 // in the DP vector.
42731 unsigned Stages = Log2_32(VT.getVectorNumElements());
42732 EVT DpVT = DP.getValueType();
42733
42734 if (Stages > StageBias) {
42735 unsigned DpElems = DpVT.getVectorNumElements();
42736
42737 for (unsigned i = Stages - StageBias; i > 0; --i) {
42738 SmallVector<int, 16> Mask(DpElems, -1);
42739 for (unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
42740 Mask[j] = MaskEnd + j;
42741
42742 SDValue Shuffle =
42743 DAG.getVectorShuffle(DpVT, DL, DP, DAG.getUNDEF(DpVT), Mask);
42744 DP = DAG.getNode(ISD::ADD, DL, DpVT, DP, Shuffle);
42745 }
42746 }
42747
42748 // Return the lowest ExtractSizeInBits bits.
42749 EVT ResVT =
42750 EVT::getVectorVT(*DAG.getContext(), ExtractVT,
42751 DpVT.getSizeInBits() / ExtractVT.getSizeInBits());
42752 DP = DAG.getBitcast(ResVT, DP);
42753 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, DP,
42754 Extract->getOperand(1));
42755}
42756
42757static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
42758 const X86Subtarget &Subtarget) {
42759 // PSADBW is only supported on SSE2 and up.
42760 if (!Subtarget.hasSSE2())
42761 return SDValue();
42762
42763 EVT ExtractVT = Extract->getValueType(0);
42764 // Verify the type we're extracting is either i32 or i64.
42765 // FIXME: Could support other types, but this is what we have coverage for.
42766 if (ExtractVT != MVT::i32 && ExtractVT != MVT::i64)
42767 return SDValue();
42768
42769 EVT VT = Extract->getOperand(0).getValueType();
42770 if (!isPowerOf2_32(VT.getVectorNumElements()))
42771 return SDValue();
42772
42773 // Match shuffle + add pyramid.
42774 ISD::NodeType BinOp;
42775 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
42776
42777 // The operand is expected to be zero extended from i8
42778 // (verified in detectZextAbsDiff).
42779 // In order to convert to i64 and above, additional any/zero/sign
42780 // extend is expected.
42781 // The zero extend from 32 bit has no mathematical effect on the result.
42782 // Also the sign extend is basically zero extend
42783 // (extends the sign bit which is zero).
42784 // So it is correct to skip the sign/zero extend instruction.
42785 if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
42786 Root.getOpcode() == ISD::ZERO_EXTEND ||
42787 Root.getOpcode() == ISD::ANY_EXTEND))
42788 Root = Root.getOperand(0);
42789
42790 // If there was a match, we want Root to be a select that is the root of an
42791 // abs-diff pattern.
42792 if (!Root || Root.getOpcode() != ISD::ABS)
42793 return SDValue();
42794
42795 // Check whether we have an abs-diff pattern feeding into the select.
42796 SDValue Zext0, Zext1;
42797 if (!detectZextAbsDiff(Root, Zext0, Zext1))
42798 return SDValue();
42799
42800 // Create the SAD instruction.
42801 SDLoc DL(Extract);
42802 SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL, Subtarget);
42803
42804 // If the original vector was wider than 8 elements, sum over the results
42805 // in the SAD vector.
42806 unsigned Stages = Log2_32(VT.getVectorNumElements());
42807 EVT SadVT = SAD.getValueType();
42808 if (Stages > 3) {
42809 unsigned SadElems = SadVT.getVectorNumElements();
42810
42811 for(unsigned i = Stages - 3; i > 0; --i) {
42812 SmallVector<int, 16> Mask(SadElems, -1);
42813 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
42814 Mask[j] = MaskEnd + j;
42815
42816 SDValue Shuffle =
42817 DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
42818 SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
42819 }
42820 }
42821
42822 unsigned ExtractSizeInBits = ExtractVT.getSizeInBits();
42823 // Return the lowest ExtractSizeInBits bits.
42824 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), ExtractVT,
42825 SadVT.getSizeInBits() / ExtractSizeInBits);
42826 SAD = DAG.getBitcast(ResVT, SAD);
42827 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, SAD,
42828 Extract->getOperand(1));
42829}
42830
42831// Attempt to peek through a target shuffle and extract the scalar from the
42832// source.
42833static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
42834 TargetLowering::DAGCombinerInfo &DCI,
42835 const X86Subtarget &Subtarget) {
42836 if (DCI.isBeforeLegalizeOps())
42837 return SDValue();
42838
42839 SDLoc dl(N);
42840 SDValue Src = N->getOperand(0);
42841 SDValue Idx = N->getOperand(1);
42842
42843 EVT VT = N->getValueType(0);
42844 EVT SrcVT = Src.getValueType();
42845 EVT SrcSVT = SrcVT.getVectorElementType();
42846 unsigned SrcEltBits = SrcSVT.getSizeInBits();
42847 unsigned NumSrcElts = SrcVT.getVectorNumElements();
42848
42849 // Don't attempt this for boolean mask vectors or unknown extraction indices.
42850 if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
42851 return SDValue();
42852
42853 const APInt &IdxC = N->getConstantOperandAPInt(1);
42854 if (IdxC.uge(NumSrcElts))
42855 return SDValue();
42856
42857 SDValue SrcBC = peekThroughBitcasts(Src);
42858
42859 // Handle extract(bitcast(broadcast(scalar_value))).
42860 if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {
42861 SDValue SrcOp = SrcBC.getOperand(0);
42862 EVT SrcOpVT = SrcOp.getValueType();
42863 if (SrcOpVT.isScalarInteger() && VT.isInteger() &&
42864 (SrcOpVT.getSizeInBits() % SrcEltBits) == 0) {
42865 unsigned Scale = SrcOpVT.getSizeInBits() / SrcEltBits;
42866 unsigned Offset = IdxC.urem(Scale) * SrcEltBits;
42867 // TODO support non-zero offsets.
42868 if (Offset == 0) {
42869 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, SrcVT.getScalarType());
42870 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, VT);
42871 return SrcOp;
42872 }
42873 }
42874 }
42875
42876 // If we're extracting a single element from a broadcast load and there are
42877 // no other users, just create a single load.
42878 if (SrcBC.getOpcode() == X86ISD::VBROADCAST_LOAD && SrcBC.hasOneUse()) {
42879 auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);
42880 unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();
42881 if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&
42882 VT.getSizeInBits() == SrcBCWidth && SrcEltBits == SrcBCWidth) {
42883 SDValue Load = DAG.getLoad(VT, dl, MemIntr->getChain(),
42884 MemIntr->getBasePtr(),
42885 MemIntr->getPointerInfo(),
42886 MemIntr->getOriginalAlign(),
42887 MemIntr->getMemOperand()->getFlags());
42888 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
42889 return Load;
42890 }
42891 }
42892
42893 // Handle extract(bitcast(scalar_to_vector(scalar_value))) for integers.
42894 // TODO: Move to DAGCombine?
42895 if (SrcBC.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isInteger() &&
42896 SrcBC.getValueType().isInteger() &&
42897 (SrcBC.getScalarValueSizeInBits() % SrcEltBits) == 0 &&
42898 SrcBC.getScalarValueSizeInBits() ==
42899 SrcBC.getOperand(0).getValueSizeInBits()) {
42900 unsigned Scale = SrcBC.getScalarValueSizeInBits() / SrcEltBits;
42901 if (IdxC.ult(Scale)) {
42902 unsigned Offset = IdxC.getZExtValue() * SrcVT.getScalarSizeInBits();
42903 SDValue Scl = SrcBC.getOperand(0);
42904 EVT SclVT = Scl.getValueType();
42905 if (Offset) {
42906 Scl = DAG.getNode(ISD::SRL, dl, SclVT, Scl,
42907 DAG.getShiftAmountConstant(Offset, SclVT, dl));
42908 }
42909 Scl = DAG.getZExtOrTrunc(Scl, dl, SrcVT.getScalarType());
42910 Scl = DAG.getZExtOrTrunc(Scl, dl, VT);
42911 return Scl;
42912 }
42913 }
42914
42915 // Handle extract(truncate(x)) for 0'th index.
42916 // TODO: Treat this as a faux shuffle?
42917 // TODO: When can we use this for general indices?
42918 if (ISD::TRUNCATE == Src.getOpcode() && IdxC == 0 &&
42919 (SrcVT.getSizeInBits() % 128) == 0) {
42920 Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);
42921 MVT ExtractVT = MVT::getVectorVT(SrcSVT.getSimpleVT(), 128 / SrcEltBits);
42922 return DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(ExtractVT, Src),
42923 Idx);
42924 }
42925
42926 // We can only legally extract other elements from 128-bit vectors and in
42927 // certain circumstances, depending on SSE-level.
42928 // TODO: Investigate float/double extraction if it will be just stored.
42929 auto GetLegalExtract = [&Subtarget, &DAG, &dl](SDValue Vec, EVT VecVT,
42930 unsigned Idx) {
42931 EVT VecSVT = VecVT.getScalarType();
42932 if ((VecVT.is256BitVector() || VecVT.is512BitVector()) &&
42933 (VecSVT == MVT::i8 || VecSVT == MVT::i16 || VecSVT == MVT::i32 ||
42934 VecSVT == MVT::i64)) {
42935 unsigned EltSizeInBits = VecSVT.getSizeInBits();
42936 unsigned NumEltsPerLane = 128 / EltSizeInBits;
42937 unsigned LaneOffset = (Idx & ~(NumEltsPerLane - 1)) * EltSizeInBits;
42938 unsigned LaneIdx = LaneOffset / Vec.getScalarValueSizeInBits();
42939 VecVT = EVT::getVectorVT(*DAG.getContext(), VecSVT, NumEltsPerLane);
42940 Vec = extract128BitVector(Vec, LaneIdx, DAG, dl);
42941 Idx &= (NumEltsPerLane - 1);
42942 }
42943 if ((VecVT == MVT::v4i32 || VecVT == MVT::v2i64) &&
42944 ((Idx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
42945 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VecVT.getScalarType(),
42946 DAG.getBitcast(VecVT, Vec),
42947 DAG.getIntPtrConstant(Idx, dl));
42948 }
42949 if ((VecVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
42950 (VecVT == MVT::v16i8 && Subtarget.hasSSE41())) {
42951 unsigned OpCode = (VecVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
42952 return DAG.getNode(OpCode, dl, MVT::i32, DAG.getBitcast(VecVT, Vec),
42953 DAG.getTargetConstant(Idx, dl, MVT::i8));
42954 }
42955 return SDValue();
42956 };
42957
42958 // Resolve the target shuffle inputs and mask.
42959 SmallVector<int, 16> Mask;
42960 SmallVector<SDValue, 2> Ops;
42961 if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG))
42962 return SDValue();
42963
42964 // Shuffle inputs must be the same size as the result.
42965 if (llvm::any_of(Ops, [SrcVT](SDValue Op) {
42966 return SrcVT.getSizeInBits() != Op.getValueSizeInBits();
42967 }))
42968 return SDValue();
42969
42970 // Attempt to narrow/widen the shuffle mask to the correct size.
42971 if (Mask.size() != NumSrcElts) {
42972 if ((NumSrcElts % Mask.size()) == 0) {
42973 SmallVector<int, 16> ScaledMask;
42974 int Scale = NumSrcElts / Mask.size();
42975 narrowShuffleMaskElts(Scale, Mask, ScaledMask);
42976 Mask = std::move(ScaledMask);
42977 } else if ((Mask.size() % NumSrcElts) == 0) {
42978 // Simplify Mask based on demanded element.
42979 int ExtractIdx = (int)IdxC.getZExtValue();
42980 int Scale = Mask.size() / NumSrcElts;
42981 int Lo = Scale * ExtractIdx;
42982 int Hi = Scale * (ExtractIdx + 1);
42983 for (int i = 0, e = (int)Mask.size(); i != e; ++i)
42984 if (i < Lo || Hi <= i)
42985 Mask[i] = SM_SentinelUndef;
42986
42987 SmallVector<int, 16> WidenedMask;
42988 while (Mask.size() > NumSrcElts &&
42989 canWidenShuffleElements(Mask, WidenedMask))
42990 Mask = std::move(WidenedMask);
42991 }
42992 }
42993
42994 // If narrowing/widening failed, see if we can extract+zero-extend.
42995 int ExtractIdx;
42996 EVT ExtractVT;
42997 if (Mask.size() == NumSrcElts) {
42998 ExtractIdx = Mask[IdxC.getZExtValue()];
42999 ExtractVT = SrcVT;
43000 } else {
43001 unsigned Scale = Mask.size() / NumSrcElts;
43002 if ((Mask.size() % NumSrcElts) != 0 || SrcVT.isFloatingPoint())
43003 return SDValue();
43004 unsigned ScaledIdx = Scale * IdxC.getZExtValue();
43005 if (!isUndefOrZeroInRange(Mask, ScaledIdx + 1, Scale - 1))
43006 return SDValue();
43007 ExtractIdx = Mask[ScaledIdx];
43008 EVT ExtractSVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltBits / Scale);
43009 ExtractVT = EVT::getVectorVT(*DAG.getContext(), ExtractSVT, Mask.size());
43010 assert(SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() &&(static_cast <bool> (SrcVT.getSizeInBits() == ExtractVT
.getSizeInBits() && "Failed to widen vector type") ? void
(0) : __assert_fail ("SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() && \"Failed to widen vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43011, __extension__
__PRETTY_FUNCTION__))
43011 "Failed to widen vector type")(static_cast <bool> (SrcVT.getSizeInBits() == ExtractVT
.getSizeInBits() && "Failed to widen vector type") ? void
(0) : __assert_fail ("SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() && \"Failed to widen vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43011, __extension__
__PRETTY_FUNCTION__))
;
43012 }
43013
43014 // If the shuffle source element is undef/zero then we can just accept it.
43015 if (ExtractIdx == SM_SentinelUndef)
43016 return DAG.getUNDEF(VT);
43017
43018 if (ExtractIdx == SM_SentinelZero)
43019 return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
43020 : DAG.getConstant(0, dl, VT);
43021
43022 SDValue SrcOp = Ops[ExtractIdx / Mask.size()];
43023 ExtractIdx = ExtractIdx % Mask.size();
43024 if (SDValue V = GetLegalExtract(SrcOp, ExtractVT, ExtractIdx))
43025 return DAG.getZExtOrTrunc(V, dl, VT);
43026
43027 return SDValue();
43028}
43029
43030/// Extracting a scalar FP value from vector element 0 is free, so extract each
43031/// operand first, then perform the math as a scalar op.
43032static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG,
43033 const X86Subtarget &Subtarget) {
43034 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract")(static_cast <bool> (ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT
&& "Expected extract") ? void (0) : __assert_fail ("ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && \"Expected extract\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43034, __extension__
__PRETTY_FUNCTION__))
;
43035 SDValue Vec = ExtElt->getOperand(0);
43036 SDValue Index = ExtElt->getOperand(1);
43037 EVT VT = ExtElt->getValueType(0);
43038 EVT VecVT = Vec.getValueType();
43039
43040 // TODO: If this is a unary/expensive/expand op, allow extraction from a
43041 // non-zero element because the shuffle+scalar op will be cheaper?
43042 if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT)
43043 return SDValue();
43044
43045 // Vector FP compares don't fit the pattern of FP math ops (propagate, not
43046 // extract, the condition code), so deal with those as a special-case.
43047 if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) {
43048 EVT OpVT = Vec.getOperand(0).getValueType().getScalarType();
43049 if (OpVT != MVT::f32 && OpVT != MVT::f64)
43050 return SDValue();
43051
43052 // extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC
43053 SDLoc DL(ExtElt);
43054 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
43055 Vec.getOperand(0), Index);
43056 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
43057 Vec.getOperand(1), Index);
43058 return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2));
43059 }
43060
43061 if (!(VT == MVT::f16 && Subtarget.hasFP16()) && VT != MVT::f32 &&
43062 VT != MVT::f64)
43063 return SDValue();
43064
43065 // Vector FP selects don't fit the pattern of FP math ops (because the
43066 // condition has a different type and we have to change the opcode), so deal
43067 // with those here.
43068 // FIXME: This is restricted to pre type legalization by ensuring the setcc
43069 // has i1 elements. If we loosen this we need to convert vector bool to a
43070 // scalar bool.
43071 if (Vec.getOpcode() == ISD::VSELECT &&
43072 Vec.getOperand(0).getOpcode() == ISD::SETCC &&
43073 Vec.getOperand(0).getValueType().getScalarType() == MVT::i1 &&
43074 Vec.getOperand(0).getOperand(0).getValueType() == VecVT) {
43075 // ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)
43076 SDLoc DL(ExtElt);
43077 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
43078 Vec.getOperand(0).getValueType().getScalarType(),
43079 Vec.getOperand(0), Index);
43080 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
43081 Vec.getOperand(1), Index);
43082 SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
43083 Vec.getOperand(2), Index);
43084 return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);
43085 }
43086
43087 // TODO: This switch could include FNEG and the x86-specific FP logic ops
43088 // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid
43089 // missed load folding and fma+fneg combining.
43090 switch (Vec.getOpcode()) {
43091 case ISD::FMA: // Begin 3 operands
43092 case ISD::FMAD:
43093 case ISD::FADD: // Begin 2 operands
43094 case ISD::FSUB:
43095 case ISD::FMUL:
43096 case ISD::FDIV:
43097 case ISD::FREM:
43098 case ISD::FCOPYSIGN:
43099 case ISD::FMINNUM:
43100 case ISD::FMAXNUM:
43101 case ISD::FMINNUM_IEEE:
43102 case ISD::FMAXNUM_IEEE:
43103 case ISD::FMAXIMUM:
43104 case ISD::FMINIMUM:
43105 case X86ISD::FMAX:
43106 case X86ISD::FMIN:
43107 case ISD::FABS: // Begin 1 operand
43108 case ISD::FSQRT:
43109 case ISD::FRINT:
43110 case ISD::FCEIL:
43111 case ISD::FTRUNC:
43112 case ISD::FNEARBYINT:
43113 case ISD::FROUND:
43114 case ISD::FFLOOR:
43115 case X86ISD::FRCP:
43116 case X86ISD::FRSQRT: {
43117 // extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...
43118 SDLoc DL(ExtElt);
43119 SmallVector<SDValue, 4> ExtOps;
43120 for (SDValue Op : Vec->ops())
43121 ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index));
43122 return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps);
43123 }
43124 default:
43125 return SDValue();
43126 }
43127 llvm_unreachable("All opcodes should return within switch")::llvm::llvm_unreachable_internal("All opcodes should return within switch"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43127)
;
43128}
43129
43130/// Try to convert a vector reduction sequence composed of binops and shuffles
43131/// into horizontal ops.
43132static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG,
43133 const X86Subtarget &Subtarget) {
43134 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller")(static_cast <bool> (ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT
&& "Unexpected caller") ? void (0) : __assert_fail (
"ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && \"Unexpected caller\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43134, __extension__
__PRETTY_FUNCTION__))
;
43135
43136 // We need at least SSE2 to anything here.
43137 if (!Subtarget.hasSSE2())
43138 return SDValue();
43139
43140 ISD::NodeType Opc;
43141 SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc,
43142 {ISD::ADD, ISD::MUL, ISD::FADD}, true);
43143 if (!Rdx)
43144 return SDValue();
43145
43146 SDValue Index = ExtElt->getOperand(1);
43147 assert(isNullConstant(Index) &&(static_cast <bool> (isNullConstant(Index) && "Reduction doesn't end in an extract from index 0"
) ? void (0) : __assert_fail ("isNullConstant(Index) && \"Reduction doesn't end in an extract from index 0\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43148, __extension__
__PRETTY_FUNCTION__))
43148 "Reduction doesn't end in an extract from index 0")(static_cast <bool> (isNullConstant(Index) && "Reduction doesn't end in an extract from index 0"
) ? void (0) : __assert_fail ("isNullConstant(Index) && \"Reduction doesn't end in an extract from index 0\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43148, __extension__
__PRETTY_FUNCTION__))
;
43149
43150 EVT VT = ExtElt->getValueType(0);
43151 EVT VecVT = Rdx.getValueType();
43152 if (VecVT.getScalarType() != VT)
43153 return SDValue();
43154
43155 SDLoc DL(ExtElt);
43156 unsigned NumElts = VecVT.getVectorNumElements();
43157 unsigned EltSizeInBits = VecVT.getScalarSizeInBits();
43158
43159 // Extend v4i8/v8i8 vector to v16i8, with undef upper 64-bits.
43160 auto WidenToV16I8 = [&](SDValue V, bool ZeroExtend) {
43161 if (V.getValueType() == MVT::v4i8) {
43162 if (ZeroExtend && Subtarget.hasSSE41()) {
43163 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
43164 DAG.getConstant(0, DL, MVT::v4i32),
43165 DAG.getBitcast(MVT::i32, V),
43166 DAG.getIntPtrConstant(0, DL));
43167 return DAG.getBitcast(MVT::v16i8, V);
43168 }
43169 V = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, V,
43170 ZeroExtend ? DAG.getConstant(0, DL, MVT::v4i8)
43171 : DAG.getUNDEF(MVT::v4i8));
43172 }
43173 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V,
43174 DAG.getUNDEF(MVT::v8i8));
43175 };
43176
43177 // vXi8 mul reduction - promote to vXi16 mul reduction.
43178 if (Opc == ISD::MUL) {
43179 if (VT != MVT::i8 || NumElts < 4 || !isPowerOf2_32(NumElts))
43180 return SDValue();
43181 if (VecVT.getSizeInBits() >= 128) {
43182 EVT WideVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts / 2);
43183 SDValue Lo = getUnpackl(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
43184 SDValue Hi = getUnpackh(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
43185 Lo = DAG.getBitcast(WideVT, Lo);
43186 Hi = DAG.getBitcast(WideVT, Hi);
43187 Rdx = DAG.getNode(Opc, DL, WideVT, Lo, Hi);
43188 while (Rdx.getValueSizeInBits() > 128) {
43189 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
43190 Rdx = DAG.getNode(Opc, DL, Lo.getValueType(), Lo, Hi);
43191 }
43192 } else {
43193 Rdx = WidenToV16I8(Rdx, false);
43194 Rdx = getUnpackl(DAG, DL, MVT::v16i8, Rdx, DAG.getUNDEF(MVT::v16i8));
43195 Rdx = DAG.getBitcast(MVT::v8i16, Rdx);
43196 }
43197 if (NumElts >= 8)
43198 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
43199 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
43200 {4, 5, 6, 7, -1, -1, -1, -1}));
43201 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
43202 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
43203 {2, 3, -1, -1, -1, -1, -1, -1}));
43204 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
43205 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
43206 {1, -1, -1, -1, -1, -1, -1, -1}));
43207 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
43208 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
43209 }
43210
43211 // vXi8 add reduction - sub 128-bit vector.
43212 if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) {
43213 Rdx = WidenToV16I8(Rdx, true);
43214 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
43215 DAG.getConstant(0, DL, MVT::v16i8));
43216 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
43217 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
43218 }
43219
43220 // Must be a >=128-bit vector with pow2 elements.
43221 if ((VecVT.getSizeInBits() % 128) != 0 || !isPowerOf2_32(NumElts))
43222 return SDValue();
43223
43224 // vXi8 add reduction - sum lo/hi halves then use PSADBW.
43225 if (VT == MVT::i8) {
43226 while (Rdx.getValueSizeInBits() > 128) {
43227 SDValue Lo, Hi;
43228 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
43229 VecVT = Lo.getValueType();
43230 Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
43231 }
43232 assert(VecVT == MVT::v16i8 && "v16i8 reduction expected")(static_cast <bool> (VecVT == MVT::v16i8 && "v16i8 reduction expected"
) ? void (0) : __assert_fail ("VecVT == MVT::v16i8 && \"v16i8 reduction expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43232, __extension__
__PRETTY_FUNCTION__))
;
43233
43234 SDValue Hi = DAG.getVectorShuffle(
43235 MVT::v16i8, DL, Rdx, Rdx,
43236 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
43237 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi);
43238 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
43239 getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
43240 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
43241 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
43242 }
43243
43244 // See if we can use vXi8 PSADBW add reduction for larger zext types.
43245 // If the source vector values are 0-255, then we can use PSADBW to
43246 // sum+zext v8i8 subvectors to vXi64, then perform the reduction.
43247 // TODO: See if its worth avoiding vXi16/i32 truncations?
43248 if (Opc == ISD::ADD && NumElts >= 4 && EltSizeInBits >= 16 &&
43249 DAG.computeKnownBits(Rdx).getMaxValue().ule(255) &&
43250 (EltSizeInBits == 16 || Rdx.getOpcode() == ISD::ZERO_EXTEND ||
43251 Subtarget.hasAVX512())) {
43252 EVT ByteVT = VecVT.changeVectorElementType(MVT::i8);
43253 Rdx = DAG.getNode(ISD::TRUNCATE, DL, ByteVT, Rdx);
43254 if (ByteVT.getSizeInBits() < 128)
43255 Rdx = WidenToV16I8(Rdx, true);
43256
43257 // Build the PSADBW, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
43258 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
43259 ArrayRef<SDValue> Ops) {
43260 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
43261 SDValue Zero = DAG.getConstant(0, DL, Ops[0].getValueType());
43262 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops[0], Zero);
43263 };
43264 MVT SadVT = MVT::getVectorVT(MVT::i64, Rdx.getValueSizeInBits() / 64);
43265 Rdx = SplitOpsAndApply(DAG, Subtarget, DL, SadVT, {Rdx}, PSADBWBuilder);
43266
43267 // TODO: We could truncate to vXi16/vXi32 before performing the reduction.
43268 while (Rdx.getValueSizeInBits() > 128) {
43269 SDValue Lo, Hi;
43270 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
43271 VecVT = Lo.getValueType();
43272 Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
43273 }
43274 assert(Rdx.getValueType() == MVT::v2i64 && "v2i64 reduction expected")(static_cast <bool> (Rdx.getValueType() == MVT::v2i64 &&
"v2i64 reduction expected") ? void (0) : __assert_fail ("Rdx.getValueType() == MVT::v2i64 && \"v2i64 reduction expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43274, __extension__
__PRETTY_FUNCTION__))
;
43275
43276 if (NumElts > 8) {
43277 SDValue RdxHi = DAG.getVectorShuffle(MVT::v2i64, DL, Rdx, Rdx, {1, -1});
43278 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v2i64, Rdx, RdxHi);
43279 }
43280
43281 VecVT = MVT::getVectorVT(VT.getSimpleVT(), 128 / VT.getSizeInBits());
43282 Rdx = DAG.getBitcast(VecVT, Rdx);
43283 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
43284 }
43285
43286 // Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.
43287 if (!shouldUseHorizontalOp(true, DAG, Subtarget))
43288 return SDValue();
43289
43290 unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
43291
43292 // 256-bit horizontal instructions operate on 128-bit chunks rather than
43293 // across the whole vector, so we need an extract + hop preliminary stage.
43294 // This is the only step where the operands of the hop are not the same value.
43295 // TODO: We could extend this to handle 512-bit or even longer vectors.
43296 if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) ||
43297 ((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {
43298 unsigned NumElts = VecVT.getVectorNumElements();
43299 SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);
43300 SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);
43301 Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo);
43302 VecVT = Rdx.getValueType();
43303 }
43304 if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
43305 !((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
43306 return SDValue();
43307
43308 // extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
43309 unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
43310 for (unsigned i = 0; i != ReductionSteps; ++i)
43311 Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);
43312
43313 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
43314}
43315
43316/// Detect vector gather/scatter index generation and convert it from being a
43317/// bunch of shuffles and extracts into a somewhat faster sequence.
43318/// For i686, the best sequence is apparently storing the value and loading
43319/// scalars back, while for x64 we should use 64-bit extracts and shifts.
43320static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
43321 TargetLowering::DAGCombinerInfo &DCI,
43322 const X86Subtarget &Subtarget) {
43323 if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
43324 return NewOp;
43325
43326 SDValue InputVector = N->getOperand(0);
43327 SDValue EltIdx = N->getOperand(1);
43328 auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx);
43329
43330 EVT SrcVT = InputVector.getValueType();
43331 EVT VT = N->getValueType(0);
43332 SDLoc dl(InputVector);
43333 bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;
43334 unsigned NumSrcElts = SrcVT.getVectorNumElements();
43335
43336 if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts))
43337 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
43338
43339 // Integer Constant Folding.
43340 if (CIdx && VT.isInteger()) {
43341 APInt UndefVecElts;
43342 SmallVector<APInt, 16> EltBits;
43343 unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits();
43344 if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts,
43345 EltBits, true, false)) {
43346 uint64_t Idx = CIdx->getZExtValue();
43347 if (UndefVecElts[Idx])
43348 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
43349 return DAG.getConstant(EltBits[Idx].zextOrSelf(VT.getScalarSizeInBits()),
43350 dl, VT);
43351 }
43352 }
43353
43354 if (IsPextr) {
43355 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43356 if (TLI.SimplifyDemandedBits(SDValue(N, 0),
43357 APInt::getAllOnes(VT.getSizeInBits()), DCI))
43358 return SDValue(N, 0);
43359
43360 // PEXTR*(PINSR*(v, s, c), c) -> s (with implicit zext handling).
43361 if ((InputVector.getOpcode() == X86ISD::PINSRB ||
43362 InputVector.getOpcode() == X86ISD::PINSRW) &&
43363 InputVector.getOperand(2) == EltIdx) {
43364 assert(SrcVT == InputVector.getOperand(0).getValueType() &&(static_cast <bool> (SrcVT == InputVector.getOperand(0)
.getValueType() && "Vector type mismatch") ? void (0)
: __assert_fail ("SrcVT == InputVector.getOperand(0).getValueType() && \"Vector type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43365, __extension__
__PRETTY_FUNCTION__))
43365 "Vector type mismatch")(static_cast <bool> (SrcVT == InputVector.getOperand(0)
.getValueType() && "Vector type mismatch") ? void (0)
: __assert_fail ("SrcVT == InputVector.getOperand(0).getValueType() && \"Vector type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43365, __extension__
__PRETTY_FUNCTION__))
;
43366 SDValue Scl = InputVector.getOperand(1);
43367 Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl);
43368 return DAG.getZExtOrTrunc(Scl, dl, VT);
43369 }
43370
43371 // TODO - Remove this once we can handle the implicit zero-extension of
43372 // X86ISD::PEXTRW/X86ISD::PEXTRB in combinePredicateReduction and
43373 // combineBasicSADPattern.
43374 return SDValue();
43375 }
43376
43377 // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
43378 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
43379 VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
43380 SDValue MMXSrc = InputVector.getOperand(0);
43381
43382 // The bitcast source is a direct mmx result.
43383 if (MMXSrc.getValueType() == MVT::x86mmx)
43384 return DAG.getBitcast(VT, InputVector);
43385 }
43386
43387 // Detect mmx to i32 conversion through a v2i32 elt extract.
43388 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
43389 VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {
43390 SDValue MMXSrc = InputVector.getOperand(0);
43391
43392 // The bitcast source is a direct mmx result.
43393 if (MMXSrc.getValueType() == MVT::x86mmx)
43394 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
43395 }
43396
43397 // Check whether this extract is the root of a sum of absolute differences
43398 // pattern. This has to be done here because we really want it to happen
43399 // pre-legalization,
43400 if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
43401 return SAD;
43402
43403 if (SDValue VPDPBUSD = combineVPDPBUSDPattern(N, DAG, Subtarget))
43404 return VPDPBUSD;
43405
43406 // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
43407 if (SDValue Cmp = combinePredicateReduction(N, DAG, Subtarget))
43408 return Cmp;
43409
43410 // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
43411 if (SDValue MinMax = combineMinMaxReduction(N, DAG, Subtarget))
43412 return MinMax;
43413
43414 // Attempt to optimize ADD/FADD/MUL reductions with HADD, promotion etc..
43415 if (SDValue V = combineArithReduction(N, DAG, Subtarget))
43416 return V;
43417
43418 if (SDValue V = scalarizeExtEltFP(N, DAG, Subtarget))
43419 return V;
43420
43421 // Attempt to extract a i1 element by using MOVMSK to extract the signbits
43422 // and then testing the relevant element.
43423 //
43424 // Note that we only combine extracts on the *same* result number, i.e.
43425 // t0 = merge_values a0, a1, a2, a3
43426 // i1 = extract_vector_elt t0, Constant:i64<2>
43427 // i1 = extract_vector_elt t0, Constant:i64<3>
43428 // but not
43429 // i1 = extract_vector_elt t0:1, Constant:i64<2>
43430 // since the latter would need its own MOVMSK.
43431 if (SrcVT.getScalarType() == MVT::i1) {
43432 bool IsVar = !CIdx;
43433 SmallVector<SDNode *, 16> BoolExtracts;
43434 unsigned ResNo = InputVector.getResNo();
43435 auto IsBoolExtract = [&BoolExtracts, &ResNo, &IsVar](SDNode *Use) {
43436 if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
43437 Use->getOperand(0).getResNo() == ResNo &&
43438 Use->getValueType(0) == MVT::i1) {
43439 BoolExtracts.push_back(Use);
43440 IsVar |= !isa<ConstantSDNode>(Use->getOperand(1));
43441 return true;
43442 }
43443 return false;
43444 };
43445 // TODO: Can we drop the oneuse check for constant extracts?
43446 if (all_of(InputVector->uses(), IsBoolExtract) &&
43447 (IsVar || BoolExtracts.size() > 1)) {
43448 EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);
43449 if (SDValue BC =
43450 combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {
43451 for (SDNode *Use : BoolExtracts) {
43452 // extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask
43453 // Mask = 1 << MaskIdx
43454 SDValue MaskIdx = DAG.getZExtOrTrunc(Use->getOperand(1), dl, MVT::i8);
43455 SDValue MaskBit = DAG.getConstant(1, dl, BCVT);
43456 SDValue Mask = DAG.getNode(ISD::SHL, dl, BCVT, MaskBit, MaskIdx);
43457 SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);
43458 Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);
43459 DCI.CombineTo(Use, Res);
43460 }
43461 return SDValue(N, 0);
43462 }
43463 }
43464 }
43465
43466 // If this extract is from a loaded vector value and will be used as an
43467 // integer, that requires a potentially expensive XMM -> GPR transfer.
43468 // Additionally, if we can convert to a scalar integer load, that will likely
43469 // be folded into a subsequent integer op.
43470 // Note: Unlike the related fold for this in DAGCombiner, this is not limited
43471 // to a single-use of the loaded vector. For the reasons above, we
43472 // expect this to be profitable even if it creates an extra load.
43473 bool LikelyUsedAsVector = any_of(N->uses(), [](SDNode *Use) {
43474 return Use->getOpcode() == ISD::STORE ||
43475 Use->getOpcode() == ISD::INSERT_VECTOR_ELT ||
43476 Use->getOpcode() == ISD::SCALAR_TO_VECTOR;
43477 });
43478 auto *LoadVec = dyn_cast<LoadSDNode>(InputVector);
43479 if (LoadVec && CIdx && ISD::isNormalLoad(LoadVec) && VT.isInteger() &&
43480 SrcVT.getVectorElementType() == VT && DCI.isAfterLegalizeDAG() &&
43481 !LikelyUsedAsVector) {
43482 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43483 SDValue NewPtr =
43484 TLI.getVectorElementPointer(DAG, LoadVec->getBasePtr(), SrcVT, EltIdx);
43485 unsigned PtrOff = VT.getSizeInBits() * CIdx->getZExtValue() / 8;
43486 MachinePointerInfo MPI = LoadVec->getPointerInfo().getWithOffset(PtrOff);
43487 Align Alignment = commonAlignment(LoadVec->getAlign(), PtrOff);
43488 SDValue Load =
43489 DAG.getLoad(VT, dl, LoadVec->getChain(), NewPtr, MPI, Alignment,
43490 LoadVec->getMemOperand()->getFlags(), LoadVec->getAAInfo());
43491 DAG.makeEquivalentMemoryOrdering(LoadVec, Load);
43492 return Load;
43493 }
43494
43495 return SDValue();
43496}
43497
43498// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
43499// This is more or less the reverse of combineBitcastvxi1.
43500static SDValue combineToExtendBoolVectorInReg(
43501 unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG,
43502 TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) {
43503 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
43504 Opcode != ISD::ANY_EXTEND)
43505 return SDValue();
43506 if (!DCI.isBeforeLegalizeOps())
43507 return SDValue();
43508 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
43509 return SDValue();
43510
43511 EVT SVT = VT.getScalarType();
43512 EVT InSVT = N0.getValueType().getScalarType();
43513 unsigned EltSizeInBits = SVT.getSizeInBits();
43514
43515 // Input type must be extending a bool vector (bit-casted from a scalar
43516 // integer) to legal integer types.
43517 if (!VT.isVector())
43518 return SDValue();
43519 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
43520 return SDValue();
43521 if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
43522 return SDValue();
43523
43524 SDValue N00 = N0.getOperand(0);
43525 EVT SclVT = N00.getValueType();
43526 if (!SclVT.isScalarInteger())
43527 return SDValue();
43528
43529 SDValue Vec;
43530 SmallVector<int> ShuffleMask;
43531 unsigned NumElts = VT.getVectorNumElements();
43532 assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size")(static_cast <bool> (NumElts == SclVT.getSizeInBits() &&
"Unexpected bool vector size") ? void (0) : __assert_fail ("NumElts == SclVT.getSizeInBits() && \"Unexpected bool vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43532, __extension__
__PRETTY_FUNCTION__))
;
43533
43534 // Broadcast the scalar integer to the vector elements.
43535 if (NumElts > EltSizeInBits) {
43536 // If the scalar integer is greater than the vector element size, then we
43537 // must split it down into sub-sections for broadcasting. For example:
43538 // i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
43539 // i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
43540 assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale")(static_cast <bool> ((NumElts % EltSizeInBits) == 0 &&
"Unexpected integer scale") ? void (0) : __assert_fail ("(NumElts % EltSizeInBits) == 0 && \"Unexpected integer scale\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43540, __extension__
__PRETTY_FUNCTION__))
;
43541 unsigned Scale = NumElts / EltSizeInBits;
43542 EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
43543 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
43544 Vec = DAG.getBitcast(VT, Vec);
43545
43546 for (unsigned i = 0; i != Scale; ++i)
43547 ShuffleMask.append(EltSizeInBits, i);
43548 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
43549 } else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits &&
43550 (SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) {
43551 // If we have register broadcast instructions, use the scalar size as the
43552 // element type for the shuffle. Then cast to the wider element type. The
43553 // widened bits won't be used, and this might allow the use of a broadcast
43554 // load.
43555 assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale")(static_cast <bool> ((EltSizeInBits % NumElts) == 0 &&
"Unexpected integer scale") ? void (0) : __assert_fail ("(EltSizeInBits % NumElts) == 0 && \"Unexpected integer scale\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43555, __extension__
__PRETTY_FUNCTION__))
;
43556 unsigned Scale = EltSizeInBits / NumElts;
43557 EVT BroadcastVT =
43558 EVT::getVectorVT(*DAG.getContext(), SclVT, NumElts * Scale);
43559 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
43560 ShuffleMask.append(NumElts * Scale, 0);
43561 Vec = DAG.getVectorShuffle(BroadcastVT, DL, Vec, Vec, ShuffleMask);
43562 Vec = DAG.getBitcast(VT, Vec);
43563 } else {
43564 // For smaller scalar integers, we can simply any-extend it to the vector
43565 // element size (we don't care about the upper bits) and broadcast it to all
43566 // elements.
43567 SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
43568 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
43569 ShuffleMask.append(NumElts, 0);
43570 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
43571 }
43572
43573 // Now, mask the relevant bit in each element.
43574 SmallVector<SDValue, 32> Bits;
43575 for (unsigned i = 0; i != NumElts; ++i) {
43576 int BitIdx = (i % EltSizeInBits);
43577 APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
43578 Bits.push_back(DAG.getConstant(Bit, DL, SVT));
43579 }
43580 SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
43581 Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
43582
43583 // Compare against the bitmask and extend the result.
43584 EVT CCVT = VT.changeVectorElementType(MVT::i1);
43585 Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
43586 Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
43587
43588 // For SEXT, this is now done, otherwise shift the result down for
43589 // zero-extension.
43590 if (Opcode == ISD::SIGN_EXTEND)
43591 return Vec;
43592 return DAG.getNode(ISD::SRL, DL, VT, Vec,
43593 DAG.getConstant(EltSizeInBits - 1, DL, VT));
43594}
43595
43596/// If a vector select has an operand that is -1 or 0, try to simplify the
43597/// select to a bitwise logic operation.
43598/// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
43599static SDValue
43600combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
43601 TargetLowering::DAGCombinerInfo &DCI,
43602 const X86Subtarget &Subtarget) {
43603 SDValue Cond = N->getOperand(0);
43604 SDValue LHS = N->getOperand(1);
43605 SDValue RHS = N->getOperand(2);
43606 EVT VT = LHS.getValueType();
43607 EVT CondVT = Cond.getValueType();
43608 SDLoc DL(N);
43609 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43610
43611 if (N->getOpcode() != ISD::VSELECT)
43612 return SDValue();
43613
43614 assert(CondVT.isVector() && "Vector select expects a vector selector!")(static_cast <bool> (CondVT.isVector() && "Vector select expects a vector selector!"
) ? void (0) : __assert_fail ("CondVT.isVector() && \"Vector select expects a vector selector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43614, __extension__
__PRETTY_FUNCTION__))
;
43615
43616 // TODO: Use isNullOrNullSplat() to distinguish constants with undefs?
43617 // TODO: Can we assert that both operands are not zeros (because that should
43618 // get simplified at node creation time)?
43619 bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
43620 bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
43621
43622 // If both inputs are 0/undef, create a complete zero vector.
43623 // FIXME: As noted above this should be handled by DAGCombiner/getNode.
43624 if (TValIsAllZeros && FValIsAllZeros) {
43625 if (VT.isFloatingPoint())
43626 return DAG.getConstantFP(0.0, DL, VT);
43627 return DAG.getConstant(0, DL, VT);
43628 }
43629
43630 // To use the condition operand as a bitwise mask, it must have elements that
43631 // are the same size as the select elements. Ie, the condition operand must
43632 // have already been promoted from the IR select condition type <N x i1>.
43633 // Don't check if the types themselves are equal because that excludes
43634 // vector floating-point selects.
43635 if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
43636 return SDValue();
43637
43638 // Try to invert the condition if true value is not all 1s and false value is
43639 // not all 0s. Only do this if the condition has one use.
43640 bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
43641 if (!TValIsAllOnes && !FValIsAllZeros && Cond.hasOneUse() &&
43642 // Check if the selector will be produced by CMPP*/PCMP*.
43643 Cond.getOpcode() == ISD::SETCC &&
43644 // Check if SETCC has already been promoted.
43645 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
43646 CondVT) {
43647 bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
43648
43649 if (TValIsAllZeros || FValIsAllOnes) {
43650 SDValue CC = Cond.getOperand(2);
43651 ISD::CondCode NewCC = ISD::getSetCCInverse(
43652 cast<CondCodeSDNode>(CC)->get(), Cond.getOperand(0).getValueType());
43653 Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
43654 NewCC);
43655 std::swap(LHS, RHS);
43656 TValIsAllOnes = FValIsAllOnes;
43657 FValIsAllZeros = TValIsAllZeros;
43658 }
43659 }
43660
43661 // Cond value must be 'sign splat' to be converted to a logical op.
43662 if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())
43663 return SDValue();
43664
43665 // vselect Cond, 111..., 000... -> Cond
43666 if (TValIsAllOnes && FValIsAllZeros)
43667 return DAG.getBitcast(VT, Cond);
43668
43669 if (!TLI.isTypeLegal(CondVT))
43670 return SDValue();
43671
43672 // vselect Cond, 111..., X -> or Cond, X
43673 if (TValIsAllOnes) {
43674 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
43675 SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
43676 return DAG.getBitcast(VT, Or);
43677 }
43678
43679 // vselect Cond, X, 000... -> and Cond, X
43680 if (FValIsAllZeros) {
43681 SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
43682 SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
43683 return DAG.getBitcast(VT, And);
43684 }
43685
43686 // vselect Cond, 000..., X -> andn Cond, X
43687 if (TValIsAllZeros) {
43688 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
43689 SDValue AndN;
43690 // The canonical form differs for i1 vectors - x86andnp is not used
43691 if (CondVT.getScalarType() == MVT::i1)
43692 AndN = DAG.getNode(ISD::AND, DL, CondVT, DAG.getNOT(DL, Cond, CondVT),
43693 CastRHS);
43694 else
43695 AndN = DAG.getNode(X86ISD::ANDNP, DL, CondVT, Cond, CastRHS);
43696 return DAG.getBitcast(VT, AndN);
43697 }
43698
43699 return SDValue();
43700}
43701
43702/// If both arms of a vector select are concatenated vectors, split the select,
43703/// and concatenate the result to eliminate a wide (256-bit) vector instruction:
43704/// vselect Cond, (concat T0, T1), (concat F0, F1) -->
43705/// concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1)
43706static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG,
43707 const X86Subtarget &Subtarget) {
43708 unsigned Opcode = N->getOpcode();
43709 if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT)
43710 return SDValue();
43711
43712 // TODO: Split 512-bit vectors too?
43713 EVT VT = N->getValueType(0);
43714 if (!VT.is256BitVector())
43715 return SDValue();
43716
43717 // TODO: Split as long as any 2 of the 3 operands are concatenated?
43718 SDValue Cond = N->getOperand(0);
43719 SDValue TVal = N->getOperand(1);
43720 SDValue FVal = N->getOperand(2);
43721 SmallVector<SDValue, 4> CatOpsT, CatOpsF;
43722 if (!TVal.hasOneUse() || !FVal.hasOneUse() ||
43723 !collectConcatOps(TVal.getNode(), CatOpsT) ||
43724 !collectConcatOps(FVal.getNode(), CatOpsF))
43725 return SDValue();
43726
43727 auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,
43728 ArrayRef<SDValue> Ops) {
43729 return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops);
43730 };
43731 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { Cond, TVal, FVal },
43732 makeBlend, /*CheckBWI*/ false);
43733}
43734
43735static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
43736 SDValue Cond = N->getOperand(0);
43737 SDValue LHS = N->getOperand(1);
43738 SDValue RHS = N->getOperand(2);
43739 SDLoc DL(N);
43740
43741 auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
43742 auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
43743 if (!TrueC || !FalseC)
43744 return SDValue();
43745
43746 // Don't do this for crazy integer types.
43747 EVT VT = N->getValueType(0);
43748 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
43749 return SDValue();
43750
43751 // We're going to use the condition bit in math or logic ops. We could allow
43752 // this with a wider condition value (post-legalization it becomes an i8),
43753 // but if nothing is creating selects that late, it doesn't matter.
43754 if (Cond.getValueType() != MVT::i1)
43755 return SDValue();
43756
43757 // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
43758 // 3, 5, or 9 with i32/i64, so those get transformed too.
43759 // TODO: For constants that overflow or do not differ by power-of-2 or small
43760 // multiplier, convert to 'and' + 'add'.
43761 const APInt &TrueVal = TrueC->getAPIntValue();
43762 const APInt &FalseVal = FalseC->getAPIntValue();
43763
43764 // We have a more efficient lowering for "(X == 0) ? Y : -1" using SBB.
43765 if ((TrueVal.isAllOnes() || FalseVal.isAllOnes()) &&
43766 Cond.getOpcode() == ISD::SETCC && isNullConstant(Cond.getOperand(1))) {
43767 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
43768 if (CC == ISD::SETEQ || CC == ISD::SETNE)
43769 return SDValue();
43770 }
43771
43772 bool OV;
43773 APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
43774 if (OV)
43775 return SDValue();
43776
43777 APInt AbsDiff = Diff.abs();
43778 if (AbsDiff.isPowerOf2() ||
43779 ((VT == MVT::i32 || VT == MVT::i64) &&
43780 (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {
43781
43782 // We need a positive multiplier constant for shift/LEA codegen. The 'not'
43783 // of the condition can usually be folded into a compare predicate, but even
43784 // without that, the sequence should be cheaper than a CMOV alternative.
43785 if (TrueVal.slt(FalseVal)) {
43786 Cond = DAG.getNOT(DL, Cond, MVT::i1);
43787 std::swap(TrueC, FalseC);
43788 }
43789
43790 // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
43791 SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
43792
43793 // Multiply condition by the difference if non-one.
43794 if (!AbsDiff.isOne())
43795 R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
43796
43797 // Add the base if non-zero.
43798 if (!FalseC->isZero())
43799 R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
43800
43801 return R;
43802 }
43803
43804 return SDValue();
43805}
43806
43807/// If this is a *dynamic* select (non-constant condition) and we can match
43808/// this node with one of the variable blend instructions, restructure the
43809/// condition so that blends can use the high (sign) bit of each element.
43810/// This function will also call SimplifyDemandedBits on already created
43811/// BLENDV to perform additional simplifications.
43812static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,
43813 TargetLowering::DAGCombinerInfo &DCI,
43814 const X86Subtarget &Subtarget) {
43815 SDValue Cond = N->getOperand(0);
43816 if ((N->getOpcode() != ISD::VSELECT &&
43817 N->getOpcode() != X86ISD::BLENDV) ||
43818 ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
43819 return SDValue();
43820
43821 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43822 unsigned BitWidth = Cond.getScalarValueSizeInBits();
43823 EVT VT = N->getValueType(0);
43824
43825 // We can only handle the cases where VSELECT is directly legal on the
43826 // subtarget. We custom lower VSELECT nodes with constant conditions and
43827 // this makes it hard to see whether a dynamic VSELECT will correctly
43828 // lower, so we both check the operation's status and explicitly handle the
43829 // cases where a *dynamic* blend will fail even though a constant-condition
43830 // blend could be custom lowered.
43831 // FIXME: We should find a better way to handle this class of problems.
43832 // Potentially, we should combine constant-condition vselect nodes
43833 // pre-legalization into shuffles and not mark as many types as custom
43834 // lowered.
43835 if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
43836 return SDValue();
43837 // FIXME: We don't support i16-element blends currently. We could and
43838 // should support them by making *all* the bits in the condition be set
43839 // rather than just the high bit and using an i8-element blend.
43840 if (VT.getVectorElementType() == MVT::i16)
43841 return SDValue();
43842 // Dynamic blending was only available from SSE4.1 onward.
43843 if (VT.is128BitVector() && !Subtarget.hasSSE41())
43844 return SDValue();
43845 // Byte blends are only available in AVX2
43846 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
43847 return SDValue();
43848 // There are no 512-bit blend instructions that use sign bits.
43849 if (VT.is512BitVector())
43850 return SDValue();
43851
43852 // Don't optimize before the condition has been transformed to a legal type
43853 // and don't ever optimize vector selects that map to AVX512 mask-registers.
43854 if (BitWidth < 8 || BitWidth > 64)
43855 return SDValue();
43856
43857 auto OnlyUsedAsSelectCond = [](SDValue Cond) {
43858 for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
43859 UI != UE; ++UI)
43860 if ((UI->getOpcode() != ISD::VSELECT &&
43861 UI->getOpcode() != X86ISD::BLENDV) ||
43862 UI.getOperandNo() != 0)
43863 return false;
43864
43865 return true;
43866 };
43867
43868 APInt DemandedBits(APInt::getSignMask(BitWidth));
43869
43870 if (OnlyUsedAsSelectCond(Cond)) {
43871 KnownBits Known;
43872 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
43873 !DCI.isBeforeLegalizeOps());
43874 if (!TLI.SimplifyDemandedBits(Cond, DemandedBits, Known, TLO, 0, true))
43875 return SDValue();
43876
43877 // If we changed the computation somewhere in the DAG, this change will
43878 // affect all users of Cond. Update all the nodes so that we do not use
43879 // the generic VSELECT anymore. Otherwise, we may perform wrong
43880 // optimizations as we messed with the actual expectation for the vector
43881 // boolean values.
43882 for (SDNode *U : Cond->uses()) {
43883 if (U->getOpcode() == X86ISD::BLENDV)
43884 continue;
43885
43886 SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),
43887 Cond, U->getOperand(1), U->getOperand(2));
43888 DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
43889 DCI.AddToWorklist(U);
43890 }
43891 DCI.CommitTargetLoweringOpt(TLO);
43892 return SDValue(N, 0);
43893 }
43894
43895 // Otherwise we can still at least try to simplify multiple use bits.
43896 if (SDValue V = TLI.SimplifyMultipleUseDemandedBits(Cond, DemandedBits, DAG))
43897 return DAG.getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0), V,
43898 N->getOperand(1), N->getOperand(2));
43899
43900 return SDValue();
43901}
43902
43903// Try to match:
43904// (or (and (M, (sub 0, X)), (pandn M, X)))
43905// which is a special case of:
43906// (select M, (sub 0, X), X)
43907// Per:
43908// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
43909// We know that, if fNegate is 0 or 1:
43910// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
43911//
43912// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
43913// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
43914// ( M ? -X : X) == ((X ^ M ) + (M & 1))
43915// This lets us transform our vselect to:
43916// (add (xor X, M), (and M, 1))
43917// And further to:
43918// (sub (xor X, M), M)
43919static SDValue combineLogicBlendIntoConditionalNegate(
43920 EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,
43921 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
43922 EVT MaskVT = Mask.getValueType();
43923 assert(MaskVT.isInteger() &&(static_cast <bool> (MaskVT.isInteger() && DAG.
ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
"Mask must be zero/all-bits") ? void (0) : __assert_fail ("MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && \"Mask must be zero/all-bits\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43925, __extension__
__PRETTY_FUNCTION__))
43924 DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&(static_cast <bool> (MaskVT.isInteger() && DAG.
ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
"Mask must be zero/all-bits") ? void (0) : __assert_fail ("MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && \"Mask must be zero/all-bits\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43925, __extension__
__PRETTY_FUNCTION__))
43925 "Mask must be zero/all-bits")(static_cast <bool> (MaskVT.isInteger() && DAG.
ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
"Mask must be zero/all-bits") ? void (0) : __assert_fail ("MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && \"Mask must be zero/all-bits\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43925, __extension__
__PRETTY_FUNCTION__))
;
43926
43927 if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT)
43928 return SDValue();
43929 if (!DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT))
43930 return SDValue();
43931
43932 auto IsNegV = [](SDNode *N, SDValue V) {
43933 return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
43934 ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
43935 };
43936
43937 SDValue V;
43938 if (IsNegV(Y.getNode(), X))
43939 V = X;
43940 else if (IsNegV(X.getNode(), Y))
43941 V = Y;
43942 else
43943 return SDValue();
43944
43945 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
43946 SDValue SubOp2 = Mask;
43947
43948 // If the negate was on the false side of the select, then
43949 // the operands of the SUB need to be swapped. PR 27251.
43950 // This is because the pattern being matched above is
43951 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
43952 // but if the pattern matched was
43953 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
43954 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
43955 // pattern also needs to be a negation of the replacement pattern above.
43956 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
43957 // sub accomplishes the negation of the replacement pattern.
43958 if (V == Y)
43959 std::swap(SubOp1, SubOp2);
43960
43961 SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
43962 return DAG.getBitcast(VT, Res);
43963}
43964
43965/// Do target-specific dag combines on SELECT and VSELECT nodes.
43966static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
43967 TargetLowering::DAGCombinerInfo &DCI,
43968 const X86Subtarget &Subtarget) {
43969 SDLoc DL(N);
43970 SDValue Cond = N->getOperand(0);
43971 SDValue LHS = N->getOperand(1);
43972 SDValue RHS = N->getOperand(2);
43973
43974 // Try simplification again because we use this function to optimize
43975 // BLENDV nodes that are not handled by the generic combiner.
43976 if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS))
43977 return V;
43978
43979 EVT VT = LHS.getValueType();
43980 EVT CondVT = Cond.getValueType();
43981 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43982 bool CondConstantVector = ISD::isBuildVectorOfConstantSDNodes(Cond.getNode());
43983
43984 // Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M).
43985 // Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT
43986 // can't catch, plus vXi8 cases where we'd likely end up with BLENDV.
43987 if (CondVT.isVector() && CondVT.isInteger() &&
43988 CondVT.getScalarSizeInBits() == VT.getScalarSizeInBits() &&
43989 (!CondConstantVector || CondVT.getScalarType() == MVT::i8) &&
43990 DAG.ComputeNumSignBits(Cond) == CondVT.getScalarSizeInBits())
43991 if (SDValue V = combineLogicBlendIntoConditionalNegate(VT, Cond, RHS, LHS,
43992 DL, DAG, Subtarget))
43993 return V;
43994
43995 // Convert vselects with constant condition into shuffles.
43996 if (CondConstantVector && DCI.isBeforeLegalizeOps() &&
43997 (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV)) {
43998 SmallVector<int, 64> Mask;
43999 if (createShuffleMaskFromVSELECT(Mask, Cond,
44000 N->getOpcode() == X86ISD::BLENDV))
44001 return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);
44002 }
44003
44004 // fold vselect(cond, pshufb(x), pshufb(y)) -> or (pshufb(x), pshufb(y))
44005 // by forcing the unselected elements to zero.
44006 // TODO: Can we handle more shuffles with this?
44007 if (N->getOpcode() == ISD::VSELECT && CondVT.isVector() &&
44008 LHS.getOpcode() == X86ISD::PSHUFB && RHS.getOpcode() == X86ISD::PSHUFB &&
44009 LHS.hasOneUse() && RHS.hasOneUse()) {
44010 MVT SimpleVT = VT.getSimpleVT();
44011 SmallVector<SDValue, 1> LHSOps, RHSOps;
44012 SmallVector<int, 64> LHSMask, RHSMask, CondMask;
44013 if (createShuffleMaskFromVSELECT(CondMask, Cond) &&
44014 getTargetShuffleMask(LHS.getNode(), SimpleVT, true, LHSOps, LHSMask) &&
44015 getTargetShuffleMask(RHS.getNode(), SimpleVT, true, RHSOps, RHSMask)) {
44016 int NumElts = VT.getVectorNumElements();
44017 for (int i = 0; i != NumElts; ++i) {
44018 // getConstVector sets negative shuffle mask values as undef, so ensure
44019 // we hardcode SM_SentinelZero values to zero (0x80).
44020 if (CondMask[i] < NumElts) {
44021 LHSMask[i] = isUndefOrZero(LHSMask[i]) ? 0x80 : LHSMask[i];
44022 RHSMask[i] = 0x80;
44023 } else {
44024 LHSMask[i] = 0x80;
44025 RHSMask[i] = isUndefOrZero(RHSMask[i]) ? 0x80 : RHSMask[i];
44026 }
44027 }
44028 LHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, LHS.getOperand(0),
44029 getConstVector(LHSMask, SimpleVT, DAG, DL, true));
44030 RHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, RHS.getOperand(0),
44031 getConstVector(RHSMask, SimpleVT, DAG, DL, true));
44032 return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
44033 }
44034 }
44035
44036 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
44037 // instructions match the semantics of the common C idiom x<y?x:y but not
44038 // x<=y?x:y, because of how they handle negative zero (which can be
44039 // ignored in unsafe-math mode).
44040 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
44041 if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
44042 VT != MVT::f80 && VT != MVT::f128 &&
44043 (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
44044 (Subtarget.hasSSE2() ||
44045 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
44046 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
44047
44048 unsigned Opcode = 0;
44049 // Check for x CC y ? x : y.
44050 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
44051 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
44052 switch (CC) {
44053 default: break;
44054 case ISD::SETULT:
44055 // Converting this to a min would handle NaNs incorrectly, and swapping
44056 // the operands would cause it to handle comparisons between positive
44057 // and negative zero incorrectly.
44058 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
44059 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
44060 !(DAG.isKnownNeverZeroFloat(LHS) ||
44061 DAG.isKnownNeverZeroFloat(RHS)))
44062 break;
44063 std::swap(LHS, RHS);
44064 }
44065 Opcode = X86ISD::FMIN;
44066 break;
44067 case ISD::SETOLE:
44068 // Converting this to a min would handle comparisons between positive
44069 // and negative zero incorrectly.
44070 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
44071 !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
44072 break;
44073 Opcode = X86ISD::FMIN;
44074 break;
44075 case ISD::SETULE:
44076 // Converting this to a min would handle both negative zeros and NaNs
44077 // incorrectly, but we can swap the operands to fix both.
44078 std::swap(LHS, RHS);
44079 LLVM_FALLTHROUGH[[gnu::fallthrough]];
44080 case ISD::SETOLT:
44081 case ISD::SETLT:
44082 case ISD::SETLE:
44083 Opcode = X86ISD::FMIN;
44084 break;
44085
44086 case ISD::SETOGE:
44087 // Converting this to a max would handle comparisons between positive
44088 // and negative zero incorrectly.
44089 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
44090 !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
44091 break;
44092 Opcode = X86ISD::FMAX;
44093 break;
44094 case ISD::SETUGT:
44095 // Converting this to a max would handle NaNs incorrectly, and swapping
44096 // the operands would cause it to handle comparisons between positive
44097 // and negative zero incorrectly.
44098 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
44099 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
44100 !(DAG.isKnownNeverZeroFloat(LHS) ||
44101 DAG.isKnownNeverZeroFloat(RHS)))
44102 break;
44103 std::swap(LHS, RHS);
44104 }
44105 Opcode = X86ISD::FMAX;
44106 break;
44107 case ISD::SETUGE:
44108 // Converting this to a max would handle both negative zeros and NaNs
44109 // incorrectly, but we can swap the operands to fix both.
44110 std::swap(LHS, RHS);
44111 LLVM_FALLTHROUGH[[gnu::fallthrough]];
44112 case ISD::SETOGT:
44113 case ISD::SETGT:
44114 case ISD::SETGE:
44115 Opcode = X86ISD::FMAX;
44116 break;
44117 }
44118 // Check for x CC y ? y : x -- a min/max with reversed arms.
44119 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
44120 DAG.isEqualTo(RHS, Cond.getOperand(0))) {
44121 switch (CC) {
44122 default: break;
44123 case ISD::SETOGE:
44124 // Converting this to a min would handle comparisons between positive
44125 // and negative zero incorrectly, and swapping the operands would
44126 // cause it to handle NaNs incorrectly.
44127 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
44128 !(DAG.isKnownNeverZeroFloat(LHS) ||
44129 DAG.isKnownNeverZeroFloat(RHS))) {
44130 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
44131 break;
44132 std::swap(LHS, RHS);
44133 }
44134 Opcode = X86ISD::FMIN;
44135 break;
44136 case ISD::SETUGT:
44137 // Converting this to a min would handle NaNs incorrectly.
44138 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
44139 break;
44140 Opcode = X86ISD::FMIN;
44141 break;
44142 case ISD::SETUGE:
44143 // Converting this to a min would handle both negative zeros and NaNs
44144 // incorrectly, but we can swap the operands to fix both.
44145 std::swap(LHS, RHS);
44146 LLVM_FALLTHROUGH[[gnu::fallthrough]];
44147 case ISD::SETOGT:
44148 case ISD::SETGT:
44149 case ISD::SETGE:
44150 Opcode = X86ISD::FMIN;
44151 break;
44152
44153 case ISD::SETULT:
44154 // Converting this to a max would handle NaNs incorrectly.
44155 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
44156 break;
44157 Opcode = X86ISD::FMAX;
44158 break;
44159 case ISD::SETOLE:
44160 // Converting this to a max would handle comparisons between positive
44161 // and negative zero incorrectly, and swapping the operands would
44162 // cause it to handle NaNs incorrectly.
44163 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
44164 !DAG.isKnownNeverZeroFloat(LHS) &&
44165 !DAG.isKnownNeverZeroFloat(RHS)) {
44166 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
44167 break;
44168 std::swap(LHS, RHS);
44169 }
44170 Opcode = X86ISD::FMAX;
44171 break;
44172 case ISD::SETULE:
44173 // Converting this to a max would handle both negative zeros and NaNs
44174 // incorrectly, but we can swap the operands to fix both.
44175 std::swap(LHS, RHS);
44176 LLVM_FALLTHROUGH[[gnu::fallthrough]];
44177 case ISD::SETOLT:
44178 case ISD::SETLT:
44179 case ISD::SETLE:
44180 Opcode = X86ISD::FMAX;
44181 break;
44182 }
44183 }
44184
44185 if (Opcode)
44186 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
44187 }
44188
44189 // Some mask scalar intrinsics rely on checking if only one bit is set
44190 // and implement it in C code like this:
44191 // A[0] = (U & 1) ? A[0] : W[0];
44192 // This creates some redundant instructions that break pattern matching.
44193 // fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
44194 if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
44195 Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {
44196 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
44197 SDValue AndNode = Cond.getOperand(0);
44198 if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&
44199 isNullConstant(Cond.getOperand(1)) &&
44200 isOneConstant(AndNode.getOperand(1))) {
44201 // LHS and RHS swapped due to
44202 // setcc outputting 1 when AND resulted in 0 and vice versa.
44203 AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);
44204 return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);
44205 }
44206 }
44207
44208 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
44209 // lowering on KNL. In this case we convert it to
44210 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
44211 // The same situation all vectors of i8 and i16 without BWI.
44212 // Make sure we extend these even before type legalization gets a chance to
44213 // split wide vectors.
44214 // Since SKX these selects have a proper lowering.
44215 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
44216 CondVT.getVectorElementType() == MVT::i1 &&
44217 (VT.getVectorElementType() == MVT::i8 ||
44218 VT.getVectorElementType() == MVT::i16)) {
44219 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
44220 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
44221 }
44222
44223 // AVX512 - Extend select with zero to merge with target shuffle.
44224 // select(mask, extract_subvector(shuffle(x)), zero) -->
44225 // extract_subvector(select(insert_subvector(mask), shuffle(x), zero))
44226 // TODO - support non target shuffles as well.
44227 if (Subtarget.hasAVX512() && CondVT.isVector() &&
44228 CondVT.getVectorElementType() == MVT::i1) {
44229 auto SelectableOp = [&TLI](SDValue Op) {
44230 return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
44231 isTargetShuffle(Op.getOperand(0).getOpcode()) &&
44232 isNullConstant(Op.getOperand(1)) &&
44233 TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&
44234 Op.hasOneUse() && Op.getOperand(0).hasOneUse();
44235 };
44236
44237 bool SelectableLHS = SelectableOp(LHS);
44238 bool SelectableRHS = SelectableOp(RHS);
44239 bool ZeroLHS = ISD::isBuildVectorAllZeros(LHS.getNode());
44240 bool ZeroRHS = ISD::isBuildVectorAllZeros(RHS.getNode());
44241
44242 if ((SelectableLHS && ZeroRHS) || (SelectableRHS && ZeroLHS)) {
44243 EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType()
44244 : RHS.getOperand(0).getValueType();
44245 EVT SrcCondVT = SrcVT.changeVectorElementType(MVT::i1);
44246 LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL,
44247 VT.getSizeInBits());
44248 RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL,
44249 VT.getSizeInBits());
44250 Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT,
44251 DAG.getUNDEF(SrcCondVT), Cond,
44252 DAG.getIntPtrConstant(0, DL));
44253 SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS);
44254 return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
44255 }
44256 }
44257
44258 if (SDValue V = combineSelectOfTwoConstants(N, DAG))
44259 return V;
44260
44261 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
44262 Cond.hasOneUse()) {
44263 EVT CondVT = Cond.getValueType();
44264 SDValue Cond0 = Cond.getOperand(0);
44265 SDValue Cond1 = Cond.getOperand(1);
44266 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
44267
44268 // Canonicalize min/max:
44269 // (x > 0) ? x : 0 -> (x >= 0) ? x : 0
44270 // (x < -1) ? x : -1 -> (x <= -1) ? x : -1
44271 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
44272 // the need for an extra compare against zero. e.g.
44273 // (a - b) > 0 : (a - b) ? 0 -> (a - b) >= 0 : (a - b) ? 0
44274 // subl %esi, %edi
44275 // testl %edi, %edi
44276 // movl $0, %eax
44277 // cmovgl %edi, %eax
44278 // =>
44279 // xorl %eax, %eax
44280 // subl %esi, $edi
44281 // cmovsl %eax, %edi
44282 //
44283 // We can also canonicalize
44284 // (x s> 1) ? x : 1 -> (x s>= 1) ? x : 1 -> (x s> 0) ? x : 1
44285 // (x u> 1) ? x : 1 -> (x u>= 1) ? x : 1 -> (x != 0) ? x : 1
44286 // This allows the use of a test instruction for the compare.
44287 if (LHS == Cond0 && RHS == Cond1) {
44288 if ((CC == ISD::SETGT && (isNullConstant(RHS) || isOneConstant(RHS))) ||
44289 (CC == ISD::SETLT && isAllOnesConstant(RHS))) {
44290 ISD::CondCode NewCC = CC == ISD::SETGT ? ISD::SETGE : ISD::SETLE;
44291 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
44292 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
44293 }
44294 if (CC == ISD::SETUGT && isOneConstant(RHS)) {
44295 ISD::CondCode NewCC = ISD::SETUGE;
44296 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
44297 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
44298 }
44299 }
44300
44301 // Similar to DAGCombine's select(or(CC0,CC1),X,Y) fold but for legal types.
44302 // fold eq + gt/lt nested selects into ge/le selects
44303 // select (cmpeq Cond0, Cond1), LHS, (select (cmpugt Cond0, Cond1), LHS, Y)
44304 // --> (select (cmpuge Cond0, Cond1), LHS, Y)
44305 // select (cmpslt Cond0, Cond1), LHS, (select (cmpeq Cond0, Cond1), LHS, Y)
44306 // --> (select (cmpsle Cond0, Cond1), LHS, Y)
44307 // .. etc ..
44308 if (RHS.getOpcode() == ISD::SELECT && RHS.getOperand(1) == LHS &&
44309 RHS.getOperand(0).getOpcode() == ISD::SETCC) {
44310 SDValue InnerSetCC = RHS.getOperand(0);
44311 ISD::CondCode InnerCC =
44312 cast<CondCodeSDNode>(InnerSetCC.getOperand(2))->get();
44313 if ((CC == ISD::SETEQ || InnerCC == ISD::SETEQ) &&
44314 Cond0 == InnerSetCC.getOperand(0) &&
44315 Cond1 == InnerSetCC.getOperand(1)) {
44316 ISD::CondCode NewCC;
44317 switch (CC == ISD::SETEQ ? InnerCC : CC) {
44318 case ISD::SETGT: NewCC = ISD::SETGE; break;
44319 case ISD::SETLT: NewCC = ISD::SETLE; break;
44320 case ISD::SETUGT: NewCC = ISD::SETUGE; break;
44321 case ISD::SETULT: NewCC = ISD::SETULE; break;
44322 default: NewCC = ISD::SETCC_INVALID; break;
44323 }
44324 if (NewCC != ISD::SETCC_INVALID) {
44325 Cond = DAG.getSetCC(DL, CondVT, Cond0, Cond1, NewCC);
44326 return DAG.getSelect(DL, VT, Cond, LHS, RHS.getOperand(2));
44327 }
44328 }
44329 }
44330 }
44331
44332 // Check if the first operand is all zeros and Cond type is vXi1.
44333 // If this an avx512 target we can improve the use of zero masking by
44334 // swapping the operands and inverting the condition.
44335 if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&
44336 Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&
44337 ISD::isBuildVectorAllZeros(LHS.getNode()) &&
44338 !ISD::isBuildVectorAllZeros(RHS.getNode())) {
44339 // Invert the cond to not(cond) : xor(op,allones)=not(op)
44340 SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
44341 // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
44342 return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
44343 }
44344
44345 // Attempt to convert a (vXi1 bitcast(iX Cond)) selection mask before it might
44346 // get split by legalization.
44347 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::BITCAST &&
44348 CondVT.getVectorElementType() == MVT::i1 && Cond.hasOneUse() &&
44349 TLI.isTypeLegal(VT.getScalarType())) {
44350 EVT ExtCondVT = VT.changeVectorElementTypeToInteger();
44351 if (SDValue ExtCond = combineToExtendBoolVectorInReg(
44352 ISD::SIGN_EXTEND, DL, ExtCondVT, Cond, DAG, DCI, Subtarget)) {
44353 ExtCond = DAG.getNode(ISD::TRUNCATE, DL, CondVT, ExtCond);
44354 return DAG.getSelect(DL, VT, ExtCond, LHS, RHS);
44355 }
44356 }
44357
44358 // Early exit check
44359 if (!TLI.isTypeLegal(VT))
44360 return SDValue();
44361
44362 if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
44363 return V;
44364
44365 if (SDValue V = combineVSelectToBLENDV(N, DAG, DCI, Subtarget))
44366 return V;
44367
44368 if (SDValue V = narrowVectorSelect(N, DAG, Subtarget))
44369 return V;
44370
44371 // select(~Cond, X, Y) -> select(Cond, Y, X)
44372 if (CondVT.getScalarType() != MVT::i1) {
44373 if (SDValue CondNot = IsNOT(Cond, DAG))
44374 return DAG.getNode(N->getOpcode(), DL, VT,
44375 DAG.getBitcast(CondVT, CondNot), RHS, LHS);
44376 // pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the signbit.
44377 if (Cond.getOpcode() == X86ISD::PCMPGT && Cond.hasOneUse() &&
44378 ISD::isBuildVectorAllOnes(Cond.getOperand(1).getNode())) {
44379 Cond = DAG.getNode(X86ISD::PCMPGT, DL, CondVT,
44380 DAG.getConstant(0, DL, CondVT), Cond.getOperand(0));
44381 return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
44382 }
44383 }
44384
44385 // Try to optimize vXi1 selects if both operands are either all constants or
44386 // bitcasts from scalar integer type. In that case we can convert the operands
44387 // to integer and use an integer select which will be converted to a CMOV.
44388 // We need to take a little bit of care to avoid creating an i64 type after
44389 // type legalization.
44390 if (N->getOpcode() == ISD::SELECT && VT.isVector() &&
44391 VT.getVectorElementType() == MVT::i1 &&
44392 (DCI.isBeforeLegalize() || (VT != MVT::v64i1 || Subtarget.is64Bit()))) {
44393 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());
44394 bool LHSIsConst = ISD::isBuildVectorOfConstantSDNodes(LHS.getNode());
44395 bool RHSIsConst = ISD::isBuildVectorOfConstantSDNodes(RHS.getNode());
44396
44397 if ((LHSIsConst ||
44398 (LHS.getOpcode() == ISD::BITCAST &&
44399 LHS.getOperand(0).getValueType() == IntVT)) &&
44400 (RHSIsConst ||
44401 (RHS.getOpcode() == ISD::BITCAST &&
44402 RHS.getOperand(0).getValueType() == IntVT))) {
44403 if (LHSIsConst)
44404 LHS = combinevXi1ConstantToInteger(LHS, DAG);
44405 else
44406 LHS = LHS.getOperand(0);
44407
44408 if (RHSIsConst)
44409 RHS = combinevXi1ConstantToInteger(RHS, DAG);
44410 else
44411 RHS = RHS.getOperand(0);
44412
44413 SDValue Select = DAG.getSelect(DL, IntVT, Cond, LHS, RHS);
44414 return DAG.getBitcast(VT, Select);
44415 }
44416 }
44417
44418 // If this is "((X & C) == 0) ? Y : Z" and C is a constant mask vector of
44419 // single bits, then invert the predicate and swap the select operands.
44420 // This can lower using a vector shift bit-hack rather than mask and compare.
44421 if (DCI.isBeforeLegalize() && !Subtarget.hasAVX512() &&
44422 N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
44423 Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1 &&
44424 Cond.getOperand(0).getOpcode() == ISD::AND &&
44425 isNullOrNullSplat(Cond.getOperand(1)) &&
44426 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
44427 Cond.getOperand(0).getValueType() == VT) {
44428 // The 'and' mask must be composed of power-of-2 constants.
44429 SDValue And = Cond.getOperand(0);
44430 auto *C = isConstOrConstSplat(And.getOperand(1));
44431 if (C && C->getAPIntValue().isPowerOf2()) {
44432 // vselect (X & C == 0), LHS, RHS --> vselect (X & C != 0), RHS, LHS
44433 SDValue NotCond =
44434 DAG.getSetCC(DL, CondVT, And, Cond.getOperand(1), ISD::SETNE);
44435 return DAG.getSelect(DL, VT, NotCond, RHS, LHS);
44436 }
44437
44438 // If we have a non-splat but still powers-of-2 mask, AVX1 can use pmulld
44439 // and AVX2 can use vpsllv{dq}. 8-bit lacks a proper shift or multiply.
44440 // 16-bit lacks a proper blendv.
44441 unsigned EltBitWidth = VT.getScalarSizeInBits();
44442 bool CanShiftBlend =
44443 TLI.isTypeLegal(VT) && ((Subtarget.hasAVX() && EltBitWidth == 32) ||
44444 (Subtarget.hasAVX2() && EltBitWidth == 64) ||
44445 (Subtarget.hasXOP()));
44446 if (CanShiftBlend &&
44447 ISD::matchUnaryPredicate(And.getOperand(1), [](ConstantSDNode *C) {
44448 return C->getAPIntValue().isPowerOf2();
44449 })) {
44450 // Create a left-shift constant to get the mask bits over to the sign-bit.
44451 SDValue Mask = And.getOperand(1);
44452 SmallVector<int, 32> ShlVals;
44453 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
44454 auto *MaskVal = cast<ConstantSDNode>(Mask.getOperand(i));
44455 ShlVals.push_back(EltBitWidth - 1 -
44456 MaskVal->getAPIntValue().exactLogBase2());
44457 }
44458 // vsel ((X & C) == 0), LHS, RHS --> vsel ((shl X, C') < 0), RHS, LHS
44459 SDValue ShlAmt = getConstVector(ShlVals, VT.getSimpleVT(), DAG, DL);
44460 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And.getOperand(0), ShlAmt);
44461 SDValue NewCond =
44462 DAG.getSetCC(DL, CondVT, Shl, Cond.getOperand(1), ISD::SETLT);
44463 return DAG.getSelect(DL, VT, NewCond, RHS, LHS);
44464 }
44465 }
44466
44467 return SDValue();
44468}
44469
44470/// Combine:
44471/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
44472/// to:
44473/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
44474/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
44475/// Note that this is only legal for some op/cc combinations.
44476static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
44477 SelectionDAG &DAG,
44478 const X86Subtarget &Subtarget) {
44479 // This combine only operates on CMP-like nodes.
44480 if (!(Cmp.getOpcode() == X86ISD::CMP ||
44481 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
44482 return SDValue();
44483
44484 // Can't replace the cmp if it has more uses than the one we're looking at.
44485 // FIXME: We would like to be able to handle this, but would need to make sure
44486 // all uses were updated.
44487 if (!Cmp.hasOneUse())
44488 return SDValue();
44489
44490 // This only applies to variations of the common case:
44491 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
44492 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
44493 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
44494 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
44495 // Using the proper condcodes (see below), overflow is checked for.
44496
44497 // FIXME: We can generalize both constraints:
44498 // - XOR/OR/AND (if they were made to survive AtomicExpand)
44499 // - LHS != 1
44500 // if the result is compared.
44501
44502 SDValue CmpLHS = Cmp.getOperand(0);
44503 SDValue CmpRHS = Cmp.getOperand(1);
44504 EVT CmpVT = CmpLHS.getValueType();
44505
44506 if (!CmpLHS.hasOneUse())
44507 return SDValue();
44508
44509 unsigned Opc = CmpLHS.getOpcode();
44510 if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
44511 return SDValue();
44512
44513 SDValue OpRHS = CmpLHS.getOperand(2);
44514 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
44515 if (!OpRHSC)
44516 return SDValue();
44517
44518 APInt Addend = OpRHSC->getAPIntValue();
44519 if (Opc == ISD::ATOMIC_LOAD_SUB)
44520 Addend = -Addend;
44521
44522 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
44523 if (!CmpRHSC)
44524 return SDValue();
44525
44526 APInt Comparison = CmpRHSC->getAPIntValue();
44527 APInt NegAddend = -Addend;
44528
44529 // See if we can adjust the CC to make the comparison match the negated
44530 // addend.
44531 if (Comparison != NegAddend) {
44532 APInt IncComparison = Comparison + 1;
44533 if (IncComparison == NegAddend) {
44534 if (CC == X86::COND_A && !Comparison.isMaxValue()) {
44535 Comparison = IncComparison;
44536 CC = X86::COND_AE;
44537 } else if (CC == X86::COND_LE && !Comparison.isMaxSignedValue()) {
44538 Comparison = IncComparison;
44539 CC = X86::COND_L;
44540 }
44541 }
44542 APInt DecComparison = Comparison - 1;
44543 if (DecComparison == NegAddend) {
44544 if (CC == X86::COND_AE && !Comparison.isMinValue()) {
44545 Comparison = DecComparison;
44546 CC = X86::COND_A;
44547 } else if (CC == X86::COND_L && !Comparison.isMinSignedValue()) {
44548 Comparison = DecComparison;
44549 CC = X86::COND_LE;
44550 }
44551 }
44552 }
44553
44554 // If the addend is the negation of the comparison value, then we can do
44555 // a full comparison by emitting the atomic arithmetic as a locked sub.
44556 if (Comparison == NegAddend) {
44557 // The CC is fine, but we need to rewrite the LHS of the comparison as an
44558 // atomic sub.
44559 auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
44560 auto AtomicSub = DAG.getAtomic(
44561 ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpVT,
44562 /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
44563 /*RHS*/ DAG.getConstant(NegAddend, SDLoc(CmpRHS), CmpVT),
44564 AN->getMemOperand());
44565 auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget);
44566 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
44567 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
44568 return LockOp;
44569 }
44570
44571 // We can handle comparisons with zero in a number of cases by manipulating
44572 // the CC used.
44573 if (!Comparison.isZero())
44574 return SDValue();
44575
44576 if (CC == X86::COND_S && Addend == 1)
44577 CC = X86::COND_LE;
44578 else if (CC == X86::COND_NS && Addend == 1)
44579 CC = X86::COND_G;
44580 else if (CC == X86::COND_G && Addend == -1)
44581 CC = X86::COND_GE;
44582 else if (CC == X86::COND_LE && Addend == -1)
44583 CC = X86::COND_L;
44584 else
44585 return SDValue();
44586
44587 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
44588 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
44589 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
44590 return LockOp;
44591}
44592
44593// Check whether a boolean test is testing a boolean value generated by
44594// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
44595// code.
44596//
44597// Simplify the following patterns:
44598// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
44599// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
44600// to (Op EFLAGS Cond)
44601//
44602// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
44603// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
44604// to (Op EFLAGS !Cond)
44605//
44606// where Op could be BRCOND or CMOV.
44607//
44608static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
44609 // This combine only operates on CMP-like nodes.
44610 if (!(Cmp.getOpcode() == X86ISD::CMP ||
44611 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
44612 return SDValue();
44613
44614 // Quit if not used as a boolean value.
44615 if (CC != X86::COND_E && CC != X86::COND_NE)
44616 return SDValue();
44617
44618 // Check CMP operands. One of them should be 0 or 1 and the other should be
44619 // an SetCC or extended from it.
44620 SDValue Op1 = Cmp.getOperand(0);
44621 SDValue Op2 = Cmp.getOperand(1);
44622
44623 SDValue SetCC;
44624 const ConstantSDNode* C = nullptr;
44625 bool needOppositeCond = (CC == X86::COND_E);
44626 bool checkAgainstTrue = false; // Is it a comparison against 1?
44627
44628 if ((C = dyn_cast<ConstantSDNode>(Op1)))
44629 SetCC = Op2;
44630 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
44631 SetCC = Op1;
44632 else // Quit if all operands are not constants.
44633 return SDValue();
44634
44635 if (C->getZExtValue() == 1) {
44636 needOppositeCond = !needOppositeCond;
44637 checkAgainstTrue = true;
44638 } else if (C->getZExtValue() != 0)
44639 // Quit if the constant is neither 0 or 1.
44640 return SDValue();
44641
44642 bool truncatedToBoolWithAnd = false;
44643 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
44644 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
44645 SetCC.getOpcode() == ISD::TRUNCATE ||
44646 SetCC.getOpcode() == ISD::AND) {
44647 if (SetCC.getOpcode() == ISD::AND) {
44648 int OpIdx = -1;
44649 if (isOneConstant(SetCC.getOperand(0)))
44650 OpIdx = 1;
44651 if (isOneConstant(SetCC.getOperand(1)))
44652 OpIdx = 0;
44653 if (OpIdx < 0)
44654 break;
44655 SetCC = SetCC.getOperand(OpIdx);
44656 truncatedToBoolWithAnd = true;
44657 } else
44658 SetCC = SetCC.getOperand(0);
44659 }
44660
44661 switch (SetCC.getOpcode()) {
44662 case X86ISD::SETCC_CARRY:
44663 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
44664 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
44665 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
44666 // truncated to i1 using 'and'.
44667 if (checkAgainstTrue && !truncatedToBoolWithAnd)
44668 break;
44669 assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&(static_cast <bool> (X86::CondCode(SetCC.getConstantOperandVal
(0)) == X86::COND_B && "Invalid use of SETCC_CARRY!")
? void (0) : __assert_fail ("X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && \"Invalid use of SETCC_CARRY!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44670, __extension__
__PRETTY_FUNCTION__))
44670 "Invalid use of SETCC_CARRY!")(static_cast <bool> (X86::CondCode(SetCC.getConstantOperandVal
(0)) == X86::COND_B && "Invalid use of SETCC_CARRY!")
? void (0) : __assert_fail ("X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && \"Invalid use of SETCC_CARRY!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44670, __extension__
__PRETTY_FUNCTION__))
;
44671 LLVM_FALLTHROUGH[[gnu::fallthrough]];
44672 case X86ISD::SETCC:
44673 // Set the condition code or opposite one if necessary.
44674 CC = X86::CondCode(SetCC.getConstantOperandVal(0));
44675 if (needOppositeCond)
44676 CC = X86::GetOppositeBranchCondition(CC);
44677 return SetCC.getOperand(1);
44678 case X86ISD::CMOV: {
44679 // Check whether false/true value has canonical one, i.e. 0 or 1.
44680 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
44681 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
44682 // Quit if true value is not a constant.
44683 if (!TVal)
44684 return SDValue();
44685 // Quit if false value is not a constant.
44686 if (!FVal) {
44687 SDValue Op = SetCC.getOperand(0);
44688 // Skip 'zext' or 'trunc' node.
44689 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
44690 Op.getOpcode() == ISD::TRUNCATE)
44691 Op = Op.getOperand(0);
44692 // A special case for rdrand/rdseed, where 0 is set if false cond is
44693 // found.
44694 if ((Op.getOpcode() != X86ISD::RDRAND &&
44695 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
44696 return SDValue();
44697 }
44698 // Quit if false value is not the constant 0 or 1.
44699 bool FValIsFalse = true;
44700 if (FVal && FVal->getZExtValue() != 0) {
44701 if (FVal->getZExtValue() != 1)
44702 return SDValue();
44703 // If FVal is 1, opposite cond is needed.
44704 needOppositeCond = !needOppositeCond;
44705 FValIsFalse = false;
44706 }
44707 // Quit if TVal is not the constant opposite of FVal.
44708 if (FValIsFalse && TVal->getZExtValue() != 1)
44709 return SDValue();
44710 if (!FValIsFalse && TVal->getZExtValue() != 0)
44711 return SDValue();
44712 CC = X86::CondCode(SetCC.getConstantOperandVal(2));
44713 if (needOppositeCond)
44714 CC = X86::GetOppositeBranchCondition(CC);
44715 return SetCC.getOperand(3);
44716 }
44717 }
44718
44719 return SDValue();
44720}
44721
44722/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
44723/// Match:
44724/// (X86or (X86setcc) (X86setcc))
44725/// (X86cmp (and (X86setcc) (X86setcc)), 0)
44726static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
44727 X86::CondCode &CC1, SDValue &Flags,
44728 bool &isAnd) {
44729 if (Cond->getOpcode() == X86ISD::CMP) {
44730 if (!isNullConstant(Cond->getOperand(1)))
44731 return false;
44732
44733 Cond = Cond->getOperand(0);
44734 }
44735
44736 isAnd = false;
44737
44738 SDValue SetCC0, SetCC1;
44739 switch (Cond->getOpcode()) {
44740 default: return false;
44741 case ISD::AND:
44742 case X86ISD::AND:
44743 isAnd = true;
44744 LLVM_FALLTHROUGH[[gnu::fallthrough]];
44745 case ISD::OR:
44746 case X86ISD::OR:
44747 SetCC0 = Cond->getOperand(0);
44748 SetCC1 = Cond->getOperand(1);
44749 break;
44750 };
44751
44752 // Make sure we have SETCC nodes, using the same flags value.
44753 if (SetCC0.getOpcode() != X86ISD::SETCC ||
44754 SetCC1.getOpcode() != X86ISD::SETCC ||
44755 SetCC0->getOperand(1) != SetCC1->getOperand(1))
44756 return false;
44757
44758 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
44759 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
44760 Flags = SetCC0->getOperand(1);
44761 return true;
44762}
44763
44764// When legalizing carry, we create carries via add X, -1
44765// If that comes from an actual carry, via setcc, we use the
44766// carry directly.
44767static SDValue combineCarryThroughADD(SDValue EFLAGS, SelectionDAG &DAG) {
44768 if (EFLAGS.getOpcode() == X86ISD::ADD) {
44769 if (isAllOnesConstant(EFLAGS.getOperand(1))) {
44770 bool FoundAndLSB = false;
44771 SDValue Carry = EFLAGS.getOperand(0);
44772 while (Carry.getOpcode() == ISD::TRUNCATE ||
44773 Carry.getOpcode() == ISD::ZERO_EXTEND ||
44774 (Carry.getOpcode() == ISD::AND &&
44775 isOneConstant(Carry.getOperand(1)))) {
44776 FoundAndLSB |= Carry.getOpcode() == ISD::AND;
44777 Carry = Carry.getOperand(0);
44778 }
44779 if (Carry.getOpcode() == X86ISD::SETCC ||
44780 Carry.getOpcode() == X86ISD::SETCC_CARRY) {
44781 // TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB?
44782 uint64_t CarryCC = Carry.getConstantOperandVal(0);
44783 SDValue CarryOp1 = Carry.getOperand(1);
44784 if (CarryCC == X86::COND_B)
44785 return CarryOp1;
44786 if (CarryCC == X86::COND_A) {
44787 // Try to convert COND_A into COND_B in an attempt to facilitate
44788 // materializing "setb reg".
44789 //
44790 // Do not flip "e > c", where "c" is a constant, because Cmp
44791 // instruction cannot take an immediate as its first operand.
44792 //
44793 if (CarryOp1.getOpcode() == X86ISD::SUB &&
44794 CarryOp1.getNode()->hasOneUse() &&
44795 CarryOp1.getValueType().isInteger() &&
44796 !isa<ConstantSDNode>(CarryOp1.getOperand(1))) {
44797 SDValue SubCommute =
44798 DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),
44799 CarryOp1.getOperand(1), CarryOp1.getOperand(0));
44800 return SDValue(SubCommute.getNode(), CarryOp1.getResNo());
44801 }
44802 }
44803 // If this is a check of the z flag of an add with 1, switch to the
44804 // C flag.
44805 if (CarryCC == X86::COND_E &&
44806 CarryOp1.getOpcode() == X86ISD::ADD &&
44807 isOneConstant(CarryOp1.getOperand(1)))
44808 return CarryOp1;
44809 } else if (FoundAndLSB) {
44810 SDLoc DL(Carry);
44811 SDValue BitNo = DAG.getConstant(0, DL, Carry.getValueType());
44812 if (Carry.getOpcode() == ISD::SRL) {
44813 BitNo = Carry.getOperand(1);
44814 Carry = Carry.getOperand(0);
44815 }
44816 return getBT(Carry, BitNo, DL, DAG);
44817 }
44818 }
44819 }
44820
44821 return SDValue();
44822}
44823
44824/// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC
44825/// to avoid the inversion.
44826static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC,
44827 SelectionDAG &DAG,
44828 const X86Subtarget &Subtarget) {
44829 // TODO: Handle X86ISD::KTEST/X86ISD::KORTEST.
44830 if (EFLAGS.getOpcode() != X86ISD::PTEST &&
44831 EFLAGS.getOpcode() != X86ISD::TESTP)
44832 return SDValue();
44833
44834 // PTEST/TESTP sets EFLAGS as:
44835 // TESTZ: ZF = (Op0 & Op1) == 0
44836 // TESTC: CF = (~Op0 & Op1) == 0
44837 // TESTNZC: ZF == 0 && CF == 0
44838 EVT VT = EFLAGS.getValueType();
44839 SDValue Op0 = EFLAGS.getOperand(0);
44840 SDValue Op1 = EFLAGS.getOperand(1);
44841 EVT OpVT = Op0.getValueType();
44842
44843 // TEST*(~X,Y) == TEST*(X,Y)
44844 if (SDValue NotOp0 = IsNOT(Op0, DAG)) {
44845 X86::CondCode InvCC;
44846 switch (CC) {
44847 case X86::COND_B:
44848 // testc -> testz.
44849 InvCC = X86::COND_E;
44850 break;
44851 case X86::COND_AE:
44852 // !testc -> !testz.
44853 InvCC = X86::COND_NE;
44854 break;
44855 case X86::COND_E:
44856 // testz -> testc.
44857 InvCC = X86::COND_B;
44858 break;
44859 case X86::COND_NE:
44860 // !testz -> !testc.
44861 InvCC = X86::COND_AE;
44862 break;
44863 case X86::COND_A:
44864 case X86::COND_BE:
44865 // testnzc -> testnzc (no change).
44866 InvCC = CC;
44867 break;
44868 default:
44869 InvCC = X86::COND_INVALID;
44870 break;
44871 }
44872
44873 if (InvCC != X86::COND_INVALID) {
44874 CC = InvCC;
44875 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
44876 DAG.getBitcast(OpVT, NotOp0), Op1);
44877 }
44878 }
44879
44880 if (CC == X86::COND_E || CC == X86::COND_NE) {
44881 // TESTZ(X,~Y) == TESTC(Y,X)
44882 if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
44883 CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
44884 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
44885 DAG.getBitcast(OpVT, NotOp1), Op0);
44886 }
44887
44888 if (Op0 == Op1) {
44889 SDValue BC = peekThroughBitcasts(Op0);
44890 EVT BCVT = BC.getValueType();
44891 assert(BCVT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(BCVT) &&(static_cast <bool> (BCVT.isVector() && DAG.getTargetLoweringInfo
().isTypeLegal(BCVT) && "Unexpected vector type") ? void
(0) : __assert_fail ("BCVT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(BCVT) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44892, __extension__
__PRETTY_FUNCTION__))
44892 "Unexpected vector type")(static_cast <bool> (BCVT.isVector() && DAG.getTargetLoweringInfo
().isTypeLegal(BCVT) && "Unexpected vector type") ? void
(0) : __assert_fail ("BCVT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(BCVT) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44892, __extension__
__PRETTY_FUNCTION__))
;
44893
44894 // TESTZ(AND(X,Y),AND(X,Y)) == TESTZ(X,Y)
44895 if (BC.getOpcode() == ISD::AND || BC.getOpcode() == X86ISD::FAND) {
44896 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
44897 DAG.getBitcast(OpVT, BC.getOperand(0)),
44898 DAG.getBitcast(OpVT, BC.getOperand(1)));
44899 }
44900
44901 // TESTZ(AND(~X,Y),AND(~X,Y)) == TESTC(X,Y)
44902 if (BC.getOpcode() == X86ISD::ANDNP || BC.getOpcode() == X86ISD::FANDN) {
44903 CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
44904 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
44905 DAG.getBitcast(OpVT, BC.getOperand(0)),
44906 DAG.getBitcast(OpVT, BC.getOperand(1)));
44907 }
44908
44909 // If every element is an all-sign value, see if we can use MOVMSK to
44910 // more efficiently extract the sign bits and compare that.
44911 // TODO: Handle TESTC with comparison inversion.
44912 // TODO: Can we remove SimplifyMultipleUseDemandedBits and rely on
44913 // MOVMSK combines to make sure its never worse than PTEST?
44914 unsigned EltBits = BCVT.getScalarSizeInBits();
44915 if (DAG.ComputeNumSignBits(BC) == EltBits) {
44916 assert(VT == MVT::i32 && "Expected i32 EFLAGS comparison result")(static_cast <bool> (VT == MVT::i32 && "Expected i32 EFLAGS comparison result"
) ? void (0) : __assert_fail ("VT == MVT::i32 && \"Expected i32 EFLAGS comparison result\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44916, __extension__
__PRETTY_FUNCTION__))
;
44917 APInt SignMask = APInt::getSignMask(EltBits);
44918 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44919 if (SDValue Res =
44920 TLI.SimplifyMultipleUseDemandedBits(BC, SignMask, DAG)) {
44921 // For vXi16 cases we need to use pmovmksb and extract every other
44922 // sign bit.
44923 SDLoc DL(EFLAGS);
44924 if (EltBits == 16) {
44925 MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8;
44926 Res = DAG.getBitcast(MovmskVT, Res);
44927 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
44928 Res = DAG.getNode(ISD::AND, DL, MVT::i32, Res,
44929 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
44930 } else {
44931 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
44932 }
44933 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Res,
44934 DAG.getConstant(0, DL, MVT::i32));
44935 }
44936 }
44937 }
44938
44939 // TESTZ(-1,X) == TESTZ(X,X)
44940 if (ISD::isBuildVectorAllOnes(Op0.getNode()))
44941 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1);
44942
44943 // TESTZ(X,-1) == TESTZ(X,X)
44944 if (ISD::isBuildVectorAllOnes(Op1.getNode()))
44945 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0);
44946
44947 // TESTZ(OR(LO(X),HI(X)),OR(LO(Y),HI(Y))) -> TESTZ(X,Y)
44948 // TODO: Add COND_NE handling?
44949 if (CC == X86::COND_E && OpVT.is128BitVector() && Subtarget.hasAVX()) {
44950 SDValue Src0 = peekThroughBitcasts(Op0);
44951 SDValue Src1 = peekThroughBitcasts(Op1);
44952 if (Src0.getOpcode() == ISD::OR && Src1.getOpcode() == ISD::OR) {
44953 Src0 = getSplitVectorSrc(peekThroughBitcasts(Src0.getOperand(0)),
44954 peekThroughBitcasts(Src0.getOperand(1)), true);
44955 Src1 = getSplitVectorSrc(peekThroughBitcasts(Src1.getOperand(0)),
44956 peekThroughBitcasts(Src1.getOperand(1)), true);
44957 if (Src0 && Src1)
44958 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
44959 DAG.getBitcast(MVT::v4i64, Src0),
44960 DAG.getBitcast(MVT::v4i64, Src1));
44961 }
44962 }
44963 }
44964
44965 return SDValue();
44966}
44967
44968// Attempt to simplify the MOVMSK input based on the comparison type.
44969static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC,
44970 SelectionDAG &DAG,
44971 const X86Subtarget &Subtarget) {
44972 // Handle eq/ne against zero (any_of).
44973 // Handle eq/ne against -1 (all_of).
44974 if (!(CC == X86::COND_E || CC == X86::COND_NE))
44975 return SDValue();
44976 if (EFLAGS.getValueType() != MVT::i32)
44977 return SDValue();
44978 unsigned CmpOpcode = EFLAGS.getOpcode();
44979 if (CmpOpcode != X86ISD::CMP && CmpOpcode != X86ISD::SUB)
44980 return SDValue();
44981 auto *CmpConstant = dyn_cast<ConstantSDNode>(EFLAGS.getOperand(1));
44982 if (!CmpConstant)
44983 return SDValue();
44984 const APInt &CmpVal = CmpConstant->getAPIntValue();
44985
44986 SDValue CmpOp = EFLAGS.getOperand(0);
44987 unsigned CmpBits = CmpOp.getValueSizeInBits();
44988 assert(CmpBits == CmpVal.getBitWidth() && "Value size mismatch")(static_cast <bool> (CmpBits == CmpVal.getBitWidth() &&
"Value size mismatch") ? void (0) : __assert_fail ("CmpBits == CmpVal.getBitWidth() && \"Value size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44988, __extension__
__PRETTY_FUNCTION__))
;
44989
44990 // Peek through any truncate.
44991 if (CmpOp.getOpcode() == ISD::TRUNCATE)
44992 CmpOp = CmpOp.getOperand(0);
44993
44994 // Bail if we don't find a MOVMSK.
44995 if (CmpOp.getOpcode() != X86ISD::MOVMSK)
44996 return SDValue();
44997
44998 SDValue Vec = CmpOp.getOperand(0);
44999 MVT VecVT = Vec.getSimpleValueType();
45000 assert((VecVT.is128BitVector() || VecVT.is256BitVector()) &&(static_cast <bool> ((VecVT.is128BitVector() || VecVT.is256BitVector
()) && "Unexpected MOVMSK operand") ? void (0) : __assert_fail
("(VecVT.is128BitVector() || VecVT.is256BitVector()) && \"Unexpected MOVMSK operand\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45001, __extension__
__PRETTY_FUNCTION__))
45001 "Unexpected MOVMSK operand")(static_cast <bool> ((VecVT.is128BitVector() || VecVT.is256BitVector
()) && "Unexpected MOVMSK operand") ? void (0) : __assert_fail
("(VecVT.is128BitVector() || VecVT.is256BitVector()) && \"Unexpected MOVMSK operand\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45001, __extension__
__PRETTY_FUNCTION__))
;
45002 unsigned NumElts = VecVT.getVectorNumElements();
45003 unsigned NumEltBits = VecVT.getScalarSizeInBits();
45004
45005 bool IsAnyOf = CmpOpcode == X86ISD::CMP && CmpVal.isZero();
45006 bool IsAllOf = (CmpOpcode == X86ISD::SUB || CmpOpcode == X86ISD::CMP) &&
45007 NumElts <= CmpBits && CmpVal.isMask(NumElts);
45008 if (!IsAnyOf && !IsAllOf)
45009 return SDValue();
45010
45011 // See if we can peek through to a vector with a wider element type, if the
45012 // signbits extend down to all the sub-elements as well.
45013 // Calling MOVMSK with the wider type, avoiding the bitcast, helps expose
45014 // potential SimplifyDemandedBits/Elts cases.
45015 // If we looked through a truncate that discard bits, we can't do this
45016 // transform.
45017 // FIXME: We could do this transform for truncates that discarded bits by
45018 // inserting an AND mask between the new MOVMSK and the CMP.
45019 if (Vec.getOpcode() == ISD::BITCAST && NumElts <= CmpBits) {
45020 SDValue BC = peekThroughBitcasts(Vec);
45021 MVT BCVT = BC.getSimpleValueType();
45022 unsigned BCNumElts = BCVT.getVectorNumElements();
45023 unsigned BCNumEltBits = BCVT.getScalarSizeInBits();
45024 if ((BCNumEltBits == 32 || BCNumEltBits == 64) &&
45025 BCNumEltBits > NumEltBits &&
45026 DAG.ComputeNumSignBits(BC) > (BCNumEltBits - NumEltBits)) {
45027 SDLoc DL(EFLAGS);
45028 APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : BCNumElts);
45029 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
45030 DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, BC),
45031 DAG.getConstant(CmpMask, DL, MVT::i32));
45032 }
45033 }
45034
45035 // MOVMSK(CONCAT(X,Y)) == 0 -> MOVMSK(OR(X,Y)).
45036 // MOVMSK(CONCAT(X,Y)) != 0 -> MOVMSK(OR(X,Y)).
45037 // MOVMSK(CONCAT(X,Y)) == -1 -> MOVMSK(AND(X,Y)).
45038 // MOVMSK(CONCAT(X,Y)) != -1 -> MOVMSK(AND(X,Y)).
45039 if (VecVT.is256BitVector() && NumElts <= CmpBits) {
45040 SmallVector<SDValue> Ops;
45041 if (collectConcatOps(peekThroughBitcasts(Vec).getNode(), Ops) &&
45042 Ops.size() == 2) {
45043 SDLoc DL(EFLAGS);
45044 EVT SubVT = Ops[0].getValueType().changeTypeToInteger();
45045 APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : NumElts / 2);
45046 SDValue V = DAG.getNode(IsAnyOf ? ISD::OR : ISD::AND, DL, SubVT,
45047 DAG.getBitcast(SubVT, Ops[0]),
45048 DAG.getBitcast(SubVT, Ops[1]));
45049 V = DAG.getBitcast(VecVT.getHalfNumVectorElementsVT(), V);
45050 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
45051 DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V),
45052 DAG.getConstant(CmpMask, DL, MVT::i32));
45053 }
45054 }
45055
45056 // MOVMSK(PCMPEQ(X,0)) == -1 -> PTESTZ(X,X).
45057 // MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X).
45058 // MOVMSK(PCMPEQ(X,Y)) == -1 -> PTESTZ(SUB(X,Y),SUB(X,Y)).
45059 // MOVMSK(PCMPEQ(X,Y)) != -1 -> !PTESTZ(SUB(X,Y),SUB(X,Y)).
45060 if (IsAllOf && Subtarget.hasSSE41()) {
45061 MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
45062 SDValue BC = peekThroughBitcasts(Vec);
45063 // Ensure MOVMSK was testing every signbit of BC.
45064 if (BC.getValueType().getVectorNumElements() <= NumElts) {
45065 if (BC.getOpcode() == X86ISD::PCMPEQ) {
45066 SDValue V = DAG.getNode(ISD::SUB, SDLoc(BC), BC.getValueType(),
45067 BC.getOperand(0), BC.getOperand(1));
45068 V = DAG.getBitcast(TestVT, V);
45069 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
45070 }
45071 // Check for 256-bit split vector cases.
45072 if (BC.getOpcode() == ISD::AND &&
45073 BC.getOperand(0).getOpcode() == X86ISD::PCMPEQ &&
45074 BC.getOperand(1).getOpcode() == X86ISD::PCMPEQ) {
45075 SDValue LHS = BC.getOperand(0);
45076 SDValue RHS = BC.getOperand(1);
45077 LHS = DAG.getNode(ISD::SUB, SDLoc(LHS), LHS.getValueType(),
45078 LHS.getOperand(0), LHS.getOperand(1));
45079 RHS = DAG.getNode(ISD::SUB, SDLoc(RHS), RHS.getValueType(),
45080 RHS.getOperand(0), RHS.getOperand(1));
45081 LHS = DAG.getBitcast(TestVT, LHS);
45082 RHS = DAG.getBitcast(TestVT, RHS);
45083 SDValue V = DAG.getNode(ISD::OR, SDLoc(EFLAGS), TestVT, LHS, RHS);
45084 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
45085 }
45086 }
45087 }
45088
45089 // See if we can avoid a PACKSS by calling MOVMSK on the sources.
45090 // For vXi16 cases we can use a v2Xi8 PMOVMSKB. We must mask out
45091 // sign bits prior to the comparison with zero unless we know that
45092 // the vXi16 splats the sign bit down to the lower i8 half.
45093 // TODO: Handle all_of patterns.
45094 if (Vec.getOpcode() == X86ISD::PACKSS && VecVT == MVT::v16i8) {
45095 SDValue VecOp0 = Vec.getOperand(0);
45096 SDValue VecOp1 = Vec.getOperand(1);
45097 bool SignExt0 = DAG.ComputeNumSignBits(VecOp0) > 8;
45098 bool SignExt1 = DAG.ComputeNumSignBits(VecOp1) > 8;
45099 // PMOVMSKB(PACKSSBW(X, undef)) -> PMOVMSKB(BITCAST_v16i8(X)) & 0xAAAA.
45100 if (IsAnyOf && CmpBits == 8 && VecOp1.isUndef()) {
45101 SDLoc DL(EFLAGS);
45102 SDValue Result = DAG.getBitcast(MVT::v16i8, VecOp0);
45103 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
45104 Result = DAG.getZExtOrTrunc(Result, DL, MVT::i16);
45105 if (!SignExt0) {
45106 Result = DAG.getNode(ISD::AND, DL, MVT::i16, Result,
45107 DAG.getConstant(0xAAAA, DL, MVT::i16));
45108 }
45109 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
45110 DAG.getConstant(0, DL, MVT::i16));
45111 }
45112 // PMOVMSKB(PACKSSBW(LO(X), HI(X)))
45113 // -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA.
45114 if (CmpBits >= 16 && Subtarget.hasInt256() &&
45115 (IsAnyOf || (SignExt0 && SignExt1))) {
45116 if (SDValue Src = getSplitVectorSrc(VecOp0, VecOp1, true)) {
45117 SDLoc DL(EFLAGS);
45118 SDValue Result = peekThroughBitcasts(Src);
45119 if (IsAllOf && Result.getOpcode() == X86ISD::PCMPEQ &&
45120 Result.getValueType().getVectorNumElements() <= NumElts) {
45121 SDValue V = DAG.getNode(ISD::SUB, DL, Result.getValueType(),
45122 Result.getOperand(0), Result.getOperand(1));
45123 V = DAG.getBitcast(MVT::v4i64, V);
45124 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
45125 }
45126 Result = DAG.getBitcast(MVT::v32i8, Result);
45127 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
45128 unsigned CmpMask = IsAnyOf ? 0 : 0xFFFFFFFF;
45129 if (!SignExt0 || !SignExt1) {
45130 assert(IsAnyOf &&(static_cast <bool> (IsAnyOf && "Only perform v16i16 signmasks for any_of patterns"
) ? void (0) : __assert_fail ("IsAnyOf && \"Only perform v16i16 signmasks for any_of patterns\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45131, __extension__
__PRETTY_FUNCTION__))
45131 "Only perform v16i16 signmasks for any_of patterns")(static_cast <bool> (IsAnyOf && "Only perform v16i16 signmasks for any_of patterns"
) ? void (0) : __assert_fail ("IsAnyOf && \"Only perform v16i16 signmasks for any_of patterns\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45131, __extension__
__PRETTY_FUNCTION__))
;
45132 Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,
45133 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
45134 }
45135 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
45136 DAG.getConstant(CmpMask, DL, MVT::i32));
45137 }
45138 }
45139 }
45140
45141 // MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced.
45142 SmallVector<int, 32> ShuffleMask;
45143 SmallVector<SDValue, 2> ShuffleInputs;
45144 if (NumElts <= CmpBits &&
45145 getTargetShuffleInputs(peekThroughBitcasts(Vec), ShuffleInputs,
45146 ShuffleMask, DAG) &&
45147 ShuffleInputs.size() == 1 && !isAnyZeroOrUndef(ShuffleMask) &&
45148 ShuffleInputs[0].getValueSizeInBits() == VecVT.getSizeInBits()) {
45149 unsigned NumShuffleElts = ShuffleMask.size();
45150 APInt DemandedElts = APInt::getZero(NumShuffleElts);
45151 for (int M : ShuffleMask) {
45152 assert(0 <= M && M < (int)NumShuffleElts && "Bad unary shuffle index")(static_cast <bool> (0 <= M && M < (int)NumShuffleElts
&& "Bad unary shuffle index") ? void (0) : __assert_fail
("0 <= M && M < (int)NumShuffleElts && \"Bad unary shuffle index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45152, __extension__
__PRETTY_FUNCTION__))
;
45153 DemandedElts.setBit(M);
45154 }
45155 if (DemandedElts.isAllOnes()) {
45156 SDLoc DL(EFLAGS);
45157 SDValue Result = DAG.getBitcast(VecVT, ShuffleInputs[0]);
45158 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
45159 Result =
45160 DAG.getZExtOrTrunc(Result, DL, EFLAGS.getOperand(0).getValueType());
45161 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
45162 EFLAGS.getOperand(1));
45163 }
45164 }
45165
45166 return SDValue();
45167}
45168
45169/// Optimize an EFLAGS definition used according to the condition code \p CC
45170/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
45171/// uses of chain values.
45172static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
45173 SelectionDAG &DAG,
45174 const X86Subtarget &Subtarget) {
45175 if (CC == X86::COND_B)
45176 if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG))
45177 return Flags;
45178
45179 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
45180 return R;
45181
45182 if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG, Subtarget))
45183 return R;
45184
45185 if (SDValue R = combineSetCCMOVMSK(EFLAGS, CC, DAG, Subtarget))
45186 return R;
45187
45188 return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
45189}
45190
45191/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
45192static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
45193 TargetLowering::DAGCombinerInfo &DCI,
45194 const X86Subtarget &Subtarget) {
45195 SDLoc DL(N);
45196
45197 SDValue FalseOp = N->getOperand(0);
45198 SDValue TrueOp = N->getOperand(1);
45199 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
45200 SDValue Cond = N->getOperand(3);
45201
45202 // cmov X, X, ?, ? --> X
45203 if (TrueOp == FalseOp)
45204 return TrueOp;
45205
45206 // Try to simplify the EFLAGS and condition code operands.
45207 // We can't always do this as FCMOV only supports a subset of X86 cond.
45208 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
45209 if (!(FalseOp.getValueType() == MVT::f80 ||
45210 (FalseOp.getValueType() == MVT::f64 && !Subtarget.hasSSE2()) ||
45211 (FalseOp.getValueType() == MVT::f32 && !Subtarget.hasSSE1())) ||
45212 !Subtarget.canUseCMOV() || hasFPCMov(CC)) {
45213 SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),
45214 Flags};
45215 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
45216 }
45217 }
45218
45219 // If this is a select between two integer constants, try to do some
45220 // optimizations. Note that the operands are ordered the opposite of SELECT
45221 // operands.
45222 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
45223 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
45224 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
45225 // larger than FalseC (the false value).
45226 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
45227 CC = X86::GetOppositeBranchCondition(CC);
45228 std::swap(TrueC, FalseC);
45229 std::swap(TrueOp, FalseOp);
45230 }
45231
45232 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
45233 // This is efficient for any integer data type (including i8/i16) and
45234 // shift amount.
45235 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
45236 Cond = getSETCC(CC, Cond, DL, DAG);
45237
45238 // Zero extend the condition if needed.
45239 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
45240
45241 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
45242 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
45243 DAG.getConstant(ShAmt, DL, MVT::i8));
45244 return Cond;
45245 }
45246
45247 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
45248 // for any integer data type, including i8/i16.
45249 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
45250 Cond = getSETCC(CC, Cond, DL, DAG);
45251
45252 // Zero extend the condition if needed.
45253 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
45254 FalseC->getValueType(0), Cond);
45255 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
45256 SDValue(FalseC, 0));
45257 return Cond;
45258 }
45259
45260 // Optimize cases that will turn into an LEA instruction. This requires
45261 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
45262 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
45263 APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
45264 assert(Diff.getBitWidth() == N->getValueType(0).getSizeInBits() &&(static_cast <bool> (Diff.getBitWidth() == N->getValueType
(0).getSizeInBits() && "Implicit constant truncation"
) ? void (0) : __assert_fail ("Diff.getBitWidth() == N->getValueType(0).getSizeInBits() && \"Implicit constant truncation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45265, __extension__
__PRETTY_FUNCTION__))
45265 "Implicit constant truncation")(static_cast <bool> (Diff.getBitWidth() == N->getValueType
(0).getSizeInBits() && "Implicit constant truncation"
) ? void (0) : __assert_fail ("Diff.getBitWidth() == N->getValueType(0).getSizeInBits() && \"Implicit constant truncation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45265, __extension__
__PRETTY_FUNCTION__))
;
45266
45267 bool isFastMultiplier = false;
45268 if (Diff.ult(10)) {
45269 switch (Diff.getZExtValue()) {
45270 default: break;
45271 case 1: // result = add base, cond
45272 case 2: // result = lea base( , cond*2)
45273 case 3: // result = lea base(cond, cond*2)
45274 case 4: // result = lea base( , cond*4)
45275 case 5: // result = lea base(cond, cond*4)
45276 case 8: // result = lea base( , cond*8)
45277 case 9: // result = lea base(cond, cond*8)
45278 isFastMultiplier = true;
45279 break;
45280 }
45281 }
45282
45283 if (isFastMultiplier) {
45284 Cond = getSETCC(CC, Cond, DL ,DAG);
45285 // Zero extend the condition if needed.
45286 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
45287 Cond);
45288 // Scale the condition by the difference.
45289 if (Diff != 1)
45290 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
45291 DAG.getConstant(Diff, DL, Cond.getValueType()));
45292
45293 // Add the base if non-zero.
45294 if (FalseC->getAPIntValue() != 0)
45295 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
45296 SDValue(FalseC, 0));
45297 return Cond;
45298 }
45299 }
45300 }
45301 }
45302
45303 // Handle these cases:
45304 // (select (x != c), e, c) -> select (x != c), e, x),
45305 // (select (x == c), c, e) -> select (x == c), x, e)
45306 // where the c is an integer constant, and the "select" is the combination
45307 // of CMOV and CMP.
45308 //
45309 // The rationale for this change is that the conditional-move from a constant
45310 // needs two instructions, however, conditional-move from a register needs
45311 // only one instruction.
45312 //
45313 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
45314 // some instruction-combining opportunities. This opt needs to be
45315 // postponed as late as possible.
45316 //
45317 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
45318 // the DCI.xxxx conditions are provided to postpone the optimization as
45319 // late as possible.
45320
45321 ConstantSDNode *CmpAgainst = nullptr;
45322 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
45323 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
45324 !isa<ConstantSDNode>(Cond.getOperand(0))) {
45325
45326 if (CC == X86::COND_NE &&
45327 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
45328 CC = X86::GetOppositeBranchCondition(CC);
45329 std::swap(TrueOp, FalseOp);
45330 }
45331
45332 if (CC == X86::COND_E &&
45333 CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
45334 SDValue Ops[] = {FalseOp, Cond.getOperand(0),
45335 DAG.getTargetConstant(CC, DL, MVT::i8), Cond};
45336 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
45337 }
45338 }
45339 }
45340
45341 // Fold and/or of setcc's to double CMOV:
45342 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
45343 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
45344 //
45345 // This combine lets us generate:
45346 // cmovcc1 (jcc1 if we don't have CMOV)
45347 // cmovcc2 (same)
45348 // instead of:
45349 // setcc1
45350 // setcc2
45351 // and/or
45352 // cmovne (jne if we don't have CMOV)
45353 // When we can't use the CMOV instruction, it might increase branch
45354 // mispredicts.
45355 // When we can use CMOV, or when there is no mispredict, this improves
45356 // throughput and reduces register pressure.
45357 //
45358 if (CC == X86::COND_NE) {
45359 SDValue Flags;
45360 X86::CondCode CC0, CC1;
45361 bool isAndSetCC;
45362 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
45363 if (isAndSetCC) {
45364 std::swap(FalseOp, TrueOp);
45365 CC0 = X86::GetOppositeBranchCondition(CC0);
45366 CC1 = X86::GetOppositeBranchCondition(CC1);
45367 }
45368
45369 SDValue LOps[] = {FalseOp, TrueOp,
45370 DAG.getTargetConstant(CC0, DL, MVT::i8), Flags};
45371 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);
45372 SDValue Ops[] = {LCMOV, TrueOp, DAG.getTargetConstant(CC1, DL, MVT::i8),
45373 Flags};
45374 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
45375 return CMOV;
45376 }
45377 }
45378
45379 // Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->
45380 // (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)
45381 // Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->
45382 // (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)
45383 if ((CC == X86::COND_NE || CC == X86::COND_E) &&
45384 Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {
45385 SDValue Add = TrueOp;
45386 SDValue Const = FalseOp;
45387 // Canonicalize the condition code for easier matching and output.
45388 if (CC == X86::COND_E)
45389 std::swap(Add, Const);
45390
45391 // We might have replaced the constant in the cmov with the LHS of the
45392 // compare. If so change it to the RHS of the compare.
45393 if (Const == Cond.getOperand(0))
45394 Const = Cond.getOperand(1);
45395
45396 // Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant.
45397 if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD &&
45398 Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) &&
45399 (Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
45400 Add.getOperand(0).getOpcode() == ISD::CTTZ) &&
45401 Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {
45402 EVT VT = N->getValueType(0);
45403 // This should constant fold.
45404 SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));
45405 SDValue CMov =
45406 DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),
45407 DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), Cond);
45408 return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));
45409 }
45410 }
45411
45412 return SDValue();
45413}
45414
45415/// Different mul shrinking modes.
45416enum class ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
45417
45418static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
45419 EVT VT = N->getOperand(0).getValueType();
45420 if (VT.getScalarSizeInBits() != 32)
45421 return false;
45422
45423 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2")(static_cast <bool> (N->getNumOperands() == 2 &&
"NumOperands of Mul are 2") ? void (0) : __assert_fail ("N->getNumOperands() == 2 && \"NumOperands of Mul are 2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45423, __extension__
__PRETTY_FUNCTION__))
;
45424 unsigned SignBits[2] = {1, 1};
45425 bool IsPositive[2] = {false, false};
45426 for (unsigned i = 0; i < 2; i++) {
45427 SDValue Opd = N->getOperand(i);
45428
45429 SignBits[i] = DAG.ComputeNumSignBits(Opd);
45430 IsPositive[i] = DAG.SignBitIsZero(Opd);
45431 }
45432
45433 bool AllPositive = IsPositive[0] && IsPositive[1];
45434 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
45435 // When ranges are from -128 ~ 127, use MULS8 mode.
45436 if (MinSignBits >= 25)
45437 Mode = ShrinkMode::MULS8;
45438 // When ranges are from 0 ~ 255, use MULU8 mode.
45439 else if (AllPositive && MinSignBits >= 24)
45440 Mode = ShrinkMode::MULU8;
45441 // When ranges are from -32768 ~ 32767, use MULS16 mode.
45442 else if (MinSignBits >= 17)
45443 Mode = ShrinkMode::MULS16;
45444 // When ranges are from 0 ~ 65535, use MULU16 mode.
45445 else if (AllPositive && MinSignBits >= 16)
45446 Mode = ShrinkMode::MULU16;
45447 else
45448 return false;
45449 return true;
45450}
45451
45452/// When the operands of vector mul are extended from smaller size values,
45453/// like i8 and i16, the type of mul may be shrinked to generate more
45454/// efficient code. Two typical patterns are handled:
45455/// Pattern1:
45456/// %2 = sext/zext <N x i8> %1 to <N x i32>
45457/// %4 = sext/zext <N x i8> %3 to <N x i32>
45458// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
45459/// %5 = mul <N x i32> %2, %4
45460///
45461/// Pattern2:
45462/// %2 = zext/sext <N x i16> %1 to <N x i32>
45463/// %4 = zext/sext <N x i16> %3 to <N x i32>
45464/// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
45465/// %5 = mul <N x i32> %2, %4
45466///
45467/// There are four mul shrinking modes:
45468/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
45469/// -128 to 128, and the scalar value range of %4 is also -128 to 128,
45470/// generate pmullw+sext32 for it (MULS8 mode).
45471/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
45472/// 0 to 255, and the scalar value range of %4 is also 0 to 255,
45473/// generate pmullw+zext32 for it (MULU8 mode).
45474/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
45475/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
45476/// generate pmullw+pmulhw for it (MULS16 mode).
45477/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
45478/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
45479/// generate pmullw+pmulhuw for it (MULU16 mode).
45480static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
45481 const X86Subtarget &Subtarget) {
45482 // Check for legality
45483 // pmullw/pmulhw are not supported by SSE.
45484 if (!Subtarget.hasSSE2())
45485 return SDValue();
45486
45487 // Check for profitability
45488 // pmulld is supported since SSE41. It is better to use pmulld
45489 // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
45490 // the expansion.
45491 bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize();
45492 if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
45493 return SDValue();
45494
45495 ShrinkMode Mode;
45496 if (!canReduceVMulWidth(N, DAG, Mode))
45497 return SDValue();
45498
45499 SDLoc DL(N);
45500 SDValue N0 = N->getOperand(0);
45501 SDValue N1 = N->getOperand(1);
45502 EVT VT = N->getOperand(0).getValueType();
45503 unsigned NumElts = VT.getVectorNumElements();
45504 if ((NumElts % 2) != 0)
45505 return SDValue();
45506
45507 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
45508
45509 // Shrink the operands of mul.
45510 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
45511 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
45512
45513 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
45514 // lower part is needed.
45515 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
45516 if (Mode == ShrinkMode::MULU8 || Mode == ShrinkMode::MULS8)
45517 return DAG.getNode((Mode == ShrinkMode::MULU8) ? ISD::ZERO_EXTEND
45518 : ISD::SIGN_EXTEND,
45519 DL, VT, MulLo);
45520
45521 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts / 2);
45522 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
45523 // the higher part is also needed.
45524 SDValue MulHi =
45525 DAG.getNode(Mode == ShrinkMode::MULS16 ? ISD::MULHS : ISD::MULHU, DL,
45526 ReducedVT, NewN0, NewN1);
45527
45528 // Repack the lower part and higher part result of mul into a wider
45529 // result.
45530 // Generate shuffle functioning as punpcklwd.
45531 SmallVector<int, 16> ShuffleMask(NumElts);
45532 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
45533 ShuffleMask[2 * i] = i;
45534 ShuffleMask[2 * i + 1] = i + NumElts;
45535 }
45536 SDValue ResLo =
45537 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
45538 ResLo = DAG.getBitcast(ResVT, ResLo);
45539 // Generate shuffle functioning as punpckhwd.
45540 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
45541 ShuffleMask[2 * i] = i + NumElts / 2;
45542 ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
45543 }
45544 SDValue ResHi =
45545 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
45546 ResHi = DAG.getBitcast(ResVT, ResHi);
45547 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
45548}
45549
45550static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
45551 EVT VT, const SDLoc &DL) {
45552
45553 auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
45554 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
45555 DAG.getConstant(Mult, DL, VT));
45556 Result = DAG.getNode(ISD::SHL, DL, VT, Result,
45557 DAG.getConstant(Shift, DL, MVT::i8));
45558 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
45559 N->getOperand(0));
45560 return Result;
45561 };
45562
45563 auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {
45564 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
45565 DAG.getConstant(Mul1, DL, VT));
45566 Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,
45567 DAG.getConstant(Mul2, DL, VT));
45568 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
45569 N->getOperand(0));
45570 return Result;
45571 };
45572
45573 switch (MulAmt) {
45574 default:
45575 break;
45576 case 11:
45577 // mul x, 11 => add ((shl (mul x, 5), 1), x)
45578 return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
45579 case 21:
45580 // mul x, 21 => add ((shl (mul x, 5), 2), x)
45581 return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
45582 case 41:
45583 // mul x, 41 => add ((shl (mul x, 5), 3), x)
45584 return combineMulShlAddOrSub(5, 3, /*isAdd*/ true);
45585 case 22:
45586 // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
45587 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
45588 combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
45589 case 19:
45590 // mul x, 19 => add ((shl (mul x, 9), 1), x)
45591 return combineMulShlAddOrSub(9, 1, /*isAdd*/ true);
45592 case 37:
45593 // mul x, 37 => add ((shl (mul x, 9), 2), x)
45594 return combineMulShlAddOrSub(9, 2, /*isAdd*/ true);
45595 case 73:
45596 // mul x, 73 => add ((shl (mul x, 9), 3), x)
45597 return combineMulShlAddOrSub(9, 3, /*isAdd*/ true);
45598 case 13:
45599 // mul x, 13 => add ((shl (mul x, 3), 2), x)
45600 return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
45601 case 23:
45602 // mul x, 23 => sub ((shl (mul x, 3), 3), x)
45603 return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
45604 case 26:
45605 // mul x, 26 => add ((mul (mul x, 5), 5), x)
45606 return combineMulMulAddOrSub(5, 5, /*isAdd*/ true);
45607 case 28:
45608 // mul x, 28 => add ((mul (mul x, 9), 3), x)
45609 return combineMulMulAddOrSub(9, 3, /*isAdd*/ true);
45610 case 29:
45611 // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
45612 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
45613 combineMulMulAddOrSub(9, 3, /*isAdd*/ true));
45614 }
45615
45616 // Another trick. If this is a power 2 + 2/4/8, we can use a shift followed
45617 // by a single LEA.
45618 // First check if this a sum of two power of 2s because that's easy. Then
45619 // count how many zeros are up to the first bit.
45620 // TODO: We can do this even without LEA at a cost of two shifts and an add.
45621 if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
45622 unsigned ScaleShift = countTrailingZeros(MulAmt);
45623 if (ScaleShift >= 1 && ScaleShift < 4) {
45624 unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
45625 SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
45626 DAG.getConstant(ShiftAmt, DL, MVT::i8));
45627 SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
45628 DAG.getConstant(ScaleShift, DL, MVT::i8));
45629 return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);
45630 }
45631 }
45632
45633 return SDValue();
45634}
45635
45636// If the upper 17 bits of either element are zero and the other element are
45637// zero/sign bits then we can use PMADDWD, which is always at least as quick as
45638// PMULLD, except on KNL.
45639static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
45640 const X86Subtarget &Subtarget) {
45641 if (!Subtarget.hasSSE2())
45642 return SDValue();
45643
45644 if (Subtarget.isPMADDWDSlow())
45645 return SDValue();
45646
45647 EVT VT = N->getValueType(0);
45648
45649 // Only support vXi32 vectors.
45650 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
45651 return SDValue();
45652
45653 // Make sure the type is legal or can split/widen to a legal type.
45654 // With AVX512 but without BWI, we would need to split v32i16.
45655 unsigned NumElts = VT.getVectorNumElements();
45656 if (NumElts == 1 || !isPowerOf2_32(NumElts))
45657 return SDValue();
45658
45659 EVT WVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, 2 * NumElts);
45660
45661 // With AVX512 but without BWI, we would need to split v32i16.
45662 if (32 <= (2 * NumElts) && Subtarget.hasAVX512() && !Subtarget.hasBWI())
45663 return SDValue();
45664
45665 SDValue N0 = N->getOperand(0);
45666 SDValue N1 = N->getOperand(1);
45667
45668 // If we are zero/sign extending two steps without SSE4.1, its better to
45669 // reduce the vmul width instead.
45670 if (!Subtarget.hasSSE41() &&
45671 (((N0.getOpcode() == ISD::ZERO_EXTEND &&
45672 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
45673 (N1.getOpcode() == ISD::ZERO_EXTEND &&
45674 N1.getOperand(0).getScalarValueSizeInBits() <= 8)) ||
45675 ((N0.getOpcode() == ISD::SIGN_EXTEND &&
45676 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
45677 (N1.getOpcode() == ISD::SIGN_EXTEND &&
45678 N1.getOperand(0).getScalarValueSizeInBits() <= 8))))
45679 return SDValue();
45680
45681 // If we are sign extending a wide vector without SSE4.1, its better to reduce
45682 // the vmul width instead.
45683 if (!Subtarget.hasSSE41() &&
45684 (N0.getOpcode() == ISD::SIGN_EXTEND &&
45685 N0.getOperand(0).getValueSizeInBits() > 128) &&
45686 (N1.getOpcode() == ISD::SIGN_EXTEND &&
45687 N1.getOperand(0).getValueSizeInBits() > 128))
45688 return SDValue();
45689
45690 // Sign bits must extend down to the lowest i16.
45691 if (DAG.ComputeMaxSignificantBits(N1) > 16 ||
45692 DAG.ComputeMaxSignificantBits(N0) > 16)
45693 return SDValue();
45694
45695 // At least one of the elements must be zero in the upper 17 bits, or can be
45696 // safely made zero without altering the final result.
45697 auto GetZeroableOp = [&](SDValue Op) {
45698 APInt Mask17 = APInt::getHighBitsSet(32, 17);
45699 if (DAG.MaskedValueIsZero(Op, Mask17))
45700 return Op;
45701 // Mask off upper 16-bits of sign-extended constants.
45702 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode()))
45703 return DAG.getNode(ISD::AND, SDLoc(N), VT, Op,
45704 DAG.getConstant(0xFFFF, SDLoc(N), VT));
45705 if (Op.getOpcode() == ISD::SIGN_EXTEND && N->isOnlyUserOf(Op.getNode())) {
45706 SDValue Src = Op.getOperand(0);
45707 // Convert sext(vXi16) to zext(vXi16).
45708 if (Src.getScalarValueSizeInBits() == 16 && VT.getSizeInBits() <= 128)
45709 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, Src);
45710 // Convert sext(vXi8) to zext(vXi16 sext(vXi8)) on pre-SSE41 targets
45711 // which will expand the extension.
45712 if (Src.getScalarValueSizeInBits() < 16 && !Subtarget.hasSSE41()) {
45713 EVT ExtVT = VT.changeVectorElementType(MVT::i16);
45714 Src = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), ExtVT, Src);
45715 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, Src);
45716 }
45717 }
45718 // Convert SIGN_EXTEND_VECTOR_INREG to ZEXT_EXTEND_VECTOR_INREG.
45719 if (Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
45720 N->isOnlyUserOf(Op.getNode())) {
45721 SDValue Src = Op.getOperand(0);
45722 if (Src.getScalarValueSizeInBits() == 16)
45723 return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(N), VT, Src);
45724 }
45725 // Convert VSRAI(Op, 16) to VSRLI(Op, 16).
45726 if (Op.getOpcode() == X86ISD::VSRAI && Op.getConstantOperandVal(1) == 16 &&
45727 N->isOnlyUserOf(Op.getNode())) {
45728 return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, Op.getOperand(0),
45729 Op.getOperand(1));
45730 }
45731 return SDValue();
45732 };
45733 SDValue ZeroN0 = GetZeroableOp(N0);
45734 SDValue ZeroN1 = GetZeroableOp(N1);
45735 if (!ZeroN0 && !ZeroN1)
45736 return SDValue();
45737 N0 = ZeroN0 ? ZeroN0 : N0;
45738 N1 = ZeroN1 ? ZeroN1 : N1;
45739
45740 // Use SplitOpsAndApply to handle AVX splitting.
45741 auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
45742 ArrayRef<SDValue> Ops) {
45743 MVT OpVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
45744 return DAG.getNode(X86ISD::VPMADDWD, DL, OpVT, Ops);
45745 };
45746 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
45747 { DAG.getBitcast(WVT, N0), DAG.getBitcast(WVT, N1) },
45748 PMADDWDBuilder);
45749}
45750
45751static SDValue combineMulToPMULDQ(SDNode *N, SelectionDAG &DAG,
45752 const X86Subtarget &Subtarget) {
45753 if (!Subtarget.hasSSE2())
45754 return SDValue();
45755
45756 EVT VT = N->getValueType(0);
45757
45758 // Only support vXi64 vectors.
45759 if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||
45760 VT.getVectorNumElements() < 2 ||
45761 !isPowerOf2_32(VT.getVectorNumElements()))
45762 return SDValue();
45763
45764 SDValue N0 = N->getOperand(0);
45765 SDValue N1 = N->getOperand(1);
45766
45767 // MULDQ returns the 64-bit result of the signed multiplication of the lower
45768 // 32-bits. We can lower with this if the sign bits stretch that far.
45769 if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&
45770 DAG.ComputeNumSignBits(N1) > 32) {
45771 auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
45772 ArrayRef<SDValue> Ops) {
45773 return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);
45774 };
45775 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
45776 PMULDQBuilder, /*CheckBWI*/false);
45777 }
45778
45779 // If the upper bits are zero we can use a single pmuludq.
45780 APInt Mask = APInt::getHighBitsSet(64, 32);
45781 if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {
45782 auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
45783 ArrayRef<SDValue> Ops) {
45784 return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);
45785 };
45786 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
45787 PMULUDQBuilder, /*CheckBWI*/false);
45788 }
45789
45790 return SDValue();
45791}
45792
45793static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
45794 TargetLowering::DAGCombinerInfo &DCI,
45795 const X86Subtarget &Subtarget) {
45796 EVT VT = N->getValueType(0);
45797
45798 if (SDValue V = combineMulToPMADDWD(N, DAG, Subtarget))
45799 return V;
45800
45801 if (SDValue V = combineMulToPMULDQ(N, DAG, Subtarget))
45802 return V;
45803
45804 if (DCI.isBeforeLegalize() && VT.isVector())
45805 return reduceVMULWidth(N, DAG, Subtarget);
45806
45807 // Optimize a single multiply with constant into two operations in order to
45808 // implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
45809 if (!MulConstantOptimization)
45810 return SDValue();
45811
45812 // An imul is usually smaller than the alternative sequence.
45813 if (DAG.getMachineFunction().getFunction().hasMinSize())
45814 return SDValue();
45815
45816 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
45817 return SDValue();
45818
45819 if (VT != MVT::i64 && VT != MVT::i32)
45820 return SDValue();
45821
45822 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
45823 if (!C)
45824 return SDValue();
45825 if (isPowerOf2_64(C->getZExtValue()))
45826 return SDValue();
45827
45828 int64_t SignMulAmt = C->getSExtValue();
45829 assert(SignMulAmt != INT64_MIN && "Int min should have been handled!")(static_cast <bool> (SignMulAmt != (-9223372036854775807L
-1) && "Int min should have been handled!") ? void (
0) : __assert_fail ("SignMulAmt != INT64_MIN && \"Int min should have been handled!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45829, __extension__
__PRETTY_FUNCTION__))
;
45830 uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;
45831
45832 SDLoc DL(N);
45833 if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) {
45834 SDValue NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
45835 DAG.getConstant(AbsMulAmt, DL, VT));
45836 if (SignMulAmt < 0)
45837 NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
45838 NewMul);
45839
45840 return NewMul;
45841 }
45842
45843 uint64_t MulAmt1 = 0;
45844 uint64_t MulAmt2 = 0;
45845 if ((AbsMulAmt % 9) == 0) {
45846 MulAmt1 = 9;
45847 MulAmt2 = AbsMulAmt / 9;
45848 } else if ((AbsMulAmt % 5) == 0) {
45849 MulAmt1 = 5;
45850 MulAmt2 = AbsMulAmt / 5;
45851 } else if ((AbsMulAmt % 3) == 0) {
45852 MulAmt1 = 3;
45853 MulAmt2 = AbsMulAmt / 3;
45854 }
45855
45856 SDValue NewMul;
45857 // For negative multiply amounts, only allow MulAmt2 to be a power of 2.
45858 if (MulAmt2 &&
45859 (isPowerOf2_64(MulAmt2) ||
45860 (SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) {
45861
45862 if (isPowerOf2_64(MulAmt2) &&
45863 !(SignMulAmt >= 0 && N->hasOneUse() &&
45864 N->use_begin()->getOpcode() == ISD::ADD))
45865 // If second multiplifer is pow2, issue it first. We want the multiply by
45866 // 3, 5, or 9 to be folded into the addressing mode unless the lone use
45867 // is an add. Only do this for positive multiply amounts since the
45868 // negate would prevent it from being used as an address mode anyway.
45869 std::swap(MulAmt1, MulAmt2);
45870
45871 if (isPowerOf2_64(MulAmt1))
45872 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
45873 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
45874 else
45875 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
45876 DAG.getConstant(MulAmt1, DL, VT));
45877
45878 if (isPowerOf2_64(MulAmt2))
45879 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
45880 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
45881 else
45882 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
45883 DAG.getConstant(MulAmt2, DL, VT));
45884
45885 // Negate the result.
45886 if (SignMulAmt < 0)
45887 NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
45888 NewMul);
45889 } else if (!Subtarget.slowLEA())
45890 NewMul = combineMulSpecial(C->getZExtValue(), N, DAG, VT, DL);
45891
45892 if (!NewMul) {
45893 assert(C->getZExtValue() != 0 &&(static_cast <bool> (C->getZExtValue() != 0 &&
C->getZExtValue() != (VT == MVT::i64 ? (18446744073709551615UL
) : (4294967295U)) && "Both cases that could cause potential overflows should have "
"already been handled.") ? void (0) : __assert_fail ("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45896, __extension__
__PRETTY_FUNCTION__))
45894 C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&(static_cast <bool> (C->getZExtValue() != 0 &&
C->getZExtValue() != (VT == MVT::i64 ? (18446744073709551615UL
) : (4294967295U)) && "Both cases that could cause potential overflows should have "
"already been handled.") ? void (0) : __assert_fail ("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45896, __extension__
__PRETTY_FUNCTION__))
45895 "Both cases that could cause potential overflows should have "(static_cast <bool> (C->getZExtValue() != 0 &&
C->getZExtValue() != (VT == MVT::i64 ? (18446744073709551615UL
) : (4294967295U)) && "Both cases that could cause potential overflows should have "
"already been handled.") ? void (0) : __assert_fail ("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45896, __extension__
__PRETTY_FUNCTION__))
45896 "already been handled.")(static_cast <bool> (C->getZExtValue() != 0 &&
C->getZExtValue() != (VT == MVT::i64 ? (18446744073709551615UL
) : (4294967295U)) && "Both cases that could cause potential overflows should have "
"already been handled.") ? void (0) : __assert_fail ("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45896, __extension__
__PRETTY_FUNCTION__))
;
45897 if (isPowerOf2_64(AbsMulAmt - 1)) {
45898 // (mul x, 2^N + 1) => (add (shl x, N), x)
45899 NewMul = DAG.getNode(
45900 ISD::ADD, DL, VT, N->getOperand(0),
45901 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
45902 DAG.getConstant(Log2_64(AbsMulAmt - 1), DL,
45903 MVT::i8)));
45904 // To negate, subtract the number from zero
45905 if (SignMulAmt < 0)
45906 NewMul = DAG.getNode(ISD::SUB, DL, VT,
45907 DAG.getConstant(0, DL, VT), NewMul);
45908 } else if (isPowerOf2_64(AbsMulAmt + 1)) {
45909 // (mul x, 2^N - 1) => (sub (shl x, N), x)
45910 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
45911 DAG.getConstant(Log2_64(AbsMulAmt + 1),
45912 DL, MVT::i8));
45913 // To negate, reverse the operands of the subtract.
45914 if (SignMulAmt < 0)
45915 NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);
45916 else
45917 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
45918 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2)) {
45919 // (mul x, 2^N + 2) => (add (add (shl x, N), x), x)
45920 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
45921 DAG.getConstant(Log2_64(AbsMulAmt - 2),
45922 DL, MVT::i8));
45923 NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0));
45924 NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0));
45925 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2)) {
45926 // (mul x, 2^N - 2) => (sub (sub (shl x, N), x), x)
45927 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
45928 DAG.getConstant(Log2_64(AbsMulAmt + 2),
45929 DL, MVT::i8));
45930 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
45931 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
45932 }
45933 }
45934
45935 return NewMul;
45936}
45937
45938// Try to form a MULHU or MULHS node by looking for
45939// (srl (mul ext, ext), 16)
45940// TODO: This is X86 specific because we want to be able to handle wide types
45941// before type legalization. But we can only do it if the vector will be
45942// legalized via widening/splitting. Type legalization can't handle promotion
45943// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
45944// combiner.
45945static SDValue combineShiftToPMULH(SDNode *N, SelectionDAG &DAG,
45946 const X86Subtarget &Subtarget) {
45947 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&(static_cast <bool> ((N->getOpcode() == ISD::SRL || N
->getOpcode() == ISD::SRA) && "SRL or SRA node is required here!"
) ? void (0) : __assert_fail ("(N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) && \"SRL or SRA node is required here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45948, __extension__
__PRETTY_FUNCTION__))
45948 "SRL or SRA node is required here!")(static_cast <bool> ((N->getOpcode() == ISD::SRL || N
->getOpcode() == ISD::SRA) && "SRL or SRA node is required here!"
) ? void (0) : __assert_fail ("(N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) && \"SRL or SRA node is required here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45948, __extension__
__PRETTY_FUNCTION__))
;
45949 SDLoc DL(N);
45950
45951 if (!Subtarget.hasSSE2())
45952 return SDValue();
45953
45954 // The operation feeding into the shift must be a multiply.
45955 SDValue ShiftOperand = N->getOperand(0);
45956 if (ShiftOperand.getOpcode() != ISD::MUL || !ShiftOperand.hasOneUse())
45957 return SDValue();
45958
45959 // Input type should be at least vXi32.
45960 EVT VT = N->getValueType(0);
45961 if (!VT.isVector() || VT.getVectorElementType().getSizeInBits() < 32)
45962 return SDValue();
45963
45964 // Need a shift by 16.
45965 APInt ShiftAmt;
45966 if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), ShiftAmt) ||
45967 ShiftAmt != 16)
45968 return SDValue();
45969
45970 SDValue LHS = ShiftOperand.getOperand(0);
45971 SDValue RHS = ShiftOperand.getOperand(1);
45972
45973 unsigned ExtOpc = LHS.getOpcode();
45974 if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
45975 RHS.getOpcode() != ExtOpc)
45976 return SDValue();
45977
45978 // Peek through the extends.
45979 LHS = LHS.getOperand(0);
45980 RHS = RHS.getOperand(0);
45981
45982 // Ensure the input types match.
45983 EVT MulVT = LHS.getValueType();
45984 if (MulVT.getVectorElementType() != MVT::i16 || RHS.getValueType() != MulVT)
45985 return SDValue();
45986
45987 unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
45988 SDValue Mulh = DAG.getNode(Opc, DL, MulVT, LHS, RHS);
45989
45990 ExtOpc = N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
45991 return DAG.getNode(ExtOpc, DL, VT, Mulh);
45992}
45993
45994static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
45995 SDValue N0 = N->getOperand(0);
45996 SDValue N1 = N->getOperand(1);
45997 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
45998 EVT VT = N0.getValueType();
45999
46000 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
46001 // since the result of setcc_c is all zero's or all ones.
46002 if (VT.isInteger() && !VT.isVector() &&
46003 N1C && N0.getOpcode() == ISD::AND &&
46004 N0.getOperand(1).getOpcode() == ISD::Constant) {
46005 SDValue N00 = N0.getOperand(0);
46006 APInt Mask = N0.getConstantOperandAPInt(1);
46007 Mask <<= N1C->getAPIntValue();
46008 bool MaskOK = false;
46009 // We can handle cases concerning bit-widening nodes containing setcc_c if
46010 // we carefully interrogate the mask to make sure we are semantics
46011 // preserving.
46012 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
46013 // of the underlying setcc_c operation if the setcc_c was zero extended.
46014 // Consider the following example:
46015 // zext(setcc_c) -> i32 0x0000FFFF
46016 // c1 -> i32 0x0000FFFF
46017 // c2 -> i32 0x00000001
46018 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
46019 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
46020 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
46021 MaskOK = true;
46022 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
46023 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
46024 MaskOK = true;
46025 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
46026 N00.getOpcode() == ISD::ANY_EXTEND) &&
46027 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
46028 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
46029 }
46030 if (MaskOK && Mask != 0) {
46031 SDLoc DL(N);
46032 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
46033 }
46034 }
46035
46036 // Hardware support for vector shifts is sparse which makes us scalarize the
46037 // vector operations in many cases. Also, on sandybridge ADD is faster than
46038 // shl.
46039 // (shl V, 1) -> add V,V
46040 if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
46041 if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
46042 assert(N0.getValueType().isVector() && "Invalid vector shift type")(static_cast <bool> (N0.getValueType().isVector() &&
"Invalid vector shift type") ? void (0) : __assert_fail ("N0.getValueType().isVector() && \"Invalid vector shift type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46042, __extension__
__PRETTY_FUNCTION__))
;
46043 // We shift all of the values by one. In many cases we do not have
46044 // hardware support for this operation. This is better expressed as an ADD
46045 // of two values.
46046 if (N1SplatC->isOne())
46047 return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
46048 }
46049
46050 return SDValue();
46051}
46052
46053static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG,
46054 const X86Subtarget &Subtarget) {
46055 SDValue N0 = N->getOperand(0);
46056 SDValue N1 = N->getOperand(1);
46057 EVT VT = N0.getValueType();
46058 unsigned Size = VT.getSizeInBits();
46059
46060 if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
46061 return V;
46062
46063 // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
46064 // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
46065 // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
46066 // depending on sign of (SarConst - [56,48,32,24,16])
46067
46068 // sexts in X86 are MOVs. The MOVs have the same code size
46069 // as above SHIFTs (only SHIFT on 1 has lower code size).
46070 // However the MOVs have 2 advantages to a SHIFT:
46071 // 1. MOVs can write to a register that differs from source
46072 // 2. MOVs accept memory operands
46073
46074 if (VT.isVector() || N1.getOpcode() != ISD::Constant ||
46075 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
46076 N0.getOperand(1).getOpcode() != ISD::Constant)
46077 return SDValue();
46078
46079 SDValue N00 = N0.getOperand(0);
46080 SDValue N01 = N0.getOperand(1);
46081 APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
46082 APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
46083 EVT CVT = N1.getValueType();
46084
46085 if (SarConst.isNegative())
46086 return SDValue();
46087
46088 for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
46089 unsigned ShiftSize = SVT.getSizeInBits();
46090 // skipping types without corresponding sext/zext and
46091 // ShlConst that is not one of [56,48,32,24,16]
46092 if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
46093 continue;
46094 SDLoc DL(N);
46095 SDValue NN =
46096 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
46097 SarConst = SarConst - (Size - ShiftSize);
46098 if (SarConst == 0)
46099 return NN;
46100 if (SarConst.isNegative())
46101 return DAG.getNode(ISD::SHL, DL, VT, NN,
46102 DAG.getConstant(-SarConst, DL, CVT));
46103 return DAG.getNode(ISD::SRA, DL, VT, NN,
46104 DAG.getConstant(SarConst, DL, CVT));
46105 }
46106 return SDValue();
46107}
46108
46109static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,
46110 TargetLowering::DAGCombinerInfo &DCI,
46111 const X86Subtarget &Subtarget) {
46112 SDValue N0 = N->getOperand(0);
46113 SDValue N1 = N->getOperand(1);
46114 EVT VT = N0.getValueType();
46115
46116 if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
46117 return V;
46118
46119 // Only do this on the last DAG combine as it can interfere with other
46120 // combines.
46121 if (!DCI.isAfterLegalizeDAG())
46122 return SDValue();
46123
46124 // Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
46125 // TODO: This is a generic DAG combine that became an x86-only combine to
46126 // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
46127 // and-not ('andn').
46128 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
46129 return SDValue();
46130
46131 auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
46132 auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
46133 if (!ShiftC || !AndC)
46134 return SDValue();
46135
46136 // If we can shrink the constant mask below 8-bits or 32-bits, then this
46137 // transform should reduce code size. It may also enable secondary transforms
46138 // from improved known-bits analysis or instruction selection.
46139 APInt MaskVal = AndC->getAPIntValue();
46140
46141 // If this can be matched by a zero extend, don't optimize.
46142 if (MaskVal.isMask()) {
46143 unsigned TO = MaskVal.countTrailingOnes();
46144 if (TO >= 8 && isPowerOf2_32(TO))
46145 return SDValue();
46146 }
46147
46148 APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
46149 unsigned OldMaskSize = MaskVal.getMinSignedBits();
46150 unsigned NewMaskSize = NewMaskVal.getMinSignedBits();
46151 if ((OldMaskSize > 8 && NewMaskSize <= 8) ||
46152 (OldMaskSize > 32 && NewMaskSize <= 32)) {
46153 // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
46154 SDLoc DL(N);
46155 SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
46156 SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
46157 return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
46158 }
46159 return SDValue();
46160}
46161
46162static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG,
46163 const X86Subtarget &Subtarget) {
46164 unsigned Opcode = N->getOpcode();
46165 assert(isHorizOp(Opcode) && "Unexpected hadd/hsub/pack opcode")(static_cast <bool> (isHorizOp(Opcode) && "Unexpected hadd/hsub/pack opcode"
) ? void (0) : __assert_fail ("isHorizOp(Opcode) && \"Unexpected hadd/hsub/pack opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46165, __extension__
__PRETTY_FUNCTION__))
;
46166
46167 SDLoc DL(N);
46168 EVT VT = N->getValueType(0);
46169 SDValue N0 = N->getOperand(0);
46170 SDValue N1 = N->getOperand(1);
46171 EVT SrcVT = N0.getValueType();
46172
46173 SDValue BC0 =
46174 N->isOnlyUserOf(N0.getNode()) ? peekThroughOneUseBitcasts(N0) : N0;
46175 SDValue BC1 =
46176 N->isOnlyUserOf(N1.getNode()) ? peekThroughOneUseBitcasts(N1) : N1;
46177
46178 // Attempt to fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))
46179 // to SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for
46180 // truncation trees that help us avoid lane crossing shuffles.
46181 // TODO: There's a lot more we can do for PACK/HADD style shuffle combines.
46182 // TODO: We don't handle vXf64 shuffles yet.
46183 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
46184 if (SDValue BCSrc = getSplitVectorSrc(BC0, BC1, false)) {
46185 SmallVector<SDValue> ShuffleOps;
46186 SmallVector<int> ShuffleMask, ScaledMask;
46187 SDValue Vec = peekThroughBitcasts(BCSrc);
46188 if (getTargetShuffleInputs(Vec, ShuffleOps, ShuffleMask, DAG)) {
46189 resolveTargetShuffleInputsAndMask(ShuffleOps, ShuffleMask);
46190 // To keep the HOP LHS/RHS coherency, we must be able to scale the unary
46191 // shuffle to a v4X64 width - we can probably relax this in the future.
46192 if (!isAnyZero(ShuffleMask) && ShuffleOps.size() == 1 &&
46193 ShuffleOps[0].getValueType().is256BitVector() &&
46194 scaleShuffleElements(ShuffleMask, 4, ScaledMask)) {
46195 SDValue Lo, Hi;
46196 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
46197 std::tie(Lo, Hi) = DAG.SplitVector(ShuffleOps[0], DL);
46198 Lo = DAG.getBitcast(SrcVT, Lo);
46199 Hi = DAG.getBitcast(SrcVT, Hi);
46200 SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);
46201 Res = DAG.getBitcast(ShufVT, Res);
46202 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ScaledMask);
46203 return DAG.getBitcast(VT, Res);
46204 }
46205 }
46206 }
46207 }
46208
46209 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(Z,W)) -> SHUFFLE(HOP()).
46210 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
46211 // If either/both ops are a shuffle that can scale to v2x64,
46212 // then see if we can perform this as a v4x32 post shuffle.
46213 SmallVector<SDValue> Ops0, Ops1;
46214 SmallVector<int> Mask0, Mask1, ScaledMask0, ScaledMask1;
46215 bool IsShuf0 =
46216 getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
46217 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
46218 all_of(Ops0, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
46219 bool IsShuf1 =
46220 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
46221 scaleShuffleElements(Mask1, 2, ScaledMask1) &&
46222 all_of(Ops1, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
46223 if (IsShuf0 || IsShuf1) {
46224 if (!IsShuf0) {
46225 Ops0.assign({BC0});
46226 ScaledMask0.assign({0, 1});
46227 }
46228 if (!IsShuf1) {
46229 Ops1.assign({BC1});
46230 ScaledMask1.assign({0, 1});
46231 }
46232
46233 SDValue LHS, RHS;
46234 int PostShuffle[4] = {-1, -1, -1, -1};
46235 auto FindShuffleOpAndIdx = [&](int M, int &Idx, ArrayRef<SDValue> Ops) {
46236 if (M < 0)
46237 return true;
46238 Idx = M % 2;
46239 SDValue Src = Ops[M / 2];
46240 if (!LHS || LHS == Src) {
46241 LHS = Src;
46242 return true;
46243 }
46244 if (!RHS || RHS == Src) {
46245 Idx += 2;
46246 RHS = Src;
46247 return true;
46248 }
46249 return false;
46250 };
46251 if (FindShuffleOpAndIdx(ScaledMask0[0], PostShuffle[0], Ops0) &&
46252 FindShuffleOpAndIdx(ScaledMask0[1], PostShuffle[1], Ops0) &&
46253 FindShuffleOpAndIdx(ScaledMask1[0], PostShuffle[2], Ops1) &&
46254 FindShuffleOpAndIdx(ScaledMask1[1], PostShuffle[3], Ops1)) {
46255 LHS = DAG.getBitcast(SrcVT, LHS);
46256 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
46257 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
46258 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
46259 Res = DAG.getBitcast(ShufVT, Res);
46260 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, PostShuffle);
46261 return DAG.getBitcast(VT, Res);
46262 }
46263 }
46264 }
46265
46266 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)).
46267 if (VT.is256BitVector() && Subtarget.hasInt256()) {
46268 SmallVector<int> Mask0, Mask1;
46269 SmallVector<SDValue> Ops0, Ops1;
46270 SmallVector<int, 2> ScaledMask0, ScaledMask1;
46271 if (getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
46272 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
46273 !Ops0.empty() && !Ops1.empty() &&
46274 all_of(Ops0,
46275 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
46276 all_of(Ops1,
46277 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
46278 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
46279 scaleShuffleElements(Mask1, 2, ScaledMask1)) {
46280 SDValue Op00 = peekThroughBitcasts(Ops0.front());
46281 SDValue Op10 = peekThroughBitcasts(Ops1.front());
46282 SDValue Op01 = peekThroughBitcasts(Ops0.back());
46283 SDValue Op11 = peekThroughBitcasts(Ops1.back());
46284 if ((Op00 == Op11) && (Op01 == Op10)) {
46285 std::swap(Op10, Op11);
46286 ShuffleVectorSDNode::commuteMask(ScaledMask1);
46287 }
46288 if ((Op00 == Op10) && (Op01 == Op11)) {
46289 const int Map[4] = {0, 2, 1, 3};
46290 SmallVector<int, 4> ShuffleMask(
46291 {Map[ScaledMask0[0]], Map[ScaledMask1[0]], Map[ScaledMask0[1]],
46292 Map[ScaledMask1[1]]});
46293 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
46294 SDValue Res = DAG.getNode(Opcode, DL, VT, DAG.getBitcast(SrcVT, Op00),
46295 DAG.getBitcast(SrcVT, Op01));
46296 Res = DAG.getBitcast(ShufVT, Res);
46297 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);
46298 return DAG.getBitcast(VT, Res);
46299 }
46300 }
46301 }
46302
46303 return SDValue();
46304}
46305
46306static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
46307 TargetLowering::DAGCombinerInfo &DCI,
46308 const X86Subtarget &Subtarget) {
46309 unsigned Opcode = N->getOpcode();
46310 assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&(static_cast <bool> ((X86ISD::PACKSS == Opcode || X86ISD
::PACKUS == Opcode) && "Unexpected pack opcode") ? void
(0) : __assert_fail ("(X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) && \"Unexpected pack opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46311, __extension__
__PRETTY_FUNCTION__))
46311 "Unexpected pack opcode")(static_cast <bool> ((X86ISD::PACKSS == Opcode || X86ISD
::PACKUS == Opcode) && "Unexpected pack opcode") ? void
(0) : __assert_fail ("(X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) && \"Unexpected pack opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46311, __extension__
__PRETTY_FUNCTION__))
;
46312
46313 EVT VT = N->getValueType(0);
46314 SDValue N0 = N->getOperand(0);
46315 SDValue N1 = N->getOperand(1);
46316 unsigned NumDstElts = VT.getVectorNumElements();
46317 unsigned DstBitsPerElt = VT.getScalarSizeInBits();
46318 unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
46319 assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&(static_cast <bool> (N0.getScalarValueSizeInBits() == SrcBitsPerElt
&& N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
"Unexpected PACKSS/PACKUS input type") ? void (0) : __assert_fail
("N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && \"Unexpected PACKSS/PACKUS input type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46321, __extension__
__PRETTY_FUNCTION__))
46320 N1.getScalarValueSizeInBits() == SrcBitsPerElt &&(static_cast <bool> (N0.getScalarValueSizeInBits() == SrcBitsPerElt
&& N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
"Unexpected PACKSS/PACKUS input type") ? void (0) : __assert_fail
("N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && \"Unexpected PACKSS/PACKUS input type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46321, __extension__
__PRETTY_FUNCTION__))
46321 "Unexpected PACKSS/PACKUS input type")(static_cast <bool> (N0.getScalarValueSizeInBits() == SrcBitsPerElt
&& N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
"Unexpected PACKSS/PACKUS input type") ? void (0) : __assert_fail
("N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && \"Unexpected PACKSS/PACKUS input type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46321, __extension__
__PRETTY_FUNCTION__))
;
46322
46323 bool IsSigned = (X86ISD::PACKSS == Opcode);
46324
46325 // Constant Folding.
46326 APInt UndefElts0, UndefElts1;
46327 SmallVector<APInt, 32> EltBits0, EltBits1;
46328 if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) &&
46329 (N1.isUndef() || N->isOnlyUserOf(N1.getNode())) &&
46330 getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) &&
46331 getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) {
46332 unsigned NumLanes = VT.getSizeInBits() / 128;
46333 unsigned NumSrcElts = NumDstElts / 2;
46334 unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
46335 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
46336
46337 APInt Undefs(NumDstElts, 0);
46338 SmallVector<APInt, 32> Bits(NumDstElts, APInt::getZero(DstBitsPerElt));
46339 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
46340 for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
46341 unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
46342 auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
46343 auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);
46344
46345 if (UndefElts[SrcIdx]) {
46346 Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
46347 continue;
46348 }
46349
46350 APInt &Val = EltBits[SrcIdx];
46351 if (IsSigned) {
46352 // PACKSS: Truncate signed value with signed saturation.
46353 // Source values less than dst minint are saturated to minint.
46354 // Source values greater than dst maxint are saturated to maxint.
46355 if (Val.isSignedIntN(DstBitsPerElt))
46356 Val = Val.trunc(DstBitsPerElt);
46357 else if (Val.isNegative())
46358 Val = APInt::getSignedMinValue(DstBitsPerElt);
46359 else
46360 Val = APInt::getSignedMaxValue(DstBitsPerElt);
46361 } else {
46362 // PACKUS: Truncate signed value with unsigned saturation.
46363 // Source values less than zero are saturated to zero.
46364 // Source values greater than dst maxuint are saturated to maxuint.
46365 if (Val.isIntN(DstBitsPerElt))
46366 Val = Val.trunc(DstBitsPerElt);
46367 else if (Val.isNegative())
46368 Val = APInt::getZero(DstBitsPerElt);
46369 else
46370 Val = APInt::getAllOnes(DstBitsPerElt);
46371 }
46372 Bits[Lane * NumDstEltsPerLane + Elt] = Val;
46373 }
46374 }
46375
46376 return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
46377 }
46378
46379 // Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()).
46380 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
46381 return V;
46382
46383 // Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular
46384 // truncate to create a larger truncate.
46385 if (Subtarget.hasAVX512() &&
46386 N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&
46387 N0.getOperand(0).getValueType() == MVT::v8i32) {
46388 if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) ||
46389 (!IsSigned &&
46390 DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) {
46391 if (Subtarget.hasVLX())
46392 return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0));
46393
46394 // Widen input to v16i32 so we can truncate that.
46395 SDLoc dl(N);
46396 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32,
46397 N0.getOperand(0), DAG.getUNDEF(MVT::v8i32));
46398 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat);
46399 }
46400 }
46401
46402 // Try to fold PACK(EXTEND(X),EXTEND(Y)) -> CONCAT(X,Y) subvectors.
46403 if (VT.is128BitVector()) {
46404 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
46405 SDValue Src0, Src1;
46406 if (N0.getOpcode() == ExtOpc &&
46407 N0.getOperand(0).getValueType().is64BitVector() &&
46408 N0.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
46409 Src0 = N0.getOperand(0);
46410 }
46411 if (N1.getOpcode() == ExtOpc &&
46412 N1.getOperand(0).getValueType().is64BitVector() &&
46413 N1.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
46414 Src1 = N1.getOperand(0);
46415 }
46416 if ((Src0 || N0.isUndef()) && (Src1 || N1.isUndef())) {
46417 assert((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)")(static_cast <bool> ((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)"
) ? void (0) : __assert_fail ("(Src0 || Src1) && \"Found PACK(UNDEF,UNDEF)\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46417, __extension__
__PRETTY_FUNCTION__))
;
46418 Src0 = Src0 ? Src0 : DAG.getUNDEF(Src1.getValueType());
46419 Src1 = Src1 ? Src1 : DAG.getUNDEF(Src0.getValueType());
46420 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Src0, Src1);
46421 }
46422
46423 // Try again with pack(*_extend_vector_inreg, undef).
46424 unsigned VecInRegOpc = IsSigned ? ISD::SIGN_EXTEND_VECTOR_INREG
46425 : ISD::ZERO_EXTEND_VECTOR_INREG;
46426 if (N0.getOpcode() == VecInRegOpc && N1.isUndef() &&
46427 N0.getOperand(0).getScalarValueSizeInBits() < DstBitsPerElt)
46428 return getEXTEND_VECTOR_INREG(ExtOpc, SDLoc(N), VT, N0.getOperand(0),
46429 DAG);
46430 }
46431
46432 // Attempt to combine as shuffle.
46433 SDValue Op(N, 0);
46434 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
46435 return Res;
46436
46437 return SDValue();
46438}
46439
46440static SDValue combineVectorHADDSUB(SDNode *N, SelectionDAG &DAG,
46441 TargetLowering::DAGCombinerInfo &DCI,
46442 const X86Subtarget &Subtarget) {
46443 assert((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() ||(static_cast <bool> ((X86ISD::HADD == N->getOpcode()
|| X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->
getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
"Unexpected horizontal add/sub opcode") ? void (0) : __assert_fail
("(X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) && \"Unexpected horizontal add/sub opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46445, __extension__
__PRETTY_FUNCTION__))
46444 X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&(static_cast <bool> ((X86ISD::HADD == N->getOpcode()
|| X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->
getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
"Unexpected horizontal add/sub opcode") ? void (0) : __assert_fail
("(X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) && \"Unexpected horizontal add/sub opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46445, __extension__
__PRETTY_FUNCTION__))
46445 "Unexpected horizontal add/sub opcode")(static_cast <bool> ((X86ISD::HADD == N->getOpcode()
|| X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->
getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
"Unexpected horizontal add/sub opcode") ? void (0) : __assert_fail
("(X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) && \"Unexpected horizontal add/sub opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46445, __extension__
__PRETTY_FUNCTION__))
;
46446
46447 if (!shouldUseHorizontalOp(true, DAG, Subtarget)) {
46448 MVT VT = N->getSimpleValueType(0);
46449 SDValue LHS = N->getOperand(0);
46450 SDValue RHS = N->getOperand(1);
46451
46452 // HOP(HOP'(X,X),HOP'(Y,Y)) -> HOP(PERMUTE(HOP'(X,Y)),PERMUTE(HOP'(X,Y)).
46453 if (LHS != RHS && LHS.getOpcode() == N->getOpcode() &&
46454 LHS.getOpcode() == RHS.getOpcode() &&
46455 LHS.getValueType() == RHS.getValueType() &&
46456 N->isOnlyUserOf(LHS.getNode()) && N->isOnlyUserOf(RHS.getNode())) {
46457 SDValue LHS0 = LHS.getOperand(0);
46458 SDValue LHS1 = LHS.getOperand(1);
46459 SDValue RHS0 = RHS.getOperand(0);
46460 SDValue RHS1 = RHS.getOperand(1);
46461 if ((LHS0 == LHS1 || LHS0.isUndef() || LHS1.isUndef()) &&
46462 (RHS0 == RHS1 || RHS0.isUndef() || RHS1.isUndef())) {
46463 SDLoc DL(N);
46464 SDValue Res = DAG.getNode(LHS.getOpcode(), DL, LHS.getValueType(),
46465 LHS0.isUndef() ? LHS1 : LHS0,
46466 RHS0.isUndef() ? RHS1 : RHS0);
46467 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
46468 Res = DAG.getBitcast(ShufVT, Res);
46469 SDValue NewLHS =
46470 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
46471 getV4X86ShuffleImm8ForMask({0, 1, 0, 1}, DL, DAG));
46472 SDValue NewRHS =
46473 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
46474 getV4X86ShuffleImm8ForMask({2, 3, 2, 3}, DL, DAG));
46475 return DAG.getNode(N->getOpcode(), DL, VT, DAG.getBitcast(VT, NewLHS),
46476 DAG.getBitcast(VT, NewRHS));
46477 }
46478 }
46479 }
46480
46481 // Try to fold HOP(SHUFFLE(),SHUFFLE()) -> SHUFFLE(HOP()).
46482 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
46483 return V;
46484
46485 return SDValue();
46486}
46487
46488static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG,
46489 TargetLowering::DAGCombinerInfo &DCI,
46490 const X86Subtarget &Subtarget) {
46491 assert((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() ||(static_cast <bool> ((X86ISD::VSHL == N->getOpcode()
|| X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->
getOpcode()) && "Unexpected shift opcode") ? void (0)
: __assert_fail ("(X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->getOpcode()) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46493, __extension__
__PRETTY_FUNCTION__))
46492 X86ISD::VSRL == N->getOpcode()) &&(static_cast <bool> ((X86ISD::VSHL == N->getOpcode()
|| X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->
getOpcode()) && "Unexpected shift opcode") ? void (0)
: __assert_fail ("(X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->getOpcode()) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46493, __extension__
__PRETTY_FUNCTION__))
46493 "Unexpected shift opcode")(static_cast <bool> ((X86ISD::VSHL == N->getOpcode()
|| X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->
getOpcode()) && "Unexpected shift opcode") ? void (0)
: __assert_fail ("(X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->getOpcode()) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46493, __extension__
__PRETTY_FUNCTION__))
;
46494 EVT VT = N->getValueType(0);
46495 SDValue N0 = N->getOperand(0);
46496 SDValue N1 = N->getOperand(1);
46497
46498 // Shift zero -> zero.
46499 if (ISD::isBuildVectorAllZeros(N0.getNode()))
46500 return DAG.getConstant(0, SDLoc(N), VT);
46501
46502 // Detect constant shift amounts.
46503 APInt UndefElts;
46504 SmallVector<APInt, 32> EltBits;
46505 if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits, true, false)) {
46506 unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);
46507 return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0,
46508 EltBits[0].getZExtValue(), DAG);
46509 }
46510
46511 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46512 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
46513 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
46514 return SDValue(N, 0);
46515
46516 return SDValue();
46517}
46518
46519static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
46520 TargetLowering::DAGCombinerInfo &DCI,
46521 const X86Subtarget &Subtarget) {
46522 unsigned Opcode = N->getOpcode();
46523 assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||(static_cast <bool> ((X86ISD::VSHLI == Opcode || X86ISD
::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && "Unexpected shift opcode"
) ? void (0) : __assert_fail ("(X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46525, __extension__
__PRETTY_FUNCTION__))
46524 X86ISD::VSRLI == Opcode) &&(static_cast <bool> ((X86ISD::VSHLI == Opcode || X86ISD
::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && "Unexpected shift opcode"
) ? void (0) : __assert_fail ("(X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46525, __extension__
__PRETTY_FUNCTION__))
46525 "Unexpected shift opcode")(static_cast <bool> ((X86ISD::VSHLI == Opcode || X86ISD
::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && "Unexpected shift opcode"
) ? void (0) : __assert_fail ("(X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46525, __extension__
__PRETTY_FUNCTION__))
;
46526 bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
46527 EVT VT = N->getValueType(0);
46528 SDValue N0 = N->getOperand(0);
46529 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
46530 assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&(static_cast <bool> (VT == N0.getValueType() &&
(NumBitsPerElt % 8) == 0 && "Unexpected value type")
? void (0) : __assert_fail ("VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46531, __extension__
__PRETTY_FUNCTION__))
46531 "Unexpected value type")(static_cast <bool> (VT == N0.getValueType() &&
(NumBitsPerElt % 8) == 0 && "Unexpected value type")
? void (0) : __assert_fail ("VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46531, __extension__
__PRETTY_FUNCTION__))
;
46532 assert(N->getOperand(1).getValueType() == MVT::i8 &&(static_cast <bool> (N->getOperand(1).getValueType()
== MVT::i8 && "Unexpected shift amount type") ? void
(0) : __assert_fail ("N->getOperand(1).getValueType() == MVT::i8 && \"Unexpected shift amount type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46533, __extension__
__PRETTY_FUNCTION__))
46533 "Unexpected shift amount type")(static_cast <bool> (N->getOperand(1).getValueType()
== MVT::i8 && "Unexpected shift amount type") ? void
(0) : __assert_fail ("N->getOperand(1).getValueType() == MVT::i8 && \"Unexpected shift amount type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46533, __extension__
__PRETTY_FUNCTION__))
;
46534
46535 // (shift undef, X) -> 0
46536 if (N0.isUndef())
46537 return DAG.getConstant(0, SDLoc(N), VT);
46538
46539 // Out of range logical bit shifts are guaranteed to be zero.
46540 // Out of range arithmetic bit shifts splat the sign bit.
46541 unsigned ShiftVal = N->getConstantOperandVal(1);
46542 if (ShiftVal >= NumBitsPerElt) {
46543 if (LogicalShift)
46544 return DAG.getConstant(0, SDLoc(N), VT);
46545 ShiftVal = NumBitsPerElt - 1;
46546 }
46547
46548 // (shift X, 0) -> X
46549 if (!ShiftVal)
46550 return N0;
46551
46552 // (shift 0, C) -> 0
46553 if (ISD::isBuildVectorAllZeros(N0.getNode()))
46554 // N0 is all zeros or undef. We guarantee that the bits shifted into the
46555 // result are all zeros, not undef.
46556 return DAG.getConstant(0, SDLoc(N), VT);
46557
46558 // (VSRAI -1, C) -> -1
46559 if (!LogicalShift && ISD::isBuildVectorAllOnes(N0.getNode()))
46560 // N0 is all ones or undef. We guarantee that the bits shifted into the
46561 // result are all ones, not undef.
46562 return DAG.getConstant(-1, SDLoc(N), VT);
46563
46564 // (shift (shift X, C2), C1) -> (shift X, (C1 + C2))
46565 if (Opcode == N0.getOpcode()) {
46566 unsigned ShiftVal2 = cast<ConstantSDNode>(N0.getOperand(1))->getZExtValue();
46567 unsigned NewShiftVal = ShiftVal + ShiftVal2;
46568 if (NewShiftVal >= NumBitsPerElt) {
46569 // Out of range logical bit shifts are guaranteed to be zero.
46570 // Out of range arithmetic bit shifts splat the sign bit.
46571 if (LogicalShift)
46572 return DAG.getConstant(0, SDLoc(N), VT);
46573 NewShiftVal = NumBitsPerElt - 1;
46574 }
46575 return DAG.getNode(Opcode, SDLoc(N), VT, N0.getOperand(0),
46576 DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8));
46577 }
46578
46579 // We can decode 'whole byte' logical bit shifts as shuffles.
46580 if (LogicalShift && (ShiftVal % 8) == 0) {
46581 SDValue Op(N, 0);
46582 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
46583 return Res;
46584 }
46585
46586 // Constant Folding.
46587 APInt UndefElts;
46588 SmallVector<APInt, 32> EltBits;
46589 if (N->isOnlyUserOf(N0.getNode()) &&
46590 getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {
46591 assert(EltBits.size() == VT.getVectorNumElements() &&(static_cast <bool> (EltBits.size() == VT.getVectorNumElements
() && "Unexpected shift value type") ? void (0) : __assert_fail
("EltBits.size() == VT.getVectorNumElements() && \"Unexpected shift value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46592, __extension__
__PRETTY_FUNCTION__))
46592 "Unexpected shift value type")(static_cast <bool> (EltBits.size() == VT.getVectorNumElements
() && "Unexpected shift value type") ? void (0) : __assert_fail
("EltBits.size() == VT.getVectorNumElements() && \"Unexpected shift value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46592, __extension__
__PRETTY_FUNCTION__))
;
46593 // Undef elements need to fold to 0. It's possible SimplifyDemandedBits
46594 // created an undef input due to no input bits being demanded, but user
46595 // still expects 0 in other bits.
46596 for (unsigned i = 0, e = EltBits.size(); i != e; ++i) {
46597 APInt &Elt = EltBits[i];
46598 if (UndefElts[i])
46599 Elt = 0;
46600 else if (X86ISD::VSHLI == Opcode)
46601 Elt <<= ShiftVal;
46602 else if (X86ISD::VSRAI == Opcode)
46603 Elt.ashrInPlace(ShiftVal);
46604 else
46605 Elt.lshrInPlace(ShiftVal);
46606 }
46607 // Reset undef elements since they were zeroed above.
46608 UndefElts = 0;
46609 return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
46610 }
46611
46612 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46613 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBitsPerElt),
46614 DCI))
46615 return SDValue(N, 0);
46616
46617 return SDValue();
46618}
46619
46620static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
46621 TargetLowering::DAGCombinerInfo &DCI,
46622 const X86Subtarget &Subtarget) {
46623 EVT VT = N->getValueType(0);
46624 assert(((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) ||(static_cast <bool> (((N->getOpcode() == X86ISD::PINSRB
&& VT == MVT::v16i8) || (N->getOpcode() == X86ISD
::PINSRW && VT == MVT::v8i16) || N->getOpcode() ==
ISD::INSERT_VECTOR_ELT) && "Unexpected vector insertion"
) ? void (0) : __assert_fail ("((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) || (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16) || N->getOpcode() == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46627, __extension__
__PRETTY_FUNCTION__))
46625 (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16) ||(static_cast <bool> (((N->getOpcode() == X86ISD::PINSRB
&& VT == MVT::v16i8) || (N->getOpcode() == X86ISD
::PINSRW && VT == MVT::v8i16) || N->getOpcode() ==
ISD::INSERT_VECTOR_ELT) && "Unexpected vector insertion"
) ? void (0) : __assert_fail ("((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) || (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16) || N->getOpcode() == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46627, __extension__
__PRETTY_FUNCTION__))
46626 N->getOpcode() == ISD::INSERT_VECTOR_ELT) &&(static_cast <bool> (((N->getOpcode() == X86ISD::PINSRB
&& VT == MVT::v16i8) || (N->getOpcode() == X86ISD
::PINSRW && VT == MVT::v8i16) || N->getOpcode() ==
ISD::INSERT_VECTOR_ELT) && "Unexpected vector insertion"
) ? void (0) : __assert_fail ("((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) || (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16) || N->getOpcode() == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46627, __extension__
__PRETTY_FUNCTION__))
46627 "Unexpected vector insertion")(static_cast <bool> (((N->getOpcode() == X86ISD::PINSRB
&& VT == MVT::v16i8) || (N->getOpcode() == X86ISD
::PINSRW && VT == MVT::v8i16) || N->getOpcode() ==
ISD::INSERT_VECTOR_ELT) && "Unexpected vector insertion"
) ? void (0) : __assert_fail ("((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) || (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16) || N->getOpcode() == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46627, __extension__
__PRETTY_FUNCTION__))
;
46628
46629 if (N->getOpcode() == X86ISD::PINSRB || N->getOpcode() == X86ISD::PINSRW) {
46630 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
46631 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46632 if (TLI.SimplifyDemandedBits(SDValue(N, 0),
46633 APInt::getAllOnes(NumBitsPerElt), DCI))
46634 return SDValue(N, 0);
46635 }
46636
46637 // Attempt to combine insertion patterns to a shuffle.
46638 if (VT.isSimple() && DCI.isAfterLegalizeDAG()) {
46639 SDValue Op(N, 0);
46640 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
46641 return Res;
46642 }
46643
46644 return SDValue();
46645}
46646
46647/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
46648/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
46649/// OR -> CMPNEQSS.
46650static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
46651 TargetLowering::DAGCombinerInfo &DCI,
46652 const X86Subtarget &Subtarget) {
46653 unsigned opcode;
46654
46655 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
46656 // we're requiring SSE2 for both.
46657 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
46658 SDValue N0 = N->getOperand(0);
46659 SDValue N1 = N->getOperand(1);
46660 SDValue CMP0 = N0.getOperand(1);
46661 SDValue CMP1 = N1.getOperand(1);
46662 SDLoc DL(N);
46663
46664 // The SETCCs should both refer to the same CMP.
46665 if (CMP0.getOpcode() != X86ISD::FCMP || CMP0 != CMP1)
46666 return SDValue();
46667
46668 SDValue CMP00 = CMP0->getOperand(0);
46669 SDValue CMP01 = CMP0->getOperand(1);
46670 EVT VT = CMP00.getValueType();
46671
46672 if (VT == MVT::f32 || VT == MVT::f64 ||
46673 (VT == MVT::f16 && Subtarget.hasFP16())) {
46674 bool ExpectingFlags = false;
46675 // Check for any users that want flags:
46676 for (const SDNode *U : N->uses()) {
46677 if (ExpectingFlags)
46678 break;
46679
46680 switch (U->getOpcode()) {
46681 default:
46682 case ISD::BR_CC:
46683 case ISD::BRCOND:
46684 case ISD::SELECT:
46685 ExpectingFlags = true;
46686 break;
46687 case ISD::CopyToReg:
46688 case ISD::SIGN_EXTEND:
46689 case ISD::ZERO_EXTEND:
46690 case ISD::ANY_EXTEND:
46691 break;
46692 }
46693 }
46694
46695 if (!ExpectingFlags) {
46696 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
46697 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
46698
46699 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
46700 X86::CondCode tmp = cc0;
46701 cc0 = cc1;
46702 cc1 = tmp;
46703 }
46704
46705 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
46706 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
46707 // FIXME: need symbolic constants for these magic numbers.
46708 // See X86ATTInstPrinter.cpp:printSSECC().
46709 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
46710 if (Subtarget.hasAVX512()) {
46711 SDValue FSetCC =
46712 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
46713 DAG.getTargetConstant(x86cc, DL, MVT::i8));
46714 // Need to fill with zeros to ensure the bitcast will produce zeroes
46715 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
46716 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
46717 DAG.getConstant(0, DL, MVT::v16i1),
46718 FSetCC, DAG.getIntPtrConstant(0, DL));
46719 return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
46720 N->getSimpleValueType(0));
46721 }
46722 SDValue OnesOrZeroesF =
46723 DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00,
46724 CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8));
46725
46726 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
46727 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
46728
46729 if (is64BitFP && !Subtarget.is64Bit()) {
46730 // On a 32-bit target, we cannot bitcast the 64-bit float to a
46731 // 64-bit integer, since that's not a legal type. Since
46732 // OnesOrZeroesF is all ones or all zeroes, we don't need all the
46733 // bits, but can do this little dance to extract the lowest 32 bits
46734 // and work with those going forward.
46735 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
46736 OnesOrZeroesF);
46737 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
46738 OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
46739 Vector32, DAG.getIntPtrConstant(0, DL));
46740 IntVT = MVT::i32;
46741 }
46742
46743 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
46744 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
46745 DAG.getConstant(1, DL, IntVT));
46746 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
46747 ANDed);
46748 return OneBitOfTruth;
46749 }
46750 }
46751 }
46752 }
46753 return SDValue();
46754}
46755
46756/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
46757static SDValue combineAndNotIntoANDNP(SDNode *N, SelectionDAG &DAG) {
46758 assert(N->getOpcode() == ISD::AND)(static_cast <bool> (N->getOpcode() == ISD::AND) ? void
(0) : __assert_fail ("N->getOpcode() == ISD::AND", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 46758, __extension__ __PRETTY_FUNCTION__))
;
46759
46760 MVT VT = N->getSimpleValueType(0);
46761 if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
46762 return SDValue();
46763
46764 SDValue X, Y;
46765 SDValue N0 = N->getOperand(0);
46766 SDValue N1 = N->getOperand(1);
46767
46768 auto GetNot = [&VT, &DAG](SDValue V) {
46769 // Basic X = NOT(Y) detection.
46770 if (SDValue Not = IsNOT(V, DAG))
46771 return Not;
46772 // Fold BROADCAST(NOT(Y)) -> BROADCAST(Y).
46773 if (V.getOpcode() == X86ISD::VBROADCAST) {
46774 SDValue Src = V.getOperand(0);
46775 EVT SrcVT = Src.getValueType();
46776 if (!SrcVT.isVector())
46777 return SDValue();
46778 if (SDValue Not = IsNOT(Src, DAG))
46779 return DAG.getNode(X86ISD::VBROADCAST, SDLoc(V), VT,
46780 DAG.getBitcast(SrcVT, Not));
46781 }
46782 return SDValue();
46783 };
46784
46785 if (SDValue Not = GetNot(N0)) {
46786 X = Not;
46787 Y = N1;
46788 } else if (SDValue Not = GetNot(N1)) {
46789 X = Not;
46790 Y = N0;
46791 } else
46792 return SDValue();
46793
46794 X = DAG.getBitcast(VT, X);
46795 Y = DAG.getBitcast(VT, Y);
46796 return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);
46797}
46798
46799// Try to widen AND, OR and XOR nodes to VT in order to remove casts around
46800// logical operations, like in the example below.
46801// or (and (truncate x, truncate y)),
46802// (xor (truncate z, build_vector (constants)))
46803// Given a target type \p VT, we generate
46804// or (and x, y), (xor z, zext(build_vector (constants)))
46805// given x, y and z are of type \p VT. We can do so, if operands are either
46806// truncates from VT types, the second operand is a vector of constants or can
46807// be recursively promoted.
46808static SDValue PromoteMaskArithmetic(SDNode *N, EVT VT, SelectionDAG &DAG,
46809 unsigned Depth) {
46810 // Limit recursion to avoid excessive compile times.
46811 if (Depth >= SelectionDAG::MaxRecursionDepth)
46812 return SDValue();
46813
46814 if (N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND &&
46815 N->getOpcode() != ISD::OR)
46816 return SDValue();
46817
46818 SDValue N0 = N->getOperand(0);
46819 SDValue N1 = N->getOperand(1);
46820 SDLoc DL(N);
46821
46822 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46823 if (!TLI.isOperationLegalOrPromote(N->getOpcode(), VT))
46824 return SDValue();
46825
46826 if (SDValue NN0 = PromoteMaskArithmetic(N0.getNode(), VT, DAG, Depth + 1))
46827 N0 = NN0;
46828 else {
46829 // The Left side has to be a trunc.
46830 if (N0.getOpcode() != ISD::TRUNCATE)
46831 return SDValue();
46832
46833 // The type of the truncated inputs.
46834 if (N0.getOperand(0).getValueType() != VT)
46835 return SDValue();
46836
46837 N0 = N0.getOperand(0);
46838 }
46839
46840 if (SDValue NN1 = PromoteMaskArithmetic(N1.getNode(), VT, DAG, Depth + 1))
46841 N1 = NN1;
46842 else {
46843 // The right side has to be a 'trunc' or a constant vector.
46844 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
46845 N1.getOperand(0).getValueType() == VT;
46846 if (!RHSTrunc && !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()))
46847 return SDValue();
46848
46849 if (RHSTrunc)
46850 N1 = N1.getOperand(0);
46851 else
46852 N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);
46853 }
46854
46855 return DAG.getNode(N->getOpcode(), DL, VT, N0, N1);
46856}
46857
46858// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
46859// register. In most cases we actually compare or select YMM-sized registers
46860// and mixing the two types creates horrible code. This method optimizes
46861// some of the transition sequences.
46862// Even with AVX-512 this is still useful for removing casts around logical
46863// operations on vXi1 mask types.
46864static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG,
46865 const X86Subtarget &Subtarget) {
46866 EVT VT = N->getValueType(0);
46867 assert(VT.isVector() && "Expected vector type")(static_cast <bool> (VT.isVector() && "Expected vector type"
) ? void (0) : __assert_fail ("VT.isVector() && \"Expected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46867, __extension__
__PRETTY_FUNCTION__))
;
46868
46869 SDLoc DL(N);
46870 assert((N->getOpcode() == ISD::ANY_EXTEND ||(static_cast <bool> ((N->getOpcode() == ISD::ANY_EXTEND
|| N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode(
) == ISD::SIGN_EXTEND) && "Invalid Node") ? void (0) :
__assert_fail ("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46872, __extension__
__PRETTY_FUNCTION__))
46871 N->getOpcode() == ISD::ZERO_EXTEND ||(static_cast <bool> ((N->getOpcode() == ISD::ANY_EXTEND
|| N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode(
) == ISD::SIGN_EXTEND) && "Invalid Node") ? void (0) :
__assert_fail ("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46872, __extension__
__PRETTY_FUNCTION__))
46872 N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node")(static_cast <bool> ((N->getOpcode() == ISD::ANY_EXTEND
|| N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode(
) == ISD::SIGN_EXTEND) && "Invalid Node") ? void (0) :
__assert_fail ("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46872, __extension__
__PRETTY_FUNCTION__))
;
46873
46874 SDValue Narrow = N->getOperand(0);
46875 EVT NarrowVT = Narrow.getValueType();
46876
46877 // Generate the wide operation.
46878 SDValue Op = PromoteMaskArithmetic(Narrow.getNode(), VT, DAG, 0);
46879 if (!Op)
46880 return SDValue();
46881 switch (N->getOpcode()) {
46882 default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 46882)
;
46883 case ISD::ANY_EXTEND:
46884 return Op;
46885 case ISD::ZERO_EXTEND:
46886 return DAG.getZeroExtendInReg(Op, DL, NarrowVT);
46887 case ISD::SIGN_EXTEND:
46888 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
46889 Op, DAG.getValueType(NarrowVT));
46890 }
46891}
46892
46893static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) {
46894 unsigned FPOpcode;
46895 switch (Opcode) {
46896 default: llvm_unreachable("Unexpected input node for FP logic conversion")::llvm::llvm_unreachable_internal("Unexpected input node for FP logic conversion"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46896)
;
46897 case ISD::AND: FPOpcode = X86ISD::FAND; break;
46898 case ISD::OR: FPOpcode = X86ISD::FOR; break;
46899 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
46900 }
46901 return FPOpcode;
46902}
46903
46904/// If both input operands of a logic op are being cast from floating-point
46905/// types or FP compares, try to convert this into a floating-point logic node
46906/// to avoid unnecessary moves from SSE to integer registers.
46907static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
46908 TargetLowering::DAGCombinerInfo &DCI,
46909 const X86Subtarget &Subtarget) {
46910 EVT VT = N->getValueType(0);
46911 SDValue N0 = N->getOperand(0);
46912 SDValue N1 = N->getOperand(1);
46913 SDLoc DL(N);
46914
46915 if (!((N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) ||
46916 (N0.getOpcode() == ISD::SETCC && N1.getOpcode() == ISD::SETCC)))
46917 return SDValue();
46918
46919 SDValue N00 = N0.getOperand(0);
46920 SDValue N10 = N1.getOperand(0);
46921 EVT N00Type = N00.getValueType();
46922 EVT N10Type = N10.getValueType();
46923
46924 // Ensure that both types are the same and are legal scalar fp types.
46925 if (N00Type != N10Type || !((Subtarget.hasSSE1() && N00Type == MVT::f32) ||
46926 (Subtarget.hasSSE2() && N00Type == MVT::f64) ||
46927 (Subtarget.hasFP16() && N00Type == MVT::f16)))
46928 return SDValue();
46929
46930 if (N0.getOpcode() == ISD::BITCAST && !DCI.isBeforeLegalizeOps()) {
46931 unsigned FPOpcode = convertIntLogicToFPLogicOpcode(N->getOpcode());
46932 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
46933 return DAG.getBitcast(VT, FPLogic);
46934 }
46935
46936 if (VT != MVT::i1 || N0.getOpcode() != ISD::SETCC || !N0.hasOneUse() ||
46937 !N1.hasOneUse())
46938 return SDValue();
46939
46940 ISD::CondCode CC0 = cast<CondCodeSDNode>(N0.getOperand(2))->get();
46941 ISD::CondCode CC1 = cast<CondCodeSDNode>(N1.getOperand(2))->get();
46942
46943 // The vector ISA for FP predicates is incomplete before AVX, so converting
46944 // COMIS* to CMPS* may not be a win before AVX.
46945 if (!Subtarget.hasAVX() &&
46946 !(cheapX86FSETCC_SSE(CC0) && cheapX86FSETCC_SSE(CC1)))
46947 return SDValue();
46948
46949 // Convert scalar FP compares and logic to vector compares (COMIS* to CMPS*)
46950 // and vector logic:
46951 // logic (setcc N00, N01), (setcc N10, N11) -->
46952 // extelt (logic (setcc (s2v N00), (s2v N01)), setcc (s2v N10), (s2v N11))), 0
46953 unsigned NumElts = 128 / N00Type.getSizeInBits();
46954 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), N00Type, NumElts);
46955 EVT BoolVecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
46956 SDValue ZeroIndex = DAG.getVectorIdxConstant(0, DL);
46957 SDValue N01 = N0.getOperand(1);
46958 SDValue N11 = N1.getOperand(1);
46959 SDValue Vec00 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N00);
46960 SDValue Vec01 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N01);
46961 SDValue Vec10 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N10);
46962 SDValue Vec11 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N11);
46963 SDValue Setcc0 = DAG.getSetCC(DL, BoolVecVT, Vec00, Vec01, CC0);
46964 SDValue Setcc1 = DAG.getSetCC(DL, BoolVecVT, Vec10, Vec11, CC1);
46965 SDValue Logic = DAG.getNode(N->getOpcode(), DL, BoolVecVT, Setcc0, Setcc1);
46966 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Logic, ZeroIndex);
46967}
46968
46969// Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y))
46970// to reduce XMM->GPR traffic.
46971static SDValue combineBitOpWithMOVMSK(SDNode *N, SelectionDAG &DAG) {
46972 unsigned Opc = N->getOpcode();
46973 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&(static_cast <bool> ((Opc == ISD::OR || Opc == ISD::AND
|| Opc == ISD::XOR) && "Unexpected bit opcode") ? void
(0) : __assert_fail ("(Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && \"Unexpected bit opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46974, __extension__
__PRETTY_FUNCTION__))
46974 "Unexpected bit opcode")(static_cast <bool> ((Opc == ISD::OR || Opc == ISD::AND
|| Opc == ISD::XOR) && "Unexpected bit opcode") ? void
(0) : __assert_fail ("(Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && \"Unexpected bit opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46974, __extension__
__PRETTY_FUNCTION__))
;
46975
46976 SDValue N0 = N->getOperand(0);
46977 SDValue N1 = N->getOperand(1);
46978
46979 // Both operands must be single use MOVMSK.
46980 if (N0.getOpcode() != X86ISD::MOVMSK || !N0.hasOneUse() ||
46981 N1.getOpcode() != X86ISD::MOVMSK || !N1.hasOneUse())
46982 return SDValue();
46983
46984 SDValue Vec0 = N0.getOperand(0);
46985 SDValue Vec1 = N1.getOperand(0);
46986 EVT VecVT0 = Vec0.getValueType();
46987 EVT VecVT1 = Vec1.getValueType();
46988
46989 // Both MOVMSK operands must be from vectors of the same size and same element
46990 // size, but its OK for a fp/int diff.
46991 if (VecVT0.getSizeInBits() != VecVT1.getSizeInBits() ||
46992 VecVT0.getScalarSizeInBits() != VecVT1.getScalarSizeInBits())
46993 return SDValue();
46994
46995 SDLoc DL(N);
46996 unsigned VecOpc =
46997 VecVT0.isFloatingPoint() ? convertIntLogicToFPLogicOpcode(Opc) : Opc;
46998 SDValue Result =
46999 DAG.getNode(VecOpc, DL, VecVT0, Vec0, DAG.getBitcast(VecVT0, Vec1));
47000 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
47001}
47002
47003// Attempt to fold BITOP(SHIFT(X,Z),SHIFT(Y,Z)) -> SHIFT(BITOP(X,Y),Z).
47004// NOTE: This is a very limited case of what SimplifyUsingDistributiveLaws
47005// handles in InstCombine.
47006static SDValue combineBitOpWithShift(SDNode *N, SelectionDAG &DAG) {
47007 unsigned Opc = N->getOpcode();
47008 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&(static_cast <bool> ((Opc == ISD::OR || Opc == ISD::AND
|| Opc == ISD::XOR) && "Unexpected bit opcode") ? void
(0) : __assert_fail ("(Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && \"Unexpected bit opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47009, __extension__
__PRETTY_FUNCTION__))
47009 "Unexpected bit opcode")(static_cast <bool> ((Opc == ISD::OR || Opc == ISD::AND
|| Opc == ISD::XOR) && "Unexpected bit opcode") ? void
(0) : __assert_fail ("(Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && \"Unexpected bit opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47009, __extension__
__PRETTY_FUNCTION__))
;
47010
47011 SDValue N0 = N->getOperand(0);
47012 SDValue N1 = N->getOperand(1);
47013 EVT VT = N->getValueType(0);
47014
47015 // Both operands must be single use.
47016 if (!N0.hasOneUse() || !N1.hasOneUse())
47017 return SDValue();
47018
47019 // Search for matching shifts.
47020 SDValue BC0 = peekThroughOneUseBitcasts(N0);
47021 SDValue BC1 = peekThroughOneUseBitcasts(N1);
47022
47023 unsigned BCOpc = BC0.getOpcode();
47024 EVT BCVT = BC0.getValueType();
47025 if (BCOpc != BC1->getOpcode() || BCVT != BC1.getValueType())
47026 return SDValue();
47027
47028 switch (BCOpc) {
47029 case X86ISD::VSHLI:
47030 case X86ISD::VSRLI:
47031 case X86ISD::VSRAI: {
47032 if (BC0.getOperand(1) != BC1.getOperand(1))
47033 return SDValue();
47034
47035 SDLoc DL(N);
47036 SDValue BitOp =
47037 DAG.getNode(Opc, DL, BCVT, BC0.getOperand(0), BC1.getOperand(0));
47038 SDValue Shift = DAG.getNode(BCOpc, DL, BCVT, BitOp, BC0.getOperand(1));
47039 return DAG.getBitcast(VT, Shift);
47040 }
47041 }
47042
47043 return SDValue();
47044}
47045
47046/// If this is a zero/all-bits result that is bitwise-anded with a low bits
47047/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
47048/// with a shift-right to eliminate loading the vector constant mask value.
47049static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
47050 const X86Subtarget &Subtarget) {
47051 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
47052 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
47053 EVT VT = Op0.getValueType();
47054 if (VT != Op1.getValueType() || !VT.isSimple() || !VT.isInteger())
47055 return SDValue();
47056
47057 // Try to convert an "is positive" signbit masking operation into arithmetic
47058 // shift and "andn". This saves a materialization of a -1 vector constant.
47059 // The "is negative" variant should be handled more generally because it only
47060 // requires "and" rather than "andn":
47061 // and (pcmpgt X, -1), Y --> pandn (vsrai X, BitWidth - 1), Y
47062 //
47063 // This is limited to the original type to avoid producing even more bitcasts.
47064 // If the bitcasts can't be eliminated, then it is unlikely that this fold
47065 // will be profitable.
47066 if (N->getValueType(0) == VT &&
47067 supportedVectorShiftWithImm(VT.getSimpleVT(), Subtarget, ISD::SRA)) {
47068 SDValue X, Y;
47069 if (Op1.hasOneUse() && Op1.getOpcode() == X86ISD::PCMPGT &&
47070 isAllOnesOrAllOnesSplat(Op1.getOperand(1))) {
47071 X = Op1.getOperand(0);
47072 Y = Op0;
47073 } else if (Op0.hasOneUse() && Op0.getOpcode() == X86ISD::PCMPGT &&
47074 isAllOnesOrAllOnesSplat(Op0.getOperand(1))) {
47075 X = Op0.getOperand(0);
47076 Y = Op1;
47077 }
47078 if (X && Y) {
47079 SDLoc DL(N);
47080 SDValue Sra =
47081 getTargetVShiftByConstNode(X86ISD::VSRAI, DL, VT.getSimpleVT(), X,
47082 VT.getScalarSizeInBits() - 1, DAG);
47083 return DAG.getNode(X86ISD::ANDNP, DL, VT, Sra, Y);
47084 }
47085 }
47086
47087 APInt SplatVal;
47088 if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) ||
47089 !SplatVal.isMask())
47090 return SDValue();
47091
47092 // Don't prevent creation of ANDN.
47093 if (isBitwiseNot(Op0))
47094 return SDValue();
47095
47096 if (!supportedVectorShiftWithImm(VT.getSimpleVT(), Subtarget, ISD::SRL))
47097 return SDValue();
47098
47099 unsigned EltBitWidth = VT.getScalarSizeInBits();
47100 if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
47101 return SDValue();
47102
47103 SDLoc DL(N);
47104 unsigned ShiftVal = SplatVal.countTrailingOnes();
47105 SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
47106 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT, Op0, ShAmt);
47107 return DAG.getBitcast(N->getValueType(0), Shift);
47108}
47109
47110// Get the index node from the lowered DAG of a GEP IR instruction with one
47111// indexing dimension.
47112static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) {
47113 if (Ld->isIndexed())
47114 return SDValue();
47115
47116 SDValue Base = Ld->getBasePtr();
47117
47118 if (Base.getOpcode() != ISD::ADD)
47119 return SDValue();
47120
47121 SDValue ShiftedIndex = Base.getOperand(0);
47122
47123 if (ShiftedIndex.getOpcode() != ISD::SHL)
47124 return SDValue();
47125
47126 return ShiftedIndex.getOperand(0);
47127
47128}
47129
47130static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
47131 if (Subtarget.hasBMI2() && VT.isScalarInteger()) {
47132 switch (VT.getSizeInBits()) {
47133 default: return false;
47134 case 64: return Subtarget.is64Bit() ? true : false;
47135 case 32: return true;
47136 }
47137 }
47138 return false;
47139}
47140
47141// This function recognizes cases where X86 bzhi instruction can replace and
47142// 'and-load' sequence.
47143// In case of loading integer value from an array of constants which is defined
47144// as follows:
47145//
47146// int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
47147//
47148// then applying a bitwise and on the result with another input.
47149// It's equivalent to performing bzhi (zero high bits) on the input, with the
47150// same index of the load.
47151static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
47152 const X86Subtarget &Subtarget) {
47153 MVT VT = Node->getSimpleValueType(0);
47154 SDLoc dl(Node);
47155
47156 // Check if subtarget has BZHI instruction for the node's type
47157 if (!hasBZHI(Subtarget, VT))
47158 return SDValue();
47159
47160 // Try matching the pattern for both operands.
47161 for (unsigned i = 0; i < 2; i++) {
47162 SDValue N = Node->getOperand(i);
47163 LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());
47164
47165 // continue if the operand is not a load instruction
47166 if (!Ld)
47167 return SDValue();
47168
47169 const Value *MemOp = Ld->getMemOperand()->getValue();
47170
47171 if (!MemOp)
47172 return SDValue();
47173
47174 if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
47175 if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
47176 if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
47177
47178 Constant *Init = GV->getInitializer();
47179 Type *Ty = Init->getType();
47180 if (!isa<ConstantDataArray>(Init) ||
47181 !Ty->getArrayElementType()->isIntegerTy() ||
47182 Ty->getArrayElementType()->getScalarSizeInBits() !=
47183 VT.getSizeInBits() ||
47184 Ty->getArrayNumElements() >
47185 Ty->getArrayElementType()->getScalarSizeInBits())
47186 continue;
47187
47188 // Check if the array's constant elements are suitable to our case.
47189 uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
47190 bool ConstantsMatch = true;
47191 for (uint64_t j = 0; j < ArrayElementCount; j++) {
47192 auto *Elem = cast<ConstantInt>(Init->getAggregateElement(j));
47193 if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
47194 ConstantsMatch = false;
47195 break;
47196 }
47197 }
47198 if (!ConstantsMatch)
47199 continue;
47200
47201 // Do the transformation (For 32-bit type):
47202 // -> (and (load arr[idx]), inp)
47203 // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
47204 // that will be replaced with one bzhi instruction.
47205 SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);
47206 SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);
47207
47208 // Get the Node which indexes into the array.
47209 SDValue Index = getIndexFromUnindexedLoad(Ld);
47210 if (!Index)
47211 return SDValue();
47212 Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);
47213
47214 SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);
47215 Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);
47216
47217 SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
47218 SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
47219
47220 return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
47221 }
47222 }
47223 }
47224 }
47225 return SDValue();
47226}
47227
47228// Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C)
47229// Where C is a mask containing the same number of bits as the setcc and
47230// where the setcc will freely 0 upper bits of k-register. We can replace the
47231// undef in the concat with 0s and remove the AND. This mainly helps with
47232// v2i1/v4i1 setcc being casted to scalar.
47233static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG,
47234 const X86Subtarget &Subtarget) {
47235 assert(N->getOpcode() == ISD::AND && "Unexpected opcode!")(static_cast <bool> (N->getOpcode() == ISD::AND &&
"Unexpected opcode!") ? void (0) : __assert_fail ("N->getOpcode() == ISD::AND && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47235, __extension__
__PRETTY_FUNCTION__))
;
47236
47237 EVT VT = N->getValueType(0);
47238
47239 // Make sure this is an AND with constant. We will check the value of the
47240 // constant later.
47241 if (!isa<ConstantSDNode>(N->getOperand(1)))
47242 return SDValue();
47243
47244 // This is implied by the ConstantSDNode.
47245 assert(!VT.isVector() && "Expected scalar VT!")(static_cast <bool> (!VT.isVector() && "Expected scalar VT!"
) ? void (0) : __assert_fail ("!VT.isVector() && \"Expected scalar VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47245, __extension__
__PRETTY_FUNCTION__))
;
47246
47247 if (N->getOperand(0).getOpcode() != ISD::BITCAST ||
47248 !N->getOperand(0).hasOneUse() ||
47249 !N->getOperand(0).getOperand(0).hasOneUse())
47250 return SDValue();
47251
47252 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47253 SDValue Src = N->getOperand(0).getOperand(0);
47254 EVT SrcVT = Src.getValueType();
47255 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::i1 ||
47256 !TLI.isTypeLegal(SrcVT))
47257 return SDValue();
47258
47259 if (Src.getOpcode() != ISD::CONCAT_VECTORS)
47260 return SDValue();
47261
47262 // We only care about the first subvector of the concat, we expect the
47263 // other subvectors to be ignored due to the AND if we make the change.
47264 SDValue SubVec = Src.getOperand(0);
47265 EVT SubVecVT = SubVec.getValueType();
47266
47267 // First subvector should be a setcc with a legal result type. The RHS of the
47268 // AND should be a mask with this many bits.
47269 if (SubVec.getOpcode() != ISD::SETCC || !TLI.isTypeLegal(SubVecVT) ||
47270 !N->getConstantOperandAPInt(1).isMask(SubVecVT.getVectorNumElements()))
47271 return SDValue();
47272
47273 EVT SetccVT = SubVec.getOperand(0).getValueType();
47274 if (!TLI.isTypeLegal(SetccVT) ||
47275 !(Subtarget.hasVLX() || SetccVT.is512BitVector()))
47276 return SDValue();
47277
47278 if (!(Subtarget.hasBWI() || SetccVT.getScalarSizeInBits() >= 32))
47279 return SDValue();
47280
47281 // We passed all the checks. Rebuild the concat_vectors with zeroes
47282 // and cast it back to VT.
47283 SDLoc dl(N);
47284 SmallVector<SDValue, 4> Ops(Src.getNumOperands(),
47285 DAG.getConstant(0, dl, SubVecVT));
47286 Ops[0] = SubVec;
47287 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT,
47288 Ops);
47289 return DAG.getBitcast(VT, Concat);
47290}
47291
47292static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
47293 TargetLowering::DAGCombinerInfo &DCI,
47294 const X86Subtarget &Subtarget) {
47295 SDValue N0 = N->getOperand(0);
47296 SDValue N1 = N->getOperand(1);
47297 EVT VT = N->getValueType(0);
47298 SDLoc dl(N);
47299 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47300
47301 // If this is SSE1 only convert to FAND to avoid scalarization.
47302 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
47303 return DAG.getBitcast(MVT::v4i32,
47304 DAG.getNode(X86ISD::FAND, dl, MVT::v4f32,
47305 DAG.getBitcast(MVT::v4f32, N0),
47306 DAG.getBitcast(MVT::v4f32, N1)));
47307 }
47308
47309 // Use a 32-bit and+zext if upper bits known zero.
47310 if (VT == MVT::i64 && Subtarget.is64Bit() && !isa<ConstantSDNode>(N1)) {
47311 APInt HiMask = APInt::getHighBitsSet(64, 32);
47312 if (DAG.MaskedValueIsZero(N1, HiMask) ||
47313 DAG.MaskedValueIsZero(N0, HiMask)) {
47314 SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N0);
47315 SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N1);
47316 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
47317 DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));
47318 }
47319 }
47320
47321 // Match all-of bool scalar reductions into a bitcast/movmsk + cmp.
47322 // TODO: Support multiple SrcOps.
47323 if (VT == MVT::i1) {
47324 SmallVector<SDValue, 2> SrcOps;
47325 SmallVector<APInt, 2> SrcPartials;
47326 if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps, &SrcPartials) &&
47327 SrcOps.size() == 1) {
47328 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
47329 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
47330 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
47331 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
47332 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
47333 if (Mask) {
47334 assert(SrcPartials[0].getBitWidth() == NumElts &&(static_cast <bool> (SrcPartials[0].getBitWidth() == NumElts
&& "Unexpected partial reduction mask") ? void (0) :
__assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47335, __extension__
__PRETTY_FUNCTION__))
47335 "Unexpected partial reduction mask")(static_cast <bool> (SrcPartials[0].getBitWidth() == NumElts
&& "Unexpected partial reduction mask") ? void (0) :
__assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47335, __extension__
__PRETTY_FUNCTION__))
;
47336 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
47337 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
47338 return DAG.getSetCC(dl, MVT::i1, Mask, PartialBits, ISD::SETEQ);
47339 }
47340 }
47341 }
47342
47343 if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget))
47344 return V;
47345
47346 if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
47347 return R;
47348
47349 if (SDValue R = combineBitOpWithShift(N, DAG))
47350 return R;
47351
47352 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))
47353 return FPLogic;
47354
47355 if (DCI.isBeforeLegalizeOps())
47356 return SDValue();
47357
47358 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
47359 return R;
47360
47361 if (SDValue R = combineAndNotIntoANDNP(N, DAG))
47362 return R;
47363
47364 if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
47365 return ShiftRight;
47366
47367 if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
47368 return R;
47369
47370 // Fold AND(SRL(X,Y),1) -> SETCC(BT(X,Y), COND_B) iff Y is not a constant
47371 // avoids slow variable shift (moving shift amount to ECX etc.)
47372 if (isOneConstant(N1) && N0->hasOneUse()) {
47373 SDValue Src = N0;
47374 while ((Src.getOpcode() == ISD::ZERO_EXTEND ||
47375 Src.getOpcode() == ISD::TRUNCATE) &&
47376 Src.getOperand(0)->hasOneUse())
47377 Src = Src.getOperand(0);
47378 X86::CondCode X86CC = X86::COND_B;
47379 // Peek through AND(NOT(SRL(X,Y)),1).
47380 if (isBitwiseNot(Src)) {
47381 Src = Src.getOperand(0);
47382 X86CC = X86::COND_AE;
47383 }
47384 if (Src.getOpcode() == ISD::SRL &&
47385 !isa<ConstantSDNode>(Src.getOperand(1))) {
47386 SDValue BitNo = Src.getOperand(1);
47387 Src = Src.getOperand(0);
47388 // Peek through AND(SRL(NOT(X),Y),1).
47389 if (isBitwiseNot(Src)) {
47390 Src = Src.getOperand(0);
47391 X86CC = X86CC == X86::COND_AE ? X86::COND_B : X86::COND_AE;
47392 }
47393 if (SDValue BT = getBT(Src, BitNo, dl, DAG))
47394 return DAG.getZExtOrTrunc(getSETCC(X86CC, BT, dl, DAG), dl, VT);
47395 }
47396 }
47397
47398 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
47399 // Attempt to recursively combine a bitmask AND with shuffles.
47400 SDValue Op(N, 0);
47401 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
47402 return Res;
47403
47404 // If either operand is a constant mask, then only the elements that aren't
47405 // zero are actually demanded by the other operand.
47406 auto GetDemandedMasks = [&](SDValue Op) {
47407 APInt UndefElts;
47408 SmallVector<APInt> EltBits;
47409 int NumElts = VT.getVectorNumElements();
47410 int EltSizeInBits = VT.getScalarSizeInBits();
47411 APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
47412 APInt DemandedElts = APInt::getAllOnes(NumElts);
47413 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
47414 EltBits)) {
47415 DemandedBits.clearAllBits();
47416 DemandedElts.clearAllBits();
47417 for (int I = 0; I != NumElts; ++I)
47418 if (!EltBits[I].isZero()) {
47419 DemandedBits |= EltBits[I];
47420 DemandedElts.setBit(I);
47421 }
47422 }
47423 return std::make_pair(DemandedBits, DemandedElts);
47424 };
47425 std::pair<APInt, APInt> Demand0 = GetDemandedMasks(N1);
47426 std::pair<APInt, APInt> Demand1 = GetDemandedMasks(N0);
47427
47428 if (TLI.SimplifyDemandedVectorElts(N0, Demand0.second, DCI) ||
47429 TLI.SimplifyDemandedVectorElts(N1, Demand1.second, DCI) ||
47430 TLI.SimplifyDemandedBits(N0, Demand0.first, Demand0.second, DCI) ||
47431 TLI.SimplifyDemandedBits(N1, Demand1.first, Demand1.second, DCI)) {
47432 if (N->getOpcode() != ISD::DELETED_NODE)
47433 DCI.AddToWorklist(N);
47434 return SDValue(N, 0);
47435 }
47436
47437 SDValue NewN0 = TLI.SimplifyMultipleUseDemandedBits(N0, Demand0.first,
47438 Demand0.second, DAG);
47439 SDValue NewN1 = TLI.SimplifyMultipleUseDemandedBits(N1, Demand1.first,
47440 Demand1.second, DAG);
47441 if (NewN0 || NewN1)
47442 return DAG.getNode(ISD::AND, dl, VT, NewN0 ? NewN0 : N0,
47443 NewN1 ? NewN1 : N1);
47444 }
47445
47446 // Attempt to combine a scalar bitmask AND with an extracted shuffle.
47447 if ((VT.getScalarSizeInBits() % 8) == 0 &&
47448 N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
47449 isa<ConstantSDNode>(N0.getOperand(1))) {
47450 SDValue BitMask = N1;
47451 SDValue SrcVec = N0.getOperand(0);
47452 EVT SrcVecVT = SrcVec.getValueType();
47453
47454 // Check that the constant bitmask masks whole bytes.
47455 APInt UndefElts;
47456 SmallVector<APInt, 64> EltBits;
47457 if (VT == SrcVecVT.getScalarType() && N0->isOnlyUserOf(SrcVec.getNode()) &&
47458 getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
47459 llvm::all_of(EltBits, [](const APInt &M) {
47460 return M.isZero() || M.isAllOnes();
47461 })) {
47462 unsigned NumElts = SrcVecVT.getVectorNumElements();
47463 unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
47464 unsigned Idx = N0.getConstantOperandVal(1);
47465
47466 // Create a root shuffle mask from the byte mask and the extracted index.
47467 SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
47468 for (unsigned i = 0; i != Scale; ++i) {
47469 if (UndefElts[i])
47470 continue;
47471 int VecIdx = Scale * Idx + i;
47472 ShuffleMask[VecIdx] = EltBits[i].isZero() ? SM_SentinelZero : VecIdx;
47473 }
47474
47475 if (SDValue Shuffle = combineX86ShufflesRecursively(
47476 {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 1,
47477 X86::MaxShuffleCombineDepth,
47478 /*HasVarMask*/ false, /*AllowVarCrossLaneMask*/ true,
47479 /*AllowVarPerLaneMask*/ true, DAG, Subtarget))
47480 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Shuffle,
47481 N0.getOperand(1));
47482 }
47483 }
47484
47485 return SDValue();
47486}
47487
47488// Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))
47489static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG,
47490 const X86Subtarget &Subtarget) {
47491 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode")(static_cast <bool> (N->getOpcode() == ISD::OR &&
"Unexpected Opcode") ? void (0) : __assert_fail ("N->getOpcode() == ISD::OR && \"Unexpected Opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47491, __extension__
__PRETTY_FUNCTION__))
;
47492
47493 MVT VT = N->getSimpleValueType(0);
47494 unsigned EltSizeInBits = VT.getScalarSizeInBits();
47495 if (!VT.isVector() || (EltSizeInBits % 8) != 0)
47496 return SDValue();
47497
47498 SDValue N0 = peekThroughBitcasts(N->getOperand(0));
47499 SDValue N1 = peekThroughBitcasts(N->getOperand(1));
47500 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)
47501 return SDValue();
47502
47503 // On XOP we'll lower to PCMOV so accept one use. With AVX512, we can use
47504 // VPTERNLOG. Otherwise only do this if either mask has multiple uses already.
47505 if (!(Subtarget.hasXOP() || useVPTERNLOG(Subtarget, VT) ||
47506 !N0.getOperand(1).hasOneUse() || !N1.getOperand(1).hasOneUse()))
47507 return SDValue();
47508
47509 // Attempt to extract constant byte masks.
47510 APInt UndefElts0, UndefElts1;
47511 SmallVector<APInt, 32> EltBits0, EltBits1;
47512 if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0,
47513 false, false))
47514 return SDValue();
47515 if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1,
47516 false, false))
47517 return SDValue();
47518
47519 for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) {
47520 // TODO - add UNDEF elts support.
47521 if (UndefElts0[i] || UndefElts1[i])
47522 return SDValue();
47523 if (EltBits0[i] != ~EltBits1[i])
47524 return SDValue();
47525 }
47526
47527 SDLoc DL(N);
47528
47529 if (useVPTERNLOG(Subtarget, VT)) {
47530 // Emit a VPTERNLOG node directly - 0xCA is the imm code for A?B:C.
47531 // VPTERNLOG is only available as vXi32/64-bit types.
47532 MVT OpSVT = EltSizeInBits == 32 ? MVT::i32 : MVT::i64;
47533 MVT OpVT =
47534 MVT::getVectorVT(OpSVT, VT.getSizeInBits() / OpSVT.getSizeInBits());
47535 SDValue A = DAG.getBitcast(OpVT, N0.getOperand(1));
47536 SDValue B = DAG.getBitcast(OpVT, N0.getOperand(0));
47537 SDValue C = DAG.getBitcast(OpVT, N1.getOperand(0));
47538 SDValue Imm = DAG.getTargetConstant(0xCA, DL, MVT::i8);
47539 SDValue Res = getAVX512Node(X86ISD::VPTERNLOG, DL, OpVT, {A, B, C, Imm},
47540 DAG, Subtarget);
47541 return DAG.getBitcast(VT, Res);
47542 }
47543
47544 SDValue X = N->getOperand(0);
47545 SDValue Y =
47546 DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),
47547 DAG.getBitcast(VT, N1.getOperand(0)));
47548 return DAG.getNode(ISD::OR, DL, VT, X, Y);
47549}
47550
47551// Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
47552static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
47553 if (N->getOpcode() != ISD::OR)
47554 return false;
47555
47556 SDValue N0 = N->getOperand(0);
47557 SDValue N1 = N->getOperand(1);
47558
47559 // Canonicalize AND to LHS.
47560 if (N1.getOpcode() == ISD::AND)
47561 std::swap(N0, N1);
47562
47563 // Attempt to match OR(AND(M,Y),ANDNP(M,X)).
47564 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
47565 return false;
47566
47567 Mask = N1.getOperand(0);
47568 X = N1.getOperand(1);
47569
47570 // Check to see if the mask appeared in both the AND and ANDNP.
47571 if (N0.getOperand(0) == Mask)
47572 Y = N0.getOperand(1);
47573 else if (N0.getOperand(1) == Mask)
47574 Y = N0.getOperand(0);
47575 else
47576 return false;
47577
47578 // TODO: Attempt to match against AND(XOR(-1,M),Y) as well, waiting for
47579 // ANDNP combine allows other combines to happen that prevent matching.
47580 return true;
47581}
47582
47583// Try to fold:
47584// (or (and (m, y), (pandn m, x)))
47585// into:
47586// (vselect m, x, y)
47587// As a special case, try to fold:
47588// (or (and (m, (sub 0, x)), (pandn m, x)))
47589// into:
47590// (sub (xor X, M), M)
47591static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
47592 const X86Subtarget &Subtarget) {
47593 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode")(static_cast <bool> (N->getOpcode() == ISD::OR &&
"Unexpected Opcode") ? void (0) : __assert_fail ("N->getOpcode() == ISD::OR && \"Unexpected Opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47593, __extension__
__PRETTY_FUNCTION__))
;
47594
47595 EVT VT = N->getValueType(0);
47596 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
47597 (VT.is256BitVector() && Subtarget.hasInt256())))
47598 return SDValue();
47599
47600 SDValue X, Y, Mask;
47601 if (!matchLogicBlend(N, X, Y, Mask))
47602 return SDValue();
47603
47604 // Validate that X, Y, and Mask are bitcasts, and see through them.
47605 Mask = peekThroughBitcasts(Mask);
47606 X = peekThroughBitcasts(X);
47607 Y = peekThroughBitcasts(Y);
47608
47609 EVT MaskVT = Mask.getValueType();
47610 unsigned EltBits = MaskVT.getScalarSizeInBits();
47611
47612 // TODO: Attempt to handle floating point cases as well?
47613 if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
47614 return SDValue();
47615
47616 SDLoc DL(N);
47617
47618 // Attempt to combine to conditional negate: (sub (xor X, M), M)
47619 if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL,
47620 DAG, Subtarget))
47621 return Res;
47622
47623 // PBLENDVB is only available on SSE 4.1.
47624 if (!Subtarget.hasSSE41())
47625 return SDValue();
47626
47627 // If we have VPTERNLOG we should prefer that since PBLENDVB is multiple uops.
47628 if (Subtarget.hasVLX())
47629 return SDValue();
47630
47631 MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;
47632
47633 X = DAG.getBitcast(BlendVT, X);
47634 Y = DAG.getBitcast(BlendVT, Y);
47635 Mask = DAG.getBitcast(BlendVT, Mask);
47636 Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
47637 return DAG.getBitcast(VT, Mask);
47638}
47639
47640// Helper function for combineOrCmpEqZeroToCtlzSrl
47641// Transforms:
47642// seteq(cmp x, 0)
47643// into:
47644// srl(ctlz x), log2(bitsize(x))
47645// Input pattern is checked by caller.
47646static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, SelectionDAG &DAG) {
47647 SDValue Cmp = Op.getOperand(1);
47648 EVT VT = Cmp.getOperand(0).getValueType();
47649 unsigned Log2b = Log2_32(VT.getSizeInBits());
47650 SDLoc dl(Op);
47651 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
47652 // The result of the shift is true or false, and on X86, the 32-bit
47653 // encoding of shr and lzcnt is more desirable.
47654 SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
47655 SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
47656 DAG.getConstant(Log2b, dl, MVT::i8));
47657 return Scc;
47658}
47659
47660// Try to transform:
47661// zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
47662// into:
47663// srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
47664// Will also attempt to match more generic cases, eg:
47665// zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
47666// Only applies if the target supports the FastLZCNT feature.
47667static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
47668 TargetLowering::DAGCombinerInfo &DCI,
47669 const X86Subtarget &Subtarget) {
47670 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
47671 return SDValue();
47672
47673 auto isORCandidate = [](SDValue N) {
47674 return (N->getOpcode() == ISD::OR && N->hasOneUse());
47675 };
47676
47677 // Check the zero extend is extending to 32-bit or more. The code generated by
47678 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
47679 // instructions to clear the upper bits.
47680 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
47681 !isORCandidate(N->getOperand(0)))
47682 return SDValue();
47683
47684 // Check the node matches: setcc(eq, cmp 0)
47685 auto isSetCCCandidate = [](SDValue N) {
47686 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
47687 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
47688 N->getOperand(1).getOpcode() == X86ISD::CMP &&
47689 isNullConstant(N->getOperand(1).getOperand(1)) &&
47690 N->getOperand(1).getValueType().bitsGE(MVT::i32);
47691 };
47692
47693 SDNode *OR = N->getOperand(0).getNode();
47694 SDValue LHS = OR->getOperand(0);
47695 SDValue RHS = OR->getOperand(1);
47696
47697 // Save nodes matching or(or, setcc(eq, cmp 0)).
47698 SmallVector<SDNode *, 2> ORNodes;
47699 while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
47700 (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
47701 ORNodes.push_back(OR);
47702 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
47703 LHS = OR->getOperand(0);
47704 RHS = OR->getOperand(1);
47705 }
47706
47707 // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
47708 if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
47709 !isORCandidate(SDValue(OR, 0)))
47710 return SDValue();
47711
47712 // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
47713 // to
47714 // or(srl(ctlz),srl(ctlz)).
47715 // The dag combiner can then fold it into:
47716 // srl(or(ctlz, ctlz)).
47717 SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, DAG);
47718 SDValue Ret, NewRHS;
47719 if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG)))
47720 Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, NewLHS, NewRHS);
47721
47722 if (!Ret)
47723 return SDValue();
47724
47725 // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
47726 while (ORNodes.size() > 0) {
47727 OR = ORNodes.pop_back_val();
47728 LHS = OR->getOperand(0);
47729 RHS = OR->getOperand(1);
47730 // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
47731 if (RHS->getOpcode() == ISD::OR)
47732 std::swap(LHS, RHS);
47733 NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG);
47734 if (!NewRHS)
47735 return SDValue();
47736 Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, Ret, NewRHS);
47737 }
47738
47739 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
47740}
47741
47742static SDValue foldMaskedMergeImpl(SDValue And0_L, SDValue And0_R,
47743 SDValue And1_L, SDValue And1_R, SDLoc DL,
47744 SelectionDAG &DAG) {
47745 if (!isBitwiseNot(And0_L, true) || !And0_L->hasOneUse())
47746 return SDValue();
47747 SDValue NotOp = And0_L->getOperand(0);
47748 if (NotOp == And1_R)
47749 std::swap(And1_R, And1_L);
47750 if (NotOp != And1_L)
47751 return SDValue();
47752
47753 // (~(NotOp) & And0_R) | (NotOp & And1_R)
47754 // --> ((And0_R ^ And1_R) & NotOp) ^ And1_R
47755 EVT VT = And1_L->getValueType(0);
47756 SDValue Freeze_And0_R = DAG.getNode(ISD::FREEZE, SDLoc(), VT, And0_R);
47757 SDValue Xor0 = DAG.getNode(ISD::XOR, DL, VT, And1_R, Freeze_And0_R);
47758 SDValue And = DAG.getNode(ISD::AND, DL, VT, Xor0, NotOp);
47759 SDValue Xor1 = DAG.getNode(ISD::XOR, DL, VT, And, Freeze_And0_R);
47760 return Xor1;
47761}
47762
47763/// Fold "masked merge" expressions like `(m & x) | (~m & y)` into the
47764/// equivalent `((x ^ y) & m) ^ y)` pattern.
47765/// This is typically a better representation for targets without a fused
47766/// "and-not" operation. This function is intended to be called from a
47767/// `TargetLowering::PerformDAGCombine` callback on `ISD::OR` nodes.
47768static SDValue foldMaskedMerge(SDNode *Node, SelectionDAG &DAG) {
47769 // Note that masked-merge variants using XOR or ADD expressions are
47770 // normalized to OR by InstCombine so we only check for OR.
47771 assert(Node->getOpcode() == ISD::OR && "Must be called with ISD::OR node")(static_cast <bool> (Node->getOpcode() == ISD::OR &&
"Must be called with ISD::OR node") ? void (0) : __assert_fail
("Node->getOpcode() == ISD::OR && \"Must be called with ISD::OR node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47771, __extension__
__PRETTY_FUNCTION__))
;
47772 SDValue N0 = Node->getOperand(0);
47773 if (N0->getOpcode() != ISD::AND || !N0->hasOneUse())
47774 return SDValue();
47775 SDValue N1 = Node->getOperand(1);
47776 if (N1->getOpcode() != ISD::AND || !N1->hasOneUse())
47777 return SDValue();
47778
47779 SDLoc DL(Node);
47780 SDValue N00 = N0->getOperand(0);
47781 SDValue N01 = N0->getOperand(1);
47782 SDValue N10 = N1->getOperand(0);
47783 SDValue N11 = N1->getOperand(1);
47784 if (SDValue Result = foldMaskedMergeImpl(N00, N01, N10, N11, DL, DAG))
47785 return Result;
47786 if (SDValue Result = foldMaskedMergeImpl(N01, N00, N10, N11, DL, DAG))
47787 return Result;
47788 if (SDValue Result = foldMaskedMergeImpl(N10, N11, N00, N01, DL, DAG))
47789 return Result;
47790 if (SDValue Result = foldMaskedMergeImpl(N11, N10, N00, N01, DL, DAG))
47791 return Result;
47792 return SDValue();
47793}
47794
47795static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
47796 TargetLowering::DAGCombinerInfo &DCI,
47797 const X86Subtarget &Subtarget) {
47798 SDValue N0 = N->getOperand(0);
47799 SDValue N1 = N->getOperand(1);
47800 EVT VT = N->getValueType(0);
47801 SDLoc dl(N);
47802 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47803
47804 // If this is SSE1 only convert to FOR to avoid scalarization.
47805 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
47806 return DAG.getBitcast(MVT::v4i32,
47807 DAG.getNode(X86ISD::FOR, dl, MVT::v4f32,
47808 DAG.getBitcast(MVT::v4f32, N0),
47809 DAG.getBitcast(MVT::v4f32, N1)));
47810 }
47811
47812 // Match any-of bool scalar reductions into a bitcast/movmsk + cmp.
47813 // TODO: Support multiple SrcOps.
47814 if (VT == MVT::i1) {
47815 SmallVector<SDValue, 2> SrcOps;
47816 SmallVector<APInt, 2> SrcPartials;
47817 if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps, &SrcPartials) &&
47818 SrcOps.size() == 1) {
47819 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
47820 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
47821 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
47822 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
47823 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
47824 if (Mask) {
47825 assert(SrcPartials[0].getBitWidth() == NumElts &&(static_cast <bool> (SrcPartials[0].getBitWidth() == NumElts
&& "Unexpected partial reduction mask") ? void (0) :
__assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47826, __extension__
__PRETTY_FUNCTION__))
47826 "Unexpected partial reduction mask")(static_cast <bool> (SrcPartials[0].getBitWidth() == NumElts
&& "Unexpected partial reduction mask") ? void (0) :
__assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47826, __extension__
__PRETTY_FUNCTION__))
;
47827 SDValue ZeroBits = DAG.getConstant(0, dl, MaskVT);
47828 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
47829 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
47830 return DAG.getSetCC(dl, MVT::i1, Mask, ZeroBits, ISD::SETNE);
47831 }
47832 }
47833 }
47834
47835 if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
47836 return R;
47837
47838 if (SDValue R = combineBitOpWithShift(N, DAG))
47839 return R;
47840
47841 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))
47842 return FPLogic;
47843
47844 if (DCI.isBeforeLegalizeOps())
47845 return SDValue();
47846
47847 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
47848 return R;
47849
47850 if (SDValue R = canonicalizeBitSelect(N, DAG, Subtarget))
47851 return R;
47852
47853 if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
47854 return R;
47855
47856 // Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y).
47857 // Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X).
47858 // iff the upper elements of the non-shifted arg are zero.
47859 // KUNPCK require 16+ bool vector elements.
47860 if (N0.getOpcode() == X86ISD::KSHIFTL || N1.getOpcode() == X86ISD::KSHIFTL) {
47861 unsigned NumElts = VT.getVectorNumElements();
47862 unsigned HalfElts = NumElts / 2;
47863 APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts);
47864 if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL &&
47865 N1.getConstantOperandAPInt(1) == HalfElts &&
47866 DAG.MaskedValueIsZero(N0, APInt(1, 1), UpperElts)) {
47867 return DAG.getNode(
47868 ISD::CONCAT_VECTORS, dl, VT,
47869 extractSubVector(N0, 0, DAG, dl, HalfElts),
47870 extractSubVector(N1.getOperand(0), 0, DAG, dl, HalfElts));
47871 }
47872 if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL &&
47873 N0.getConstantOperandAPInt(1) == HalfElts &&
47874 DAG.MaskedValueIsZero(N1, APInt(1, 1), UpperElts)) {
47875 return DAG.getNode(
47876 ISD::CONCAT_VECTORS, dl, VT,
47877 extractSubVector(N1, 0, DAG, dl, HalfElts),
47878 extractSubVector(N0.getOperand(0), 0, DAG, dl, HalfElts));
47879 }
47880 }
47881
47882 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
47883 // Attempt to recursively combine an OR of shuffles.
47884 SDValue Op(N, 0);
47885 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
47886 return Res;
47887
47888 // If either operand is a constant mask, then only the elements that aren't
47889 // allones are actually demanded by the other operand.
47890 auto SimplifyUndemandedElts = [&](SDValue Op, SDValue OtherOp) {
47891 APInt UndefElts;
47892 SmallVector<APInt> EltBits;
47893 int NumElts = VT.getVectorNumElements();
47894 int EltSizeInBits = VT.getScalarSizeInBits();
47895 if (!getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, EltBits))
47896 return false;
47897
47898 APInt DemandedElts = APInt::getZero(NumElts);
47899 for (int I = 0; I != NumElts; ++I)
47900 if (!EltBits[I].isAllOnes())
47901 DemandedElts.setBit(I);
47902
47903 return TLI.SimplifyDemandedVectorElts(OtherOp, DemandedElts, DCI);
47904 };
47905 if (SimplifyUndemandedElts(N0, N1) || SimplifyUndemandedElts(N1, N0)) {
47906 if (N->getOpcode() != ISD::DELETED_NODE)
47907 DCI.AddToWorklist(N);
47908 return SDValue(N, 0);
47909 }
47910 }
47911
47912 // We should fold "masked merge" patterns when `andn` is not available.
47913 if (!Subtarget.hasBMI() && VT.isScalarInteger() && VT != MVT::i1)
47914 if (SDValue R = foldMaskedMerge(N, DAG))
47915 return R;
47916
47917 return SDValue();
47918}
47919
47920/// Try to turn tests against the signbit in the form of:
47921/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
47922/// into:
47923/// SETGT(X, -1)
47924static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
47925 // This is only worth doing if the output type is i8 or i1.
47926 EVT ResultType = N->getValueType(0);
47927 if (ResultType != MVT::i8 && ResultType != MVT::i1)
47928 return SDValue();
47929
47930 SDValue N0 = N->getOperand(0);
47931 SDValue N1 = N->getOperand(1);
47932
47933 // We should be performing an xor against a truncated shift.
47934 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
47935 return SDValue();
47936
47937 // Make sure we are performing an xor against one.
47938 if (!isOneConstant(N1))
47939 return SDValue();
47940
47941 // SetCC on x86 zero extends so only act on this if it's a logical shift.
47942 SDValue Shift = N0.getOperand(0);
47943 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
47944 return SDValue();
47945
47946 // Make sure we are truncating from one of i16, i32 or i64.
47947 EVT ShiftTy = Shift.getValueType();
47948 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
47949 return SDValue();
47950
47951 // Make sure the shift amount extracts the sign bit.
47952 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
47953 Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))
47954 return SDValue();
47955
47956 // Create a greater-than comparison against -1.
47957 // N.B. Using SETGE against 0 works but we want a canonical looking
47958 // comparison, using SETGT matches up with what TranslateX86CC.
47959 SDLoc DL(N);
47960 SDValue ShiftOp = Shift.getOperand(0);
47961 EVT ShiftOpTy = ShiftOp.getValueType();
47962 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47963 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
47964 *DAG.getContext(), ResultType);
47965 SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
47966 DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
47967 if (SetCCResultType != ResultType)
47968 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
47969 return Cond;
47970}
47971
47972/// Turn vector tests of the signbit in the form of:
47973/// xor (sra X, elt_size(X)-1), -1
47974/// into:
47975/// pcmpgt X, -1
47976///
47977/// This should be called before type legalization because the pattern may not
47978/// persist after that.
47979static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
47980 const X86Subtarget &Subtarget) {
47981 EVT VT = N->getValueType(0);
47982 if (!VT.isSimple())
47983 return SDValue();
47984
47985 switch (VT.getSimpleVT().SimpleTy) {
47986 default: return SDValue();
47987 case MVT::v16i8:
47988 case MVT::v8i16:
47989 case MVT::v4i32:
47990 case MVT::v2i64: if (!Subtarget.hasSSE2()) return SDValue(); break;
47991 case MVT::v32i8:
47992 case MVT::v16i16:
47993 case MVT::v8i32:
47994 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
47995 }
47996
47997 // There must be a shift right algebraic before the xor, and the xor must be a
47998 // 'not' operation.
47999 SDValue Shift = N->getOperand(0);
48000 SDValue Ones = N->getOperand(1);
48001 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
48002 !ISD::isBuildVectorAllOnes(Ones.getNode()))
48003 return SDValue();
48004
48005 // The shift should be smearing the sign bit across each vector element.
48006 auto *ShiftAmt =
48007 isConstOrConstSplat(Shift.getOperand(1), /*AllowUndefs*/ true);
48008 if (!ShiftAmt ||
48009 ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))
48010 return SDValue();
48011
48012 // Create a greater-than comparison against -1. We don't use the more obvious
48013 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
48014 return DAG.getSetCC(SDLoc(N), VT, Shift.getOperand(0), Ones, ISD::SETGT);
48015}
48016
48017/// Detect patterns of truncation with unsigned saturation:
48018///
48019/// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
48020/// Return the source value x to be truncated or SDValue() if the pattern was
48021/// not matched.
48022///
48023/// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),
48024/// where C1 >= 0 and C2 is unsigned max of destination type.
48025///
48026/// (truncate (smax (smin (x, C2), C1)) to dest_type)
48027/// where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.
48028///
48029/// These two patterns are equivalent to:
48030/// (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)
48031/// So return the smax(x, C1) value to be truncated or SDValue() if the
48032/// pattern was not matched.
48033static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG,
48034 const SDLoc &DL) {
48035 EVT InVT = In.getValueType();
48036
48037 // Saturation with truncation. We truncate from InVT to VT.
48038 assert(InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&(static_cast <bool> (InVT.getScalarSizeInBits() > VT
.getScalarSizeInBits() && "Unexpected types for truncate operation"
) ? void (0) : __assert_fail ("InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() && \"Unexpected types for truncate operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48039, __extension__
__PRETTY_FUNCTION__))
48039 "Unexpected types for truncate operation")(static_cast <bool> (InVT.getScalarSizeInBits() > VT
.getScalarSizeInBits() && "Unexpected types for truncate operation"
) ? void (0) : __assert_fail ("InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() && \"Unexpected types for truncate operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48039, __extension__
__PRETTY_FUNCTION__))
;
48040
48041 // Match min/max and return limit value as a parameter.
48042 auto MatchMinMax = [](SDValue V, unsigned Opcode, APInt &Limit) -> SDValue {
48043 if (V.getOpcode() == Opcode &&
48044 ISD::isConstantSplatVector(V.getOperand(1).getNode(), Limit))
48045 return V.getOperand(0);
48046 return SDValue();
48047 };
48048
48049 APInt C1, C2;
48050 if (SDValue UMin = MatchMinMax(In, ISD::UMIN, C2))
48051 // C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
48052 // the element size of the destination type.
48053 if (C2.isMask(VT.getScalarSizeInBits()))
48054 return UMin;
48055
48056 if (SDValue SMin = MatchMinMax(In, ISD::SMIN, C2))
48057 if (MatchMinMax(SMin, ISD::SMAX, C1))
48058 if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))
48059 return SMin;
48060
48061 if (SDValue SMax = MatchMinMax(In, ISD::SMAX, C1))
48062 if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, C2))
48063 if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) &&
48064 C2.uge(C1)) {
48065 return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));
48066 }
48067
48068 return SDValue();
48069}
48070
48071/// Detect patterns of truncation with signed saturation:
48072/// (truncate (smin ((smax (x, signed_min_of_dest_type)),
48073/// signed_max_of_dest_type)) to dest_type)
48074/// or:
48075/// (truncate (smax ((smin (x, signed_max_of_dest_type)),
48076/// signed_min_of_dest_type)) to dest_type).
48077/// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].
48078/// Return the source value to be truncated or SDValue() if the pattern was not
48079/// matched.
48080static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
48081 unsigned NumDstBits = VT.getScalarSizeInBits();
48082 unsigned NumSrcBits = In.getScalarValueSizeInBits();
48083 assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation")(static_cast <bool> (NumSrcBits > NumDstBits &&
"Unexpected types for truncate operation") ? void (0) : __assert_fail
("NumSrcBits > NumDstBits && \"Unexpected types for truncate operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48083, __extension__
__PRETTY_FUNCTION__))
;
48084
48085 auto MatchMinMax = [](SDValue V, unsigned Opcode,
48086 const APInt &Limit) -> SDValue {
48087 APInt C;
48088 if (V.getOpcode() == Opcode &&
48089 ISD::isConstantSplatVector(V.getOperand(1).getNode(), C) && C == Limit)
48090 return V.getOperand(0);
48091 return SDValue();
48092 };
48093
48094 APInt SignedMax, SignedMin;
48095 if (MatchPackUS) {
48096 SignedMax = APInt::getAllOnes(NumDstBits).zext(NumSrcBits);
48097 SignedMin = APInt(NumSrcBits, 0);
48098 } else {
48099 SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
48100 SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);
48101 }
48102
48103 if (SDValue SMin = MatchMinMax(In, ISD::SMIN, SignedMax))
48104 if (SDValue SMax = MatchMinMax(SMin, ISD::SMAX, SignedMin))
48105 return SMax;
48106
48107 if (SDValue SMax = MatchMinMax(In, ISD::SMAX, SignedMin))
48108 if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, SignedMax))
48109 return SMin;
48110
48111 return SDValue();
48112}
48113
48114static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
48115 SelectionDAG &DAG,
48116 const X86Subtarget &Subtarget) {
48117 if (!Subtarget.hasSSE2() || !VT.isVector())
48118 return SDValue();
48119
48120 EVT SVT = VT.getVectorElementType();
48121 EVT InVT = In.getValueType();
48122 EVT InSVT = InVT.getVectorElementType();
48123
48124 // If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is
48125 // split across two registers. We can use a packusdw+perm to clamp to 0-65535
48126 // and concatenate at the same time. Then we can use a final vpmovuswb to
48127 // clip to 0-255.
48128 if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
48129 InVT == MVT::v16i32 && VT == MVT::v16i8) {
48130 if (auto USatVal = detectSSatPattern(In, VT, true)) {
48131 // Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.
48132 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,
48133 DL, DAG, Subtarget);
48134 assert(Mid && "Failed to pack!")(static_cast <bool> (Mid && "Failed to pack!") ?
void (0) : __assert_fail ("Mid && \"Failed to pack!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48134, __extension__
__PRETTY_FUNCTION__))
;
48135 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid);
48136 }
48137 }
48138
48139 // vXi32 truncate instructions are available with AVX512F.
48140 // vXi16 truncate instructions are only available with AVX512BW.
48141 // For 256-bit or smaller vectors, we require VLX.
48142 // FIXME: We could widen truncates to 512 to remove the VLX restriction.
48143 // If the result type is 256-bits or larger and we have disable 512-bit
48144 // registers, we should go ahead and use the pack instructions if possible.
48145 bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) ||
48146 (Subtarget.hasBWI() && InSVT == MVT::i16)) &&
48147 (InVT.getSizeInBits() > 128) &&
48148 (Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&
48149 !(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);
48150
48151 if (isPowerOf2_32(VT.getVectorNumElements()) && !PreferAVX512 &&
48152 VT.getSizeInBits() >= 64 &&
48153 (SVT == MVT::i8 || SVT == MVT::i16) &&
48154 (InSVT == MVT::i16 || InSVT == MVT::i32)) {
48155 if (auto USatVal = detectSSatPattern(In, VT, true)) {
48156 // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
48157 // Only do this when the result is at least 64 bits or we'll leaving
48158 // dangling PACKSSDW nodes.
48159 if (SVT == MVT::i8 && InSVT == MVT::i32) {
48160 EVT MidVT = VT.changeVectorElementType(MVT::i16);
48161 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
48162 DAG, Subtarget);
48163 assert(Mid && "Failed to pack!")(static_cast <bool> (Mid && "Failed to pack!") ?
void (0) : __assert_fail ("Mid && \"Failed to pack!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48163, __extension__
__PRETTY_FUNCTION__))
;
48164 SDValue V = truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG,
48165 Subtarget);
48166 assert(V && "Failed to pack!")(static_cast <bool> (V && "Failed to pack!") ? void
(0) : __assert_fail ("V && \"Failed to pack!\"", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 48166, __extension__ __PRETTY_FUNCTION__))
;
48167 return V;
48168 } else if (SVT == MVT::i8 || Subtarget.hasSSE41())
48169 return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
48170 Subtarget);
48171 }
48172 if (auto SSatVal = detectSSatPattern(In, VT))
48173 return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
48174 Subtarget);
48175 }
48176
48177 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48178 if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&
48179 Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI()) &&
48180 (SVT == MVT::i32 || SVT == MVT::i16 || SVT == MVT::i8)) {
48181 unsigned TruncOpc = 0;
48182 SDValue SatVal;
48183 if (auto SSatVal = detectSSatPattern(In, VT)) {
48184 SatVal = SSatVal;
48185 TruncOpc = X86ISD::VTRUNCS;
48186 } else if (auto USatVal = detectUSatPattern(In, VT, DAG, DL)) {
48187 SatVal = USatVal;
48188 TruncOpc = X86ISD::VTRUNCUS;
48189 }
48190 if (SatVal) {
48191 unsigned ResElts = VT.getVectorNumElements();
48192 // If the input type is less than 512 bits and we don't have VLX, we need
48193 // to widen to 512 bits.
48194 if (!Subtarget.hasVLX() && !InVT.is512BitVector()) {
48195 unsigned NumConcats = 512 / InVT.getSizeInBits();
48196 ResElts *= NumConcats;
48197 SmallVector<SDValue, 4> ConcatOps(NumConcats, DAG.getUNDEF(InVT));
48198 ConcatOps[0] = SatVal;
48199 InVT = EVT::getVectorVT(*DAG.getContext(), InSVT,
48200 NumConcats * InVT.getVectorNumElements());
48201 SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps);
48202 }
48203 // Widen the result if its narrower than 128 bits.
48204 if (ResElts * SVT.getSizeInBits() < 128)
48205 ResElts = 128 / SVT.getSizeInBits();
48206 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts);
48207 SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal);
48208 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
48209 DAG.getIntPtrConstant(0, DL));
48210 }
48211 }
48212
48213 return SDValue();
48214}
48215
48216/// This function detects the AVG pattern between vectors of unsigned i8/i16,
48217/// which is c = (a + b + 1) / 2, and replace this operation with the efficient
48218/// ISD::AVGCEILU (AVG) instruction.
48219static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
48220 const X86Subtarget &Subtarget,
48221 const SDLoc &DL) {
48222 if (!VT.isVector())
48223 return SDValue();
48224 EVT InVT = In.getValueType();
48225 unsigned NumElems = VT.getVectorNumElements();
48226
48227 EVT ScalarVT = VT.getVectorElementType();
48228 if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) && NumElems >= 2))
48229 return SDValue();
48230
48231 // InScalarVT is the intermediate type in AVG pattern and it should be greater
48232 // than the original input type (i8/i16).
48233 EVT InScalarVT = InVT.getVectorElementType();
48234 if (InScalarVT.getFixedSizeInBits() <= ScalarVT.getFixedSizeInBits())
48235 return SDValue();
48236
48237 if (!Subtarget.hasSSE2())
48238 return SDValue();
48239
48240 // Detect the following pattern:
48241 //
48242 // %1 = zext <N x i8> %a to <N x i32>
48243 // %2 = zext <N x i8> %b to <N x i32>
48244 // %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
48245 // %4 = add nuw nsw <N x i32> %3, %2
48246 // %5 = lshr <N x i32> %N, <i32 1 x N>
48247 // %6 = trunc <N x i32> %5 to <N x i8>
48248 //
48249 // In AVX512, the last instruction can also be a trunc store.
48250 if (In.getOpcode() != ISD::SRL)
48251 return SDValue();
48252
48253 // A lambda checking the given SDValue is a constant vector and each element
48254 // is in the range [Min, Max].
48255 auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
48256 return ISD::matchUnaryPredicate(V, [Min, Max](ConstantSDNode *C) {
48257 return !(C->getAPIntValue().ult(Min) || C->getAPIntValue().ugt(Max));
48258 });
48259 };
48260
48261 auto IsZExtLike = [DAG = &DAG, ScalarVT](SDValue V) {
48262 unsigned MaxActiveBits = DAG->computeKnownBits(V).countMaxActiveBits();
48263 return MaxActiveBits <= ScalarVT.getSizeInBits();
48264 };
48265
48266 // Check if each element of the vector is right-shifted by one.
48267 SDValue LHS = In.getOperand(0);
48268 SDValue RHS = In.getOperand(1);
48269 if (!IsConstVectorInRange(RHS, 1, 1))
48270 return SDValue();
48271 if (LHS.getOpcode() != ISD::ADD)
48272 return SDValue();
48273
48274 // Detect a pattern of a + b + 1 where the order doesn't matter.
48275 SDValue Operands[3];
48276 Operands[0] = LHS.getOperand(0);
48277 Operands[1] = LHS.getOperand(1);
48278
48279 auto AVGBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
48280 ArrayRef<SDValue> Ops) {
48281 return DAG.getNode(ISD::AVGCEILU, DL, Ops[0].getValueType(), Ops);
48282 };
48283
48284 auto AVGSplitter = [&](std::array<SDValue, 2> Ops) {
48285 for (SDValue &Op : Ops)
48286 if (Op.getValueType() != VT)
48287 Op = DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
48288 // Pad to a power-of-2 vector, split+apply and extract the original vector.
48289 unsigned NumElemsPow2 = PowerOf2Ceil(NumElems);
48290 EVT Pow2VT = EVT::getVectorVT(*DAG.getContext(), ScalarVT, NumElemsPow2);
48291 if (NumElemsPow2 != NumElems) {
48292 for (SDValue &Op : Ops) {
48293 SmallVector<SDValue, 32> EltsOfOp(NumElemsPow2, DAG.getUNDEF(ScalarVT));
48294 for (unsigned i = 0; i != NumElems; ++i) {
48295 SDValue Idx = DAG.getIntPtrConstant(i, DL);
48296 EltsOfOp[i] =
48297 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Op, Idx);
48298 }
48299 Op = DAG.getBuildVector(Pow2VT, DL, EltsOfOp);
48300 }
48301 }
48302 SDValue Res = SplitOpsAndApply(DAG, Subtarget, DL, Pow2VT, Ops, AVGBuilder);
48303 if (NumElemsPow2 == NumElems)
48304 return Res;
48305 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
48306 DAG.getIntPtrConstant(0, DL));
48307 };
48308
48309 // Take care of the case when one of the operands is a constant vector whose
48310 // element is in the range [1, 256].
48311 if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
48312 IsZExtLike(Operands[0])) {
48313 // The pattern is detected. Subtract one from the constant vector, then
48314 // demote it and emit X86ISD::AVG instruction.
48315 SDValue VecOnes = DAG.getConstant(1, DL, InVT);
48316 Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
48317 return AVGSplitter({Operands[0], Operands[1]});
48318 }
48319
48320 // Matches 'add like' patterns: add(Op0,Op1) + zext(or(Op0,Op1)).
48321 // Match the or case only if its 'add-like' - can be replaced by an add.
48322 auto FindAddLike = [&](SDValue V, SDValue &Op0, SDValue &Op1) {
48323 if (ISD::ADD == V.getOpcode()) {
48324 Op0 = V.getOperand(0);
48325 Op1 = V.getOperand(1);
48326 return true;
48327 }
48328 if (ISD::ZERO_EXTEND != V.getOpcode())
48329 return false;
48330 V = V.getOperand(0);
48331 if (V.getValueType() != VT || ISD::OR != V.getOpcode() ||
48332 !DAG.haveNoCommonBitsSet(V.getOperand(0), V.getOperand(1)))
48333 return false;
48334 Op0 = V.getOperand(0);
48335 Op1 = V.getOperand(1);
48336 return true;
48337 };
48338
48339 SDValue Op0, Op1;
48340 if (FindAddLike(Operands[0], Op0, Op1))
48341 std::swap(Operands[0], Operands[1]);
48342 else if (!FindAddLike(Operands[1], Op0, Op1))
48343 return SDValue();
48344 Operands[2] = Op0;
48345 Operands[1] = Op1;
48346
48347 // Now we have three operands of two additions. Check that one of them is a
48348 // constant vector with ones, and the other two can be promoted from i8/i16.
48349 for (int i = 0; i < 3; ++i) {
48350 if (!IsConstVectorInRange(Operands[i], 1, 1))
48351 continue;
48352 std::swap(Operands[i], Operands[2]);
48353
48354 // Check if Operands[0] and Operands[1] are results of type promotion.
48355 for (int j = 0; j < 2; ++j)
48356 if (Operands[j].getValueType() != VT)
48357 if (!IsZExtLike(Operands[j]))
48358 return SDValue();
48359
48360 // The pattern is detected, emit X86ISD::AVG instruction(s).
48361 return AVGSplitter({Operands[0], Operands[1]});
48362 }
48363
48364 return SDValue();
48365}
48366
48367static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
48368 TargetLowering::DAGCombinerInfo &DCI,
48369 const X86Subtarget &Subtarget) {
48370 LoadSDNode *Ld = cast<LoadSDNode>(N);
48371 EVT RegVT = Ld->getValueType(0);
48372 EVT MemVT = Ld->getMemoryVT();
48373 SDLoc dl(Ld);
48374 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48375
48376 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
48377 // into two 16-byte operations. Also split non-temporal aligned loads on
48378 // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
48379 ISD::LoadExtType Ext = Ld->getExtensionType();
48380 bool Fast;
48381 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
48382 Ext == ISD::NON_EXTLOAD &&
48383 ((Ld->isNonTemporal() && !Subtarget.hasInt256() &&
48384 Ld->getAlignment() >= 16) ||
48385 (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
48386 *Ld->getMemOperand(), &Fast) &&
48387 !Fast))) {
48388 unsigned NumElems = RegVT.getVectorNumElements();
48389 if (NumElems < 2)
48390 return SDValue();
48391
48392 unsigned HalfOffset = 16;
48393 SDValue Ptr1 = Ld->getBasePtr();
48394 SDValue Ptr2 =
48395 DAG.getMemBasePlusOffset(Ptr1, TypeSize::Fixed(HalfOffset), dl);
48396 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
48397 NumElems / 2);
48398 SDValue Load1 =
48399 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),
48400 Ld->getOriginalAlign(),
48401 Ld->getMemOperand()->getFlags());
48402 SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,
48403 Ld->getPointerInfo().getWithOffset(HalfOffset),
48404 Ld->getOriginalAlign(),
48405 Ld->getMemOperand()->getFlags());
48406 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
48407 Load1.getValue(1), Load2.getValue(1));
48408
48409 SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
48410 return DCI.CombineTo(N, NewVec, TF, true);
48411 }
48412
48413 // Bool vector load - attempt to cast to an integer, as we have good
48414 // (vXiY *ext(vXi1 bitcast(iX))) handling.
48415 if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() &&
48416 RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) {
48417 unsigned NumElts = RegVT.getVectorNumElements();
48418 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
48419 if (TLI.isTypeLegal(IntVT)) {
48420 SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),
48421 Ld->getPointerInfo(),
48422 Ld->getOriginalAlign(),
48423 Ld->getMemOperand()->getFlags());
48424 SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);
48425 return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);
48426 }
48427 }
48428
48429 // If we also broadcast this as a subvector to a wider type, then just extract
48430 // the lowest subvector.
48431 if (Ext == ISD::NON_EXTLOAD && Subtarget.hasAVX() && Ld->isSimple() &&
48432 (RegVT.is128BitVector() || RegVT.is256BitVector())) {
48433 SDValue Ptr = Ld->getBasePtr();
48434 SDValue Chain = Ld->getChain();
48435 for (SDNode *User : Ptr->uses()) {
48436 if (User != N && User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
48437 cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
48438 cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
48439 cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
48440 MemVT.getSizeInBits() &&
48441 !User->hasAnyUseOfValue(1) &&
48442 User->getValueSizeInBits(0).getFixedSize() >
48443 RegVT.getFixedSizeInBits()) {
48444 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
48445 RegVT.getSizeInBits());
48446 Extract = DAG.getBitcast(RegVT, Extract);
48447 return DCI.CombineTo(N, Extract, SDValue(User, 1));
48448 }
48449 }
48450 }
48451
48452 // Cast ptr32 and ptr64 pointers to the default address space before a load.
48453 unsigned AddrSpace = Ld->getAddressSpace();
48454 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
48455 AddrSpace == X86AS::PTR32_UPTR) {
48456 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
48457 if (PtrVT != Ld->getBasePtr().getSimpleValueType()) {
48458 SDValue Cast =
48459 DAG.getAddrSpaceCast(dl, PtrVT, Ld->getBasePtr(), AddrSpace, 0);
48460 return DAG.getLoad(RegVT, dl, Ld->getChain(), Cast, Ld->getPointerInfo(),
48461 Ld->getOriginalAlign(),
48462 Ld->getMemOperand()->getFlags());
48463 }
48464 }
48465
48466 return SDValue();
48467}
48468
48469/// If V is a build vector of boolean constants and exactly one of those
48470/// constants is true, return the operand index of that true element.
48471/// Otherwise, return -1.
48472static int getOneTrueElt(SDValue V) {
48473 // This needs to be a build vector of booleans.
48474 // TODO: Checking for the i1 type matches the IR definition for the mask,
48475 // but the mask check could be loosened to i8 or other types. That might
48476 // also require checking more than 'allOnesValue'; eg, the x86 HW
48477 // instructions only require that the MSB is set for each mask element.
48478 // The ISD::MSTORE comments/definition do not specify how the mask operand
48479 // is formatted.
48480 auto *BV = dyn_cast<BuildVectorSDNode>(V);
48481 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
48482 return -1;
48483
48484 int TrueIndex = -1;
48485 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
48486 for (unsigned i = 0; i < NumElts; ++i) {
48487 const SDValue &Op = BV->getOperand(i);
48488 if (Op.isUndef())
48489 continue;
48490 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
48491 if (!ConstNode)
48492 return -1;
48493 if (ConstNode->getAPIntValue().countTrailingOnes() >= 1) {
48494 // If we already found a one, this is too many.
48495 if (TrueIndex >= 0)
48496 return -1;
48497 TrueIndex = i;
48498 }
48499 }
48500 return TrueIndex;
48501}
48502
48503/// Given a masked memory load/store operation, return true if it has one mask
48504/// bit set. If it has one mask bit set, then also return the memory address of
48505/// the scalar element to load/store, the vector index to insert/extract that
48506/// scalar element, and the alignment for the scalar memory access.
48507static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
48508 SelectionDAG &DAG, SDValue &Addr,
48509 SDValue &Index, Align &Alignment,
48510 unsigned &Offset) {
48511 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
48512 if (TrueMaskElt < 0)
48513 return false;
48514
48515 // Get the address of the one scalar element that is specified by the mask
48516 // using the appropriate offset from the base pointer.
48517 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
48518 Offset = 0;
48519 Addr = MaskedOp->getBasePtr();
48520 if (TrueMaskElt != 0) {
48521 Offset = TrueMaskElt * EltVT.getStoreSize();
48522 Addr = DAG.getMemBasePlusOffset(Addr, TypeSize::Fixed(Offset),
48523 SDLoc(MaskedOp));
48524 }
48525
48526 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
48527 Alignment = commonAlignment(MaskedOp->getOriginalAlign(),
48528 EltVT.getStoreSize());
48529 return true;
48530}
48531
48532/// If exactly one element of the mask is set for a non-extending masked load,
48533/// it is a scalar load and vector insert.
48534/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
48535/// mask have already been optimized in IR, so we don't bother with those here.
48536static SDValue
48537reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
48538 TargetLowering::DAGCombinerInfo &DCI,
48539 const X86Subtarget &Subtarget) {
48540 assert(ML->isUnindexed() && "Unexpected indexed masked load!")(static_cast <bool> (ML->isUnindexed() && "Unexpected indexed masked load!"
) ? void (0) : __assert_fail ("ML->isUnindexed() && \"Unexpected indexed masked load!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48540, __extension__
__PRETTY_FUNCTION__))
;
48541 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
48542 // However, some target hooks may need to be added to know when the transform
48543 // is profitable. Endianness would also have to be considered.
48544
48545 SDValue Addr, VecIndex;
48546 Align Alignment;
48547 unsigned Offset;
48548 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment, Offset))
48549 return SDValue();
48550
48551 // Load the one scalar element that is specified by the mask using the
48552 // appropriate offset from the base pointer.
48553 SDLoc DL(ML);
48554 EVT VT = ML->getValueType(0);
48555 EVT EltVT = VT.getVectorElementType();
48556
48557 EVT CastVT = VT;
48558 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
48559 EltVT = MVT::f64;
48560 CastVT = VT.changeVectorElementType(EltVT);
48561 }
48562
48563 SDValue Load =
48564 DAG.getLoad(EltVT, DL, ML->getChain(), Addr,
48565 ML->getPointerInfo().getWithOffset(Offset),
48566 Alignment, ML->getMemOperand()->getFlags());
48567
48568 SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru());
48569
48570 // Insert the loaded element into the appropriate place in the vector.
48571 SDValue Insert =
48572 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, CastVT, PassThru, Load, VecIndex);
48573 Insert = DAG.getBitcast(VT, Insert);
48574 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
48575}
48576
48577static SDValue
48578combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
48579 TargetLowering::DAGCombinerInfo &DCI) {
48580 assert(ML->isUnindexed() && "Unexpected indexed masked load!")(static_cast <bool> (ML->isUnindexed() && "Unexpected indexed masked load!"
) ? void (0) : __assert_fail ("ML->isUnindexed() && \"Unexpected indexed masked load!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48580, __extension__
__PRETTY_FUNCTION__))
;
48581 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
48582 return SDValue();
48583
48584 SDLoc DL(ML);
48585 EVT VT = ML->getValueType(0);
48586
48587 // If we are loading the first and last elements of a vector, it is safe and
48588 // always faster to load the whole vector. Replace the masked load with a
48589 // vector load and select.
48590 unsigned NumElts = VT.getVectorNumElements();
48591 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
48592 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
48593 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
48594 if (LoadFirstElt && LoadLastElt) {
48595 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
48596 ML->getMemOperand());
48597 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,
48598 ML->getPassThru());
48599 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
48600 }
48601
48602 // Convert a masked load with a constant mask into a masked load and a select.
48603 // This allows the select operation to use a faster kind of select instruction
48604 // (for example, vblendvps -> vblendps).
48605
48606 // Don't try this if the pass-through operand is already undefined. That would
48607 // cause an infinite loop because that's what we're about to create.
48608 if (ML->getPassThru().isUndef())
48609 return SDValue();
48610
48611 if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))
48612 return SDValue();
48613
48614 // The new masked load has an undef pass-through operand. The select uses the
48615 // original pass-through operand.
48616 SDValue NewML = DAG.getMaskedLoad(
48617 VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),
48618 DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),
48619 ML->getAddressingMode(), ML->getExtensionType());
48620 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,
48621 ML->getPassThru());
48622
48623 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
48624}
48625
48626static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
48627 TargetLowering::DAGCombinerInfo &DCI,
48628 const X86Subtarget &Subtarget) {
48629 auto *Mld = cast<MaskedLoadSDNode>(N);
48630
48631 // TODO: Expanding load with constant mask may be optimized as well.
48632 if (Mld->isExpandingLoad())
48633 return SDValue();
48634
48635 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
48636 if (SDValue ScalarLoad =
48637 reduceMaskedLoadToScalarLoad(Mld, DAG, DCI, Subtarget))
48638 return ScalarLoad;
48639
48640 // TODO: Do some AVX512 subsets benefit from this transform?
48641 if (!Subtarget.hasAVX512())
48642 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
48643 return Blend;
48644 }
48645
48646 // If the mask value has been legalized to a non-boolean vector, try to
48647 // simplify ops leading up to it. We only demand the MSB of each lane.
48648 SDValue Mask = Mld->getMask();
48649 if (Mask.getScalarValueSizeInBits() != 1) {
48650 EVT VT = Mld->getValueType(0);
48651 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48652 APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));
48653 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
48654 if (N->getOpcode() != ISD::DELETED_NODE)
48655 DCI.AddToWorklist(N);
48656 return SDValue(N, 0);
48657 }
48658 if (SDValue NewMask =
48659 TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))
48660 return DAG.getMaskedLoad(
48661 VT, SDLoc(N), Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(),
48662 NewMask, Mld->getPassThru(), Mld->getMemoryVT(), Mld->getMemOperand(),
48663 Mld->getAddressingMode(), Mld->getExtensionType());
48664 }
48665
48666 return SDValue();
48667}
48668
48669/// If exactly one element of the mask is set for a non-truncating masked store,
48670/// it is a vector extract and scalar store.
48671/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
48672/// mask have already been optimized in IR, so we don't bother with those here.
48673static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
48674 SelectionDAG &DAG,
48675 const X86Subtarget &Subtarget) {
48676 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
48677 // However, some target hooks may need to be added to know when the transform
48678 // is profitable. Endianness would also have to be considered.
48679
48680 SDValue Addr, VecIndex;
48681 Align Alignment;
48682 unsigned Offset;
48683 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment, Offset))
48684 return SDValue();
48685
48686 // Extract the one scalar element that is actually being stored.
48687 SDLoc DL(MS);
48688 SDValue Value = MS->getValue();
48689 EVT VT = Value.getValueType();
48690 EVT EltVT = VT.getVectorElementType();
48691 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
48692 EltVT = MVT::f64;
48693 EVT CastVT = VT.changeVectorElementType(EltVT);
48694 Value = DAG.getBitcast(CastVT, Value);
48695 }
48696 SDValue Extract =
48697 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Value, VecIndex);
48698
48699 // Store that element at the appropriate offset from the base pointer.
48700 return DAG.getStore(MS->getChain(), DL, Extract, Addr,
48701 MS->getPointerInfo().getWithOffset(Offset),
48702 Alignment, MS->getMemOperand()->getFlags());
48703}
48704
48705static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
48706 TargetLowering::DAGCombinerInfo &DCI,
48707 const X86Subtarget &Subtarget) {
48708 MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
48709 if (Mst->isCompressingStore())
48710 return SDValue();
48711
48712 EVT VT = Mst->getValue().getValueType();
48713 SDLoc dl(Mst);
48714 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48715
48716 if (Mst->isTruncatingStore())
48717 return SDValue();
48718
48719 if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget))
48720 return ScalarStore;
48721
48722 // If the mask value has been legalized to a non-boolean vector, try to
48723 // simplify ops leading up to it. We only demand the MSB of each lane.
48724 SDValue Mask = Mst->getMask();
48725 if (Mask.getScalarValueSizeInBits() != 1) {
48726 APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));
48727 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
48728 if (N->getOpcode() != ISD::DELETED_NODE)
48729 DCI.AddToWorklist(N);
48730 return SDValue(N, 0);
48731 }
48732 if (SDValue NewMask =
48733 TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))
48734 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(),
48735 Mst->getBasePtr(), Mst->getOffset(), NewMask,
48736 Mst->getMemoryVT(), Mst->getMemOperand(),
48737 Mst->getAddressingMode());
48738 }
48739
48740 SDValue Value = Mst->getValue();
48741 if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
48742 TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
48743 Mst->getMemoryVT())) {
48744 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
48745 Mst->getBasePtr(), Mst->getOffset(), Mask,
48746 Mst->getMemoryVT(), Mst->getMemOperand(),
48747 Mst->getAddressingMode(), true);
48748 }
48749
48750 return SDValue();
48751}
48752
48753static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
48754 TargetLowering::DAGCombinerInfo &DCI,
48755 const X86Subtarget &Subtarget) {
48756 StoreSDNode *St = cast<StoreSDNode>(N);
48757 EVT StVT = St->getMemoryVT();
48758 SDLoc dl(St);
48759 SDValue StoredVal = St->getValue();
48760 EVT VT = StoredVal.getValueType();
48761 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48762
48763 // Convert a store of vXi1 into a store of iX and a bitcast.
48764 if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() &&
48765 VT.getVectorElementType() == MVT::i1) {
48766
48767 EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());
48768 StoredVal = DAG.getBitcast(NewVT, StoredVal);
48769
48770 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
48771 St->getPointerInfo(), St->getOriginalAlign(),
48772 St->getMemOperand()->getFlags());
48773 }
48774
48775 // If this is a store of a scalar_to_vector to v1i1, just use a scalar store.
48776 // This will avoid a copy to k-register.
48777 if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
48778 StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
48779 StoredVal.getOperand(0).getValueType() == MVT::i8) {
48780 SDValue Val = StoredVal.getOperand(0);
48781 // We must store zeros to the unused bits.
48782 Val = DAG.getZeroExtendInReg(Val, dl, MVT::i1);
48783 return DAG.getStore(St->getChain(), dl, Val,
48784 St->getBasePtr(), St->getPointerInfo(),
48785 St->getOriginalAlign(),
48786 St->getMemOperand()->getFlags());
48787 }
48788
48789 // Widen v2i1/v4i1 stores to v8i1.
48790 if ((VT == MVT::v1i1 || VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
48791 Subtarget.hasAVX512()) {
48792 unsigned NumConcats = 8 / VT.getVectorNumElements();
48793 // We must store zeros to the unused bits.
48794 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, VT));
48795 Ops[0] = StoredVal;
48796 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
48797 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
48798 St->getPointerInfo(), St->getOriginalAlign(),
48799 St->getMemOperand()->getFlags());
48800 }
48801
48802 // Turn vXi1 stores of constants into a scalar store.
48803 if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||
48804 VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
48805 ISD::isBuildVectorOfConstantSDNodes(StoredVal.getNode())) {
48806 // If its a v64i1 store without 64-bit support, we need two stores.
48807 if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) {
48808 SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
48809 StoredVal->ops().slice(0, 32));
48810 Lo = combinevXi1ConstantToInteger(Lo, DAG);
48811 SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
48812 StoredVal->ops().slice(32, 32));
48813 Hi = combinevXi1ConstantToInteger(Hi, DAG);
48814
48815 SDValue Ptr0 = St->getBasePtr();
48816 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(4), dl);
48817
48818 SDValue Ch0 =
48819 DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
48820 St->getOriginalAlign(),
48821 St->getMemOperand()->getFlags());
48822 SDValue Ch1 =
48823 DAG.getStore(St->getChain(), dl, Hi, Ptr1,
48824 St->getPointerInfo().getWithOffset(4),
48825 St->getOriginalAlign(),
48826 St->getMemOperand()->getFlags());
48827 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
48828 }
48829
48830 StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
48831 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
48832 St->getPointerInfo(), St->getOriginalAlign(),
48833 St->getMemOperand()->getFlags());
48834 }
48835
48836 // If we are saving a 32-byte vector and 32-byte stores are slow, such as on
48837 // Sandy Bridge, perform two 16-byte stores.
48838 bool Fast;
48839 if (VT.is256BitVector() && StVT == VT &&
48840 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
48841 *St->getMemOperand(), &Fast) &&
48842 !Fast) {
48843 unsigned NumElems = VT.getVectorNumElements();
48844 if (NumElems < 2)
48845 return SDValue();
48846
48847 return splitVectorStore(St, DAG);
48848 }
48849
48850 // Split under-aligned vector non-temporal stores.
48851 if (St->isNonTemporal() && StVT == VT &&
48852 St->getAlignment() < VT.getStoreSize()) {
48853 // ZMM/YMM nt-stores - either it can be stored as a series of shorter
48854 // vectors or the legalizer can scalarize it to use MOVNTI.
48855 if (VT.is256BitVector() || VT.is512BitVector()) {
48856 unsigned NumElems = VT.getVectorNumElements();
48857 if (NumElems < 2)
48858 return SDValue();
48859 return splitVectorStore(St, DAG);
48860 }
48861
48862 // XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64
48863 // to use MOVNTI.
48864 if (VT.is128BitVector() && Subtarget.hasSSE2()) {
48865 MVT NTVT = Subtarget.hasSSE4A()
48866 ? MVT::v2f64
48867 : (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);
48868 return scalarizeVectorStore(St, NTVT, DAG);
48869 }
48870 }
48871
48872 // Try to optimize v16i16->v16i8 truncating stores when BWI is not
48873 // supported, but avx512f is by extending to v16i32 and truncating.
48874 if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&
48875 St->getValue().getOpcode() == ISD::TRUNCATE &&
48876 St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&
48877 TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) &&
48878 St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) {
48879 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32,
48880 St->getValue().getOperand(0));
48881 return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
48882 MVT::v16i8, St->getMemOperand());
48883 }
48884
48885 // Try to fold a VTRUNCUS or VTRUNCS into a truncating store.
48886 if (!St->isTruncatingStore() &&
48887 (StoredVal.getOpcode() == X86ISD::VTRUNCUS ||
48888 StoredVal.getOpcode() == X86ISD::VTRUNCS) &&
48889 StoredVal.hasOneUse() &&
48890 TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) {
48891 bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS;
48892 return EmitTruncSStore(IsSigned, St->getChain(),
48893 dl, StoredVal.getOperand(0), St->getBasePtr(),
48894 VT, St->getMemOperand(), DAG);
48895 }
48896
48897 // Try to fold a extract_element(VTRUNC) pattern into a truncating store.
48898 if (!St->isTruncatingStore()) {
48899 auto IsExtractedElement = [](SDValue V) {
48900 if (V.getOpcode() == ISD::TRUNCATE && V.hasOneUse())
48901 V = V.getOperand(0);
48902 unsigned Opc = V.getOpcode();
48903 if ((Opc == ISD::EXTRACT_VECTOR_ELT || Opc == X86ISD::PEXTRW) &&
48904 isNullConstant(V.getOperand(1)) && V.hasOneUse() &&
48905 V.getOperand(0).hasOneUse())
48906 return V.getOperand(0);
48907 return SDValue();
48908 };
48909 if (SDValue Extract = IsExtractedElement(StoredVal)) {
48910 SDValue Trunc = peekThroughOneUseBitcasts(Extract);
48911 if (Trunc.getOpcode() == X86ISD::VTRUNC) {
48912 SDValue Src = Trunc.getOperand(0);
48913 MVT DstVT = Trunc.getSimpleValueType();
48914 MVT SrcVT = Src.getSimpleValueType();
48915 unsigned NumSrcElts = SrcVT.getVectorNumElements();
48916 unsigned NumTruncBits = DstVT.getScalarSizeInBits() * NumSrcElts;
48917 MVT TruncVT = MVT::getVectorVT(DstVT.getScalarType(), NumSrcElts);
48918 if (NumTruncBits == VT.getSizeInBits() &&
48919 TLI.isTruncStoreLegal(SrcVT, TruncVT)) {
48920 return DAG.getTruncStore(St->getChain(), dl, Src, St->getBasePtr(),
48921 TruncVT, St->getMemOperand());
48922 }
48923 }
48924 }
48925 }
48926
48927 // Optimize trunc store (of multiple scalars) to shuffle and store.
48928 // First, pack all of the elements in one place. Next, store to memory
48929 // in fewer chunks.
48930 if (St->isTruncatingStore() && VT.isVector()) {
48931 // Check if we can detect an AVG pattern from the truncation. If yes,
48932 // replace the trunc store by a normal store with the result of X86ISD::AVG
48933 // instruction.
48934 if (DCI.isBeforeLegalize() || TLI.isTypeLegal(St->getMemoryVT()))
48935 if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
48936 Subtarget, dl))
48937 return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
48938 St->getPointerInfo(), St->getOriginalAlign(),
48939 St->getMemOperand()->getFlags());
48940
48941 if (TLI.isTruncStoreLegal(VT, StVT)) {
48942 if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))
48943 return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
48944 dl, Val, St->getBasePtr(),
48945 St->getMemoryVT(), St->getMemOperand(), DAG);
48946 if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),
48947 DAG, dl))
48948 return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
48949 dl, Val, St->getBasePtr(),
48950 St->getMemoryVT(), St->getMemOperand(), DAG);
48951 }
48952
48953 return SDValue();
48954 }
48955
48956 // Cast ptr32 and ptr64 pointers to the default address space before a store.
48957 unsigned AddrSpace = St->getAddressSpace();
48958 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
48959 AddrSpace == X86AS::PTR32_UPTR) {
48960 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
48961 if (PtrVT != St->getBasePtr().getSimpleValueType()) {
48962 SDValue Cast =
48963 DAG.getAddrSpaceCast(dl, PtrVT, St->getBasePtr(), AddrSpace, 0);
48964 return DAG.getStore(St->getChain(), dl, StoredVal, Cast,
48965 St->getPointerInfo(), St->getOriginalAlign(),
48966 St->getMemOperand()->getFlags(), St->getAAInfo());
48967 }
48968 }
48969
48970 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
48971 // the FP state in cases where an emms may be missing.
48972 // A preferable solution to the general problem is to figure out the right
48973 // places to insert EMMS. This qualifies as a quick hack.
48974
48975 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
48976 if (VT.getSizeInBits() != 64)
48977 return SDValue();
48978
48979 const Function &F = DAG.getMachineFunction().getFunction();
48980 bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
48981 bool F64IsLegal =
48982 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
48983 if ((VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit()) &&
48984 isa<LoadSDNode>(St->getValue()) &&
48985 cast<LoadSDNode>(St->getValue())->isSimple() &&
48986 St->getChain().hasOneUse() && St->isSimple()) {
48987 LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode());
48988
48989 if (!ISD::isNormalLoad(Ld))
48990 return SDValue();
48991
48992 // Avoid the transformation if there are multiple uses of the loaded value.
48993 if (!Ld->hasNUsesOfValue(1, 0))
48994 return SDValue();
48995
48996 SDLoc LdDL(Ld);
48997 SDLoc StDL(N);
48998 // Lower to a single movq load/store pair.
48999 SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(),
49000 Ld->getBasePtr(), Ld->getMemOperand());
49001
49002 // Make sure new load is placed in same chain order.
49003 DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
49004 return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
49005 St->getMemOperand());
49006 }
49007
49008 // This is similar to the above case, but here we handle a scalar 64-bit
49009 // integer store that is extracted from a vector on a 32-bit target.
49010 // If we have SSE2, then we can treat it like a floating-point double
49011 // to get past legalization. The execution dependencies fixup pass will
49012 // choose the optimal machine instruction for the store if this really is
49013 // an integer or v2f32 rather than an f64.
49014 if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
49015 St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
49016 SDValue OldExtract = St->getOperand(1);
49017 SDValue ExtOp0 = OldExtract.getOperand(0);
49018 unsigned VecSize = ExtOp0.getValueSizeInBits();
49019 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
49020 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
49021 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
49022 BitCast, OldExtract.getOperand(1));
49023 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
49024 St->getPointerInfo(), St->getOriginalAlign(),
49025 St->getMemOperand()->getFlags());
49026 }
49027
49028 return SDValue();
49029}
49030
49031static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG,
49032 TargetLowering::DAGCombinerInfo &DCI,
49033 const X86Subtarget &Subtarget) {
49034 auto *St = cast<MemIntrinsicSDNode>(N);
49035
49036 SDValue StoredVal = N->getOperand(1);
49037 MVT VT = StoredVal.getSimpleValueType();
49038 EVT MemVT = St->getMemoryVT();
49039
49040 // Figure out which elements we demand.
49041 unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits();
49042 APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts);
49043
49044 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49045 if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, DCI)) {
49046 if (N->getOpcode() != ISD::DELETED_NODE)
49047 DCI.AddToWorklist(N);
49048 return SDValue(N, 0);
49049 }
49050
49051 return SDValue();
49052}
49053
49054/// Return 'true' if this vector operation is "horizontal"
49055/// and return the operands for the horizontal operation in LHS and RHS. A
49056/// horizontal operation performs the binary operation on successive elements
49057/// of its first operand, then on successive elements of its second operand,
49058/// returning the resulting values in a vector. For example, if
49059/// A = < float a0, float a1, float a2, float a3 >
49060/// and
49061/// B = < float b0, float b1, float b2, float b3 >
49062/// then the result of doing a horizontal operation on A and B is
49063/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
49064/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
49065/// A horizontal-op B, for some already available A and B, and if so then LHS is
49066/// set to A, RHS to B, and the routine returns 'true'.
49067static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS,
49068 SelectionDAG &DAG, const X86Subtarget &Subtarget,
49069 bool IsCommutative,
49070 SmallVectorImpl<int> &PostShuffleMask) {
49071 // If either operand is undef, bail out. The binop should be simplified.
49072 if (LHS.isUndef() || RHS.isUndef())
49073 return false;
49074
49075 // Look for the following pattern:
49076 // A = < float a0, float a1, float a2, float a3 >
49077 // B = < float b0, float b1, float b2, float b3 >
49078 // and
49079 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
49080 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
49081 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
49082 // which is A horizontal-op B.
49083
49084 MVT VT = LHS.getSimpleValueType();
49085 assert((VT.is128BitVector() || VT.is256BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
()) && "Unsupported vector type for horizontal add/sub"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Unsupported vector type for horizontal add/sub\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49086, __extension__
__PRETTY_FUNCTION__))
49086 "Unsupported vector type for horizontal add/sub")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
()) && "Unsupported vector type for horizontal add/sub"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Unsupported vector type for horizontal add/sub\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49086, __extension__
__PRETTY_FUNCTION__))
;
49087 unsigned NumElts = VT.getVectorNumElements();
49088
49089 auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,
49090 SmallVectorImpl<int> &ShuffleMask) {
49091 bool UseSubVector = false;
49092 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
49093 Op.getOperand(0).getValueType().is256BitVector() &&
49094 llvm::isNullConstant(Op.getOperand(1))) {
49095 Op = Op.getOperand(0);
49096 UseSubVector = true;
49097 }
49098 SmallVector<SDValue, 2> SrcOps;
49099 SmallVector<int, 16> SrcMask, ScaledMask;
49100 SDValue BC = peekThroughBitcasts(Op);
49101 if (getTargetShuffleInputs(BC, SrcOps, SrcMask, DAG) &&
49102 !isAnyZero(SrcMask) && all_of(SrcOps, [BC](SDValue Op) {
49103 return Op.getValueSizeInBits() == BC.getValueSizeInBits();
49104 })) {
49105 resolveTargetShuffleInputsAndMask(SrcOps, SrcMask);
49106 if (!UseSubVector && SrcOps.size() <= 2 &&
49107 scaleShuffleElements(SrcMask, NumElts, ScaledMask)) {
49108 N0 = SrcOps.size() > 0 ? SrcOps[0] : SDValue();
49109 N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();
49110 ShuffleMask.assign(ScaledMask.begin(), ScaledMask.end());
49111 }
49112 if (UseSubVector && SrcOps.size() == 1 &&
49113 scaleShuffleElements(SrcMask, 2 * NumElts, ScaledMask)) {
49114 std::tie(N0, N1) = DAG.SplitVector(SrcOps[0], SDLoc(Op));
49115 ArrayRef<int> Mask = ArrayRef<int>(ScaledMask).slice(0, NumElts);
49116 ShuffleMask.assign(Mask.begin(), Mask.end());
49117 }
49118 }
49119 };
49120
49121 // View LHS in the form
49122 // LHS = VECTOR_SHUFFLE A, B, LMask
49123 // If LHS is not a shuffle, then pretend it is the identity shuffle:
49124 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
49125 // NOTE: A default initialized SDValue represents an UNDEF of type VT.
49126 SDValue A, B;
49127 SmallVector<int, 16> LMask;
49128 GetShuffle(LHS, A, B, LMask);
49129
49130 // Likewise, view RHS in the form
49131 // RHS = VECTOR_SHUFFLE C, D, RMask
49132 SDValue C, D;
49133 SmallVector<int, 16> RMask;
49134 GetShuffle(RHS, C, D, RMask);
49135
49136 // At least one of the operands should be a vector shuffle.
49137 unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1);
49138 if (NumShuffles == 0)
49139 return false;
49140
49141 if (LMask.empty()) {
49142 A = LHS;
49143 for (unsigned i = 0; i != NumElts; ++i)
49144 LMask.push_back(i);
49145 }
49146
49147 if (RMask.empty()) {
49148 C = RHS;
49149 for (unsigned i = 0; i != NumElts; ++i)
49150 RMask.push_back(i);
49151 }
49152
49153 // If we have an unary mask, ensure the other op is set to null.
49154 if (isUndefOrInRange(LMask, 0, NumElts))
49155 B = SDValue();
49156 else if (isUndefOrInRange(LMask, NumElts, NumElts * 2))
49157 A = SDValue();
49158
49159 if (isUndefOrInRange(RMask, 0, NumElts))
49160 D = SDValue();
49161 else if (isUndefOrInRange(RMask, NumElts, NumElts * 2))
49162 C = SDValue();
49163
49164 // If A and B occur in reverse order in RHS, then canonicalize by commuting
49165 // RHS operands and shuffle mask.
49166 if (A != C) {
49167 std::swap(C, D);
49168 ShuffleVectorSDNode::commuteMask(RMask);
49169 }
49170 // Check that the shuffles are both shuffling the same vectors.
49171 if (!(A == C && B == D))
49172 return false;
49173
49174 PostShuffleMask.clear();
49175 PostShuffleMask.append(NumElts, SM_SentinelUndef);
49176
49177 // LHS and RHS are now:
49178 // LHS = shuffle A, B, LMask
49179 // RHS = shuffle A, B, RMask
49180 // Check that the masks correspond to performing a horizontal operation.
49181 // AVX defines horizontal add/sub to operate independently on 128-bit lanes,
49182 // so we just repeat the inner loop if this is a 256-bit op.
49183 unsigned Num128BitChunks = VT.getSizeInBits() / 128;
49184 unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;
49185 unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
49186 assert((NumEltsPer128BitChunk % 2 == 0) &&(static_cast <bool> ((NumEltsPer128BitChunk % 2 == 0) &&
"Vector type should have an even number of elements in each lane"
) ? void (0) : __assert_fail ("(NumEltsPer128BitChunk % 2 == 0) && \"Vector type should have an even number of elements in each lane\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49187, __extension__
__PRETTY_FUNCTION__))
49187 "Vector type should have an even number of elements in each lane")(static_cast <bool> ((NumEltsPer128BitChunk % 2 == 0) &&
"Vector type should have an even number of elements in each lane"
) ? void (0) : __assert_fail ("(NumEltsPer128BitChunk % 2 == 0) && \"Vector type should have an even number of elements in each lane\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49187, __extension__
__PRETTY_FUNCTION__))
;
49188 for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {
49189 for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {
49190 // Ignore undefined components.
49191 int LIdx = LMask[i + j], RIdx = RMask[i + j];
49192 if (LIdx < 0 || RIdx < 0 ||
49193 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
49194 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
49195 continue;
49196
49197 // Check that successive odd/even elements are being operated on. If not,
49198 // this is not a horizontal operation.
49199 if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) &&
49200 !((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative))
49201 return false;
49202
49203 // Compute the post-shuffle mask index based on where the element
49204 // is stored in the HOP result, and where it needs to be moved to.
49205 int Base = LIdx & ~1u;
49206 int Index = ((Base % NumEltsPer128BitChunk) / 2) +
49207 ((Base % NumElts) & ~(NumEltsPer128BitChunk - 1));
49208
49209 // The low half of the 128-bit result must choose from A.
49210 // The high half of the 128-bit result must choose from B,
49211 // unless B is undef. In that case, we are always choosing from A.
49212 if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk))
49213 Index += NumEltsPer64BitChunk;
49214 PostShuffleMask[i + j] = Index;
49215 }
49216 }
49217
49218 SDValue NewLHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
49219 SDValue NewRHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
49220
49221 bool IsIdentityPostShuffle =
49222 isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);
49223 if (IsIdentityPostShuffle)
49224 PostShuffleMask.clear();
49225
49226 // Avoid 128-bit multi lane shuffles if pre-AVX2 and FP (integer will split).
49227 if (!IsIdentityPostShuffle && !Subtarget.hasAVX2() && VT.isFloatingPoint() &&
49228 isMultiLaneShuffleMask(128, VT.getScalarSizeInBits(), PostShuffleMask))
49229 return false;
49230
49231 // If the source nodes are already used in HorizOps then always accept this.
49232 // Shuffle folding should merge these back together.
49233 bool FoundHorizLHS = llvm::any_of(NewLHS->uses(), [&](SDNode *User) {
49234 return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
49235 });
49236 bool FoundHorizRHS = llvm::any_of(NewRHS->uses(), [&](SDNode *User) {
49237 return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
49238 });
49239 bool ForceHorizOp = FoundHorizLHS && FoundHorizRHS;
49240
49241 // Assume a SingleSource HOP if we only shuffle one input and don't need to
49242 // shuffle the result.
49243 if (!ForceHorizOp &&
49244 !shouldUseHorizontalOp(NewLHS == NewRHS &&
49245 (NumShuffles < 2 || !IsIdentityPostShuffle),
49246 DAG, Subtarget))
49247 return false;
49248
49249 LHS = DAG.getBitcast(VT, NewLHS);
49250 RHS = DAG.getBitcast(VT, NewRHS);
49251 return true;
49252}
49253
49254// Try to synthesize horizontal (f)hadd/hsub from (f)adds/subs of shuffles.
49255static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG,
49256 const X86Subtarget &Subtarget) {
49257 EVT VT = N->getValueType(0);
49258 unsigned Opcode = N->getOpcode();
49259 bool IsAdd = (Opcode == ISD::FADD) || (Opcode == ISD::ADD);
49260 SmallVector<int, 8> PostShuffleMask;
49261
49262 switch (Opcode) {
49263 case ISD::FADD:
49264 case ISD::FSUB:
49265 if ((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
49266 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
49267 SDValue LHS = N->getOperand(0);
49268 SDValue RHS = N->getOperand(1);
49269 auto HorizOpcode = IsAdd ? X86ISD::FHADD : X86ISD::FHSUB;
49270 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
49271 PostShuffleMask)) {
49272 SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
49273 if (!PostShuffleMask.empty())
49274 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
49275 DAG.getUNDEF(VT), PostShuffleMask);
49276 return HorizBinOp;
49277 }
49278 }
49279 break;
49280 case ISD::ADD:
49281 case ISD::SUB:
49282 if (Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
49283 VT == MVT::v16i16 || VT == MVT::v8i32)) {
49284 SDValue LHS = N->getOperand(0);
49285 SDValue RHS = N->getOperand(1);
49286 auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB;
49287 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
49288 PostShuffleMask)) {
49289 auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,
49290 ArrayRef<SDValue> Ops) {
49291 return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);
49292 };
49293 SDValue HorizBinOp = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
49294 {LHS, RHS}, HOpBuilder);
49295 if (!PostShuffleMask.empty())
49296 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
49297 DAG.getUNDEF(VT), PostShuffleMask);
49298 return HorizBinOp;
49299 }
49300 }
49301 break;
49302 }
49303
49304 return SDValue();
49305}
49306
49307// Try to combine the following nodes
49308// t29: i64 = X86ISD::Wrapper TargetConstantPool:i64
49309// <i32 -2147483648[float -0.000000e+00]> 0
49310// t27: v16i32[v16f32],ch = X86ISD::VBROADCAST_LOAD
49311// <(load 4 from constant-pool)> t0, t29
49312// [t30: v16i32 = bitcast t27]
49313// t6: v16i32 = xor t7, t27[t30]
49314// t11: v16f32 = bitcast t6
49315// t21: v16f32 = X86ISD::VFMULC[X86ISD::VCFMULC] t11, t8
49316// into X86ISD::VFCMULC[X86ISD::VFMULC] if possible:
49317// t22: v16f32 = bitcast t7
49318// t23: v16f32 = X86ISD::VFCMULC[X86ISD::VFMULC] t8, t22
49319// t24: v32f16 = bitcast t23
49320static SDValue combineFMulcFCMulc(SDNode *N, SelectionDAG &DAG,
49321 const X86Subtarget &Subtarget) {
49322 EVT VT = N->getValueType(0);
49323 SDValue LHS = N->getOperand(0);
49324 SDValue RHS = N->getOperand(1);
49325 int CombineOpcode =
49326 N->getOpcode() == X86ISD::VFCMULC ? X86ISD::VFMULC : X86ISD::VFCMULC;
49327 auto isConjugationConstant = [](const Constant *c) {
49328 if (const auto *CI = dyn_cast<ConstantInt>(c)) {
49329 APInt ConjugationInt32 = APInt(32, 0x80000000, true);
49330 APInt ConjugationInt64 = APInt(64, 0x8000000080000000ULL, true);
49331 switch (CI->getBitWidth()) {
49332 case 16:
49333 return false;
49334 case 32:
49335 return CI->getValue() == ConjugationInt32;
49336 case 64:
49337 return CI->getValue() == ConjugationInt64;
49338 default:
49339 llvm_unreachable("Unexpected bit width")::llvm::llvm_unreachable_internal("Unexpected bit width", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 49339)
;
49340 }
49341 }
49342 if (const auto *CF = dyn_cast<ConstantFP>(c))
49343 return CF->isNegativeZeroValue();
49344 return false;
49345 };
49346 auto combineConjugation = [&](SDValue &r) {
49347 if (LHS->getOpcode() == ISD::BITCAST && RHS.hasOneUse()) {
49348 SDValue XOR = LHS.getOperand(0);
49349 if (XOR->getOpcode() == ISD::XOR && XOR.hasOneUse()) {
49350 SDValue XORRHS = XOR.getOperand(1);
49351 if (XORRHS.getOpcode() == ISD::BITCAST && XORRHS.hasOneUse())
49352 XORRHS = XORRHS.getOperand(0);
49353 if (XORRHS.getOpcode() == X86ISD::VBROADCAST_LOAD &&
49354 XORRHS.getOperand(1).getNumOperands()) {
49355 ConstantPoolSDNode *CP =
49356 dyn_cast<ConstantPoolSDNode>(XORRHS.getOperand(1).getOperand(0));
49357 if (CP && isConjugationConstant(CP->getConstVal())) {
49358 SelectionDAG::FlagInserter FlagsInserter(DAG, N);
49359 SDValue I2F = DAG.getBitcast(VT, LHS.getOperand(0).getOperand(0));
49360 SDValue FCMulC = DAG.getNode(CombineOpcode, SDLoc(N), VT, RHS, I2F);
49361 r = DAG.getBitcast(VT, FCMulC);
49362 return true;
49363 }
49364 }
49365 }
49366 }
49367 return false;
49368 };
49369 SDValue Res;
49370 if (combineConjugation(Res))
49371 return Res;
49372 std::swap(LHS, RHS);
49373 if (combineConjugation(Res))
49374 return Res;
49375 return Res;
49376}
49377
49378// Try to combine the following nodes:
49379// FADD(A, FMA(B, C, 0)) and FADD(A, FMUL(B, C)) to FMA(B, C, A)
49380static SDValue combineFaddCFmul(SDNode *N, SelectionDAG &DAG,
49381 const X86Subtarget &Subtarget) {
49382 auto AllowContract = [&DAG](const SDNodeFlags &Flags) {
49383 return DAG.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast ||
49384 Flags.hasAllowContract();
49385 };
49386
49387 auto HasNoSignedZero = [&DAG](const SDNodeFlags &Flags) {
49388 return DAG.getTarget().Options.NoSignedZerosFPMath ||
49389 Flags.hasNoSignedZeros();
49390 };
49391 auto IsVectorAllNegativeZero = [](const SDNode *N) {
49392 if (N->getOpcode() != X86ISD::VBROADCAST_LOAD)
49393 return false;
49394 assert(N->getSimpleValueType(0).getScalarType() == MVT::f32 &&(static_cast <bool> (N->getSimpleValueType(0).getScalarType
() == MVT::f32 && "Unexpected vector type!") ? void (
0) : __assert_fail ("N->getSimpleValueType(0).getScalarType() == MVT::f32 && \"Unexpected vector type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49395, __extension__
__PRETTY_FUNCTION__))
49395 "Unexpected vector type!")(static_cast <bool> (N->getSimpleValueType(0).getScalarType
() == MVT::f32 && "Unexpected vector type!") ? void (
0) : __assert_fail ("N->getSimpleValueType(0).getScalarType() == MVT::f32 && \"Unexpected vector type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49395, __extension__
__PRETTY_FUNCTION__))
;
49396 if (ConstantPoolSDNode *CP =
49397 dyn_cast<ConstantPoolSDNode>(N->getOperand(1)->getOperand(0))) {
49398 APInt AI = APInt(32, 0x80008000, true);
49399 if (const auto *CI = dyn_cast<ConstantInt>(CP->getConstVal()))
49400 return CI->getValue() == AI;
49401 if (const auto *CF = dyn_cast<ConstantFP>(CP->getConstVal()))
49402 return CF->getValue() == APFloat(APFloat::IEEEsingle(), AI);
49403 }
49404 return false;
49405 };
49406
49407 if (N->getOpcode() != ISD::FADD || !Subtarget.hasFP16() ||
49408 !AllowContract(N->getFlags()))
49409 return SDValue();
49410
49411 EVT VT = N->getValueType(0);
49412 if (VT != MVT::v8f16 && VT != MVT::v16f16 && VT != MVT::v32f16)
49413 return SDValue();
49414
49415 SDValue LHS = N->getOperand(0);
49416 SDValue RHS = N->getOperand(1);
49417 bool IsConj;
49418 SDValue FAddOp1, MulOp0, MulOp1;
49419 auto GetCFmulFrom = [&MulOp0, &MulOp1, &IsConj, &AllowContract,
49420 &IsVectorAllNegativeZero,
49421 &HasNoSignedZero](SDValue N) -> bool {
49422 if (!N.hasOneUse() || N.getOpcode() != ISD::BITCAST)
49423 return false;
49424 SDValue Op0 = N.getOperand(0);
49425 unsigned Opcode = Op0.getOpcode();
49426 if (Op0.hasOneUse() && AllowContract(Op0->getFlags())) {
49427 if ((Opcode == X86ISD::VFMULC || Opcode == X86ISD::VFCMULC)) {
49428 MulOp0 = Op0.getOperand(0);
49429 MulOp1 = Op0.getOperand(1);
49430 IsConj = Opcode == X86ISD::VFCMULC;
49431 return true;
49432 }
49433 if ((Opcode == X86ISD::VFMADDC || Opcode == X86ISD::VFCMADDC) &&
49434 ((ISD::isBuildVectorAllZeros(Op0->getOperand(2).getNode()) &&
49435 HasNoSignedZero(Op0->getFlags())) ||
49436 IsVectorAllNegativeZero(Op0->getOperand(2).getNode()))) {
49437 MulOp0 = Op0.getOperand(0);
49438 MulOp1 = Op0.getOperand(1);
49439 IsConj = Opcode == X86ISD::VFCMADDC;
49440 return true;
49441 }
49442 }
49443 return false;
49444 };
49445
49446 if (GetCFmulFrom(LHS))
49447 FAddOp1 = RHS;
49448 else if (GetCFmulFrom(RHS))
49449 FAddOp1 = LHS;
49450 else
49451 return SDValue();
49452
49453 MVT CVT = MVT::getVectorVT(MVT::f32, VT.getVectorNumElements() / 2);
49454 FAddOp1 = DAG.getBitcast(CVT, FAddOp1);
49455 unsigned NewOp = IsConj ? X86ISD::VFCMADDC : X86ISD::VFMADDC;
49456 // FIXME: How do we handle when fast math flags of FADD are different from
49457 // CFMUL's?
49458 SDValue CFmul =
49459 DAG.getNode(NewOp, SDLoc(N), CVT, MulOp0, MulOp1, FAddOp1, N->getFlags());
49460 return DAG.getBitcast(VT, CFmul);
49461}
49462
49463/// Do target-specific dag combines on floating-point adds/subs.
49464static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
49465 const X86Subtarget &Subtarget) {
49466 if (SDValue HOp = combineToHorizontalAddSub(N, DAG, Subtarget))
49467 return HOp;
49468
49469 if (SDValue COp = combineFaddCFmul(N, DAG, Subtarget))
49470 return COp;
49471
49472 return SDValue();
49473}
49474
49475/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
49476/// the codegen.
49477/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
49478/// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove
49479/// anything that is guaranteed to be transformed by DAGCombiner.
49480static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
49481 const X86Subtarget &Subtarget,
49482 const SDLoc &DL) {
49483 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode")(static_cast <bool> (N->getOpcode() == ISD::TRUNCATE
&& "Wrong opcode") ? void (0) : __assert_fail ("N->getOpcode() == ISD::TRUNCATE && \"Wrong opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49483, __extension__
__PRETTY_FUNCTION__))
;
49484 SDValue Src = N->getOperand(0);
49485 unsigned SrcOpcode = Src.getOpcode();
49486 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49487
49488 EVT VT = N->getValueType(0);
49489 EVT SrcVT = Src.getValueType();
49490
49491 auto IsFreeTruncation = [VT](SDValue Op) {
49492 unsigned TruncSizeInBits = VT.getScalarSizeInBits();
49493
49494 // See if this has been extended from a smaller/equal size to
49495 // the truncation size, allowing a truncation to combine with the extend.
49496 unsigned Opcode = Op.getOpcode();
49497 if ((Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND ||
49498 Opcode == ISD::ZERO_EXTEND) &&
49499 Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
49500 return true;
49501
49502 // See if this is a single use constant which can be constant folded.
49503 // NOTE: We don't peek throught bitcasts here because there is currently
49504 // no support for constant folding truncate+bitcast+vector_of_constants. So
49505 // we'll just send up with a truncate on both operands which will
49506 // get turned back into (truncate (binop)) causing an infinite loop.
49507 return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
49508 };
49509
49510 auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
49511 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
49512 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
49513 return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1);
49514 };
49515
49516 // Don't combine if the operation has other uses.
49517 if (!Src.hasOneUse())
49518 return SDValue();
49519
49520 // Only support vector truncation for now.
49521 // TODO: i64 scalar math would benefit as well.
49522 if (!VT.isVector())
49523 return SDValue();
49524
49525 // In most cases its only worth pre-truncating if we're only facing the cost
49526 // of one truncation.
49527 // i.e. if one of the inputs will constant fold or the input is repeated.
49528 switch (SrcOpcode) {
49529 case ISD::MUL:
49530 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
49531 // better to truncate if we have the chance.
49532 if (SrcVT.getScalarType() == MVT::i64 &&
49533 TLI.isOperationLegal(SrcOpcode, VT) &&
49534 !TLI.isOperationLegal(SrcOpcode, SrcVT))
49535 return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
49536 LLVM_FALLTHROUGH[[gnu::fallthrough]];
49537 case ISD::AND:
49538 case ISD::XOR:
49539 case ISD::OR:
49540 case ISD::ADD:
49541 case ISD::SUB: {
49542 SDValue Op0 = Src.getOperand(0);
49543 SDValue Op1 = Src.getOperand(1);
49544 if (TLI.isOperationLegal(SrcOpcode, VT) &&
49545 (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
49546 return TruncateArithmetic(Op0, Op1);
49547 break;
49548 }
49549 }
49550
49551 return SDValue();
49552}
49553
49554/// Truncate using ISD::AND mask and X86ISD::PACKUS.
49555/// e.g. trunc <8 x i32> X to <8 x i16> -->
49556/// MaskX = X & 0xffff (clear high bits to prevent saturation)
49557/// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1)
49558static SDValue combineVectorTruncationWithPACKUS(SDNode *N, const SDLoc &DL,
49559 const X86Subtarget &Subtarget,
49560 SelectionDAG &DAG) {
49561 SDValue In = N->getOperand(0);
49562 EVT InVT = In.getValueType();
49563 EVT OutVT = N->getValueType(0);
49564
49565 APInt Mask = APInt::getLowBitsSet(InVT.getScalarSizeInBits(),
49566 OutVT.getScalarSizeInBits());
49567 In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(Mask, DL, InVT));
49568 return truncateVectorWithPACK(X86ISD::PACKUS, OutVT, In, DL, DAG, Subtarget);
49569}
49570
49571/// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
49572static SDValue combineVectorTruncationWithPACKSS(SDNode *N, const SDLoc &DL,
49573 const X86Subtarget &Subtarget,
49574 SelectionDAG &DAG) {
49575 SDValue In = N->getOperand(0);
49576 EVT InVT = In.getValueType();
49577 EVT OutVT = N->getValueType(0);
49578 In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, InVT, In,
49579 DAG.getValueType(OutVT));
49580 return truncateVectorWithPACK(X86ISD::PACKSS, OutVT, In, DL, DAG, Subtarget);
49581}
49582
49583/// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
49584/// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
49585/// legalization the truncation will be translated into a BUILD_VECTOR with each
49586/// element that is extracted from a vector and then truncated, and it is
49587/// difficult to do this optimization based on them.
49588static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
49589 const X86Subtarget &Subtarget) {
49590 EVT OutVT = N->getValueType(0);
49591 if (!OutVT.isVector())
49592 return SDValue();
49593
49594 SDValue In = N->getOperand(0);
49595 if (!In.getValueType().isSimple())
49596 return SDValue();
49597
49598 EVT InVT = In.getValueType();
49599 unsigned NumElems = OutVT.getVectorNumElements();
49600
49601 // AVX512 provides fast truncate ops.
49602 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
49603 return SDValue();
49604
49605 EVT OutSVT = OutVT.getVectorElementType();
49606 EVT InSVT = InVT.getVectorElementType();
49607 if (!((InSVT == MVT::i16 || InSVT == MVT::i32 || InSVT == MVT::i64) &&
49608 (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
49609 NumElems >= 8))
49610 return SDValue();
49611
49612 // SSSE3's pshufb results in less instructions in the cases below.
49613 if (Subtarget.hasSSSE3() && NumElems == 8) {
49614 if (InSVT == MVT::i16)
49615 return SDValue();
49616 if (InSVT == MVT::i32 &&
49617 (OutSVT == MVT::i8 || !Subtarget.hasSSE41() || Subtarget.hasInt256()))
49618 return SDValue();
49619 }
49620
49621 SDLoc DL(N);
49622 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
49623 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
49624 // truncate 2 x v4i32 to v8i16.
49625 if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
49626 return combineVectorTruncationWithPACKUS(N, DL, Subtarget, DAG);
49627 if (InSVT == MVT::i32)
49628 return combineVectorTruncationWithPACKSS(N, DL, Subtarget, DAG);
49629
49630 return SDValue();
49631}
49632
49633/// This function transforms vector truncation of 'extended sign-bits' or
49634/// 'extended zero-bits' values.
49635/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
49636static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
49637 SelectionDAG &DAG,
49638 const X86Subtarget &Subtarget) {
49639 // Requires SSE2.
49640 if (!Subtarget.hasSSE2())
49641 return SDValue();
49642
49643 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())
49644 return SDValue();
49645
49646 SDValue In = N->getOperand(0);
49647 if (!In.getValueType().isSimple())
49648 return SDValue();
49649
49650 MVT VT = N->getValueType(0).getSimpleVT();
49651 MVT SVT = VT.getScalarType();
49652
49653 MVT InVT = In.getValueType().getSimpleVT();
49654 MVT InSVT = InVT.getScalarType();
49655
49656 // Check we have a truncation suited for PACKSS/PACKUS.
49657 if (!isPowerOf2_32(VT.getVectorNumElements()))
49658 return SDValue();
49659 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
49660 return SDValue();
49661 if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
49662 return SDValue();
49663
49664 // Truncation to sub-128bit vXi32 can be better handled with shuffles.
49665 if (SVT == MVT::i32 && VT.getSizeInBits() < 128)
49666 return SDValue();
49667
49668 // AVX512 has fast truncate, but if the input is already going to be split,
49669 // there's no harm in trying pack.
49670 if (Subtarget.hasAVX512() &&
49671 !(!Subtarget.useAVX512Regs() && VT.is256BitVector() &&
49672 InVT.is512BitVector())) {
49673 // PACK should still be worth it for 128-bit vectors if the sources were
49674 // originally concatenated from subvectors.
49675 SmallVector<SDValue> ConcatOps;
49676 if (VT.getSizeInBits() > 128 || !collectConcatOps(In.getNode(), ConcatOps))
49677 return SDValue();
49678 }
49679
49680 unsigned NumPackedSignBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
49681 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
49682
49683 // Use PACKUS if the input has zero-bits that extend all the way to the
49684 // packed/truncated value. e.g. masks, zext_in_reg, etc.
49685 KnownBits Known = DAG.computeKnownBits(In);
49686 unsigned NumLeadingZeroBits = Known.countMinLeadingZeros();
49687 if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedZeroBits))
49688 return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget);
49689
49690 // Use PACKSS if the input has sign-bits that extend all the way to the
49691 // packed/truncated value. e.g. Comparison result, sext_in_reg, etc.
49692 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
49693
49694 // Don't use PACKSS for vXi64 -> vXi32 truncations unless we're dealing with
49695 // a sign splat. ComputeNumSignBits struggles to see through BITCASTs later
49696 // on and combines/simplifications can't then use it.
49697 if (SVT == MVT::i32 && NumSignBits != InSVT.getSizeInBits())
49698 return SDValue();
49699
49700 unsigned MinSignBits = InSVT.getSizeInBits() - NumPackedSignBits;
49701 if (NumSignBits > MinSignBits)
49702 return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);
49703
49704 // If we have a srl that only generates signbits that we will discard in
49705 // the truncation then we can use PACKSS by converting the srl to a sra.
49706 // SimplifyDemandedBits often relaxes sra to srl so we need to reverse it.
49707 if (In.getOpcode() == ISD::SRL && N->isOnlyUserOf(In.getNode()))
49708 if (const APInt *ShAmt = DAG.getValidShiftAmountConstant(
49709 In, APInt::getAllOnes(VT.getVectorNumElements()))) {
49710 if (*ShAmt == MinSignBits) {
49711 SDValue NewIn = DAG.getNode(ISD::SRA, DL, InVT, In->ops());
49712 return truncateVectorWithPACK(X86ISD::PACKSS, VT, NewIn, DL, DAG,
49713 Subtarget);
49714 }
49715 }
49716
49717 return SDValue();
49718}
49719
49720// Try to form a MULHU or MULHS node by looking for
49721// (trunc (srl (mul ext, ext), 16))
49722// TODO: This is X86 specific because we want to be able to handle wide types
49723// before type legalization. But we can only do it if the vector will be
49724// legalized via widening/splitting. Type legalization can't handle promotion
49725// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
49726// combiner.
49727static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
49728 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
49729 // First instruction should be a right shift of a multiply.
49730 if (Src.getOpcode() != ISD::SRL ||
49731 Src.getOperand(0).getOpcode() != ISD::MUL)
49732 return SDValue();
49733
49734 if (!Subtarget.hasSSE2())
49735 return SDValue();
49736
49737 // Only handle vXi16 types that are at least 128-bits unless they will be
49738 // widened.
49739 if (!VT.isVector() || VT.getVectorElementType() != MVT::i16)
49740 return SDValue();
49741
49742 // Input type should be at least vXi32.
49743 EVT InVT = Src.getValueType();
49744 if (InVT.getVectorElementType().getSizeInBits() < 32)
49745 return SDValue();
49746
49747 // Need a shift by 16.
49748 APInt ShiftAmt;
49749 if (!ISD::isConstantSplatVector(Src.getOperand(1).getNode(), ShiftAmt) ||
49750 ShiftAmt != 16)
49751 return SDValue();
49752
49753 SDValue LHS = Src.getOperand(0).getOperand(0);
49754 SDValue RHS = Src.getOperand(0).getOperand(1);
49755
49756 // Count leading sign/zero bits on both inputs - if there are enough then
49757 // truncation back to vXi16 will be cheap - either as a pack/shuffle
49758 // sequence or using AVX512 truncations. If the inputs are sext/zext then the
49759 // truncations may actually be free by peeking through to the ext source.
49760 auto IsSext = [&DAG](SDValue V) {
49761 return DAG.ComputeMaxSignificantBits(V) <= 16;
49762 };
49763 auto IsZext = [&DAG](SDValue V) {
49764 return DAG.computeKnownBits(V).countMaxActiveBits() <= 16;
49765 };
49766
49767 bool IsSigned = IsSext(LHS) && IsSext(RHS);
49768 bool IsUnsigned = IsZext(LHS) && IsZext(RHS);
49769 if (!IsSigned && !IsUnsigned)
49770 return SDValue();
49771
49772 // Check if both inputs are extensions, which will be removed by truncation.
49773 bool IsTruncateFree = (LHS.getOpcode() == ISD::SIGN_EXTEND ||
49774 LHS.getOpcode() == ISD::ZERO_EXTEND) &&
49775 (RHS.getOpcode() == ISD::SIGN_EXTEND ||
49776 RHS.getOpcode() == ISD::ZERO_EXTEND) &&
49777 LHS.getOperand(0).getScalarValueSizeInBits() <= 16 &&
49778 RHS.getOperand(0).getScalarValueSizeInBits() <= 16;
49779
49780 // For AVX2+ targets, with the upper bits known zero, we can perform MULHU on
49781 // the (bitcasted) inputs directly, and then cheaply pack/truncate the result
49782 // (upper elts will be zero). Don't attempt this with just AVX512F as MULHU
49783 // will have to split anyway.
49784 unsigned InSizeInBits = InVT.getSizeInBits();
49785 if (IsUnsigned && !IsTruncateFree && Subtarget.hasInt256() &&
49786 !(Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.is256BitVector()) &&
49787 (InSizeInBits % 16) == 0) {
49788 EVT BCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
49789 InVT.getSizeInBits() / 16);
49790 SDValue Res = DAG.getNode(ISD::MULHU, DL, BCVT, DAG.getBitcast(BCVT, LHS),
49791 DAG.getBitcast(BCVT, RHS));
49792 return DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getBitcast(InVT, Res));
49793 }
49794
49795 // Truncate back to source type.
49796 LHS = DAG.getNode(ISD::TRUNCATE, DL, VT, LHS);
49797 RHS = DAG.getNode(ISD::TRUNCATE, DL, VT, RHS);
49798
49799 unsigned Opc = IsSigned ? ISD::MULHS : ISD::MULHU;
49800 return DAG.getNode(Opc, DL, VT, LHS, RHS);
49801}
49802
49803// Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes
49804// from one vector with signed bytes from another vector, adds together
49805// adjacent pairs of 16-bit products, and saturates the result before
49806// truncating to 16-bits.
49807//
49808// Which looks something like this:
49809// (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),
49810// (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))
49811static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG,
49812 const X86Subtarget &Subtarget,
49813 const SDLoc &DL) {
49814 if (!VT.isVector() || !Subtarget.hasSSSE3())
49815 return SDValue();
49816
49817 unsigned NumElems = VT.getVectorNumElements();
49818 EVT ScalarVT = VT.getVectorElementType();
49819 if (ScalarVT != MVT::i16 || NumElems < 8 || !isPowerOf2_32(NumElems))
49820 return SDValue();
49821
49822 SDValue SSatVal = detectSSatPattern(In, VT);
49823 if (!SSatVal || SSatVal.getOpcode() != ISD::ADD)
49824 return SDValue();
49825
49826 // Ok this is a signed saturation of an ADD. See if this ADD is adding pairs
49827 // of multiplies from even/odd elements.
49828 SDValue N0 = SSatVal.getOperand(0);
49829 SDValue N1 = SSatVal.getOperand(1);
49830
49831 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
49832 return SDValue();
49833
49834 SDValue N00 = N0.getOperand(0);
49835 SDValue N01 = N0.getOperand(1);
49836 SDValue N10 = N1.getOperand(0);
49837 SDValue N11 = N1.getOperand(1);
49838
49839 // TODO: Handle constant vectors and use knownbits/computenumsignbits?
49840 // Canonicalize zero_extend to LHS.
49841 if (N01.getOpcode() == ISD::ZERO_EXTEND)
49842 std::swap(N00, N01);
49843 if (N11.getOpcode() == ISD::ZERO_EXTEND)
49844 std::swap(N10, N11);
49845
49846 // Ensure we have a zero_extend and a sign_extend.
49847 if (N00.getOpcode() != ISD::ZERO_EXTEND ||
49848 N01.getOpcode() != ISD::SIGN_EXTEND ||
49849 N10.getOpcode() != ISD::ZERO_EXTEND ||
49850 N11.getOpcode() != ISD::SIGN_EXTEND)
49851 return SDValue();
49852
49853 // Peek through the extends.
49854 N00 = N00.getOperand(0);
49855 N01 = N01.getOperand(0);
49856 N10 = N10.getOperand(0);
49857 N11 = N11.getOperand(0);
49858
49859 // Ensure the extend is from vXi8.
49860 if (N00.getValueType().getVectorElementType() != MVT::i8 ||
49861 N01.getValueType().getVectorElementType() != MVT::i8 ||
49862 N10.getValueType().getVectorElementType() != MVT::i8 ||
49863 N11.getValueType().getVectorElementType() != MVT::i8)
49864 return SDValue();
49865
49866 // All inputs should be build_vectors.
49867 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
49868 N01.getOpcode() != ISD::BUILD_VECTOR ||
49869 N10.getOpcode() != ISD::BUILD_VECTOR ||
49870 N11.getOpcode() != ISD::BUILD_VECTOR)
49871 return SDValue();
49872
49873 // N00/N10 are zero extended. N01/N11 are sign extended.
49874
49875 // For each element, we need to ensure we have an odd element from one vector
49876 // multiplied by the odd element of another vector and the even element from
49877 // one of the same vectors being multiplied by the even element from the
49878 // other vector. So we need to make sure for each element i, this operator
49879 // is being performed:
49880 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
49881 SDValue ZExtIn, SExtIn;
49882 for (unsigned i = 0; i != NumElems; ++i) {
49883 SDValue N00Elt = N00.getOperand(i);
49884 SDValue N01Elt = N01.getOperand(i);
49885 SDValue N10Elt = N10.getOperand(i);
49886 SDValue N11Elt = N11.getOperand(i);
49887 // TODO: Be more tolerant to undefs.
49888 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
49889 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
49890 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
49891 N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
49892 return SDValue();
49893 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
49894 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
49895 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
49896 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
49897 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
49898 return SDValue();
49899 unsigned IdxN00 = ConstN00Elt->getZExtValue();
49900 unsigned IdxN01 = ConstN01Elt->getZExtValue();
49901 unsigned IdxN10 = ConstN10Elt->getZExtValue();
49902 unsigned IdxN11 = ConstN11Elt->getZExtValue();
49903 // Add is commutative so indices can be reordered.
49904 if (IdxN00 > IdxN10) {
49905 std::swap(IdxN00, IdxN10);
49906 std::swap(IdxN01, IdxN11);
49907 }
49908 // N0 indices be the even element. N1 indices must be the next odd element.
49909 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
49910 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
49911 return SDValue();
49912 SDValue N00In = N00Elt.getOperand(0);
49913 SDValue N01In = N01Elt.getOperand(0);
49914 SDValue N10In = N10Elt.getOperand(0);
49915 SDValue N11In = N11Elt.getOperand(0);
49916 // First time we find an input capture it.
49917 if (!ZExtIn) {
49918 ZExtIn = N00In;
49919 SExtIn = N01In;
49920 }
49921 if (ZExtIn != N00In || SExtIn != N01In ||
49922 ZExtIn != N10In || SExtIn != N11In)
49923 return SDValue();
49924 }
49925
49926 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49927 ArrayRef<SDValue> Ops) {
49928 // Shrink by adding truncate nodes and let DAGCombine fold with the
49929 // sources.
49930 EVT InVT = Ops[0].getValueType();
49931 assert(InVT.getScalarType() == MVT::i8 &&(static_cast <bool> (InVT.getScalarType() == MVT::i8 &&
"Unexpected scalar element type") ? void (0) : __assert_fail
("InVT.getScalarType() == MVT::i8 && \"Unexpected scalar element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49932, __extension__
__PRETTY_FUNCTION__))
49932 "Unexpected scalar element type")(static_cast <bool> (InVT.getScalarType() == MVT::i8 &&
"Unexpected scalar element type") ? void (0) : __assert_fail
("InVT.getScalarType() == MVT::i8 && \"Unexpected scalar element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49932, __extension__
__PRETTY_FUNCTION__))
;
49933 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch")(static_cast <bool> (InVT == Ops[1].getValueType() &&
"Operands' types mismatch") ? void (0) : __assert_fail ("InVT == Ops[1].getValueType() && \"Operands' types mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49933, __extension__
__PRETTY_FUNCTION__))
;
49934 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
49935 InVT.getVectorNumElements() / 2);
49936 return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);
49937 };
49938 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },
49939 PMADDBuilder);
49940}
49941
49942static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
49943 const X86Subtarget &Subtarget) {
49944 EVT VT = N->getValueType(0);
49945 SDValue Src = N->getOperand(0);
49946 SDLoc DL(N);
49947
49948 // Attempt to pre-truncate inputs to arithmetic ops instead.
49949 if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
49950 return V;
49951
49952 // Try to detect AVG pattern first.
49953 if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
49954 return Avg;
49955
49956 // Try to detect PMADD
49957 if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))
49958 return PMAdd;
49959
49960 // Try to combine truncation with signed/unsigned saturation.
49961 if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
49962 return Val;
49963
49964 // Try to combine PMULHUW/PMULHW for vXi16.
49965 if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
49966 return V;
49967
49968 // The bitcast source is a direct mmx result.
49969 // Detect bitcasts between i32 to x86mmx
49970 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
49971 SDValue BCSrc = Src.getOperand(0);
49972 if (BCSrc.getValueType() == MVT::x86mmx)
49973 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
49974 }
49975
49976 // Try to truncate extended sign/zero bits with PACKSS/PACKUS.
49977 if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
49978 return V;
49979
49980 return combineVectorTruncation(N, DAG, Subtarget);
49981}
49982
49983static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG,
49984 TargetLowering::DAGCombinerInfo &DCI) {
49985 EVT VT = N->getValueType(0);
49986 SDValue In = N->getOperand(0);
49987 SDLoc DL(N);
49988
49989 if (auto SSatVal = detectSSatPattern(In, VT))
49990 return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
49991 if (auto USatVal = detectUSatPattern(In, VT, DAG, DL))
49992 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
49993
49994 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49995 APInt DemandedMask(APInt::getAllOnes(VT.getScalarSizeInBits()));
49996 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
49997 return SDValue(N, 0);
49998
49999 return SDValue();
50000}
50001
50002/// Returns the negated value if the node \p N flips sign of FP value.
50003///
50004/// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)
50005/// or FSUB(0, x)
50006/// AVX512F does not have FXOR, so FNEG is lowered as
50007/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
50008/// In this case we go though all bitcasts.
50009/// This also recognizes splat of a negated value and returns the splat of that
50010/// value.
50011static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {
50012 if (N->getOpcode() == ISD::FNEG)
50013 return N->getOperand(0);
50014
50015 // Don't recurse exponentially.
50016 if (Depth > SelectionDAG::MaxRecursionDepth)
50017 return SDValue();
50018
50019 unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();
50020
50021 SDValue Op = peekThroughBitcasts(SDValue(N, 0));
50022 EVT VT = Op->getValueType(0);
50023
50024 // Make sure the element size doesn't change.
50025 if (VT.getScalarSizeInBits() != ScalarSize)
50026 return SDValue();
50027
50028 unsigned Opc = Op.getOpcode();
50029 switch (Opc) {
50030 case ISD::VECTOR_SHUFFLE: {
50031 // For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate
50032 // of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here.
50033 if (!Op.getOperand(1).isUndef())
50034 return SDValue();
50035 if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1))
50036 if (NegOp0.getValueType() == VT) // FIXME: Can we do better?
50037 return DAG.getVectorShuffle(VT, SDLoc(Op), NegOp0, DAG.getUNDEF(VT),
50038 cast<ShuffleVectorSDNode>(Op)->getMask());
50039 break;
50040 }
50041 case ISD::INSERT_VECTOR_ELT: {
50042 // Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF,
50043 // -V, INDEX).
50044 SDValue InsVector = Op.getOperand(0);
50045 SDValue InsVal = Op.getOperand(1);
50046 if (!InsVector.isUndef())
50047 return SDValue();
50048 if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1))
50049 if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME
50050 return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,
50051 NegInsVal, Op.getOperand(2));
50052 break;
50053 }
50054 case ISD::FSUB:
50055 case ISD::XOR:
50056 case X86ISD::FXOR: {
50057 SDValue Op1 = Op.getOperand(1);
50058 SDValue Op0 = Op.getOperand(0);
50059
50060 // For XOR and FXOR, we want to check if constant
50061 // bits of Op1 are sign bit masks. For FSUB, we
50062 // have to check if constant bits of Op0 are sign
50063 // bit masks and hence we swap the operands.
50064 if (Opc == ISD::FSUB)
50065 std::swap(Op0, Op1);
50066
50067 APInt UndefElts;
50068 SmallVector<APInt, 16> EltBits;
50069 // Extract constant bits and see if they are all
50070 // sign bit masks. Ignore the undef elements.
50071 if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits,
50072 /* AllowWholeUndefs */ true,
50073 /* AllowPartialUndefs */ false)) {
50074 for (unsigned I = 0, E = EltBits.size(); I < E; I++)
50075 if (!UndefElts[I] && !EltBits[I].isSignMask())
50076 return SDValue();
50077
50078 return peekThroughBitcasts(Op0);
50079 }
50080 }
50081 }
50082
50083 return SDValue();
50084}
50085
50086static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
50087 bool NegRes) {
50088 if (NegMul) {
50089 switch (Opcode) {
50090 default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 50090)
;
50091 case ISD::FMA: Opcode = X86ISD::FNMADD; break;
50092 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FNMADD; break;
50093 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;
50094 case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;
50095 case X86ISD::STRICT_FMSUB: Opcode = X86ISD::STRICT_FNMSUB; break;
50096 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;
50097 case X86ISD::FNMADD: Opcode = ISD::FMA; break;
50098 case X86ISD::STRICT_FNMADD: Opcode = ISD::STRICT_FMA; break;
50099 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;
50100 case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;
50101 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB; break;
50102 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;
50103 }
50104 }
50105
50106 if (NegAcc) {
50107 switch (Opcode) {
50108 default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 50108)
;
50109 case ISD::FMA: Opcode = X86ISD::FMSUB; break;
50110 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FMSUB; break;
50111 case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
50112 case X86ISD::FMSUB: Opcode = ISD::FMA; break;
50113 case X86ISD::STRICT_FMSUB: Opcode = ISD::STRICT_FMA; break;
50114 case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
50115 case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;
50116 case X86ISD::STRICT_FNMADD: Opcode = X86ISD::STRICT_FNMSUB; break;
50117 case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
50118 case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;
50119 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FNMADD; break;
50120 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
50121 case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break;
50122 case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break;
50123 case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break;
50124 case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break;
50125 }
50126 }
50127
50128 if (NegRes) {
50129 switch (Opcode) {
50130 // For accuracy reason, we never combine fneg and fma under strict FP.
50131 default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 50131)
;
50132 case ISD::FMA: Opcode = X86ISD::FNMSUB; break;
50133 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
50134 case X86ISD::FMSUB: Opcode = X86ISD::FNMADD; break;
50135 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
50136 case X86ISD::FNMADD: Opcode = X86ISD::FMSUB; break;
50137 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
50138 case X86ISD::FNMSUB: Opcode = ISD::FMA; break;
50139 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
50140 }
50141 }
50142
50143 return Opcode;
50144}
50145
50146/// Do target-specific dag combines on floating point negations.
50147static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
50148 TargetLowering::DAGCombinerInfo &DCI,
50149 const X86Subtarget &Subtarget) {
50150 EVT OrigVT = N->getValueType(0);
50151 SDValue Arg = isFNEG(DAG, N);
50152 if (!Arg)
50153 return SDValue();
50154
50155 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50156 EVT VT = Arg.getValueType();
50157 EVT SVT = VT.getScalarType();
50158 SDLoc DL(N);
50159
50160 // Let legalize expand this if it isn't a legal type yet.
50161 if (!TLI.isTypeLegal(VT))
50162 return SDValue();
50163
50164 // If we're negating a FMUL node on a target with FMA, then we can avoid the
50165 // use of a constant by performing (-0 - A*B) instead.
50166 // FIXME: Check rounding control flags as well once it becomes available.
50167 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
50168 Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
50169 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
50170 SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
50171 Arg.getOperand(1), Zero);
50172 return DAG.getBitcast(OrigVT, NewNode);
50173 }
50174
50175 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
50176 bool LegalOperations = !DCI.isBeforeLegalizeOps();
50177 if (SDValue NegArg =
50178 TLI.getNegatedExpression(Arg, DAG, LegalOperations, CodeSize))
50179 return DAG.getBitcast(OrigVT, NegArg);
50180
50181 return SDValue();
50182}
50183
50184SDValue X86TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
50185 bool LegalOperations,
50186 bool ForCodeSize,
50187 NegatibleCost &Cost,
50188 unsigned Depth) const {
50189 // fneg patterns are removable even if they have multiple uses.
50190 if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth)) {
50191 Cost = NegatibleCost::Cheaper;
50192 return DAG.getBitcast(Op.getValueType(), Arg);
50193 }
50194
50195 EVT VT = Op.getValueType();
50196 EVT SVT = VT.getScalarType();
50197 unsigned Opc = Op.getOpcode();
50198 SDNodeFlags Flags = Op.getNode()->getFlags();
50199 switch (Opc) {
50200 case ISD::FMA:
50201 case X86ISD::FMSUB:
50202 case X86ISD::FNMADD:
50203 case X86ISD::FNMSUB:
50204 case X86ISD::FMADD_RND:
50205 case X86ISD::FMSUB_RND:
50206 case X86ISD::FNMADD_RND:
50207 case X86ISD::FNMSUB_RND: {
50208 if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||
50209 !(SVT == MVT::f32 || SVT == MVT::f64) ||
50210 !isOperationLegal(ISD::FMA, VT))
50211 break;
50212
50213 // Don't fold (fneg (fma (fneg x), y, (fneg z))) to (fma x, y, z)
50214 // if it may have signed zeros.
50215 if (!Flags.hasNoSignedZeros())
50216 break;
50217
50218 // This is always negatible for free but we might be able to remove some
50219 // extra operand negations as well.
50220 SmallVector<SDValue, 4> NewOps(Op.getNumOperands(), SDValue());
50221 for (int i = 0; i != 3; ++i)
50222 NewOps[i] = getCheaperNegatedExpression(
50223 Op.getOperand(i), DAG, LegalOperations, ForCodeSize, Depth + 1);
50224
50225 bool NegA = !!NewOps[0];
50226 bool NegB = !!NewOps[1];
50227 bool NegC = !!NewOps[2];
50228 unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true);
50229
50230 Cost = (NegA || NegB || NegC) ? NegatibleCost::Cheaper
50231 : NegatibleCost::Neutral;
50232
50233 // Fill in the non-negated ops with the original values.
50234 for (int i = 0, e = Op.getNumOperands(); i != e; ++i)
50235 if (!NewOps[i])
50236 NewOps[i] = Op.getOperand(i);
50237 return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps);
50238 }
50239 case X86ISD::FRCP:
50240 if (SDValue NegOp0 =
50241 getNegatedExpression(Op.getOperand(0), DAG, LegalOperations,
50242 ForCodeSize, Cost, Depth + 1))
50243 return DAG.getNode(Opc, SDLoc(Op), VT, NegOp0);
50244 break;
50245 }
50246
50247 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
50248 ForCodeSize, Cost, Depth);
50249}
50250
50251static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
50252 const X86Subtarget &Subtarget) {
50253 MVT VT = N->getSimpleValueType(0);
50254 // If we have integer vector types available, use the integer opcodes.
50255 if (!VT.isVector() || !Subtarget.hasSSE2())
50256 return SDValue();
50257
50258 SDLoc dl(N);
50259
50260 unsigned IntBits = VT.getScalarSizeInBits();
50261 MVT IntSVT = MVT::getIntegerVT(IntBits);
50262 MVT IntVT = MVT::getVectorVT(IntSVT, VT.getSizeInBits() / IntBits);
50263
50264 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
50265 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
50266 unsigned IntOpcode;
50267 switch (N->getOpcode()) {
50268 default: llvm_unreachable("Unexpected FP logic op")::llvm::llvm_unreachable_internal("Unexpected FP logic op", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 50268)
;
50269 case X86ISD::FOR: IntOpcode = ISD::OR; break;
50270 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
50271 case X86ISD::FAND: IntOpcode = ISD::AND; break;
50272 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
50273 }
50274 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
50275 return DAG.getBitcast(VT, IntOp);
50276}
50277
50278
50279/// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
50280static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) {
50281 if (N->getOpcode() != ISD::XOR)
50282 return SDValue();
50283
50284 SDValue LHS = N->getOperand(0);
50285 if (!isOneConstant(N->getOperand(1)) || LHS->getOpcode() != X86ISD::SETCC)
50286 return SDValue();
50287
50288 X86::CondCode NewCC = X86::GetOppositeBranchCondition(
50289 X86::CondCode(LHS->getConstantOperandVal(0)));
50290 SDLoc DL(N);
50291 return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
50292}
50293
50294static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
50295 TargetLowering::DAGCombinerInfo &DCI,
50296 const X86Subtarget &Subtarget) {
50297 SDValue N0 = N->getOperand(0);
50298 SDValue N1 = N->getOperand(1);
50299 EVT VT = N->getValueType(0);
50300
50301 // If this is SSE1 only convert to FXOR to avoid scalarization.
50302 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
50303 return DAG.getBitcast(MVT::v4i32,
50304 DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32,
50305 DAG.getBitcast(MVT::v4f32, N0),
50306 DAG.getBitcast(MVT::v4f32, N1)));
50307 }
50308
50309 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
50310 return Cmp;
50311
50312 if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
50313 return R;
50314
50315 if (SDValue R = combineBitOpWithShift(N, DAG))
50316 return R;
50317
50318 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))
50319 return FPLogic;
50320
50321 if (DCI.isBeforeLegalizeOps())
50322 return SDValue();
50323
50324 if (SDValue SetCC = foldXor1SetCC(N, DAG))
50325 return SetCC;
50326
50327 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
50328 return RV;
50329
50330 // Fold not(iX bitcast(vXi1)) -> (iX bitcast(not(vec))) for legal boolvecs.
50331 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50332 if (llvm::isAllOnesConstant(N1) && N0.getOpcode() == ISD::BITCAST &&
50333 N0.getOperand(0).getValueType().isVector() &&
50334 N0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
50335 TLI.isTypeLegal(N0.getOperand(0).getValueType()) && N0.hasOneUse()) {
50336 return DAG.getBitcast(VT, DAG.getNOT(SDLoc(N), N0.getOperand(0),
50337 N0.getOperand(0).getValueType()));
50338 }
50339
50340 // Handle AVX512 mask widening.
50341 // Fold not(insert_subvector(undef,sub)) -> insert_subvector(undef,not(sub))
50342 if (ISD::isBuildVectorAllOnes(N1.getNode()) && VT.isVector() &&
50343 VT.getVectorElementType() == MVT::i1 &&
50344 N0.getOpcode() == ISD::INSERT_SUBVECTOR && N0.getOperand(0).isUndef() &&
50345 TLI.isTypeLegal(N0.getOperand(1).getValueType())) {
50346 return DAG.getNode(
50347 ISD::INSERT_SUBVECTOR, SDLoc(N), VT, N0.getOperand(0),
50348 DAG.getNOT(SDLoc(N), N0.getOperand(1), N0.getOperand(1).getValueType()),
50349 N0.getOperand(2));
50350 }
50351
50352 // Fold xor(zext(xor(x,c1)),c2) -> xor(zext(x),xor(zext(c1),c2))
50353 // Fold xor(truncate(xor(x,c1)),c2) -> xor(truncate(x),xor(truncate(c1),c2))
50354 // TODO: Under what circumstances could this be performed in DAGCombine?
50355 if ((N0.getOpcode() == ISD::TRUNCATE || N0.getOpcode() == ISD::ZERO_EXTEND) &&
50356 N0.getOperand(0).getOpcode() == N->getOpcode()) {
50357 SDValue TruncExtSrc = N0.getOperand(0);
50358 auto *N1C = dyn_cast<ConstantSDNode>(N1);
50359 auto *N001C = dyn_cast<ConstantSDNode>(TruncExtSrc.getOperand(1));
50360 if (N1C && !N1C->isOpaque() && N001C && !N001C->isOpaque()) {
50361 SDLoc DL(N);
50362 SDValue LHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(0), DL, VT);
50363 SDValue RHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(1), DL, VT);
50364 return DAG.getNode(ISD::XOR, DL, VT, LHS,
50365 DAG.getNode(ISD::XOR, DL, VT, RHS, N1));
50366 }
50367 }
50368
50369 return combineFneg(N, DAG, DCI, Subtarget);
50370}
50371
50372static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG,
50373 TargetLowering::DAGCombinerInfo &DCI,
50374 const X86Subtarget &Subtarget) {
50375 EVT VT = N->getValueType(0);
50376 unsigned NumBits = VT.getSizeInBits();
50377
50378 // TODO - Constant Folding.
50379
50380 // Simplify the inputs.
50381 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50382 APInt DemandedMask(APInt::getAllOnes(NumBits));
50383 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
50384 return SDValue(N, 0);
50385
50386 return SDValue();
50387}
50388
50389static bool isNullFPScalarOrVectorConst(SDValue V) {
50390 return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
50391}
50392
50393/// If a value is a scalar FP zero or a vector FP zero (potentially including
50394/// undefined elements), return a zero constant that may be used to fold away
50395/// that value. In the case of a vector, the returned constant will not contain
50396/// undefined elements even if the input parameter does. This makes it suitable
50397/// to be used as a replacement operand with operations (eg, bitwise-and) where
50398/// an undef should not propagate.
50399static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
50400 const X86Subtarget &Subtarget) {
50401 if (!isNullFPScalarOrVectorConst(V))
50402 return SDValue();
50403
50404 if (V.getValueType().isVector())
50405 return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
50406
50407 return V;
50408}
50409
50410static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
50411 const X86Subtarget &Subtarget) {
50412 SDValue N0 = N->getOperand(0);
50413 SDValue N1 = N->getOperand(1);
50414 EVT VT = N->getValueType(0);
50415 SDLoc DL(N);
50416
50417 // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
50418 if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
50419 (VT == MVT::f64 && Subtarget.hasSSE2()) ||
50420 (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
50421 return SDValue();
50422
50423 auto isAllOnesConstantFP = [](SDValue V) {
50424 if (V.getSimpleValueType().isVector())
50425 return ISD::isBuildVectorAllOnes(V.getNode());
50426 auto *C = dyn_cast<ConstantFPSDNode>(V);
50427 return C && C->getConstantFPValue()->isAllOnesValue();
50428 };
50429
50430 // fand (fxor X, -1), Y --> fandn X, Y
50431 if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
50432 return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
50433
50434 // fand X, (fxor Y, -1) --> fandn Y, X
50435 if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
50436 return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
50437
50438 return SDValue();
50439}
50440
50441/// Do target-specific dag combines on X86ISD::FAND nodes.
50442static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
50443 const X86Subtarget &Subtarget) {
50444 // FAND(0.0, x) -> 0.0
50445 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
50446 return V;
50447
50448 // FAND(x, 0.0) -> 0.0
50449 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
50450 return V;
50451
50452 if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
50453 return V;
50454
50455 return lowerX86FPLogicOp(N, DAG, Subtarget);
50456}
50457
50458/// Do target-specific dag combines on X86ISD::FANDN nodes.
50459static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
50460 const X86Subtarget &Subtarget) {
50461 // FANDN(0.0, x) -> x
50462 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
50463 return N->getOperand(1);
50464
50465 // FANDN(x, 0.0) -> 0.0
50466 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
50467 return V;
50468
50469 return lowerX86FPLogicOp(N, DAG, Subtarget);
50470}
50471
50472/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
50473static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
50474 TargetLowering::DAGCombinerInfo &DCI,
50475 const X86Subtarget &Subtarget) {
50476 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR)(static_cast <bool> (N->getOpcode() == X86ISD::FOR ||
N->getOpcode() == X86ISD::FXOR) ? void (0) : __assert_fail
("N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50476, __extension__
__PRETTY_FUNCTION__))
;
50477
50478 // F[X]OR(0.0, x) -> x
50479 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
50480 return N->getOperand(1);
50481
50482 // F[X]OR(x, 0.0) -> x
50483 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
50484 return N->getOperand(0);
50485
50486 if (SDValue NewVal = combineFneg(N, DAG, DCI, Subtarget))
50487 return NewVal;
50488
50489 return lowerX86FPLogicOp(N, DAG, Subtarget);
50490}
50491
50492/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
50493static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
50494 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX)(static_cast <bool> (N->getOpcode() == X86ISD::FMIN ||
N->getOpcode() == X86ISD::FMAX) ? void (0) : __assert_fail
("N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50494, __extension__
__PRETTY_FUNCTION__))
;
50495
50496 // FMIN/FMAX are commutative if no NaNs and no negative zeros are allowed.
50497 if (!DAG.getTarget().Options.NoNaNsFPMath ||
50498 !DAG.getTarget().Options.NoSignedZerosFPMath)
50499 return SDValue();
50500
50501 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
50502 // into FMINC and FMAXC, which are Commutative operations.
50503 unsigned NewOp = 0;
50504 switch (N->getOpcode()) {
50505 default: llvm_unreachable("unknown opcode")::llvm::llvm_unreachable_internal("unknown opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 50505)
;
50506 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
50507 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
50508 }
50509
50510 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
50511 N->getOperand(0), N->getOperand(1));
50512}
50513
50514static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
50515 const X86Subtarget &Subtarget) {
50516 if (Subtarget.useSoftFloat())
50517 return SDValue();
50518
50519 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50520
50521 EVT VT = N->getValueType(0);
50522 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
50523 (Subtarget.hasSSE2() && VT == MVT::f64) ||
50524 (Subtarget.hasFP16() && VT == MVT::f16) ||
50525 (VT.isVector() && TLI.isTypeLegal(VT))))
50526 return SDValue();
50527
50528 SDValue Op0 = N->getOperand(0);
50529 SDValue Op1 = N->getOperand(1);
50530 SDLoc DL(N);
50531 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
50532
50533 // If we don't have to respect NaN inputs, this is a direct translation to x86
50534 // min/max instructions.
50535 if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
50536 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
50537
50538 // If one of the operands is known non-NaN use the native min/max instructions
50539 // with the non-NaN input as second operand.
50540 if (DAG.isKnownNeverNaN(Op1))
50541 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
50542 if (DAG.isKnownNeverNaN(Op0))
50543 return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());
50544
50545 // If we have to respect NaN inputs, this takes at least 3 instructions.
50546 // Favor a library call when operating on a scalar and minimizing code size.
50547 if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())
50548 return SDValue();
50549
50550 EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
50551 VT);
50552
50553 // There are 4 possibilities involving NaN inputs, and these are the required
50554 // outputs:
50555 // Op1
50556 // Num NaN
50557 // ----------------
50558 // Num | Max | Op0 |
50559 // Op0 ----------------
50560 // NaN | Op1 | NaN |
50561 // ----------------
50562 //
50563 // The SSE FP max/min instructions were not designed for this case, but rather
50564 // to implement:
50565 // Min = Op1 < Op0 ? Op1 : Op0
50566 // Max = Op1 > Op0 ? Op1 : Op0
50567 //
50568 // So they always return Op0 if either input is a NaN. However, we can still
50569 // use those instructions for fmaxnum by selecting away a NaN input.
50570
50571 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
50572 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
50573 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);
50574
50575 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
50576 // are NaN, the NaN value of Op1 is the result.
50577 return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
50578}
50579
50580static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,
50581 TargetLowering::DAGCombinerInfo &DCI) {
50582 EVT VT = N->getValueType(0);
50583 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50584
50585 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
50586 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
50587 return SDValue(N, 0);
50588
50589 // Convert a full vector load into vzload when not all bits are needed.
50590 SDValue In = N->getOperand(0);
50591 MVT InVT = In.getSimpleValueType();
50592 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
50593 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
50594 assert(InVT.is128BitVector() && "Expected 128-bit input vector")(static_cast <bool> (InVT.is128BitVector() && "Expected 128-bit input vector"
) ? void (0) : __assert_fail ("InVT.is128BitVector() && \"Expected 128-bit input vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50594, __extension__
__PRETTY_FUNCTION__))
;
50595 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
50596 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
50597 MVT MemVT = MVT::getIntegerVT(NumBits);
50598 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
50599 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
50600 SDLoc dl(N);
50601 SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
50602 DAG.getBitcast(InVT, VZLoad));
50603 DCI.CombineTo(N, Convert);
50604 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
50605 DCI.recursivelyDeleteUnusedNodes(LN);
50606 return SDValue(N, 0);
50607 }
50608 }
50609
50610 return SDValue();
50611}
50612
50613static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG,
50614 TargetLowering::DAGCombinerInfo &DCI) {
50615 bool IsStrict = N->isTargetStrictFPOpcode();
50616 EVT VT = N->getValueType(0);
50617
50618 // Convert a full vector load into vzload when not all bits are needed.
50619 SDValue In = N->getOperand(IsStrict ? 1 : 0);
50620 MVT InVT = In.getSimpleValueType();
50621 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
50622 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
50623 assert(InVT.is128BitVector() && "Expected 128-bit input vector")(static_cast <bool> (InVT.is128BitVector() && "Expected 128-bit input vector"
) ? void (0) : __assert_fail ("InVT.is128BitVector() && \"Expected 128-bit input vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50623, __extension__
__PRETTY_FUNCTION__))
;
50624 LoadSDNode *LN = cast<LoadSDNode>(In);
50625 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
50626 MVT MemVT = MVT::getFloatingPointVT(NumBits);
50627 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
50628 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
50629 SDLoc dl(N);
50630 if (IsStrict) {
50631 SDValue Convert =
50632 DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
50633 {N->getOperand(0), DAG.getBitcast(InVT, VZLoad)});
50634 DCI.CombineTo(N, Convert, Convert.getValue(1));
50635 } else {
50636 SDValue Convert =
50637 DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad));
50638 DCI.CombineTo(N, Convert);
50639 }
50640 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
50641 DCI.recursivelyDeleteUnusedNodes(LN);
50642 return SDValue(N, 0);
50643 }
50644 }
50645
50646 return SDValue();
50647}
50648
50649/// Do target-specific dag combines on X86ISD::ANDNP nodes.
50650static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
50651 TargetLowering::DAGCombinerInfo &DCI,
50652 const X86Subtarget &Subtarget) {
50653 SDValue N0 = N->getOperand(0);
50654 SDValue N1 = N->getOperand(1);
50655 MVT VT = N->getSimpleValueType(0);
50656
50657 // ANDNP(0, x) -> x
50658 if (ISD::isBuildVectorAllZeros(N0.getNode()))
50659 return N1;
50660
50661 // ANDNP(x, 0) -> 0
50662 if (ISD::isBuildVectorAllZeros(N1.getNode()))
50663 return DAG.getConstant(0, SDLoc(N), VT);
50664
50665 // Turn ANDNP back to AND if input is inverted.
50666 if (SDValue Not = IsNOT(N0, DAG))
50667 return DAG.getNode(ISD::AND, SDLoc(N), VT, DAG.getBitcast(VT, Not), N1);
50668
50669 // Attempt to recursively combine a bitmask ANDNP with shuffles.
50670 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
50671 SDValue Op(N, 0);
50672 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
50673 return Res;
50674
50675 // If either operand is a constant mask, then only the elements that aren't
50676 // zero are actually demanded by the other operand.
50677 auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
50678 APInt UndefElts;
50679 SmallVector<APInt> EltBits;
50680 int NumElts = VT.getVectorNumElements();
50681 int EltSizeInBits = VT.getScalarSizeInBits();
50682 APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
50683 APInt DemandedElts = APInt::getAllOnes(NumElts);
50684 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
50685 EltBits)) {
50686 DemandedBits.clearAllBits();
50687 DemandedElts.clearAllBits();
50688 for (int I = 0; I != NumElts; ++I)
50689 if ((Invert && !EltBits[I].isAllOnes()) ||
50690 (!Invert && !EltBits[I].isZero())) {
50691 DemandedBits |= Invert ? ~EltBits[I] : EltBits[I];
50692 DemandedElts.setBit(I);
50693 }
50694 }
50695 return std::make_pair(DemandedBits, DemandedElts);
50696 };
50697 std::pair<APInt, APInt> Demand0 = GetDemandedMasks(N1);
50698 std::pair<APInt, APInt> Demand1 = GetDemandedMasks(N0, true);
50699
50700 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50701 if (TLI.SimplifyDemandedVectorElts(N0, Demand0.second, DCI) ||
50702 TLI.SimplifyDemandedVectorElts(N1, Demand1.second, DCI) ||
50703 TLI.SimplifyDemandedBits(N0, Demand0.first, Demand0.second, DCI) ||
50704 TLI.SimplifyDemandedBits(N1, Demand1.first, Demand1.second, DCI)) {
50705 if (N->getOpcode() != ISD::DELETED_NODE)
50706 DCI.AddToWorklist(N);
50707 return SDValue(N, 0);
50708 }
50709 }
50710
50711 return SDValue();
50712}
50713
50714static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
50715 TargetLowering::DAGCombinerInfo &DCI) {
50716 SDValue N1 = N->getOperand(1);
50717
50718 // BT ignores high bits in the bit index operand.
50719 unsigned BitWidth = N1.getValueSizeInBits();
50720 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
50721 if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(N1, DemandedMask, DCI)) {
50722 if (N->getOpcode() != ISD::DELETED_NODE)
50723 DCI.AddToWorklist(N);
50724 return SDValue(N, 0);
50725 }
50726
50727 return SDValue();
50728}
50729
50730static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG,
50731 TargetLowering::DAGCombinerInfo &DCI) {
50732 bool IsStrict = N->getOpcode() == X86ISD::STRICT_CVTPH2PS;
50733 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
50734
50735 if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) {
50736 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50737 APInt DemandedElts = APInt::getLowBitsSet(8, 4);
50738 if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, DCI)) {
50739 if (N->getOpcode() != ISD::DELETED_NODE)
50740 DCI.AddToWorklist(N);
50741 return SDValue(N, 0);
50742 }
50743
50744 // Convert a full vector load into vzload when not all bits are needed.
50745 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
50746 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(IsStrict ? 1 : 0));
50747 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::i64, MVT::v2i64, DAG)) {
50748 SDLoc dl(N);
50749 if (IsStrict) {
50750 SDValue Convert = DAG.getNode(
50751 N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
50752 {N->getOperand(0), DAG.getBitcast(MVT::v8i16, VZLoad)});
50753 DCI.CombineTo(N, Convert, Convert.getValue(1));
50754 } else {
50755 SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32,
50756 DAG.getBitcast(MVT::v8i16, VZLoad));
50757 DCI.CombineTo(N, Convert);
50758 }
50759
50760 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
50761 DCI.recursivelyDeleteUnusedNodes(LN);
50762 return SDValue(N, 0);
50763 }
50764 }
50765 }
50766
50767 return SDValue();
50768}
50769
50770// Try to combine sext_in_reg of a cmov of constants by extending the constants.
50771static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG) {
50772 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG)(static_cast <bool> (N->getOpcode() == ISD::SIGN_EXTEND_INREG
) ? void (0) : __assert_fail ("N->getOpcode() == ISD::SIGN_EXTEND_INREG"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50772, __extension__
__PRETTY_FUNCTION__))
;
50773
50774 EVT DstVT = N->getValueType(0);
50775
50776 SDValue N0 = N->getOperand(0);
50777 SDValue N1 = N->getOperand(1);
50778 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
50779
50780 if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16)
50781 return SDValue();
50782
50783 // Look through single use any_extends / truncs.
50784 SDValue IntermediateBitwidthOp;
50785 if ((N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::TRUNCATE) &&
50786 N0.hasOneUse()) {
50787 IntermediateBitwidthOp = N0;
50788 N0 = N0.getOperand(0);
50789 }
50790
50791 // See if we have a single use cmov.
50792 if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())
50793 return SDValue();
50794
50795 SDValue CMovOp0 = N0.getOperand(0);
50796 SDValue CMovOp1 = N0.getOperand(1);
50797
50798 // Make sure both operands are constants.
50799 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
50800 !isa<ConstantSDNode>(CMovOp1.getNode()))
50801 return SDValue();
50802
50803 SDLoc DL(N);
50804
50805 // If we looked through an any_extend/trunc above, add one to the constants.
50806 if (IntermediateBitwidthOp) {
50807 unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode();
50808 CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0);
50809 CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1);
50810 }
50811
50812 CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1);
50813 CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1);
50814
50815 EVT CMovVT = DstVT;
50816 // We do not want i16 CMOV's. Promote to i32 and truncate afterwards.
50817 if (DstVT == MVT::i16) {
50818 CMovVT = MVT::i32;
50819 CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0);
50820 CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1);
50821 }
50822
50823 SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1,
50824 N0.getOperand(2), N0.getOperand(3));
50825
50826 if (CMovVT != DstVT)
50827 CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov);
50828
50829 return CMov;
50830}
50831
50832static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
50833 const X86Subtarget &Subtarget) {
50834 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG)(static_cast <bool> (N->getOpcode() == ISD::SIGN_EXTEND_INREG
) ? void (0) : __assert_fail ("N->getOpcode() == ISD::SIGN_EXTEND_INREG"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50834, __extension__
__PRETTY_FUNCTION__))
;
50835
50836 if (SDValue V = combineSextInRegCmov(N, DAG))
50837 return V;
50838
50839 EVT VT = N->getValueType(0);
50840 SDValue N0 = N->getOperand(0);
50841 SDValue N1 = N->getOperand(1);
50842 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
50843 SDLoc dl(N);
50844
50845 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
50846 // both SSE and AVX2 since there is no sign-extended shift right
50847 // operation on a vector with 64-bit elements.
50848 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
50849 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
50850 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
50851 N0.getOpcode() == ISD::SIGN_EXTEND)) {
50852 SDValue N00 = N0.getOperand(0);
50853
50854 // EXTLOAD has a better solution on AVX2,
50855 // it may be replaced with X86ISD::VSEXT node.
50856 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
50857 if (!ISD::isNormalLoad(N00.getNode()))
50858 return SDValue();
50859
50860 // Attempt to promote any comparison mask ops before moving the
50861 // SIGN_EXTEND_INREG in the way.
50862 if (SDValue Promote = PromoteMaskArithmetic(N0.getNode(), DAG, Subtarget))
50863 return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1);
50864
50865 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
50866 SDValue Tmp =
50867 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1);
50868 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
50869 }
50870 }
50871 return SDValue();
50872}
50873
50874/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
50875/// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
50876/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
50877/// opportunities to combine math ops, use an LEA, or use a complex addressing
50878/// mode. This can eliminate extend, add, and shift instructions.
50879static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
50880 const X86Subtarget &Subtarget) {
50881 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
50882 Ext->getOpcode() != ISD::ZERO_EXTEND)
50883 return SDValue();
50884
50885 // TODO: This should be valid for other integer types.
50886 EVT VT = Ext->getValueType(0);
50887 if (VT != MVT::i64)
50888 return SDValue();
50889
50890 SDValue Add = Ext->getOperand(0);
50891 if (Add.getOpcode() != ISD::ADD)
50892 return SDValue();
50893
50894 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
50895 bool NSW = Add->getFlags().hasNoSignedWrap();
50896 bool NUW = Add->getFlags().hasNoUnsignedWrap();
50897
50898 // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
50899 // into the 'zext'
50900 if ((Sext && !NSW) || (!Sext && !NUW))
50901 return SDValue();
50902
50903 // Having a constant operand to the 'add' ensures that we are not increasing
50904 // the instruction count because the constant is extended for free below.
50905 // A constant operand can also become the displacement field of an LEA.
50906 auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
50907 if (!AddOp1)
50908 return SDValue();
50909
50910 // Don't make the 'add' bigger if there's no hope of combining it with some
50911 // other 'add' or 'shl' instruction.
50912 // TODO: It may be profitable to generate simpler LEA instructions in place
50913 // of single 'add' instructions, but the cost model for selecting an LEA
50914 // currently has a high threshold.
50915 bool HasLEAPotential = false;
50916 for (auto *User : Ext->uses()) {
50917 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
50918 HasLEAPotential = true;
50919 break;
50920 }
50921 }
50922 if (!HasLEAPotential)
50923 return SDValue();
50924
50925 // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
50926 int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
50927 SDValue AddOp0 = Add.getOperand(0);
50928 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
50929 SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
50930
50931 // The wider add is guaranteed to not wrap because both operands are
50932 // sign-extended.
50933 SDNodeFlags Flags;
50934 Flags.setNoSignedWrap(NSW);
50935 Flags.setNoUnsignedWrap(NUW);
50936 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
50937}
50938
50939// If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
50940// operands and the result of CMOV is not used anywhere else - promote CMOV
50941// itself instead of promoting its result. This could be beneficial, because:
50942// 1) X86TargetLowering::EmitLoweredSelect later can do merging of two
50943// (or more) pseudo-CMOVs only when they go one-after-another and
50944// getting rid of result extension code after CMOV will help that.
50945// 2) Promotion of constant CMOV arguments is free, hence the
50946// {ANY,SIGN,ZERO}_EXTEND will just be deleted.
50947// 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
50948// promotion is also good in terms of code-size.
50949// (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
50950// promotion).
50951static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) {
50952 SDValue CMovN = Extend->getOperand(0);
50953 if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())
50954 return SDValue();
50955
50956 EVT TargetVT = Extend->getValueType(0);
50957 unsigned ExtendOpcode = Extend->getOpcode();
50958 SDLoc DL(Extend);
50959
50960 EVT VT = CMovN.getValueType();
50961 SDValue CMovOp0 = CMovN.getOperand(0);
50962 SDValue CMovOp1 = CMovN.getOperand(1);
50963
50964 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
50965 !isa<ConstantSDNode>(CMovOp1.getNode()))
50966 return SDValue();
50967
50968 // Only extend to i32 or i64.
50969 if (TargetVT != MVT::i32 && TargetVT != MVT::i64)
50970 return SDValue();
50971
50972 // Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32
50973 // are free.
50974 if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))
50975 return SDValue();
50976
50977 // If this a zero extend to i64, we should only extend to i32 and use a free
50978 // zero extend to finish.
50979 EVT ExtendVT = TargetVT;
50980 if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)
50981 ExtendVT = MVT::i32;
50982
50983 CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);
50984 CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);
50985
50986 SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,
50987 CMovN.getOperand(2), CMovN.getOperand(3));
50988
50989 // Finish extending if needed.
50990 if (ExtendVT != TargetVT)
50991 Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);
50992
50993 return Res;
50994}
50995
50996// Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
50997// result type.
50998static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,
50999 const X86Subtarget &Subtarget) {
51000 SDValue N0 = N->getOperand(0);
51001 EVT VT = N->getValueType(0);
51002 SDLoc dl(N);
51003
51004 // Only do this combine with AVX512 for vector extends.
51005 if (!Subtarget.hasAVX512() || !VT.isVector() || N0.getOpcode() != ISD::SETCC)
51006 return SDValue();
51007
51008 // Only combine legal element types.
51009 EVT SVT = VT.getVectorElementType();
51010 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
51011 SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
51012 return SDValue();
51013
51014 // We don't have CMPP Instruction for vxf16
51015 if (N0.getOperand(0).getValueType().getVectorElementType() == MVT::f16)
51016 return SDValue();
51017 // We can only do this if the vector size in 256 bits or less.
51018 unsigned Size = VT.getSizeInBits();
51019 if (Size > 256 && Subtarget.useAVX512Regs())
51020 return SDValue();
51021
51022 // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
51023 // that's the only integer compares with we have.
51024 ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
51025 if (ISD::isUnsignedIntSetCC(CC))
51026 return SDValue();
51027
51028 // Only do this combine if the extension will be fully consumed by the setcc.
51029 EVT N00VT = N0.getOperand(0).getValueType();
51030 EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
51031 if (Size != MatchingVecType.getSizeInBits())
51032 return SDValue();
51033
51034 SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
51035
51036 if (N->getOpcode() == ISD::ZERO_EXTEND)
51037 Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType());
51038
51039 return Res;
51040}
51041
51042static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
51043 TargetLowering::DAGCombinerInfo &DCI,
51044 const X86Subtarget &Subtarget) {
51045 SDValue N0 = N->getOperand(0);
51046 EVT VT = N->getValueType(0);
51047 SDLoc DL(N);
51048
51049 // (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
51050 if (!DCI.isBeforeLegalizeOps() &&
51051 N0.getOpcode() == X86ISD::SETCC_CARRY) {
51052 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0),
51053 N0->getOperand(1));
51054 bool ReplaceOtherUses = !N0.hasOneUse();
51055 DCI.CombineTo(N, Setcc);
51056 // Replace other uses with a truncate of the widened setcc_carry.
51057 if (ReplaceOtherUses) {
51058 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
51059 N0.getValueType(), Setcc);
51060 DCI.CombineTo(N0.getNode(), Trunc);
51061 }
51062
51063 return SDValue(N, 0);
51064 }
51065
51066 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
51067 return NewCMov;
51068
51069 if (!DCI.isBeforeLegalizeOps())
51070 return SDValue();
51071
51072 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
51073 return V;
51074
51075 if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), DL, VT, N0,
51076 DAG, DCI, Subtarget))
51077 return V;
51078
51079 if (VT.isVector()) {
51080 if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
51081 return R;
51082
51083 if (N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG)
51084 return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0));
51085 }
51086
51087 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
51088 return NewAdd;
51089
51090 return SDValue();
51091}
51092
51093static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
51094 TargetLowering::DAGCombinerInfo &DCI,
51095 const X86Subtarget &Subtarget) {
51096 SDLoc dl(N);
51097 EVT VT = N->getValueType(0);
51098 bool IsStrict = N->isStrictFPOpcode() || N->isTargetStrictFPOpcode();
51099
51100 // Let legalize expand this if it isn't a legal type yet.
51101 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51102 if (!TLI.isTypeLegal(VT))
51103 return SDValue();
51104
51105 SDValue A = N->getOperand(IsStrict ? 1 : 0);
51106 SDValue B = N->getOperand(IsStrict ? 2 : 1);
51107 SDValue C = N->getOperand(IsStrict ? 3 : 2);
51108
51109 // If the operation allows fast-math and the target does not support FMA,
51110 // split this into mul+add to avoid libcall(s).
51111 SDNodeFlags Flags = N->getFlags();
51112 if (!IsStrict && Flags.hasAllowReassociation() &&
51113 TLI.isOperationExpand(ISD::FMA, VT)) {
51114 SDValue Fmul = DAG.getNode(ISD::FMUL, dl, VT, A, B, Flags);
51115 return DAG.getNode(ISD::FADD, dl, VT, Fmul, C, Flags);
51116 }
51117
51118 EVT ScalarVT = VT.getScalarType();
51119 if (((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
51120 !Subtarget.hasAnyFMA()) &&
51121 !(ScalarVT == MVT::f16 && Subtarget.hasFP16()))
51122 return SDValue();
51123
51124 auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {
51125 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
51126 bool LegalOperations = !DCI.isBeforeLegalizeOps();
51127 if (SDValue NegV = TLI.getCheaperNegatedExpression(V, DAG, LegalOperations,
51128 CodeSize)) {
51129 V = NegV;
51130 return true;
51131 }
51132 // Look through extract_vector_elts. If it comes from an FNEG, create a
51133 // new extract from the FNEG input.
51134 if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
51135 isNullConstant(V.getOperand(1))) {
51136 SDValue Vec = V.getOperand(0);
51137 if (SDValue NegV = TLI.getCheaperNegatedExpression(
51138 Vec, DAG, LegalOperations, CodeSize)) {
51139 V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
51140 NegV, V.getOperand(1));
51141 return true;
51142 }
51143 }
51144
51145 return false;
51146 };
51147
51148 // Do not convert the passthru input of scalar intrinsics.
51149 // FIXME: We could allow negations of the lower element only.
51150 bool NegA = invertIfNegative(A);
51151 bool NegB = invertIfNegative(B);
51152 bool NegC = invertIfNegative(C);
51153
51154 if (!NegA && !NegB && !NegC)
51155 return SDValue();
51156
51157 unsigned NewOpcode =
51158 negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);
51159
51160 // Propagate fast-math-flags to new FMA node.
51161 SelectionDAG::FlagInserter FlagsInserter(DAG, Flags);
51162 if (IsStrict) {
51163 assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4")(static_cast <bool> (N->getNumOperands() == 4 &&
"Shouldn't be greater than 4") ? void (0) : __assert_fail ("N->getNumOperands() == 4 && \"Shouldn't be greater than 4\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51163, __extension__
__PRETTY_FUNCTION__))
;
51164 return DAG.getNode(NewOpcode, dl, {VT, MVT::Other},
51165 {N->getOperand(0), A, B, C});
51166 } else {
51167 if (N->getNumOperands() == 4)
51168 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
51169 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
51170 }
51171}
51172
51173// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
51174// Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C)
51175static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,
51176 TargetLowering::DAGCombinerInfo &DCI) {
51177 SDLoc dl(N);
51178 EVT VT = N->getValueType(0);
51179 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51180 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
51181 bool LegalOperations = !DCI.isBeforeLegalizeOps();
51182
51183 SDValue N2 = N->getOperand(2);
51184
51185 SDValue NegN2 =
51186 TLI.getCheaperNegatedExpression(N2, DAG, LegalOperations, CodeSize);
51187 if (!NegN2)
51188 return SDValue();
51189 unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);
51190
51191 if (N->getNumOperands() == 4)
51192 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
51193 NegN2, N->getOperand(3));
51194 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
51195 NegN2);
51196}
51197
51198static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
51199 TargetLowering::DAGCombinerInfo &DCI,
51200 const X86Subtarget &Subtarget) {
51201 SDLoc dl(N);
51202 SDValue N0 = N->getOperand(0);
51203 EVT VT = N->getValueType(0);
51204
51205 // (i32 (aext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
51206 // FIXME: Is this needed? We don't seem to have any tests for it.
51207 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND &&
51208 N0.getOpcode() == X86ISD::SETCC_CARRY) {
51209 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0),
51210 N0->getOperand(1));
51211 bool ReplaceOtherUses = !N0.hasOneUse();
51212 DCI.CombineTo(N, Setcc);
51213 // Replace other uses with a truncate of the widened setcc_carry.
51214 if (ReplaceOtherUses) {
51215 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
51216 N0.getValueType(), Setcc);
51217 DCI.CombineTo(N0.getNode(), Trunc);
51218 }
51219
51220 return SDValue(N, 0);
51221 }
51222
51223 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
51224 return NewCMov;
51225
51226 if (DCI.isBeforeLegalizeOps())
51227 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
51228 return V;
51229
51230 if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), dl, VT, N0,
51231 DAG, DCI, Subtarget))
51232 return V;
51233
51234 if (VT.isVector())
51235 if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
51236 return R;
51237
51238 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
51239 return NewAdd;
51240
51241 if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
51242 return R;
51243
51244 // TODO: Combine with any target/faux shuffle.
51245 if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 &&
51246 VT.getScalarSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits()) {
51247 SDValue N00 = N0.getOperand(0);
51248 SDValue N01 = N0.getOperand(1);
51249 unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();
51250 APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);
51251 if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) &&
51252 (N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) {
51253 return concatSubVectors(N00, N01, DAG, dl);
51254 }
51255 }
51256
51257 return SDValue();
51258}
51259
51260/// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a
51261/// recognizable memcmp expansion.
51262static bool isOrXorXorTree(SDValue X, bool Root = true) {
51263 if (X.getOpcode() == ISD::OR)
51264 return isOrXorXorTree(X.getOperand(0), false) &&
51265 isOrXorXorTree(X.getOperand(1), false);
51266 if (Root)
51267 return false;
51268 return X.getOpcode() == ISD::XOR;
51269}
51270
51271/// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp
51272/// expansion.
51273template<typename F>
51274static SDValue emitOrXorXorTree(SDValue X, SDLoc &DL, SelectionDAG &DAG,
51275 EVT VecVT, EVT CmpVT, bool HasPT, F SToV) {
51276 SDValue Op0 = X.getOperand(0);
51277 SDValue Op1 = X.getOperand(1);
51278 if (X.getOpcode() == ISD::OR) {
51279 SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT, HasPT, SToV);
51280 SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT, HasPT, SToV);
51281 if (VecVT != CmpVT)
51282 return DAG.getNode(ISD::OR, DL, CmpVT, A, B);
51283 if (HasPT)
51284 return DAG.getNode(ISD::OR, DL, VecVT, A, B);
51285 return DAG.getNode(ISD::AND, DL, CmpVT, A, B);
51286 } else if (X.getOpcode() == ISD::XOR) {
51287 SDValue A = SToV(Op0);
51288 SDValue B = SToV(Op1);
51289 if (VecVT != CmpVT)
51290 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE);
51291 if (HasPT)
51292 return DAG.getNode(ISD::XOR, DL, VecVT, A, B);
51293 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
51294 }
51295 llvm_unreachable("Impossible")::llvm::llvm_unreachable_internal("Impossible", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 51295)
;
51296}
51297
51298/// Try to map a 128-bit or larger integer comparison to vector instructions
51299/// before type legalization splits it up into chunks.
51300static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
51301 const X86Subtarget &Subtarget) {
51302 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
51303 assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate")(static_cast <bool> ((CC == ISD::SETNE || CC == ISD::SETEQ
) && "Bad comparison predicate") ? void (0) : __assert_fail
("(CC == ISD::SETNE || CC == ISD::SETEQ) && \"Bad comparison predicate\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51303, __extension__
__PRETTY_FUNCTION__))
;
51304
51305 // We're looking for an oversized integer equality comparison.
51306 SDValue X = SetCC->getOperand(0);
51307 SDValue Y = SetCC->getOperand(1);
51308 EVT OpVT = X.getValueType();
51309 unsigned OpSize = OpVT.getSizeInBits();
51310 if (!OpVT.isScalarInteger() || OpSize < 128)
51311 return SDValue();
51312
51313 // Ignore a comparison with zero because that gets special treatment in
51314 // EmitTest(). But make an exception for the special case of a pair of
51315 // logically-combined vector-sized operands compared to zero. This pattern may
51316 // be generated by the memcmp expansion pass with oversized integer compares
51317 // (see PR33325).
51318 bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);
51319 if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)
51320 return SDValue();
51321
51322 // Don't perform this combine if constructing the vector will be expensive.
51323 auto IsVectorBitCastCheap = [](SDValue X) {
51324 X = peekThroughBitcasts(X);
51325 return isa<ConstantSDNode>(X) || X.getValueType().isVector() ||
51326 X.getOpcode() == ISD::LOAD;
51327 };
51328 if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) &&
51329 !IsOrXorXorTreeCCZero)
51330 return SDValue();
51331
51332 EVT VT = SetCC->getValueType(0);
51333 SDLoc DL(SetCC);
51334
51335 // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.
51336 // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.
51337 // Otherwise use PCMPEQ (plus AND) and mask testing.
51338 if ((OpSize == 128 && Subtarget.hasSSE2()) ||
51339 (OpSize == 256 && Subtarget.hasAVX()) ||
51340 (OpSize == 512 && Subtarget.useAVX512Regs())) {
51341 bool HasPT = Subtarget.hasSSE41();
51342
51343 // PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened
51344 // vector registers are essentially free. (Technically, widening registers
51345 // prevents load folding, but the tradeoff is worth it.)
51346 bool PreferKOT = Subtarget.preferMaskRegisters();
51347 bool NeedZExt = PreferKOT && !Subtarget.hasVLX() && OpSize != 512;
51348
51349 EVT VecVT = MVT::v16i8;
51350 EVT CmpVT = PreferKOT ? MVT::v16i1 : VecVT;
51351 if (OpSize == 256) {
51352 VecVT = MVT::v32i8;
51353 CmpVT = PreferKOT ? MVT::v32i1 : VecVT;
51354 }
51355 EVT CastVT = VecVT;
51356 bool NeedsAVX512FCast = false;
51357 if (OpSize == 512 || NeedZExt) {
51358 if (Subtarget.hasBWI()) {
51359 VecVT = MVT::v64i8;
51360 CmpVT = MVT::v64i1;
51361 if (OpSize == 512)
51362 CastVT = VecVT;
51363 } else {
51364 VecVT = MVT::v16i32;
51365 CmpVT = MVT::v16i1;
51366 CastVT = OpSize == 512 ? VecVT :
51367 OpSize == 256 ? MVT::v8i32 : MVT::v4i32;
51368 NeedsAVX512FCast = true;
51369 }
51370 }
51371
51372 auto ScalarToVector = [&](SDValue X) -> SDValue {
51373 bool TmpZext = false;
51374 EVT TmpCastVT = CastVT;
51375 if (X.getOpcode() == ISD::ZERO_EXTEND) {
51376 SDValue OrigX = X.getOperand(0);
51377 unsigned OrigSize = OrigX.getScalarValueSizeInBits();
51378 if (OrigSize < OpSize) {
51379 if (OrigSize == 128) {
51380 TmpCastVT = NeedsAVX512FCast ? MVT::v4i32 : MVT::v16i8;
51381 X = OrigX;
51382 TmpZext = true;
51383 } else if (OrigSize == 256) {
51384 TmpCastVT = NeedsAVX512FCast ? MVT::v8i32 : MVT::v32i8;
51385 X = OrigX;
51386 TmpZext = true;
51387 }
51388 }
51389 }
51390 X = DAG.getBitcast(TmpCastVT, X);
51391 if (!NeedZExt && !TmpZext)
51392 return X;
51393 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT,
51394 DAG.getConstant(0, DL, VecVT), X,
51395 DAG.getVectorIdxConstant(0, DL));
51396 };
51397
51398 SDValue Cmp;
51399 if (IsOrXorXorTreeCCZero) {
51400 // This is a bitwise-combined equality comparison of 2 pairs of vectors:
51401 // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne
51402 // Use 2 vector equality compares and 'and' the results before doing a
51403 // MOVMSK.
51404 Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT, HasPT, ScalarToVector);
51405 } else {
51406 SDValue VecX = ScalarToVector(X);
51407 SDValue VecY = ScalarToVector(Y);
51408 if (VecVT != CmpVT) {
51409 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE);
51410 } else if (HasPT) {
51411 Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY);
51412 } else {
51413 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
51414 }
51415 }
51416 // AVX512 should emit a setcc that will lower to kortest.
51417 if (VecVT != CmpVT) {
51418 EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64 :
51419 CmpVT == MVT::v32i1 ? MVT::i32 : MVT::i16;
51420 return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp),
51421 DAG.getConstant(0, DL, KRegVT), CC);
51422 }
51423 if (HasPT) {
51424 SDValue BCCmp = DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64,
51425 Cmp);
51426 SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp);
51427 X86::CondCode X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
51428 SDValue X86SetCC = getSETCC(X86CC, PT, DL, DAG);
51429 return DAG.getNode(ISD::TRUNCATE, DL, VT, X86SetCC.getValue(0));
51430 }
51431 // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
51432 // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
51433 // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
51434 assert(Cmp.getValueType() == MVT::v16i8 &&(static_cast <bool> (Cmp.getValueType() == MVT::v16i8 &&
"Non 128-bit vector on pre-SSE41 target") ? void (0) : __assert_fail
("Cmp.getValueType() == MVT::v16i8 && \"Non 128-bit vector on pre-SSE41 target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51435, __extension__
__PRETTY_FUNCTION__))
51435 "Non 128-bit vector on pre-SSE41 target")(static_cast <bool> (Cmp.getValueType() == MVT::v16i8 &&
"Non 128-bit vector on pre-SSE41 target") ? void (0) : __assert_fail
("Cmp.getValueType() == MVT::v16i8 && \"Non 128-bit vector on pre-SSE41 target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51435, __extension__
__PRETTY_FUNCTION__))
;
51436 SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
51437 SDValue FFFFs = DAG.getConstant(0xFFFF, DL, MVT::i32);
51438 return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
51439 }
51440
51441 return SDValue();
51442}
51443
51444static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
51445 TargetLowering::DAGCombinerInfo &DCI,
51446 const X86Subtarget &Subtarget) {
51447 const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
51448 const SDValue LHS = N->getOperand(0);
51449 const SDValue RHS = N->getOperand(1);
51450 EVT VT = N->getValueType(0);
51451 EVT OpVT = LHS.getValueType();
51452 SDLoc DL(N);
51453
51454 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
51455 if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))
51456 return V;
51457
51458 if (VT == MVT::i1 && isNullConstant(RHS)) {
51459 SDValue X86CC;
51460 if (SDValue V =
51461 MatchVectorAllZeroTest(LHS, CC, DL, Subtarget, DAG, X86CC))
51462 return DAG.getNode(ISD::TRUNCATE, DL, VT,
51463 DAG.getNode(X86ISD::SETCC, DL, MVT::i8, X86CC, V));
51464 }
51465
51466 if (OpVT.isScalarInteger()) {
51467 // cmpeq(or(X,Y),X) --> cmpeq(and(~X,Y),0)
51468 // cmpne(or(X,Y),X) --> cmpne(and(~X,Y),0)
51469 auto MatchOrCmpEq = [&](SDValue N0, SDValue N1) {
51470 if (N0.getOpcode() == ISD::OR && N0->hasOneUse()) {
51471 if (N0.getOperand(0) == N1)
51472 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
51473 N0.getOperand(1));
51474 if (N0.getOperand(1) == N1)
51475 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
51476 N0.getOperand(0));
51477 }
51478 return SDValue();
51479 };
51480 if (SDValue AndN = MatchOrCmpEq(LHS, RHS))
51481 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
51482 if (SDValue AndN = MatchOrCmpEq(RHS, LHS))
51483 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
51484
51485 // cmpeq(and(X,Y),Y) --> cmpeq(and(~X,Y),0)
51486 // cmpne(and(X,Y),Y) --> cmpne(and(~X,Y),0)
51487 auto MatchAndCmpEq = [&](SDValue N0, SDValue N1) {
51488 if (N0.getOpcode() == ISD::AND && N0->hasOneUse()) {
51489 if (N0.getOperand(0) == N1)
51490 return DAG.getNode(ISD::AND, DL, OpVT, N1,
51491 DAG.getNOT(DL, N0.getOperand(1), OpVT));
51492 if (N0.getOperand(1) == N1)
51493 return DAG.getNode(ISD::AND, DL, OpVT, N1,
51494 DAG.getNOT(DL, N0.getOperand(0), OpVT));
51495 }
51496 return SDValue();
51497 };
51498 if (SDValue AndN = MatchAndCmpEq(LHS, RHS))
51499 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
51500 if (SDValue AndN = MatchAndCmpEq(RHS, LHS))
51501 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
51502
51503 // cmpeq(trunc(x),0) --> cmpeq(x,0)
51504 // cmpne(trunc(x),0) --> cmpne(x,0)
51505 // iff x upper bits are zero.
51506 // TODO: Add support for RHS to be truncate as well?
51507 if (LHS.getOpcode() == ISD::TRUNCATE &&
51508 LHS.getOperand(0).getScalarValueSizeInBits() >= 32 &&
51509 isNullConstant(RHS) && !DCI.isBeforeLegalize()) {
51510 EVT SrcVT = LHS.getOperand(0).getValueType();
51511 APInt UpperBits = APInt::getBitsSetFrom(SrcVT.getScalarSizeInBits(),
51512 OpVT.getScalarSizeInBits());
51513 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51514 if (DAG.MaskedValueIsZero(LHS.getOperand(0), UpperBits) &&
51515 TLI.isTypeLegal(LHS.getOperand(0).getValueType()))
51516 return DAG.getSetCC(DL, VT, LHS.getOperand(0),
51517 DAG.getConstant(0, DL, SrcVT), CC);
51518 }
51519 }
51520 }
51521
51522 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
51523 (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
51524 // Using temporaries to avoid messing up operand ordering for later
51525 // transformations if this doesn't work.
51526 SDValue Op0 = LHS;
51527 SDValue Op1 = RHS;
51528 ISD::CondCode TmpCC = CC;
51529 // Put build_vector on the right.
51530 if (Op0.getOpcode() == ISD::BUILD_VECTOR) {
51531 std::swap(Op0, Op1);
51532 TmpCC = ISD::getSetCCSwappedOperands(TmpCC);
51533 }
51534
51535 bool IsSEXT0 =
51536 (Op0.getOpcode() == ISD::SIGN_EXTEND) &&
51537 (Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
51538 bool IsVZero1 = ISD::isBuildVectorAllZeros(Op1.getNode());
51539
51540 if (IsSEXT0 && IsVZero1) {
51541 assert(VT == Op0.getOperand(0).getValueType() &&(static_cast <bool> (VT == Op0.getOperand(0).getValueType
() && "Unexpected operand type") ? void (0) : __assert_fail
("VT == Op0.getOperand(0).getValueType() && \"Unexpected operand type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51542, __extension__
__PRETTY_FUNCTION__))
51542 "Unexpected operand type")(static_cast <bool> (VT == Op0.getOperand(0).getValueType
() && "Unexpected operand type") ? void (0) : __assert_fail
("VT == Op0.getOperand(0).getValueType() && \"Unexpected operand type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51542, __extension__
__PRETTY_FUNCTION__))
;
51543 if (TmpCC == ISD::SETGT)
51544 return DAG.getConstant(0, DL, VT);
51545 if (TmpCC == ISD::SETLE)
51546 return DAG.getConstant(1, DL, VT);
51547 if (TmpCC == ISD::SETEQ || TmpCC == ISD::SETGE)
51548 return DAG.getNOT(DL, Op0.getOperand(0), VT);
51549
51550 assert((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) &&(static_cast <bool> ((TmpCC == ISD::SETNE || TmpCC == ISD
::SETLT) && "Unexpected condition code!") ? void (0) :
__assert_fail ("(TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) && \"Unexpected condition code!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51551, __extension__
__PRETTY_FUNCTION__))
51551 "Unexpected condition code!")(static_cast <bool> ((TmpCC == ISD::SETNE || TmpCC == ISD
::SETLT) && "Unexpected condition code!") ? void (0) :
__assert_fail ("(TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) && \"Unexpected condition code!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51551, __extension__
__PRETTY_FUNCTION__))
;
51552 return Op0.getOperand(0);
51553 }
51554 }
51555
51556 // If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just
51557 // pre-promote its result type since vXi1 vectors don't get promoted
51558 // during type legalization.
51559 // NOTE: The element count check is to ignore operand types that need to
51560 // go through type promotion to a 128-bit vector.
51561 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
51562 VT.getVectorElementType() == MVT::i1 &&
51563 (OpVT.getVectorElementType() == MVT::i8 ||
51564 OpVT.getVectorElementType() == MVT::i16)) {
51565 SDValue Setcc = DAG.getSetCC(DL, OpVT, LHS, RHS, CC);
51566 return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
51567 }
51568
51569 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
51570 // to avoid scalarization via legalization because v4i32 is not a legal type.
51571 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
51572 LHS.getValueType() == MVT::v4f32)
51573 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
51574
51575 // X pred 0.0 --> X pred -X
51576 // If the negation of X already exists, use it in the comparison. This removes
51577 // the need to materialize 0.0 and allows matching to SSE's MIN/MAX
51578 // instructions in patterns with a 'select' node.
51579 if (isNullFPScalarOrVectorConst(RHS)) {
51580 SDVTList FNegVT = DAG.getVTList(OpVT);
51581 if (SDNode *FNeg = DAG.getNodeIfExists(ISD::FNEG, FNegVT, {LHS}))
51582 return DAG.getSetCC(DL, VT, LHS, SDValue(FNeg, 0), CC);
51583 }
51584
51585 return SDValue();
51586}
51587
51588static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
51589 TargetLowering::DAGCombinerInfo &DCI,
51590 const X86Subtarget &Subtarget) {
51591 SDValue Src = N->getOperand(0);
51592 MVT SrcVT = Src.getSimpleValueType();
51593 MVT VT = N->getSimpleValueType(0);
51594 unsigned NumBits = VT.getScalarSizeInBits();
51595 unsigned NumElts = SrcVT.getVectorNumElements();
51596
51597 // Perform constant folding.
51598 if (ISD::isBuildVectorOfConstantSDNodes(Src.getNode())) {
51599 assert(VT == MVT::i32 && "Unexpected result type")(static_cast <bool> (VT == MVT::i32 && "Unexpected result type"
) ? void (0) : __assert_fail ("VT == MVT::i32 && \"Unexpected result type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51599, __extension__
__PRETTY_FUNCTION__))
;
51600 APInt Imm(32, 0);
51601 for (unsigned Idx = 0, e = Src.getNumOperands(); Idx < e; ++Idx) {
51602 if (!Src.getOperand(Idx).isUndef() &&
51603 Src.getConstantOperandAPInt(Idx).isNegative())
51604 Imm.setBit(Idx);
51605 }
51606 return DAG.getConstant(Imm, SDLoc(N), VT);
51607 }
51608
51609 // Look through int->fp bitcasts that don't change the element width.
51610 unsigned EltWidth = SrcVT.getScalarSizeInBits();
51611 if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST &&
51612 Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)
51613 return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));
51614
51615 // Fold movmsk(not(x)) -> not(movmsk(x)) to improve folding of movmsk results
51616 // with scalar comparisons.
51617 if (SDValue NotSrc = IsNOT(Src, DAG)) {
51618 SDLoc DL(N);
51619 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
51620 NotSrc = DAG.getBitcast(SrcVT, NotSrc);
51621 return DAG.getNode(ISD::XOR, DL, VT,
51622 DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc),
51623 DAG.getConstant(NotMask, DL, VT));
51624 }
51625
51626 // Fold movmsk(icmp_sgt(x,-1)) -> not(movmsk(x)) to improve folding of movmsk
51627 // results with scalar comparisons.
51628 if (Src.getOpcode() == X86ISD::PCMPGT &&
51629 ISD::isBuildVectorAllOnes(Src.getOperand(1).getNode())) {
51630 SDLoc DL(N);
51631 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
51632 return DAG.getNode(ISD::XOR, DL, VT,
51633 DAG.getNode(X86ISD::MOVMSK, DL, VT, Src.getOperand(0)),
51634 DAG.getConstant(NotMask, DL, VT));
51635 }
51636
51637 // Fold movmsk(icmp_eq(and(x,c1),0)) -> movmsk(not(shl(x,c2)))
51638 // iff pow2splat(c1).
51639 if (Src.getOpcode() == X86ISD::PCMPEQ &&
51640 Src.getOperand(0).getOpcode() == ISD::AND &&
51641 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {
51642 SDValue LHS = Src.getOperand(0).getOperand(0);
51643 SDValue RHS = Src.getOperand(0).getOperand(1);
51644 KnownBits KnownRHS = DAG.computeKnownBits(RHS);
51645 if (KnownRHS.isConstant() && KnownRHS.getConstant().isPowerOf2()) {
51646 SDLoc DL(N);
51647 MVT ShiftVT = SrcVT;
51648 if (ShiftVT.getScalarType() == MVT::i8) {
51649 // vXi8 shifts - we only care about the signbit so can use PSLLW.
51650 ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
51651 LHS = DAG.getBitcast(ShiftVT, LHS);
51652 }
51653 unsigned ShiftAmt = KnownRHS.getConstant().countLeadingZeros();
51654 LHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT, LHS,
51655 ShiftAmt, DAG);
51656 LHS = DAG.getNOT(DL, DAG.getBitcast(SrcVT, LHS), SrcVT);
51657 return DAG.getNode(X86ISD::MOVMSK, DL, VT, LHS);
51658 }
51659 }
51660
51661 // Simplify the inputs.
51662 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51663 APInt DemandedMask(APInt::getAllOnes(NumBits));
51664 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
51665 return SDValue(N, 0);
51666
51667 return SDValue();
51668}
51669
51670static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG,
51671 TargetLowering::DAGCombinerInfo &DCI,
51672 const X86Subtarget &Subtarget) {
51673 auto *MemOp = cast<X86MaskedGatherScatterSDNode>(N);
51674 SDValue BasePtr = MemOp->getBasePtr();
51675 SDValue Index = MemOp->getIndex();
51676 SDValue Scale = MemOp->getScale();
51677 SDValue Mask = MemOp->getMask();
51678
51679 // Attempt to fold an index scale into the scale value directly.
51680 // For smaller indices, implicit sext is performed BEFORE scale, preventing
51681 // this fold under most circumstances.
51682 // TODO: Move this into X86DAGToDAGISel::matchVectorAddressRecursively?
51683 if ((Index.getOpcode() == X86ISD::VSHLI ||
51684 (Index.getOpcode() == ISD::ADD &&
51685 Index.getOperand(0) == Index.getOperand(1))) &&
51686 isa<ConstantSDNode>(Scale) &&
51687 BasePtr.getScalarValueSizeInBits() == Index.getScalarValueSizeInBits()) {
51688 unsigned ShiftAmt =
51689 Index.getOpcode() == ISD::ADD ? 1 : Index.getConstantOperandVal(1);
51690 uint64_t ScaleAmt = cast<ConstantSDNode>(Scale)->getZExtValue();
51691 uint64_t NewScaleAmt = ScaleAmt * (1ULL << ShiftAmt);
51692 if (isPowerOf2_64(NewScaleAmt) && NewScaleAmt <= 8) {
51693 SDValue NewIndex = Index.getOperand(0);
51694 SDValue NewScale =
51695 DAG.getTargetConstant(NewScaleAmt, SDLoc(N), Scale.getValueType());
51696 if (N->getOpcode() == X86ISD::MGATHER)
51697 return getAVX2GatherNode(N->getOpcode(), SDValue(N, 0), DAG,
51698 MemOp->getOperand(1), Mask,
51699 MemOp->getBasePtr(), NewIndex, NewScale,
51700 MemOp->getChain(), Subtarget);
51701 if (N->getOpcode() == X86ISD::MSCATTER)
51702 return getScatterNode(N->getOpcode(), SDValue(N, 0), DAG,
51703 MemOp->getOperand(1), Mask, MemOp->getBasePtr(),
51704 NewIndex, NewScale, MemOp->getChain(), Subtarget);
51705 }
51706 }
51707
51708 // With vector masks we only demand the upper bit of the mask.
51709 if (Mask.getScalarValueSizeInBits() != 1) {
51710 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51711 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
51712 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
51713 if (N->getOpcode() != ISD::DELETED_NODE)
51714 DCI.AddToWorklist(N);
51715 return SDValue(N, 0);
51716 }
51717 }
51718
51719 return SDValue();
51720}
51721
51722static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS,
51723 SDValue Index, SDValue Base, SDValue Scale,
51724 SelectionDAG &DAG) {
51725 SDLoc DL(GorS);
51726
51727 if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
51728 SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),
51729 Gather->getMask(), Base, Index, Scale } ;
51730 return DAG.getMaskedGather(Gather->getVTList(),
51731 Gather->getMemoryVT(), DL, Ops,
51732 Gather->getMemOperand(),
51733 Gather->getIndexType(),
51734 Gather->getExtensionType());
51735 }
51736 auto *Scatter = cast<MaskedScatterSDNode>(GorS);
51737 SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),
51738 Scatter->getMask(), Base, Index, Scale };
51739 return DAG.getMaskedScatter(Scatter->getVTList(),
51740 Scatter->getMemoryVT(), DL,
51741 Ops, Scatter->getMemOperand(),
51742 Scatter->getIndexType(),
51743 Scatter->isTruncatingStore());
51744}
51745
51746static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
51747 TargetLowering::DAGCombinerInfo &DCI) {
51748 SDLoc DL(N);
51749 auto *GorS = cast<MaskedGatherScatterSDNode>(N);
51750 SDValue Index = GorS->getIndex();
51751 SDValue Base = GorS->getBasePtr();
51752 SDValue Scale = GorS->getScale();
51753
51754 if (DCI.isBeforeLegalize()) {
51755 unsigned IndexWidth = Index.getScalarValueSizeInBits();
51756
51757 // Shrink constant indices if they are larger than 32-bits.
51758 // Only do this before legalize types since v2i64 could become v2i32.
51759 // FIXME: We could check that the type is legal if we're after legalize
51760 // types, but then we would need to construct test cases where that happens.
51761 // FIXME: We could support more than just constant vectors, but we need to
51762 // careful with costing. A truncate that can be optimized out would be fine.
51763 // Otherwise we might only want to create a truncate if it avoids a split.
51764 if (auto *BV = dyn_cast<BuildVectorSDNode>(Index)) {
51765 if (BV->isConstant() && IndexWidth > 32 &&
51766 DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
51767 EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);
51768 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
51769 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
51770 }
51771 }
51772
51773 // Shrink any sign/zero extends from 32 or smaller to larger than 32 if
51774 // there are sufficient sign bits. Only do this before legalize types to
51775 // avoid creating illegal types in truncate.
51776 if ((Index.getOpcode() == ISD::SIGN_EXTEND ||
51777 Index.getOpcode() == ISD::ZERO_EXTEND) &&
51778 IndexWidth > 32 &&
51779 Index.getOperand(0).getScalarValueSizeInBits() <= 32 &&
51780 DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
51781 EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);
51782 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
51783 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
51784 }
51785 }
51786
51787 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51788 EVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
51789 // Try to move splat constant adders from the index operand to the base
51790 // pointer operand. Taking care to multiply by the scale. We can only do
51791 // this when index element type is the same as the pointer type.
51792 // Otherwise we need to be sure the math doesn't wrap before the scale.
51793 if (Index.getOpcode() == ISD::ADD &&
51794 Index.getValueType().getVectorElementType() == PtrVT &&
51795 isa<ConstantSDNode>(Scale)) {
51796 uint64_t ScaleAmt = cast<ConstantSDNode>(Scale)->getZExtValue();
51797 if (auto *BV = dyn_cast<BuildVectorSDNode>(Index.getOperand(1))) {
51798 BitVector UndefElts;
51799 if (ConstantSDNode *C = BV->getConstantSplatNode(&UndefElts)) {
51800 // FIXME: Allow non-constant?
51801 if (UndefElts.none()) {
51802 // Apply the scale.
51803 APInt Adder = C->getAPIntValue() * ScaleAmt;
51804 // Add it to the existing base.
51805 Base = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
51806 DAG.getConstant(Adder, DL, PtrVT));
51807 Index = Index.getOperand(0);
51808 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
51809 }
51810 }
51811
51812 // It's also possible base is just a constant. In that case, just
51813 // replace it with 0 and move the displacement into the index.
51814 if (BV->isConstant() && isa<ConstantSDNode>(Base) &&
51815 isOneConstant(Scale)) {
51816 SDValue Splat = DAG.getSplatBuildVector(Index.getValueType(), DL, Base);
51817 // Combine the constant build_vector and the constant base.
51818 Splat = DAG.getNode(ISD::ADD, DL, Index.getValueType(),
51819 Index.getOperand(1), Splat);
51820 // Add to the LHS of the original Index add.
51821 Index = DAG.getNode(ISD::ADD, DL, Index.getValueType(),
51822 Index.getOperand(0), Splat);
51823 Base = DAG.getConstant(0, DL, Base.getValueType());
51824 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
51825 }
51826 }
51827 }
51828
51829 if (DCI.isBeforeLegalizeOps()) {
51830 unsigned IndexWidth = Index.getScalarValueSizeInBits();
51831
51832 // Make sure the index is either i32 or i64
51833 if (IndexWidth != 32 && IndexWidth != 64) {
51834 MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32;
51835 EVT IndexVT = Index.getValueType().changeVectorElementType(EltVT);
51836 Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
51837 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
51838 }
51839 }
51840
51841 // With vector masks we only demand the upper bit of the mask.
51842 SDValue Mask = GorS->getMask();
51843 if (Mask.getScalarValueSizeInBits() != 1) {
51844 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51845 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
51846 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
51847 if (N->getOpcode() != ISD::DELETED_NODE)
51848 DCI.AddToWorklist(N);
51849 return SDValue(N, 0);
51850 }
51851 }
51852
51853 return SDValue();
51854}
51855
51856// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
51857static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
51858 const X86Subtarget &Subtarget) {
51859 SDLoc DL(N);
51860 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
51861 SDValue EFLAGS = N->getOperand(1);
51862
51863 // Try to simplify the EFLAGS and condition code operands.
51864 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
51865 return getSETCC(CC, Flags, DL, DAG);
51866
51867 return SDValue();
51868}
51869
51870/// Optimize branch condition evaluation.
51871static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
51872 const X86Subtarget &Subtarget) {
51873 SDLoc DL(N);
51874 SDValue EFLAGS = N->getOperand(3);
51875 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
51876
51877 // Try to simplify the EFLAGS and condition code operands.
51878 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
51879 // RAUW them under us.
51880 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
51881 SDValue Cond = DAG.getTargetConstant(CC, DL, MVT::i8);
51882 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
51883 N->getOperand(1), Cond, Flags);
51884 }
51885
51886 return SDValue();
51887}
51888
51889// TODO: Could we move this to DAGCombine?
51890static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
51891 SelectionDAG &DAG) {
51892 // Take advantage of vector comparisons (etc.) producing 0 or -1 in each lane
51893 // to optimize away operation when it's from a constant.
51894 //
51895 // The general transformation is:
51896 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
51897 // AND(VECTOR_CMP(x,y), constant2)
51898 // constant2 = UNARYOP(constant)
51899
51900 // Early exit if this isn't a vector operation, the operand of the
51901 // unary operation isn't a bitwise AND, or if the sizes of the operations
51902 // aren't the same.
51903 EVT VT = N->getValueType(0);
51904 bool IsStrict = N->isStrictFPOpcode();
51905 unsigned NumEltBits = VT.getScalarSizeInBits();
51906 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
51907 if (!VT.isVector() || Op0.getOpcode() != ISD::AND ||
51908 DAG.ComputeNumSignBits(Op0.getOperand(0)) != NumEltBits ||
51909 VT.getSizeInBits() != Op0.getValueSizeInBits())
51910 return SDValue();
51911
51912 // Now check that the other operand of the AND is a constant. We could
51913 // make the transformation for non-constant splats as well, but it's unclear
51914 // that would be a benefit as it would not eliminate any operations, just
51915 // perform one more step in scalar code before moving to the vector unit.
51916 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op0.getOperand(1))) {
51917 // Bail out if the vector isn't a constant.
51918 if (!BV->isConstant())
51919 return SDValue();
51920
51921 // Everything checks out. Build up the new and improved node.
51922 SDLoc DL(N);
51923 EVT IntVT = BV->getValueType(0);
51924 // Create a new constant of the appropriate type for the transformed
51925 // DAG.
51926 SDValue SourceConst;
51927 if (IsStrict)
51928 SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other},
51929 {N->getOperand(0), SDValue(BV, 0)});
51930 else
51931 SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
51932 // The AND node needs bitcasts to/from an integer vector type around it.
51933 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
51934 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0),
51935 MaskConst);
51936 SDValue Res = DAG.getBitcast(VT, NewAnd);
51937 if (IsStrict)
51938 return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL);
51939 return Res;
51940 }
51941
51942 return SDValue();
51943}
51944
51945/// If we are converting a value to floating-point, try to replace scalar
51946/// truncate of an extracted vector element with a bitcast. This tries to keep
51947/// the sequence on XMM registers rather than moving between vector and GPRs.
51948static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG) {
51949 // TODO: This is currently only used by combineSIntToFP, but it is generalized
51950 // to allow being called by any similar cast opcode.
51951 // TODO: Consider merging this into lowering: vectorizeExtractedCast().
51952 SDValue Trunc = N->getOperand(0);
51953 if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE)
51954 return SDValue();
51955
51956 SDValue ExtElt = Trunc.getOperand(0);
51957 if (!ExtElt.hasOneUse() || ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
51958 !isNullConstant(ExtElt.getOperand(1)))
51959 return SDValue();
51960
51961 EVT TruncVT = Trunc.getValueType();
51962 EVT SrcVT = ExtElt.getValueType();
51963 unsigned DestWidth = TruncVT.getSizeInBits();
51964 unsigned SrcWidth = SrcVT.getSizeInBits();
51965 if (SrcWidth % DestWidth != 0)
51966 return SDValue();
51967
51968 // inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)
51969 EVT SrcVecVT = ExtElt.getOperand(0).getValueType();
51970 unsigned VecWidth = SrcVecVT.getSizeInBits();
51971 unsigned NumElts = VecWidth / DestWidth;
51972 EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts);
51973 SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0));
51974 SDLoc DL(N);
51975 SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT,
51976 BitcastVec, ExtElt.getOperand(1));
51977 return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);
51978}
51979
51980static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
51981 const X86Subtarget &Subtarget) {
51982 bool IsStrict = N->isStrictFPOpcode();
51983 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
51984 EVT VT = N->getValueType(0);
51985 EVT InVT = Op0.getValueType();
51986
51987 // UINT_TO_FP(vXi1~15) -> UINT_TO_FP(ZEXT(vXi1~15 to vXi16))
51988 // UINT_TO_FP(vXi17~31) -> UINT_TO_FP(ZEXT(vXi17~31 to vXi32))
51989 // UINT_TO_FP(vXi33~63) -> UINT_TO_FP(ZEXT(vXi33~63 to vXi64))
51990 if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
51991 unsigned ScalarSize = InVT.getScalarSizeInBits();
51992 if (ScalarSize == 16 || ScalarSize == 32 || ScalarSize >= 64)
51993 return SDValue();
51994 SDLoc dl(N);
51995 EVT DstVT = EVT::getVectorVT(*DAG.getContext(),
51996 ScalarSize < 16 ? MVT::i16
51997 : ScalarSize < 32 ? MVT::i32
51998 : MVT::i64,
51999 InVT.getVectorNumElements());
52000 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
52001 if (IsStrict)
52002 return DAG.getNode(ISD::STRICT_UINT_TO_FP, dl, {VT, MVT::Other},
52003 {N->getOperand(0), P});
52004 return DAG.getNode(ISD::UINT_TO_FP, dl, VT, P);
52005 }
52006
52007 // UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))
52008 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
52009 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
52010 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
52011 VT.getScalarType() != MVT::f16) {
52012 SDLoc dl(N);
52013 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
52014 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
52015
52016 // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
52017 if (IsStrict)
52018 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
52019 {N->getOperand(0), P});
52020 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
52021 }
52022
52023 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
52024 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
52025 // the optimization here.
52026 if (DAG.SignBitIsZero(Op0)) {
52027 if (IsStrict)
52028 return DAG.getNode(ISD::STRICT_SINT_TO_FP, SDLoc(N), {VT, MVT::Other},
52029 {N->getOperand(0), Op0});
52030 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
52031 }
52032
52033 return SDValue();
52034}
52035
52036static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
52037 TargetLowering::DAGCombinerInfo &DCI,
52038 const X86Subtarget &Subtarget) {
52039 // First try to optimize away the conversion entirely when it's
52040 // conditionally from a constant. Vectors only.
52041 bool IsStrict = N->isStrictFPOpcode();
52042 if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
52043 return Res;
52044
52045 // Now move on to more general possibilities.
52046 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
52047 EVT VT = N->getValueType(0);
52048 EVT InVT = Op0.getValueType();
52049
52050 // SINT_TO_FP(vXi1~15) -> SINT_TO_FP(SEXT(vXi1~15 to vXi16))
52051 // SINT_TO_FP(vXi17~31) -> SINT_TO_FP(SEXT(vXi17~31 to vXi32))
52052 // SINT_TO_FP(vXi33~63) -> SINT_TO_FP(SEXT(vXi33~63 to vXi64))
52053 if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
52054 unsigned ScalarSize = InVT.getScalarSizeInBits();
52055 if (ScalarSize == 16 || ScalarSize == 32 || ScalarSize >= 64)
52056 return SDValue();
52057 SDLoc dl(N);
52058 EVT DstVT = EVT::getVectorVT(*DAG.getContext(),
52059 ScalarSize < 16 ? MVT::i16
52060 : ScalarSize < 32 ? MVT::i32
52061 : MVT::i64,
52062 InVT.getVectorNumElements());
52063 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
52064 if (IsStrict)
52065 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
52066 {N->getOperand(0), P});
52067 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
52068 }
52069
52070 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
52071 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
52072 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
52073 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
52074 VT.getScalarType() != MVT::f16) {
52075 SDLoc dl(N);
52076 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
52077 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
52078 if (IsStrict)
52079 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
52080 {N->getOperand(0), P});
52081 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
52082 }
52083
52084 // Without AVX512DQ we only support i64 to float scalar conversion. For both
52085 // vectors and scalars, see if we know that the upper bits are all the sign
52086 // bit, in which case we can truncate the input to i32 and convert from that.
52087 if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
52088 unsigned BitWidth = InVT.getScalarSizeInBits();
52089 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
52090 if (NumSignBits >= (BitWidth - 31)) {
52091 EVT TruncVT = MVT::i32;
52092 if (InVT.isVector())
52093 TruncVT = InVT.changeVectorElementType(TruncVT);
52094 SDLoc dl(N);
52095 if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) {
52096 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
52097 if (IsStrict)
52098 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
52099 {N->getOperand(0), Trunc});
52100 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
52101 }
52102 // If we're after legalize and the type is v2i32 we need to shuffle and
52103 // use CVTSI2P.
52104 assert(InVT == MVT::v2i64 && "Unexpected VT!")(static_cast <bool> (InVT == MVT::v2i64 && "Unexpected VT!"
) ? void (0) : __assert_fail ("InVT == MVT::v2i64 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52104, __extension__
__PRETTY_FUNCTION__))
;
52105 SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);
52106 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,
52107 { 0, 2, -1, -1 });
52108 if (IsStrict)
52109 return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
52110 {N->getOperand(0), Shuf});
52111 return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf);
52112 }
52113 }
52114
52115 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
52116 // a 32-bit target where SSE doesn't support i64->FP operations.
52117 if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&
52118 Op0.getOpcode() == ISD::LOAD) {
52119 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
52120
52121 // This transformation is not supported if the result type is f16 or f128.
52122 if (VT == MVT::f16 || VT == MVT::f128)
52123 return SDValue();
52124
52125 // If we have AVX512DQ we can use packed conversion instructions unless
52126 // the VT is f80.
52127 if (Subtarget.hasDQI() && VT != MVT::f80)
52128 return SDValue();
52129
52130 if (Ld->isSimple() && !VT.isVector() && ISD::isNormalLoad(Op0.getNode()) &&
52131 Op0.hasOneUse() && !Subtarget.is64Bit() && InVT == MVT::i64) {
52132 std::pair<SDValue, SDValue> Tmp =
52133 Subtarget.getTargetLowering()->BuildFILD(
52134 VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(),
52135 Ld->getPointerInfo(), Ld->getOriginalAlign(), DAG);
52136 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second);
52137 return Tmp.first;
52138 }
52139 }
52140
52141 if (IsStrict)
52142 return SDValue();
52143
52144 if (SDValue V = combineToFPTruncExtElt(N, DAG))
52145 return V;
52146
52147 return SDValue();
52148}
52149
52150static bool needCarryOrOverflowFlag(SDValue Flags) {
52151 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!")(static_cast <bool> (Flags.getValueType() == MVT::i32 &&
"Unexpected VT!") ? void (0) : __assert_fail ("Flags.getValueType() == MVT::i32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52151, __extension__
__PRETTY_FUNCTION__))
;
52152
52153 for (const SDNode *User : Flags->uses()) {
52154 X86::CondCode CC;
52155 switch (User->getOpcode()) {
52156 default:
52157 // Be conservative.
52158 return true;
52159 case X86ISD::SETCC:
52160 case X86ISD::SETCC_CARRY:
52161 CC = (X86::CondCode)User->getConstantOperandVal(0);
52162 break;
52163 case X86ISD::BRCOND:
52164 CC = (X86::CondCode)User->getConstantOperandVal(2);
52165 break;
52166 case X86ISD::CMOV:
52167 CC = (X86::CondCode)User->getConstantOperandVal(2);
52168 break;
52169 }
52170
52171 switch (CC) {
52172 default: break;
52173 case X86::COND_A: case X86::COND_AE:
52174 case X86::COND_B: case X86::COND_BE:
52175 case X86::COND_O: case X86::COND_NO:
52176 case X86::COND_G: case X86::COND_GE:
52177 case X86::COND_L: case X86::COND_LE:
52178 return true;
52179 }
52180 }
52181
52182 return false;
52183}
52184
52185static bool onlyZeroFlagUsed(SDValue Flags) {
52186 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!")(static_cast <bool> (Flags.getValueType() == MVT::i32 &&
"Unexpected VT!") ? void (0) : __assert_fail ("Flags.getValueType() == MVT::i32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52186, __extension__
__PRETTY_FUNCTION__))
;
52187
52188 for (const SDNode *User : Flags->uses()) {
52189 unsigned CCOpNo;
52190 switch (User->getOpcode()) {
52191 default:
52192 // Be conservative.
52193 return false;
52194 case X86ISD::SETCC: CCOpNo = 0; break;
52195 case X86ISD::SETCC_CARRY: CCOpNo = 0; break;
52196 case X86ISD::BRCOND: CCOpNo = 2; break;
52197 case X86ISD::CMOV: CCOpNo = 2; break;
52198 }
52199
52200 X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
52201 if (CC != X86::COND_E && CC != X86::COND_NE)
52202 return false;
52203 }
52204
52205 return true;
52206}
52207
52208static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) {
52209 // Only handle test patterns.
52210 if (!isNullConstant(N->getOperand(1)))
52211 return SDValue();
52212
52213 // If we have a CMP of a truncated binop, see if we can make a smaller binop
52214 // and use its flags directly.
52215 // TODO: Maybe we should try promoting compares that only use the zero flag
52216 // first if we can prove the upper bits with computeKnownBits?
52217 SDLoc dl(N);
52218 SDValue Op = N->getOperand(0);
52219 EVT VT = Op.getValueType();
52220
52221 // If we have a constant logical shift that's only used in a comparison
52222 // against zero turn it into an equivalent AND. This allows turning it into
52223 // a TEST instruction later.
52224 if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) &&
52225 Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&
52226 onlyZeroFlagUsed(SDValue(N, 0))) {
52227 unsigned BitWidth = VT.getSizeInBits();
52228 const APInt &ShAmt = Op.getConstantOperandAPInt(1);
52229 if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts.
52230 unsigned MaskBits = BitWidth - ShAmt.getZExtValue();
52231 APInt Mask = Op.getOpcode() == ISD::SRL
52232 ? APInt::getHighBitsSet(BitWidth, MaskBits)
52233 : APInt::getLowBitsSet(BitWidth, MaskBits);
52234 if (Mask.isSignedIntN(32)) {
52235 Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),
52236 DAG.getConstant(Mask, dl, VT));
52237 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
52238 DAG.getConstant(0, dl, VT));
52239 }
52240 }
52241 }
52242
52243 // Peek through any zero-extend if we're only testing for a zero result.
52244 if (Op.getOpcode() == ISD::ZERO_EXTEND && onlyZeroFlagUsed(SDValue(N, 0))) {
52245 SDValue Src = Op.getOperand(0);
52246 EVT SrcVT = Src.getValueType();
52247 if (SrcVT.getScalarSizeInBits() >= 8 &&
52248 DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
52249 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Src,
52250 DAG.getConstant(0, dl, SrcVT));
52251 }
52252
52253 // Look for a truncate.
52254 if (Op.getOpcode() != ISD::TRUNCATE)
52255 return SDValue();
52256
52257 SDValue Trunc = Op;
52258 Op = Op.getOperand(0);
52259
52260 // See if we can compare with zero against the truncation source,
52261 // which should help using the Z flag from many ops. Only do this for
52262 // i32 truncated op to prevent partial-reg compares of promoted ops.
52263 EVT OpVT = Op.getValueType();
52264 APInt UpperBits =
52265 APInt::getBitsSetFrom(OpVT.getSizeInBits(), VT.getSizeInBits());
52266 if (OpVT == MVT::i32 && DAG.MaskedValueIsZero(Op, UpperBits) &&
52267 onlyZeroFlagUsed(SDValue(N, 0))) {
52268 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
52269 DAG.getConstant(0, dl, OpVT));
52270 }
52271
52272 // After this the truncate and arithmetic op must have a single use.
52273 if (!Trunc.hasOneUse() || !Op.hasOneUse())
52274 return SDValue();
52275
52276 unsigned NewOpc;
52277 switch (Op.getOpcode()) {
52278 default: return SDValue();
52279 case ISD::AND:
52280 // Skip and with constant. We have special handling for and with immediate
52281 // during isel to generate test instructions.
52282 if (isa<ConstantSDNode>(Op.getOperand(1)))
52283 return SDValue();
52284 NewOpc = X86ISD::AND;
52285 break;
52286 case ISD::OR: NewOpc = X86ISD::OR; break;
52287 case ISD::XOR: NewOpc = X86ISD::XOR; break;
52288 case ISD::ADD:
52289 // If the carry or overflow flag is used, we can't truncate.
52290 if (needCarryOrOverflowFlag(SDValue(N, 0)))
52291 return SDValue();
52292 NewOpc = X86ISD::ADD;
52293 break;
52294 case ISD::SUB:
52295 // If the carry or overflow flag is used, we can't truncate.
52296 if (needCarryOrOverflowFlag(SDValue(N, 0)))
52297 return SDValue();
52298 NewOpc = X86ISD::SUB;
52299 break;
52300 }
52301
52302 // We found an op we can narrow. Truncate its inputs.
52303 SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0));
52304 SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1));
52305
52306 // Use a X86 specific opcode to avoid DAG combine messing with it.
52307 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
52308 Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1);
52309
52310 // For AND, keep a CMP so that we can match the test pattern.
52311 if (NewOpc == X86ISD::AND)
52312 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
52313 DAG.getConstant(0, dl, VT));
52314
52315 // Return the flags.
52316 return Op.getValue(1);
52317}
52318
52319static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG,
52320 TargetLowering::DAGCombinerInfo &DCI) {
52321 assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) &&(static_cast <bool> ((X86ISD::ADD == N->getOpcode() ||
X86ISD::SUB == N->getOpcode()) && "Expected X86ISD::ADD or X86ISD::SUB"
) ? void (0) : __assert_fail ("(X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) && \"Expected X86ISD::ADD or X86ISD::SUB\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52322, __extension__
__PRETTY_FUNCTION__))
52322 "Expected X86ISD::ADD or X86ISD::SUB")(static_cast <bool> ((X86ISD::ADD == N->getOpcode() ||
X86ISD::SUB == N->getOpcode()) && "Expected X86ISD::ADD or X86ISD::SUB"
) ? void (0) : __assert_fail ("(X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) && \"Expected X86ISD::ADD or X86ISD::SUB\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52322, __extension__
__PRETTY_FUNCTION__))
;
52323
52324 SDLoc DL(N);
52325 SDValue LHS = N->getOperand(0);
52326 SDValue RHS = N->getOperand(1);
52327 MVT VT = LHS.getSimpleValueType();
52328 unsigned GenericOpc = X86ISD::ADD == N->getOpcode() ? ISD::ADD : ISD::SUB;
52329
52330 // If we don't use the flag result, simplify back to a generic ADD/SUB.
52331 if (!N->hasAnyUseOfValue(1)) {
52332 SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS);
52333 return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL);
52334 }
52335
52336 // Fold any similar generic ADD/SUB opcodes to reuse this node.
52337 auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {
52338 SDValue Ops[] = {N0, N1};
52339 SDVTList VTs = DAG.getVTList(N->getValueType(0));
52340 if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {
52341 SDValue Op(N, 0);
52342 if (Negate)
52343 Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op);
52344 DCI.CombineTo(GenericAddSub, Op);
52345 }
52346 };
52347 MatchGeneric(LHS, RHS, false);
52348 MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());
52349
52350 return SDValue();
52351}
52352
52353static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
52354 SDValue LHS = N->getOperand(0);
52355 SDValue RHS = N->getOperand(1);
52356 SDValue BorrowIn = N->getOperand(2);
52357
52358 if (SDValue Flags = combineCarryThroughADD(BorrowIn, DAG)) {
52359 MVT VT = N->getSimpleValueType(0);
52360 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
52361 return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs, LHS, RHS, Flags);
52362 }
52363
52364 // Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)
52365 // iff the flag result is dead.
52366 if (LHS.getOpcode() == ISD::SUB && isNullConstant(RHS) &&
52367 !N->hasAnyUseOfValue(1))
52368 return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), LHS.getOperand(0),
52369 LHS.getOperand(1), BorrowIn);
52370
52371 return SDValue();
52372}
52373
52374// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
52375static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
52376 TargetLowering::DAGCombinerInfo &DCI) {
52377 SDValue LHS = N->getOperand(0);
52378 SDValue RHS = N->getOperand(1);
52379 SDValue CarryIn = N->getOperand(2);
52380 auto *LHSC = dyn_cast<ConstantSDNode>(LHS);
52381 auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
52382
52383 // Canonicalize constant to RHS.
52384 if (LHSC && !RHSC)
52385 return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), RHS, LHS,
52386 CarryIn);
52387
52388 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
52389 // the result is either zero or one (depending on the input carry bit).
52390 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
52391 if (LHSC && RHSC && LHSC->isZero() && RHSC->isZero() &&
52392 // We don't have a good way to replace an EFLAGS use, so only do this when
52393 // dead right now.
52394 SDValue(N, 1).use_empty()) {
52395 SDLoc DL(N);
52396 EVT VT = N->getValueType(0);
52397 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
52398 SDValue Res1 = DAG.getNode(
52399 ISD::AND, DL, VT,
52400 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52401 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), CarryIn),
52402 DAG.getConstant(1, DL, VT));
52403 return DCI.CombineTo(N, Res1, CarryOut);
52404 }
52405
52406 // Fold ADC(C1,C2,Carry) -> ADC(0,C1+C2,Carry)
52407 // iff the flag result is dead.
52408 // TODO: Allow flag result if C1+C2 doesn't signed/unsigned overflow.
52409 if (LHSC && RHSC && !LHSC->isZero() && !N->hasAnyUseOfValue(1)) {
52410 SDLoc DL(N);
52411 APInt Sum = LHSC->getAPIntValue() + RHSC->getAPIntValue();
52412 return DAG.getNode(X86ISD::ADC, DL, N->getVTList(),
52413 DAG.getConstant(0, DL, LHS.getValueType()),
52414 DAG.getConstant(Sum, DL, LHS.getValueType()), CarryIn);
52415 }
52416
52417 if (SDValue Flags = combineCarryThroughADD(CarryIn, DAG)) {
52418 MVT VT = N->getSimpleValueType(0);
52419 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
52420 return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs, LHS, RHS, Flags);
52421 }
52422
52423 // Fold ADC(ADD(X,Y),0,Carry) -> ADC(X,Y,Carry)
52424 // iff the flag result is dead.
52425 if (LHS.getOpcode() == ISD::ADD && RHSC && RHSC->isZero() &&
52426 !N->hasAnyUseOfValue(1))
52427 return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), LHS.getOperand(0),
52428 LHS.getOperand(1), CarryIn);
52429
52430 return SDValue();
52431}
52432
52433/// If this is an add or subtract where one operand is produced by a cmp+setcc,
52434/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
52435/// with CMP+{ADC, SBB}.
52436/// Also try (ADD/SUB)+(AND(SRL,1)) bit extraction pattern with BT+{ADC, SBB}.
52437static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT,
52438 SDValue X, SDValue Y,
52439 SelectionDAG &DAG) {
52440 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
52441 return SDValue();
52442
52443 // Look through a one-use zext.
52444 if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse())
52445 Y = Y.getOperand(0);
52446
52447 X86::CondCode CC;
52448 SDValue EFLAGS;
52449 if (Y.getOpcode() == X86ISD::SETCC && Y.hasOneUse()) {
52450 CC = (X86::CondCode)Y.getConstantOperandVal(0);
52451 EFLAGS = Y.getOperand(1);
52452 } else if (Y.getOpcode() == ISD::AND && isOneConstant(Y.getOperand(1)) &&
52453 Y.hasOneUse()) {
52454 EFLAGS = LowerAndToBT(Y, ISD::SETNE, DL, DAG, CC);
52455 }
52456
52457 if (!EFLAGS)
52458 return SDValue();
52459
52460 // If X is -1 or 0, then we have an opportunity to avoid constants required in
52461 // the general case below.
52462 auto *ConstantX = dyn_cast<ConstantSDNode>(X);
52463 if (ConstantX) {
52464 if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnes()) ||
52465 (IsSub && CC == X86::COND_B && ConstantX->isZero())) {
52466 // This is a complicated way to get -1 or 0 from the carry flag:
52467 // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
52468 // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
52469 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52470 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
52471 EFLAGS);
52472 }
52473
52474 if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnes()) ||
52475 (IsSub && CC == X86::COND_A && ConstantX->isZero())) {
52476 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
52477 EFLAGS.getValueType().isInteger() &&
52478 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
52479 // Swap the operands of a SUB, and we have the same pattern as above.
52480 // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
52481 // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
52482 SDValue NewSub = DAG.getNode(
52483 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
52484 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
52485 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
52486 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52487 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
52488 NewEFLAGS);
52489 }
52490 }
52491 }
52492
52493 if (CC == X86::COND_B) {
52494 // X + SETB Z --> adc X, 0
52495 // X - SETB Z --> sbb X, 0
52496 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
52497 DAG.getVTList(VT, MVT::i32), X,
52498 DAG.getConstant(0, DL, VT), EFLAGS);
52499 }
52500
52501 if (CC == X86::COND_A) {
52502 // Try to convert COND_A into COND_B in an attempt to facilitate
52503 // materializing "setb reg".
52504 //
52505 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
52506 // cannot take an immediate as its first operand.
52507 //
52508 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
52509 EFLAGS.getValueType().isInteger() &&
52510 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
52511 SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
52512 EFLAGS.getNode()->getVTList(),
52513 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
52514 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
52515 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
52516 DAG.getVTList(VT, MVT::i32), X,
52517 DAG.getConstant(0, DL, VT), NewEFLAGS);
52518 }
52519 }
52520
52521 if (CC == X86::COND_AE) {
52522 // X + SETAE --> sbb X, -1
52523 // X - SETAE --> adc X, -1
52524 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
52525 DAG.getVTList(VT, MVT::i32), X,
52526 DAG.getConstant(-1, DL, VT), EFLAGS);
52527 }
52528
52529 if (CC == X86::COND_BE) {
52530 // X + SETBE --> sbb X, -1
52531 // X - SETBE --> adc X, -1
52532 // Try to convert COND_BE into COND_AE in an attempt to facilitate
52533 // materializing "setae reg".
52534 //
52535 // Do not flip "e <= c", where "c" is a constant, because Cmp instruction
52536 // cannot take an immediate as its first operand.
52537 //
52538 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
52539 EFLAGS.getValueType().isInteger() &&
52540 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
52541 SDValue NewSub = DAG.getNode(
52542 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
52543 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
52544 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
52545 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
52546 DAG.getVTList(VT, MVT::i32), X,
52547 DAG.getConstant(-1, DL, VT), NewEFLAGS);
52548 }
52549 }
52550
52551 if (CC != X86::COND_E && CC != X86::COND_NE)
52552 return SDValue();
52553
52554 if (EFLAGS.getOpcode() != X86ISD::CMP || !EFLAGS.hasOneUse() ||
52555 !X86::isZeroNode(EFLAGS.getOperand(1)) ||
52556 !EFLAGS.getOperand(0).getValueType().isInteger())
52557 return SDValue();
52558
52559 SDValue Z = EFLAGS.getOperand(0);
52560 EVT ZVT = Z.getValueType();
52561
52562 // If X is -1 or 0, then we have an opportunity to avoid constants required in
52563 // the general case below.
52564 if (ConstantX) {
52565 // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
52566 // fake operands:
52567 // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
52568 // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
52569 if ((IsSub && CC == X86::COND_NE && ConstantX->isZero()) ||
52570 (!IsSub && CC == X86::COND_E && ConstantX->isAllOnes())) {
52571 SDValue Zero = DAG.getConstant(0, DL, ZVT);
52572 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
52573 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
52574 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52575 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
52576 SDValue(Neg.getNode(), 1));
52577 }
52578
52579 // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
52580 // with fake operands:
52581 // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
52582 // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
52583 if ((IsSub && CC == X86::COND_E && ConstantX->isZero()) ||
52584 (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnes())) {
52585 SDValue One = DAG.getConstant(1, DL, ZVT);
52586 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
52587 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
52588 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52589 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
52590 Cmp1.getValue(1));
52591 }
52592 }
52593
52594 // (cmp Z, 1) sets the carry flag if Z is 0.
52595 SDValue One = DAG.getConstant(1, DL, ZVT);
52596 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
52597 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
52598
52599 // Add the flags type for ADC/SBB nodes.
52600 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
52601
52602 // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
52603 // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
52604 if (CC == X86::COND_NE)
52605 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
52606 DAG.getConstant(-1ULL, DL, VT), Cmp1.getValue(1));
52607
52608 // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
52609 // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
52610 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
52611 DAG.getConstant(0, DL, VT), Cmp1.getValue(1));
52612}
52613
52614/// If this is an add or subtract where one operand is produced by a cmp+setcc,
52615/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
52616/// with CMP+{ADC, SBB}.
52617static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
52618 bool IsSub = N->getOpcode() == ISD::SUB;
52619 SDValue X = N->getOperand(0);
52620 SDValue Y = N->getOperand(1);
52621 EVT VT = N->getValueType(0);
52622 SDLoc DL(N);
52623
52624 if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, X, Y, DAG))
52625 return ADCOrSBB;
52626
52627 // Commute and try again (negate the result for subtracts).
52628 if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, Y, X, DAG)) {
52629 if (IsSub)
52630 ADCOrSBB =
52631 DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), ADCOrSBB);
52632 return ADCOrSBB;
52633 }
52634
52635 return SDValue();
52636}
52637
52638static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
52639 const SDLoc &DL, EVT VT,
52640 const X86Subtarget &Subtarget) {
52641 // Example of pattern we try to detect:
52642 // t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
52643 //(add (build_vector (extract_elt t, 0),
52644 // (extract_elt t, 2),
52645 // (extract_elt t, 4),
52646 // (extract_elt t, 6)),
52647 // (build_vector (extract_elt t, 1),
52648 // (extract_elt t, 3),
52649 // (extract_elt t, 5),
52650 // (extract_elt t, 7)))
52651
52652 if (!Subtarget.hasSSE2())
52653 return SDValue();
52654
52655 if (Op0.getOpcode() != ISD::BUILD_VECTOR ||
52656 Op1.getOpcode() != ISD::BUILD_VECTOR)
52657 return SDValue();
52658
52659 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
52660 VT.getVectorNumElements() < 4 ||
52661 !isPowerOf2_32(VT.getVectorNumElements()))
52662 return SDValue();
52663
52664 // Check if one of Op0,Op1 is of the form:
52665 // (build_vector (extract_elt Mul, 0),
52666 // (extract_elt Mul, 2),
52667 // (extract_elt Mul, 4),
52668 // ...
52669 // the other is of the form:
52670 // (build_vector (extract_elt Mul, 1),
52671 // (extract_elt Mul, 3),
52672 // (extract_elt Mul, 5),
52673 // ...
52674 // and identify Mul.
52675 SDValue Mul;
52676 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
52677 SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
52678 Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
52679 // TODO: Be more tolerant to undefs.
52680 if (Op0L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
52681 Op1L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
52682 Op0H.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
52683 Op1H.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
52684 return SDValue();
52685 auto *Const0L = dyn_cast<ConstantSDNode>(Op0L->getOperand(1));
52686 auto *Const1L = dyn_cast<ConstantSDNode>(Op1L->getOperand(1));
52687 auto *Const0H = dyn_cast<ConstantSDNode>(Op0H->getOperand(1));
52688 auto *Const1H = dyn_cast<ConstantSDNode>(Op1H->getOperand(1));
52689 if (!Const0L || !Const1L || !Const0H || !Const1H)
52690 return SDValue();
52691 unsigned Idx0L = Const0L->getZExtValue(), Idx1L = Const1L->getZExtValue(),
52692 Idx0H = Const0H->getZExtValue(), Idx1H = Const1H->getZExtValue();
52693 // Commutativity of mul allows factors of a product to reorder.
52694 if (Idx0L > Idx1L)
52695 std::swap(Idx0L, Idx1L);
52696 if (Idx0H > Idx1H)
52697 std::swap(Idx0H, Idx1H);
52698 // Commutativity of add allows pairs of factors to reorder.
52699 if (Idx0L > Idx0H) {
52700 std::swap(Idx0L, Idx0H);
52701 std::swap(Idx1L, Idx1H);
52702 }
52703 if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||
52704 Idx1H != 2 * i + 3)
52705 return SDValue();
52706 if (!Mul) {
52707 // First time an extract_elt's source vector is visited. Must be a MUL
52708 // with 2X number of vector elements than the BUILD_VECTOR.
52709 // Both extracts must be from same MUL.
52710 Mul = Op0L->getOperand(0);
52711 if (Mul->getOpcode() != ISD::MUL ||
52712 Mul.getValueType().getVectorNumElements() != 2 * e)
52713 return SDValue();
52714 }
52715 // Check that the extract is from the same MUL previously seen.
52716 if (Mul != Op0L->getOperand(0) || Mul != Op1L->getOperand(0) ||
52717 Mul != Op0H->getOperand(0) || Mul != Op1H->getOperand(0))
52718 return SDValue();
52719 }
52720
52721 // Check if the Mul source can be safely shrunk.
52722 ShrinkMode Mode;
52723 if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) ||
52724 Mode == ShrinkMode::MULU16)
52725 return SDValue();
52726
52727 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
52728 VT.getVectorNumElements() * 2);
52729 SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0));
52730 SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1));
52731
52732 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
52733 ArrayRef<SDValue> Ops) {
52734 EVT InVT = Ops[0].getValueType();
52735 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch")(static_cast <bool> (InVT == Ops[1].getValueType() &&
"Operands' types mismatch") ? void (0) : __assert_fail ("InVT == Ops[1].getValueType() && \"Operands' types mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52735, __extension__
__PRETTY_FUNCTION__))
;
52736 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
52737 InVT.getVectorNumElements() / 2);
52738 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
52739 };
52740 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { N0, N1 }, PMADDBuilder);
52741}
52742
52743// Attempt to turn this pattern into PMADDWD.
52744// (add (mul (sext (build_vector)), (sext (build_vector))),
52745// (mul (sext (build_vector)), (sext (build_vector)))
52746static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
52747 const SDLoc &DL, EVT VT,
52748 const X86Subtarget &Subtarget) {
52749 if (!Subtarget.hasSSE2())
52750 return SDValue();
52751
52752 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
52753 return SDValue();
52754
52755 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
52756 VT.getVectorNumElements() < 4 ||
52757 !isPowerOf2_32(VT.getVectorNumElements()))
52758 return SDValue();
52759
52760 SDValue N00 = N0.getOperand(0);
52761 SDValue N01 = N0.getOperand(1);
52762 SDValue N10 = N1.getOperand(0);
52763 SDValue N11 = N1.getOperand(1);
52764
52765 // All inputs need to be sign extends.
52766 // TODO: Support ZERO_EXTEND from known positive?
52767 if (N00.getOpcode() != ISD::SIGN_EXTEND ||
52768 N01.getOpcode() != ISD::SIGN_EXTEND ||
52769 N10.getOpcode() != ISD::SIGN_EXTEND ||
52770 N11.getOpcode() != ISD::SIGN_EXTEND)
52771 return SDValue();
52772
52773 // Peek through the extends.
52774 N00 = N00.getOperand(0);
52775 N01 = N01.getOperand(0);
52776 N10 = N10.getOperand(0);
52777 N11 = N11.getOperand(0);
52778
52779 // Must be extending from vXi16.
52780 EVT InVT = N00.getValueType();
52781 if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT ||
52782 N10.getValueType() != InVT || N11.getValueType() != InVT)
52783 return SDValue();
52784
52785 // All inputs should be build_vectors.
52786 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
52787 N01.getOpcode() != ISD::BUILD_VECTOR ||
52788 N10.getOpcode() != ISD::BUILD_VECTOR ||
52789 N11.getOpcode() != ISD::BUILD_VECTOR)
52790 return SDValue();
52791
52792 // For each element, we need to ensure we have an odd element from one vector
52793 // multiplied by the odd element of another vector and the even element from
52794 // one of the same vectors being multiplied by the even element from the
52795 // other vector. So we need to make sure for each element i, this operator
52796 // is being performed:
52797 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
52798 SDValue In0, In1;
52799 for (unsigned i = 0; i != N00.getNumOperands(); ++i) {
52800 SDValue N00Elt = N00.getOperand(i);
52801 SDValue N01Elt = N01.getOperand(i);
52802 SDValue N10Elt = N10.getOperand(i);
52803 SDValue N11Elt = N11.getOperand(i);
52804 // TODO: Be more tolerant to undefs.
52805 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
52806 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
52807 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
52808 N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
52809 return SDValue();
52810 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
52811 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
52812 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
52813 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
52814 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
52815 return SDValue();
52816 unsigned IdxN00 = ConstN00Elt->getZExtValue();
52817 unsigned IdxN01 = ConstN01Elt->getZExtValue();
52818 unsigned IdxN10 = ConstN10Elt->getZExtValue();
52819 unsigned IdxN11 = ConstN11Elt->getZExtValue();
52820 // Add is commutative so indices can be reordered.
52821 if (IdxN00 > IdxN10) {
52822 std::swap(IdxN00, IdxN10);
52823 std::swap(IdxN01, IdxN11);
52824 }
52825 // N0 indices be the even element. N1 indices must be the next odd element.
52826 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
52827 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
52828 return SDValue();
52829 SDValue N00In = N00Elt.getOperand(0);
52830 SDValue N01In = N01Elt.getOperand(0);
52831 SDValue N10In = N10Elt.getOperand(0);
52832 SDValue N11In = N11Elt.getOperand(0);
52833
52834 // First time we find an input capture it.
52835 if (!In0) {
52836 In0 = N00In;
52837 In1 = N01In;
52838
52839 // The input vectors must be at least as wide as the output.
52840 // If they are larger than the output, we extract subvector below.
52841 if (In0.getValueSizeInBits() < VT.getSizeInBits() ||
52842 In1.getValueSizeInBits() < VT.getSizeInBits())
52843 return SDValue();
52844 }
52845 // Mul is commutative so the input vectors can be in any order.
52846 // Canonicalize to make the compares easier.
52847 if (In0 != N00In)
52848 std::swap(N00In, N01In);
52849 if (In0 != N10In)
52850 std::swap(N10In, N11In);
52851 if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In)
52852 return SDValue();
52853 }
52854
52855 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
52856 ArrayRef<SDValue> Ops) {
52857 EVT OpVT = Ops[0].getValueType();
52858 assert(OpVT.getScalarType() == MVT::i16 &&(static_cast <bool> (OpVT.getScalarType() == MVT::i16 &&
"Unexpected scalar element type") ? void (0) : __assert_fail
("OpVT.getScalarType() == MVT::i16 && \"Unexpected scalar element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52859, __extension__
__PRETTY_FUNCTION__))
52859 "Unexpected scalar element type")(static_cast <bool> (OpVT.getScalarType() == MVT::i16 &&
"Unexpected scalar element type") ? void (0) : __assert_fail
("OpVT.getScalarType() == MVT::i16 && \"Unexpected scalar element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52859, __extension__
__PRETTY_FUNCTION__))
;
52860 assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch")(static_cast <bool> (OpVT == Ops[1].getValueType() &&
"Operands' types mismatch") ? void (0) : __assert_fail ("OpVT == Ops[1].getValueType() && \"Operands' types mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52860, __extension__
__PRETTY_FUNCTION__))
;
52861 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
52862 OpVT.getVectorNumElements() / 2);
52863 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
52864 };
52865
52866 // If the output is narrower than an input, extract the low part of the input
52867 // vector.
52868 EVT OutVT16 = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
52869 VT.getVectorNumElements() * 2);
52870 if (OutVT16.bitsLT(In0.getValueType())) {
52871 In0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In0,
52872 DAG.getIntPtrConstant(0, DL));
52873 }
52874 if (OutVT16.bitsLT(In1.getValueType())) {
52875 In1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In1,
52876 DAG.getIntPtrConstant(0, DL));
52877 }
52878 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
52879 PMADDBuilder);
52880}
52881
52882// ADD(VPMADDWD(X,Y),VPMADDWD(Z,W)) -> VPMADDWD(SHUFFLE(X,Z), SHUFFLE(Y,W))
52883// If upper element in each pair of both VPMADDWD are zero then we can merge
52884// the operand elements and use the implicit add of VPMADDWD.
52885// TODO: Add support for VPMADDUBSW (which isn't commutable).
52886static SDValue combineAddOfPMADDWD(SelectionDAG &DAG, SDValue N0, SDValue N1,
52887 const SDLoc &DL, EVT VT) {
52888 if (N0.getOpcode() != N1.getOpcode() || N0.getOpcode() != X86ISD::VPMADDWD)
52889 return SDValue();
52890
52891 // TODO: Add 256/512-bit support once VPMADDWD combines with shuffles.
52892 if (VT.getSizeInBits() > 128)
52893 return SDValue();
52894
52895 unsigned NumElts = VT.getVectorNumElements();
52896 MVT OpVT = N0.getOperand(0).getSimpleValueType();
52897 APInt DemandedBits = APInt::getAllOnes(OpVT.getScalarSizeInBits());
52898 APInt DemandedHiElts = APInt::getSplat(2 * NumElts, APInt(2, 2));
52899
52900 bool Op0HiZero =
52901 DAG.MaskedValueIsZero(N0.getOperand(0), DemandedBits, DemandedHiElts) ||
52902 DAG.MaskedValueIsZero(N0.getOperand(1), DemandedBits, DemandedHiElts);
52903 bool Op1HiZero =
52904 DAG.MaskedValueIsZero(N1.getOperand(0), DemandedBits, DemandedHiElts) ||
52905 DAG.MaskedValueIsZero(N1.getOperand(1), DemandedBits, DemandedHiElts);
52906
52907 // TODO: Check for zero lower elements once we have actual codegen that
52908 // creates them.
52909 if (!Op0HiZero || !Op1HiZero)
52910 return SDValue();
52911
52912 // Create a shuffle mask packing the lower elements from each VPMADDWD.
52913 SmallVector<int> Mask;
52914 for (int i = 0; i != (int)NumElts; ++i) {
52915 Mask.push_back(2 * i);
52916 Mask.push_back(2 * (i + NumElts));
52917 }
52918
52919 SDValue LHS =
52920 DAG.getVectorShuffle(OpVT, DL, N0.getOperand(0), N1.getOperand(0), Mask);
52921 SDValue RHS =
52922 DAG.getVectorShuffle(OpVT, DL, N0.getOperand(1), N1.getOperand(1), Mask);
52923 return DAG.getNode(X86ISD::VPMADDWD, DL, VT, LHS, RHS);
52924}
52925
52926/// CMOV of constants requires materializing constant operands in registers.
52927/// Try to fold those constants into an 'add' instruction to reduce instruction
52928/// count. We do this with CMOV rather the generic 'select' because there are
52929/// earlier folds that may be used to turn select-of-constants into logic hacks.
52930static SDValue pushAddIntoCmovOfConsts(SDNode *N, SelectionDAG &DAG,
52931 const X86Subtarget &Subtarget) {
52932 // If an operand is zero, add-of-0 gets simplified away, so that's clearly
52933 // better because we eliminate 1-2 instructions. This transform is still
52934 // an improvement without zero operands because we trade 2 move constants and
52935 // 1 add for 2 adds (LEA) as long as the constants can be represented as
52936 // immediate asm operands (fit in 32-bits).
52937 auto isSuitableCmov = [](SDValue V) {
52938 if (V.getOpcode() != X86ISD::CMOV || !V.hasOneUse())
52939 return false;
52940 if (!isa<ConstantSDNode>(V.getOperand(0)) ||
52941 !isa<ConstantSDNode>(V.getOperand(1)))
52942 return false;
52943 return isNullConstant(V.getOperand(0)) || isNullConstant(V.getOperand(1)) ||
52944 (V.getConstantOperandAPInt(0).isSignedIntN(32) &&
52945 V.getConstantOperandAPInt(1).isSignedIntN(32));
52946 };
52947
52948 // Match an appropriate CMOV as the first operand of the add.
52949 SDValue Cmov = N->getOperand(0);
52950 SDValue OtherOp = N->getOperand(1);
52951 if (!isSuitableCmov(Cmov))
52952 std::swap(Cmov, OtherOp);
52953 if (!isSuitableCmov(Cmov))
52954 return SDValue();
52955
52956 // Don't remove a load folding opportunity for the add. That would neutralize
52957 // any improvements from removing constant materializations.
52958 if (X86::mayFoldLoad(OtherOp, Subtarget))
52959 return SDValue();
52960
52961 EVT VT = N->getValueType(0);
52962 SDLoc DL(N);
52963 SDValue FalseOp = Cmov.getOperand(0);
52964 SDValue TrueOp = Cmov.getOperand(1);
52965
52966 // We will push the add through the select, but we can potentially do better
52967 // if we know there is another add in the sequence and this is pointer math.
52968 // In that case, we can absorb an add into the trailing memory op and avoid
52969 // a 3-operand LEA which is likely slower than a 2-operand LEA.
52970 // TODO: If target has "slow3OpsLEA", do this even without the trailing memop?
52971 if (OtherOp.getOpcode() == ISD::ADD && OtherOp.hasOneUse() &&
52972 !isa<ConstantSDNode>(OtherOp.getOperand(0)) &&
52973 all_of(N->uses(), [&](SDNode *Use) {
52974 auto *MemNode = dyn_cast<MemSDNode>(Use);
52975 return MemNode && MemNode->getBasePtr().getNode() == N;
52976 })) {
52977 // add (cmov C1, C2), add (X, Y) --> add (cmov (add X, C1), (add X, C2)), Y
52978 // TODO: We are arbitrarily choosing op0 as the 1st piece of the sum, but
52979 // it is possible that choosing op1 might be better.
52980 SDValue X = OtherOp.getOperand(0), Y = OtherOp.getOperand(1);
52981 FalseOp = DAG.getNode(ISD::ADD, DL, VT, X, FalseOp);
52982 TrueOp = DAG.getNode(ISD::ADD, DL, VT, X, TrueOp);
52983 Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp,
52984 Cmov.getOperand(2), Cmov.getOperand(3));
52985 return DAG.getNode(ISD::ADD, DL, VT, Cmov, Y);
52986 }
52987
52988 // add (cmov C1, C2), OtherOp --> cmov (add OtherOp, C1), (add OtherOp, C2)
52989 FalseOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, FalseOp);
52990 TrueOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, TrueOp);
52991 return DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp, Cmov.getOperand(2),
52992 Cmov.getOperand(3));
52993}
52994
52995static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
52996 TargetLowering::DAGCombinerInfo &DCI,
52997 const X86Subtarget &Subtarget) {
52998 EVT VT = N->getValueType(0);
52999 SDValue Op0 = N->getOperand(0);
53000 SDValue Op1 = N->getOperand(1);
53001 SDLoc DL(N);
53002
53003 if (SDValue Select = pushAddIntoCmovOfConsts(N, DAG, Subtarget))
53004 return Select;
53005
53006 if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, DL, VT, Subtarget))
53007 return MAdd;
53008 if (SDValue MAdd = matchPMADDWD_2(DAG, Op0, Op1, DL, VT, Subtarget))
53009 return MAdd;
53010 if (SDValue MAdd = combineAddOfPMADDWD(DAG, Op0, Op1, DL, VT))
53011 return MAdd;
53012
53013 // Try to synthesize horizontal adds from adds of shuffles.
53014 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
53015 return V;
53016
53017 // If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into
53018 // (sub Y, (sext (vXi1 X))).
53019 // FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y) in
53020 // generic DAG combine without a legal type check, but adding this there
53021 // caused regressions.
53022 if (VT.isVector()) {
53023 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53024 if (Op0.getOpcode() == ISD::ZERO_EXTEND &&
53025 Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
53026 TLI.isTypeLegal(Op0.getOperand(0).getValueType())) {
53027 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op0.getOperand(0));
53028 return DAG.getNode(ISD::SUB, DL, VT, Op1, SExt);
53029 }
53030
53031 if (Op1.getOpcode() == ISD::ZERO_EXTEND &&
53032 Op1.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
53033 TLI.isTypeLegal(Op1.getOperand(0).getValueType())) {
53034 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op1.getOperand(0));
53035 return DAG.getNode(ISD::SUB, DL, VT, Op0, SExt);
53036 }
53037 }
53038
53039 // Fold ADD(ADC(Y,0,W),X) -> ADC(X,Y,W)
53040 if (Op0.getOpcode() == X86ISD::ADC && Op0->hasOneUse() &&
53041 X86::isZeroNode(Op0.getOperand(1))) {
53042 assert(!Op0->hasAnyUseOfValue(1) && "Overflow bit in use")(static_cast <bool> (!Op0->hasAnyUseOfValue(1) &&
"Overflow bit in use") ? void (0) : __assert_fail ("!Op0->hasAnyUseOfValue(1) && \"Overflow bit in use\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53042, __extension__
__PRETTY_FUNCTION__))
;
53043 return DAG.getNode(X86ISD::ADC, SDLoc(Op0), Op0->getVTList(), Op1,
53044 Op0.getOperand(0), Op0.getOperand(2));
53045 }
53046
53047 return combineAddOrSubToADCOrSBB(N, DAG);
53048}
53049
53050// Try to fold (sub Y, cmovns X, -X) -> (add Y, cmovns -X, X) if the cmov
53051// condition comes from the subtract node that produced -X. This matches the
53052// cmov expansion for absolute value. By swapping the operands we convert abs
53053// to nabs.
53054static SDValue combineSubABS(SDNode *N, SelectionDAG &DAG) {
53055 SDValue N0 = N->getOperand(0);
53056 SDValue N1 = N->getOperand(1);
53057
53058 if (N1.getOpcode() != X86ISD::CMOV || !N1.hasOneUse())
53059 return SDValue();
53060
53061 X86::CondCode CC = (X86::CondCode)N1.getConstantOperandVal(2);
53062 if (CC != X86::COND_S && CC != X86::COND_NS)
53063 return SDValue();
53064
53065 // Condition should come from a negate operation.
53066 SDValue Cond = N1.getOperand(3);
53067 if (Cond.getOpcode() != X86ISD::SUB || !isNullConstant(Cond.getOperand(0)))
53068 return SDValue();
53069 assert(Cond.getResNo() == 1 && "Unexpected result number")(static_cast <bool> (Cond.getResNo() == 1 && "Unexpected result number"
) ? void (0) : __assert_fail ("Cond.getResNo() == 1 && \"Unexpected result number\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53069, __extension__
__PRETTY_FUNCTION__))
;
53070
53071 // Get the X and -X from the negate.
53072 SDValue NegX = Cond.getValue(0);
53073 SDValue X = Cond.getOperand(1);
53074
53075 SDValue FalseOp = N1.getOperand(0);
53076 SDValue TrueOp = N1.getOperand(1);
53077
53078 // Cmov operands should be X and NegX. Order doesn't matter.
53079 if (!(TrueOp == X && FalseOp == NegX) && !(TrueOp == NegX && FalseOp == X))
53080 return SDValue();
53081
53082 // Build a new CMOV with the operands swapped.
53083 SDLoc DL(N);
53084 MVT VT = N->getSimpleValueType(0);
53085 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp,
53086 N1.getOperand(2), Cond);
53087 // Convert sub to add.
53088 return DAG.getNode(ISD::ADD, DL, VT, N0, Cmov);
53089}
53090
53091static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
53092 TargetLowering::DAGCombinerInfo &DCI,
53093 const X86Subtarget &Subtarget) {
53094 SDValue Op0 = N->getOperand(0);
53095 SDValue Op1 = N->getOperand(1);
53096
53097 // TODO: Add NoOpaque handling to isConstantIntBuildVectorOrConstantInt.
53098 auto IsNonOpaqueConstant = [&](SDValue Op) {
53099 if (SDNode *C = DAG.isConstantIntBuildVectorOrConstantInt(Op)) {
53100 if (auto *Cst = dyn_cast<ConstantSDNode>(C))
53101 return !Cst->isOpaque();
53102 return true;
53103 }
53104 return false;
53105 };
53106
53107 // X86 can't encode an immediate LHS of a sub. See if we can push the
53108 // negation into a preceding instruction. If the RHS of the sub is a XOR with
53109 // one use and a constant, invert the immediate, saving one register.
53110 // sub(C1, xor(X, C2)) -> add(xor(X, ~C2), C1+1)
53111 if (Op1.getOpcode() == ISD::XOR && IsNonOpaqueConstant(Op0) &&
53112 IsNonOpaqueConstant(Op1.getOperand(1)) && Op1->hasOneUse()) {
53113 SDLoc DL(N);
53114 EVT VT = Op0.getValueType();
53115 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, Op1.getOperand(0),
53116 DAG.getNOT(SDLoc(Op1), Op1.getOperand(1), VT));
53117 SDValue NewAdd =
53118 DAG.getNode(ISD::ADD, DL, VT, Op0, DAG.getConstant(1, DL, VT));
53119 return DAG.getNode(ISD::ADD, DL, VT, NewXor, NewAdd);
53120 }
53121
53122 if (SDValue V = combineSubABS(N, DAG))
53123 return V;
53124
53125 // Try to synthesize horizontal subs from subs of shuffles.
53126 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
53127 return V;
53128
53129 // Fold SUB(X,ADC(Y,0,W)) -> SBB(X,Y,W)
53130 if (Op1.getOpcode() == X86ISD::ADC && Op1->hasOneUse() &&
53131 X86::isZeroNode(Op1.getOperand(1))) {
53132 assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use")(static_cast <bool> (!Op1->hasAnyUseOfValue(1) &&
"Overflow bit in use") ? void (0) : __assert_fail ("!Op1->hasAnyUseOfValue(1) && \"Overflow bit in use\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53132, __extension__
__PRETTY_FUNCTION__))
;
53133 return DAG.getNode(X86ISD::SBB, SDLoc(Op1), Op1->getVTList(), Op0,
53134 Op1.getOperand(0), Op1.getOperand(2));
53135 }
53136
53137 // Fold SUB(X,SBB(Y,Z,W)) -> SUB(ADC(X,Z,W),Y)
53138 // Don't fold to ADC(0,0,W)/SETCC_CARRY pattern which will prevent more folds.
53139 if (Op1.getOpcode() == X86ISD::SBB && Op1->hasOneUse() &&
53140 !(X86::isZeroNode(Op0) && X86::isZeroNode(Op1.getOperand(1)))) {
53141 assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use")(static_cast <bool> (!Op1->hasAnyUseOfValue(1) &&
"Overflow bit in use") ? void (0) : __assert_fail ("!Op1->hasAnyUseOfValue(1) && \"Overflow bit in use\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53141, __extension__
__PRETTY_FUNCTION__))
;
53142 SDValue ADC = DAG.getNode(X86ISD::ADC, SDLoc(Op1), Op1->getVTList(), Op0,
53143 Op1.getOperand(1), Op1.getOperand(2));
53144 return DAG.getNode(ISD::SUB, SDLoc(N), Op0.getValueType(), ADC.getValue(0),
53145 Op1.getOperand(0));
53146 }
53147
53148 return combineAddOrSubToADCOrSBB(N, DAG);
53149}
53150
53151static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
53152 const X86Subtarget &Subtarget) {
53153 MVT VT = N->getSimpleValueType(0);
53154 SDLoc DL(N);
53155
53156 if (N->getOperand(0) == N->getOperand(1)) {
53157 if (N->getOpcode() == X86ISD::PCMPEQ)
53158 return DAG.getConstant(-1, DL, VT);
53159 if (N->getOpcode() == X86ISD::PCMPGT)
53160 return DAG.getConstant(0, DL, VT);
53161 }
53162
53163 return SDValue();
53164}
53165
53166/// Helper that combines an array of subvector ops as if they were the operands
53167/// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g.
53168/// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type.
53169static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
53170 ArrayRef<SDValue> Ops, SelectionDAG &DAG,
53171 TargetLowering::DAGCombinerInfo &DCI,
53172 const X86Subtarget &Subtarget) {
53173 assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors")(static_cast <bool> (Subtarget.hasAVX() && "AVX assumed for concat_vectors"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"AVX assumed for concat_vectors\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53173, __extension__
__PRETTY_FUNCTION__))
;
53174 unsigned EltSizeInBits = VT.getScalarSizeInBits();
53175
53176 if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
53177 return DAG.getUNDEF(VT);
53178
53179 if (llvm::all_of(Ops, [](SDValue Op) {
53180 return ISD::isBuildVectorAllZeros(Op.getNode());
53181 }))
53182 return getZeroVector(VT, Subtarget, DAG, DL);
53183
53184 SDValue Op0 = Ops[0];
53185 bool IsSplat = llvm::all_of(Ops, [&Op0](SDValue Op) { return Op == Op0; });
53186
53187 // Repeated subvectors.
53188 if (IsSplat &&
53189 (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
53190 // If this broadcast is inserted into both halves, use a larger broadcast.
53191 if (Op0.getOpcode() == X86ISD::VBROADCAST)
53192 return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));
53193
53194 // If this simple subvector or scalar/subvector broadcast_load is inserted
53195 // into both halves, use a larger broadcast_load. Update other uses to use
53196 // an extracted subvector.
53197 if (ISD::isNormalLoad(Op0.getNode()) ||
53198 Op0.getOpcode() == X86ISD::VBROADCAST_LOAD ||
53199 Op0.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
53200 auto *Mem = cast<MemSDNode>(Op0);
53201 unsigned Opc = Op0.getOpcode() == X86ISD::VBROADCAST_LOAD
53202 ? X86ISD::VBROADCAST_LOAD
53203 : X86ISD::SUBV_BROADCAST_LOAD;
53204 if (SDValue BcastLd =
53205 getBROADCAST_LOAD(Opc, DL, VT, Mem->getMemoryVT(), Mem, 0, DAG)) {
53206 SDValue BcastSrc =
53207 extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits());
53208 DAG.ReplaceAllUsesOfValueWith(Op0, BcastSrc);
53209 return BcastLd;
53210 }
53211 }
53212
53213 // concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
53214 if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&
53215 (Subtarget.hasAVX2() ||
53216 X86::mayFoldLoadIntoBroadcastFromMem(Op0.getOperand(0),
53217 VT.getScalarType(), Subtarget)))
53218 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
53219 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,
53220 Op0.getOperand(0),
53221 DAG.getIntPtrConstant(0, DL)));
53222
53223 // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
53224 if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
53225 (Subtarget.hasAVX2() ||
53226 (EltSizeInBits >= 32 &&
53227 X86::mayFoldLoad(Op0.getOperand(0), Subtarget))) &&
53228 Op0.getOperand(0).getValueType() == VT.getScalarType())
53229 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));
53230
53231 // concat_vectors(extract_subvector(broadcast(x)),
53232 // extract_subvector(broadcast(x))) -> broadcast(x)
53233 if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
53234 Op0.getOperand(0).getValueType() == VT) {
53235 if (Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST ||
53236 Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST_LOAD)
53237 return Op0.getOperand(0);
53238 }
53239 }
53240
53241 // concat(extract_subvector(v0,c0), extract_subvector(v1,c1)) -> vperm2x128.
53242 // Only concat of subvector high halves which vperm2x128 is best at.
53243 // TODO: This should go in combineX86ShufflesRecursively eventually.
53244 if (VT.is256BitVector() && Ops.size() == 2) {
53245 SDValue Src0 = peekThroughBitcasts(Ops[0]);
53246 SDValue Src1 = peekThroughBitcasts(Ops[1]);
53247 if (Src0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
53248 Src1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
53249 EVT SrcVT0 = Src0.getOperand(0).getValueType();
53250 EVT SrcVT1 = Src1.getOperand(0).getValueType();
53251 unsigned NumSrcElts0 = SrcVT0.getVectorNumElements();
53252 unsigned NumSrcElts1 = SrcVT1.getVectorNumElements();
53253 if (SrcVT0.is256BitVector() && SrcVT1.is256BitVector() &&
53254 Src0.getConstantOperandAPInt(1) == (NumSrcElts0 / 2) &&
53255 Src1.getConstantOperandAPInt(1) == (NumSrcElts1 / 2)) {
53256 return DAG.getNode(X86ISD::VPERM2X128, DL, VT,
53257 DAG.getBitcast(VT, Src0.getOperand(0)),
53258 DAG.getBitcast(VT, Src1.getOperand(0)),
53259 DAG.getTargetConstant(0x31, DL, MVT::i8));
53260 }
53261 }
53262 }
53263
53264 // Repeated opcode.
53265 // TODO - combineX86ShufflesRecursively should handle shuffle concatenation
53266 // but it currently struggles with different vector widths.
53267 if (llvm::all_of(Ops, [Op0](SDValue Op) {
53268 return Op.getOpcode() == Op0.getOpcode();
53269 })) {
53270 auto ConcatSubOperand = [&](MVT VT, ArrayRef<SDValue> SubOps, unsigned I) {
53271 SmallVector<SDValue> Subs;
53272 for (SDValue SubOp : SubOps)
53273 Subs.push_back(SubOp.getOperand(I));
53274 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
53275 };
53276
53277 unsigned NumOps = Ops.size();
53278 switch (Op0.getOpcode()) {
53279 case X86ISD::VBROADCAST: {
53280 if (!IsSplat && VT == MVT::v4f64 && llvm::all_of(Ops, [](SDValue Op) {
53281 return Op.getOperand(0).getValueType().is128BitVector();
53282 }))
53283 return DAG.getNode(X86ISD::MOVDDUP, DL, VT,
53284 ConcatSubOperand(VT, Ops, 0));
53285 break;
53286 }
53287 case X86ISD::MOVDDUP:
53288 case X86ISD::MOVSHDUP:
53289 case X86ISD::MOVSLDUP: {
53290 if (!IsSplat)
53291 return DAG.getNode(Op0.getOpcode(), DL, VT,
53292 ConcatSubOperand(VT, Ops, 0));
53293 break;
53294 }
53295 case X86ISD::SHUFP: {
53296 // Add SHUFPD support if/when necessary.
53297 if (!IsSplat && VT.getScalarType() == MVT::f32 &&
53298 llvm::all_of(Ops, [Op0](SDValue Op) {
53299 return Op.getOperand(2) == Op0.getOperand(2);
53300 })) {
53301 return DAG.getNode(Op0.getOpcode(), DL, VT,
53302 ConcatSubOperand(VT, Ops, 0),
53303 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
53304 }
53305 break;
53306 }
53307 case X86ISD::PSHUFHW:
53308 case X86ISD::PSHUFLW:
53309 case X86ISD::PSHUFD:
53310 if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&
53311 Subtarget.hasInt256() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
53312 return DAG.getNode(Op0.getOpcode(), DL, VT,
53313 ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
53314 }
53315 LLVM_FALLTHROUGH[[gnu::fallthrough]];
53316 case X86ISD::VPERMILPI:
53317 if (!IsSplat && NumOps == 2 && (VT == MVT::v8f32 || VT == MVT::v8i32) &&
53318 Op0.getOperand(1) == Ops[1].getOperand(1)) {
53319 SDValue Res = DAG.getBitcast(MVT::v8f32, ConcatSubOperand(VT, Ops, 0));
53320 Res = DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, Res,
53321 Op0.getOperand(1));
53322 return DAG.getBitcast(VT, Res);
53323 }
53324 if (!IsSplat && NumOps == 2 && VT == MVT::v4f64) {
53325 uint64_t Idx0 = Ops[0].getConstantOperandVal(1);
53326 uint64_t Idx1 = Ops[1].getConstantOperandVal(1);
53327 uint64_t Idx = ((Idx1 & 3) << 2) | (Idx0 & 3);
53328 return DAG.getNode(Op0.getOpcode(), DL, VT,
53329 ConcatSubOperand(VT, Ops, 0),
53330 DAG.getTargetConstant(Idx, DL, MVT::i8));
53331 }
53332 break;
53333 case X86ISD::PSHUFB:
53334 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
53335 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
53336 return DAG.getNode(Op0.getOpcode(), DL, VT,
53337 ConcatSubOperand(VT, Ops, 0),
53338 ConcatSubOperand(VT, Ops, 1));
53339 }
53340 break;
53341 case X86ISD::VPERMV3:
53342 if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
53343 MVT OpVT = Op0.getSimpleValueType();
53344 int NumSrcElts = OpVT.getVectorNumElements();
53345 SmallVector<int, 64> ConcatMask;
53346 for (unsigned i = 0; i != NumOps; ++i) {
53347 SmallVector<int, 64> SubMask;
53348 SmallVector<SDValue, 2> SubOps;
53349 if (!getTargetShuffleMask(Ops[i].getNode(), OpVT, false, SubOps,
53350 SubMask))
53351 break;
53352 for (int M : SubMask) {
53353 if (0 <= M) {
53354 M += M < NumSrcElts ? 0 : NumSrcElts;
53355 M += i * NumSrcElts;
53356 }
53357 ConcatMask.push_back(M);
53358 }
53359 }
53360 if (ConcatMask.size() == (NumOps * NumSrcElts)) {
53361 SDValue Src0 = concatSubVectors(Ops[0].getOperand(0),
53362 Ops[1].getOperand(0), DAG, DL);
53363 SDValue Src1 = concatSubVectors(Ops[0].getOperand(2),
53364 Ops[1].getOperand(2), DAG, DL);
53365 MVT IntMaskSVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
53366 MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts);
53367 SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true);
53368 return DAG.getNode(X86ISD::VPERMV3, DL, VT, Src0, Mask, Src1);
53369 }
53370 }
53371 break;
53372 case X86ISD::VSHLI:
53373 case X86ISD::VSRLI:
53374 // Special case: SHL/SRL AVX1 V4i64 by 32-bits can lower as a shuffle.
53375 // TODO: Move this to LowerShiftByScalarImmediate?
53376 if (VT == MVT::v4i64 && !Subtarget.hasInt256() &&
53377 llvm::all_of(Ops, [](SDValue Op) {
53378 return Op.getConstantOperandAPInt(1) == 32;
53379 })) {
53380 SDValue Res = DAG.getBitcast(MVT::v8i32, ConcatSubOperand(VT, Ops, 0));
53381 SDValue Zero = getZeroVector(MVT::v8i32, Subtarget, DAG, DL);
53382 if (Op0.getOpcode() == X86ISD::VSHLI) {
53383 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
53384 {8, 0, 8, 2, 8, 4, 8, 6});
53385 } else {
53386 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
53387 {1, 8, 3, 8, 5, 8, 7, 8});
53388 }
53389 return DAG.getBitcast(VT, Res);
53390 }
53391 LLVM_FALLTHROUGH[[gnu::fallthrough]];
53392 case X86ISD::VSRAI:
53393 case X86ISD::VSHL:
53394 case X86ISD::VSRL:
53395 case X86ISD::VSRA:
53396 if (((VT.is256BitVector() && Subtarget.hasInt256()) ||
53397 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
53398 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
53399 llvm::all_of(Ops, [Op0](SDValue Op) {
53400 return Op0.getOperand(1) == Op.getOperand(1);
53401 })) {
53402 return DAG.getNode(Op0.getOpcode(), DL, VT,
53403 ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
53404 }
53405 break;
53406 case X86ISD::VPERMI:
53407 case X86ISD::VROTLI:
53408 case X86ISD::VROTRI:
53409 if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
53410 llvm::all_of(Ops, [Op0](SDValue Op) {
53411 return Op0.getOperand(1) == Op.getOperand(1);
53412 })) {
53413 return DAG.getNode(Op0.getOpcode(), DL, VT,
53414 ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
53415 }
53416 break;
53417 case ISD::AND:
53418 case ISD::OR:
53419 case ISD::XOR:
53420 case X86ISD::ANDNP:
53421 // TODO: Add 256-bit support.
53422 if (!IsSplat && VT.is512BitVector()) {
53423 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
53424 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
53425 NumOps * SrcVT.getVectorNumElements());
53426 return DAG.getNode(Op0.getOpcode(), DL, VT,
53427 ConcatSubOperand(SrcVT, Ops, 0),
53428 ConcatSubOperand(SrcVT, Ops, 1));
53429 }
53430 break;
53431 case X86ISD::HADD:
53432 case X86ISD::HSUB:
53433 case X86ISD::FHADD:
53434 case X86ISD::FHSUB:
53435 case X86ISD::PACKSS:
53436 case X86ISD::PACKUS:
53437 if (!IsSplat && VT.is256BitVector() &&
53438 (VT.isFloatingPoint() || Subtarget.hasInt256())) {
53439 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
53440 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
53441 NumOps * SrcVT.getVectorNumElements());
53442 return DAG.getNode(Op0.getOpcode(), DL, VT,
53443 ConcatSubOperand(SrcVT, Ops, 0),
53444 ConcatSubOperand(SrcVT, Ops, 1));
53445 }
53446 break;
53447 case X86ISD::PALIGNR:
53448 if (!IsSplat &&
53449 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
53450 (VT.is512BitVector() && Subtarget.useBWIRegs())) &&
53451 llvm::all_of(Ops, [Op0](SDValue Op) {
53452 return Op0.getOperand(2) == Op.getOperand(2);
53453 })) {
53454 return DAG.getNode(Op0.getOpcode(), DL, VT,
53455 ConcatSubOperand(VT, Ops, 0),
53456 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
53457 }
53458 break;
53459 }
53460 }
53461
53462 // Fold subvector loads into one.
53463 // If needed, look through bitcasts to get to the load.
53464 if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {
53465 bool Fast;
53466 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
53467 if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
53468 *FirstLd->getMemOperand(), &Fast) &&
53469 Fast) {
53470 if (SDValue Ld =
53471 EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))
53472 return Ld;
53473 }
53474 }
53475
53476 // Attempt to fold target constant loads.
53477 if (all_of(Ops, [](SDValue Op) { return getTargetConstantFromNode(Op); })) {
53478 SmallVector<APInt> EltBits;
53479 APInt UndefElts = APInt::getNullValue(VT.getVectorNumElements());
53480 for (unsigned I = 0, E = Ops.size(); I != E; ++I) {
53481 APInt OpUndefElts;
53482 SmallVector<APInt> OpEltBits;
53483 if (!getTargetConstantBitsFromNode(Ops[I], EltSizeInBits, OpUndefElts,
53484 OpEltBits, true, false))
53485 break;
53486 EltBits.append(OpEltBits);
53487 UndefElts.insertBits(OpUndefElts, I * OpUndefElts.getBitWidth());
53488 }
53489 if (EltBits.size() == VT.getVectorNumElements())
53490 return getConstVector(EltBits, UndefElts, VT, DAG, DL);
53491 }
53492
53493 return SDValue();
53494}
53495
53496static SDValue combineConcatVectors(SDNode *N, SelectionDAG &DAG,
53497 TargetLowering::DAGCombinerInfo &DCI,
53498 const X86Subtarget &Subtarget) {
53499 EVT VT = N->getValueType(0);
53500 EVT SrcVT = N->getOperand(0).getValueType();
53501 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53502
53503 // Don't do anything for i1 vectors.
53504 if (VT.getVectorElementType() == MVT::i1)
53505 return SDValue();
53506
53507 if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) {
53508 SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());
53509 if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG,
53510 DCI, Subtarget))
53511 return R;
53512 }
53513
53514 return SDValue();
53515}
53516
53517static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
53518 TargetLowering::DAGCombinerInfo &DCI,
53519 const X86Subtarget &Subtarget) {
53520 if (DCI.isBeforeLegalizeOps())
53521 return SDValue();
53522
53523 MVT OpVT = N->getSimpleValueType(0);
53524
53525 bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;
53526
53527 SDLoc dl(N);
53528 SDValue Vec = N->getOperand(0);
53529 SDValue SubVec = N->getOperand(1);
53530
53531 uint64_t IdxVal = N->getConstantOperandVal(2);
53532 MVT SubVecVT = SubVec.getSimpleValueType();
53533
53534 if (Vec.isUndef() && SubVec.isUndef())
53535 return DAG.getUNDEF(OpVT);
53536
53537 // Inserting undefs/zeros into zeros/undefs is a zero vector.
53538 if ((Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())) &&
53539 (SubVec.isUndef() || ISD::isBuildVectorAllZeros(SubVec.getNode())))
53540 return getZeroVector(OpVT, Subtarget, DAG, dl);
53541
53542 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
53543 // If we're inserting into a zero vector and then into a larger zero vector,
53544 // just insert into the larger zero vector directly.
53545 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
53546 ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) {
53547 uint64_t Idx2Val = SubVec.getConstantOperandVal(2);
53548 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
53549 getZeroVector(OpVT, Subtarget, DAG, dl),
53550 SubVec.getOperand(1),
53551 DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));
53552 }
53553
53554 // If we're inserting into a zero vector and our input was extracted from an
53555 // insert into a zero vector of the same type and the extraction was at
53556 // least as large as the original insertion. Just insert the original
53557 // subvector into a zero vector.
53558 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
53559 isNullConstant(SubVec.getOperand(1)) &&
53560 SubVec.getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR) {
53561 SDValue Ins = SubVec.getOperand(0);
53562 if (isNullConstant(Ins.getOperand(2)) &&
53563 ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
53564 Ins.getOperand(1).getValueSizeInBits().getFixedSize() <=
53565 SubVecVT.getFixedSizeInBits())
53566 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
53567 getZeroVector(OpVT, Subtarget, DAG, dl),
53568 Ins.getOperand(1), N->getOperand(2));
53569 }
53570 }
53571
53572 // Stop here if this is an i1 vector.
53573 if (IsI1Vector)
53574 return SDValue();
53575
53576 // If this is an insert of an extract, combine to a shuffle. Don't do this
53577 // if the insert or extract can be represented with a subregister operation.
53578 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
53579 SubVec.getOperand(0).getSimpleValueType() == OpVT &&
53580 (IdxVal != 0 ||
53581 !(Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())))) {
53582 int ExtIdxVal = SubVec.getConstantOperandVal(1);
53583 if (ExtIdxVal != 0) {
53584 int VecNumElts = OpVT.getVectorNumElements();
53585 int SubVecNumElts = SubVecVT.getVectorNumElements();
53586 SmallVector<int, 64> Mask(VecNumElts);
53587 // First create an identity shuffle mask.
53588 for (int i = 0; i != VecNumElts; ++i)
53589 Mask[i] = i;
53590 // Now insert the extracted portion.
53591 for (int i = 0; i != SubVecNumElts; ++i)
53592 Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
53593
53594 return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
53595 }
53596 }
53597
53598 // Match concat_vector style patterns.
53599 SmallVector<SDValue, 2> SubVectorOps;
53600 if (collectConcatOps(N, SubVectorOps)) {
53601 if (SDValue Fold =
53602 combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, DCI, Subtarget))
53603 return Fold;
53604
53605 // If we're inserting all zeros into the upper half, change this to
53606 // a concat with zero. We will match this to a move
53607 // with implicit upper bit zeroing during isel.
53608 // We do this here because we don't want combineConcatVectorOps to
53609 // create INSERT_SUBVECTOR from CONCAT_VECTORS.
53610 if (SubVectorOps.size() == 2 &&
53611 ISD::isBuildVectorAllZeros(SubVectorOps[1].getNode()))
53612 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
53613 getZeroVector(OpVT, Subtarget, DAG, dl),
53614 SubVectorOps[0], DAG.getIntPtrConstant(0, dl));
53615 }
53616
53617 // If this is a broadcast insert into an upper undef, use a larger broadcast.
53618 if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)
53619 return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));
53620
53621 // If this is a broadcast load inserted into an upper undef, use a larger
53622 // broadcast load.
53623 if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() &&
53624 SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) {
53625 auto *MemIntr = cast<MemIntrinsicSDNode>(SubVec);
53626 SDVTList Tys = DAG.getVTList(OpVT, MVT::Other);
53627 SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() };
53628 SDValue BcastLd =
53629 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
53630 MemIntr->getMemoryVT(),
53631 MemIntr->getMemOperand());
53632 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
53633 return BcastLd;
53634 }
53635
53636 // If we're splatting the lower half subvector of a full vector load into the
53637 // upper half, attempt to create a subvector broadcast.
53638 if (IdxVal == (OpVT.getVectorNumElements() / 2) && SubVec.hasOneUse() &&
53639 Vec.getValueSizeInBits() == (2 * SubVec.getValueSizeInBits())) {
53640 auto *VecLd = dyn_cast<LoadSDNode>(Vec);
53641 auto *SubLd = dyn_cast<LoadSDNode>(SubVec);
53642 if (VecLd && SubLd &&
53643 DAG.areNonVolatileConsecutiveLoads(SubLd, VecLd,
53644 SubVec.getValueSizeInBits() / 8, 0))
53645 return getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, dl, OpVT, SubVecVT,
53646 SubLd, 0, DAG);
53647 }
53648
53649 return SDValue();
53650}
53651
53652/// If we are extracting a subvector of a vector select and the select condition
53653/// is composed of concatenated vectors, try to narrow the select width. This
53654/// is a common pattern for AVX1 integer code because 256-bit selects may be
53655/// legal, but there is almost no integer math/logic available for 256-bit.
53656/// This function should only be called with legal types (otherwise, the calls
53657/// to get simple value types will assert).
53658static SDValue narrowExtractedVectorSelect(SDNode *Ext, SelectionDAG &DAG) {
53659 SDValue Sel = peekThroughBitcasts(Ext->getOperand(0));
53660 SmallVector<SDValue, 4> CatOps;
53661 if (Sel.getOpcode() != ISD::VSELECT ||
53662 !collectConcatOps(Sel.getOperand(0).getNode(), CatOps))
53663 return SDValue();
53664
53665 // Note: We assume simple value types because this should only be called with
53666 // legal operations/types.
53667 // TODO: This can be extended to handle extraction to 256-bits.
53668 MVT VT = Ext->getSimpleValueType(0);
53669 if (!VT.is128BitVector())
53670 return SDValue();
53671
53672 MVT SelCondVT = Sel.getOperand(0).getSimpleValueType();
53673 if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector())
53674 return SDValue();
53675
53676 MVT WideVT = Ext->getOperand(0).getSimpleValueType();
53677 MVT SelVT = Sel.getSimpleValueType();
53678 assert((SelVT.is256BitVector() || SelVT.is512BitVector()) &&(static_cast <bool> ((SelVT.is256BitVector() || SelVT.is512BitVector
()) && "Unexpected vector type with legal operations"
) ? void (0) : __assert_fail ("(SelVT.is256BitVector() || SelVT.is512BitVector()) && \"Unexpected vector type with legal operations\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53679, __extension__
__PRETTY_FUNCTION__))
53679 "Unexpected vector type with legal operations")(static_cast <bool> ((SelVT.is256BitVector() || SelVT.is512BitVector
()) && "Unexpected vector type with legal operations"
) ? void (0) : __assert_fail ("(SelVT.is256BitVector() || SelVT.is512BitVector()) && \"Unexpected vector type with legal operations\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53679, __extension__
__PRETTY_FUNCTION__))
;
53680
53681 unsigned SelElts = SelVT.getVectorNumElements();
53682 unsigned CastedElts = WideVT.getVectorNumElements();
53683 unsigned ExtIdx = Ext->getConstantOperandVal(1);
53684 if (SelElts % CastedElts == 0) {
53685 // The select has the same or more (narrower) elements than the extract
53686 // operand. The extraction index gets scaled by that factor.
53687 ExtIdx *= (SelElts / CastedElts);
53688 } else if (CastedElts % SelElts == 0) {
53689 // The select has less (wider) elements than the extract operand. Make sure
53690 // that the extraction index can be divided evenly.
53691 unsigned IndexDivisor = CastedElts / SelElts;
53692 if (ExtIdx % IndexDivisor != 0)
53693 return SDValue();
53694 ExtIdx /= IndexDivisor;
53695 } else {
53696 llvm_unreachable("Element count of simple vector types are not divisible?")::llvm::llvm_unreachable_internal("Element count of simple vector types are not divisible?"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53696)
;
53697 }
53698
53699 unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits();
53700 unsigned NarrowElts = SelElts / NarrowingFactor;
53701 MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts);
53702 SDLoc DL(Ext);
53703 SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL);
53704 SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL);
53705 SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL);
53706 SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF);
53707 return DAG.getBitcast(VT, NarrowSel);
53708}
53709
53710static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
53711 TargetLowering::DAGCombinerInfo &DCI,
53712 const X86Subtarget &Subtarget) {
53713 // For AVX1 only, if we are extracting from a 256-bit and+not (which will
53714 // eventually get combined/lowered into ANDNP) with a concatenated operand,
53715 // split the 'and' into 128-bit ops to avoid the concatenate and extract.
53716 // We let generic combining take over from there to simplify the
53717 // insert/extract and 'not'.
53718 // This pattern emerges during AVX1 legalization. We handle it before lowering
53719 // to avoid complications like splitting constant vector loads.
53720
53721 // Capture the original wide type in the likely case that we need to bitcast
53722 // back to this type.
53723 if (!N->getValueType(0).isSimple())
53724 return SDValue();
53725
53726 MVT VT = N->getSimpleValueType(0);
53727 SDValue InVec = N->getOperand(0);
53728 unsigned IdxVal = N->getConstantOperandVal(1);
53729 SDValue InVecBC = peekThroughBitcasts(InVec);
53730 EVT InVecVT = InVec.getValueType();
53731 unsigned SizeInBits = VT.getSizeInBits();
53732 unsigned InSizeInBits = InVecVT.getSizeInBits();
53733 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53734
53735 if (Subtarget.hasAVX() && !Subtarget.hasAVX2() &&
53736 TLI.isTypeLegal(InVecVT) &&
53737 InSizeInBits == 256 && InVecBC.getOpcode() == ISD::AND) {
53738 auto isConcatenatedNot = [](SDValue V) {
53739 V = peekThroughBitcasts(V);
53740 if (!isBitwiseNot(V))
53741 return false;
53742 SDValue NotOp = V->getOperand(0);
53743 return peekThroughBitcasts(NotOp).getOpcode() == ISD::CONCAT_VECTORS;
53744 };
53745 if (isConcatenatedNot(InVecBC.getOperand(0)) ||
53746 isConcatenatedNot(InVecBC.getOperand(1))) {
53747 // extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1
53748 SDValue Concat = splitVectorIntBinary(InVecBC, DAG);
53749 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT,
53750 DAG.getBitcast(InVecVT, Concat), N->getOperand(1));
53751 }
53752 }
53753
53754 if (DCI.isBeforeLegalizeOps())
53755 return SDValue();
53756
53757 if (SDValue V = narrowExtractedVectorSelect(N, DAG))
53758 return V;
53759
53760 if (ISD::isBuildVectorAllZeros(InVec.getNode()))
53761 return getZeroVector(VT, Subtarget, DAG, SDLoc(N));
53762
53763 if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
53764 if (VT.getScalarType() == MVT::i1)
53765 return DAG.getConstant(1, SDLoc(N), VT);
53766 return getOnesVector(VT, DAG, SDLoc(N));
53767 }
53768
53769 if (InVec.getOpcode() == ISD::BUILD_VECTOR)
53770 return DAG.getBuildVector(
53771 VT, SDLoc(N),
53772 InVec.getNode()->ops().slice(IdxVal, VT.getVectorNumElements()));
53773
53774 // If we are extracting from an insert into a larger vector, replace with a
53775 // smaller insert if we don't access less than the original subvector. Don't
53776 // do this for i1 vectors.
53777 // TODO: Relax the matching indices requirement?
53778 if (VT.getVectorElementType() != MVT::i1 &&
53779 InVec.getOpcode() == ISD::INSERT_SUBVECTOR && InVec.hasOneUse() &&
53780 IdxVal == InVec.getConstantOperandVal(2) &&
53781 InVec.getOperand(1).getValueSizeInBits() <= SizeInBits) {
53782 SDLoc DL(N);
53783 SDValue NewExt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT,
53784 InVec.getOperand(0), N->getOperand(1));
53785 unsigned NewIdxVal = InVec.getConstantOperandVal(2) - IdxVal;
53786 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, NewExt,
53787 InVec.getOperand(1),
53788 DAG.getVectorIdxConstant(NewIdxVal, DL));
53789 }
53790
53791 // If we're extracting an upper subvector from a broadcast we should just
53792 // extract the lowest subvector instead which should allow
53793 // SimplifyDemandedVectorElts do more simplifications.
53794 if (IdxVal != 0 && (InVec.getOpcode() == X86ISD::VBROADCAST ||
53795 InVec.getOpcode() == X86ISD::VBROADCAST_LOAD ||
53796 DAG.isSplatValue(InVec, /*AllowUndefs*/ false)))
53797 return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits);
53798
53799 // If we're extracting a broadcasted subvector, just use the lowest subvector.
53800 if (IdxVal != 0 && InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
53801 cast<MemIntrinsicSDNode>(InVec)->getMemoryVT() == VT)
53802 return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits);
53803
53804 // Attempt to extract from the source of a shuffle vector.
53805 if ((InSizeInBits % SizeInBits) == 0 &&
53806 (IdxVal % VT.getVectorNumElements()) == 0) {
53807 SmallVector<int, 32> ShuffleMask;
53808 SmallVector<int, 32> ScaledMask;
53809 SmallVector<SDValue, 2> ShuffleInputs;
53810 unsigned NumSubVecs = InSizeInBits / SizeInBits;
53811 // Decode the shuffle mask and scale it so its shuffling subvectors.
53812 if (getTargetShuffleInputs(InVecBC, ShuffleInputs, ShuffleMask, DAG) &&
53813 scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) {
53814 unsigned SubVecIdx = IdxVal / VT.getVectorNumElements();
53815 if (ScaledMask[SubVecIdx] == SM_SentinelUndef)
53816 return DAG.getUNDEF(VT);
53817 if (ScaledMask[SubVecIdx] == SM_SentinelZero)
53818 return getZeroVector(VT, Subtarget, DAG, SDLoc(N));
53819 SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs];
53820 if (Src.getValueSizeInBits() == InSizeInBits) {
53821 unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs;
53822 unsigned SrcEltIdx = SrcSubVecIdx * VT.getVectorNumElements();
53823 return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG,
53824 SDLoc(N), SizeInBits);
53825 }
53826 }
53827 }
53828
53829 // If we're extracting the lowest subvector and we're the only user,
53830 // we may be able to perform this with a smaller vector width.
53831 unsigned InOpcode = InVec.getOpcode();
53832 if (InVec.hasOneUse()) {
53833 if (IdxVal == 0 && VT == MVT::v2f64 && InVecVT == MVT::v4f64) {
53834 // v2f64 CVTDQ2PD(v4i32).
53835 if (InOpcode == ISD::SINT_TO_FP &&
53836 InVec.getOperand(0).getValueType() == MVT::v4i32) {
53837 return DAG.getNode(X86ISD::CVTSI2P, SDLoc(N), VT, InVec.getOperand(0));
53838 }
53839 // v2f64 CVTUDQ2PD(v4i32).
53840 if (InOpcode == ISD::UINT_TO_FP && Subtarget.hasVLX() &&
53841 InVec.getOperand(0).getValueType() == MVT::v4i32) {
53842 return DAG.getNode(X86ISD::CVTUI2P, SDLoc(N), VT, InVec.getOperand(0));
53843 }
53844 // v2f64 CVTPS2PD(v4f32).
53845 if (InOpcode == ISD::FP_EXTEND &&
53846 InVec.getOperand(0).getValueType() == MVT::v4f32) {
53847 return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), VT, InVec.getOperand(0));
53848 }
53849 }
53850 if (IdxVal == 0 &&
53851 (InOpcode == ISD::ANY_EXTEND ||
53852 InOpcode == ISD::ANY_EXTEND_VECTOR_INREG ||
53853 InOpcode == ISD::ZERO_EXTEND ||
53854 InOpcode == ISD::ZERO_EXTEND_VECTOR_INREG ||
53855 InOpcode == ISD::SIGN_EXTEND ||
53856 InOpcode == ISD::SIGN_EXTEND_VECTOR_INREG) &&
53857 (SizeInBits == 128 || SizeInBits == 256) &&
53858 InVec.getOperand(0).getValueSizeInBits() >= SizeInBits) {
53859 SDLoc DL(N);
53860 SDValue Ext = InVec.getOperand(0);
53861 if (Ext.getValueSizeInBits() > SizeInBits)
53862 Ext = extractSubVector(Ext, 0, DAG, DL, SizeInBits);
53863 unsigned ExtOp = getOpcode_EXTEND_VECTOR_INREG(InOpcode);
53864 return DAG.getNode(ExtOp, DL, VT, Ext);
53865 }
53866 if (IdxVal == 0 && InOpcode == ISD::VSELECT &&
53867 InVec.getOperand(0).getValueType().is256BitVector() &&
53868 InVec.getOperand(1).getValueType().is256BitVector() &&
53869 InVec.getOperand(2).getValueType().is256BitVector()) {
53870 SDLoc DL(N);
53871 SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128);
53872 SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128);
53873 SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);
53874 return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);
53875 }
53876 if (IdxVal == 0 && InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() &&
53877 (VT.is128BitVector() || VT.is256BitVector())) {
53878 SDLoc DL(N);
53879 SDValue InVecSrc = InVec.getOperand(0);
53880 unsigned Scale = InVecSrc.getValueSizeInBits() / InSizeInBits;
53881 SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits);
53882 return DAG.getNode(InOpcode, DL, VT, Ext);
53883 }
53884 if (InOpcode == X86ISD::MOVDDUP &&
53885 (VT.is128BitVector() || VT.is256BitVector())) {
53886 SDLoc DL(N);
53887 SDValue Ext0 =
53888 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
53889 return DAG.getNode(InOpcode, DL, VT, Ext0);
53890 }
53891 }
53892
53893 // Always split vXi64 logical shifts where we're extracting the upper 32-bits
53894 // as this is very likely to fold into a shuffle/truncation.
53895 if ((InOpcode == X86ISD::VSHLI || InOpcode == X86ISD::VSRLI) &&
53896 InVecVT.getScalarSizeInBits() == 64 &&
53897 InVec.getConstantOperandAPInt(1) == 32) {
53898 SDLoc DL(N);
53899 SDValue Ext =
53900 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
53901 return DAG.getNode(InOpcode, DL, VT, Ext, InVec.getOperand(1));
53902 }
53903
53904 return SDValue();
53905}
53906
53907static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {
53908 EVT VT = N->getValueType(0);
53909 SDValue Src = N->getOperand(0);
53910 SDLoc DL(N);
53911
53912 // If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
53913 // This occurs frequently in our masked scalar intrinsic code and our
53914 // floating point select lowering with AVX512.
53915 // TODO: SimplifyDemandedBits instead?
53916 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse())
53917 if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
53918 if (C->getAPIntValue().isOne())
53919 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1,
53920 Src.getOperand(0));
53921
53922 // Combine scalar_to_vector of an extract_vector_elt into an extract_subvec.
53923 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
53924 Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() &&
53925 Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1)
53926 if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
53927 if (C->isZero())
53928 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0),
53929 Src.getOperand(1));
53930
53931 // Reduce v2i64 to v4i32 if we don't need the upper bits.
53932 // TODO: Move to DAGCombine/SimplifyDemandedBits?
53933 if (VT == MVT::v2i64 || VT == MVT::v2f64) {
53934 auto IsAnyExt64 = [](SDValue Op) {
53935 if (Op.getValueType() != MVT::i64 || !Op.hasOneUse())
53936 return SDValue();
53937 if (Op.getOpcode() == ISD::ANY_EXTEND &&
53938 Op.getOperand(0).getScalarValueSizeInBits() <= 32)
53939 return Op.getOperand(0);
53940 if (auto *Ld = dyn_cast<LoadSDNode>(Op))
53941 if (Ld->getExtensionType() == ISD::EXTLOAD &&
53942 Ld->getMemoryVT().getScalarSizeInBits() <= 32)
53943 return Op;
53944 return SDValue();
53945 };
53946 if (SDValue ExtSrc = IsAnyExt64(peekThroughOneUseBitcasts(Src)))
53947 return DAG.getBitcast(
53948 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
53949 DAG.getAnyExtOrTrunc(ExtSrc, DL, MVT::i32)));
53950 }
53951
53952 // Combine (v2i64 (scalar_to_vector (i64 (bitconvert (mmx))))) to MOVQ2DQ.
53953 if (VT == MVT::v2i64 && Src.getOpcode() == ISD::BITCAST &&
53954 Src.getOperand(0).getValueType() == MVT::x86mmx)
53955 return DAG.getNode(X86ISD::MOVQ2DQ, DL, VT, Src.getOperand(0));
53956
53957 // See if we're broadcasting the scalar value, in which case just reuse that.
53958 // Ensure the same SDValue from the SDNode use is being used.
53959 if (VT.getScalarType() == Src.getValueType())
53960 for (SDNode *User : Src->uses())
53961 if (User->getOpcode() == X86ISD::VBROADCAST &&
53962 Src == User->getOperand(0)) {
53963 unsigned SizeInBits = VT.getFixedSizeInBits();
53964 unsigned BroadcastSizeInBits =
53965 User->getValueSizeInBits(0).getFixedSize();
53966 if (BroadcastSizeInBits == SizeInBits)
53967 return SDValue(User, 0);
53968 if (BroadcastSizeInBits > SizeInBits)
53969 return extractSubVector(SDValue(User, 0), 0, DAG, DL, SizeInBits);
53970 // TODO: Handle BroadcastSizeInBits < SizeInBits when we have test
53971 // coverage.
53972 }
53973
53974 return SDValue();
53975}
53976
53977// Simplify PMULDQ and PMULUDQ operations.
53978static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
53979 TargetLowering::DAGCombinerInfo &DCI,
53980 const X86Subtarget &Subtarget) {
53981 SDValue LHS = N->getOperand(0);
53982 SDValue RHS = N->getOperand(1);
53983
53984 // Canonicalize constant to RHS.
53985 if (DAG.isConstantIntBuildVectorOrConstantInt(LHS) &&
53986 !DAG.isConstantIntBuildVectorOrConstantInt(RHS))
53987 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);
53988
53989 // Multiply by zero.
53990 // Don't return RHS as it may contain UNDEFs.
53991 if (ISD::isBuildVectorAllZeros(RHS.getNode()))
53992 return DAG.getConstant(0, SDLoc(N), N->getValueType(0));
53993
53994 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
53995 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53996 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(64), DCI))
53997 return SDValue(N, 0);
53998
53999 // If the input is an extend_invec and the SimplifyDemandedBits call didn't
54000 // convert it to any_extend_invec, due to the LegalOperations check, do the
54001 // conversion directly to a vector shuffle manually. This exposes combine
54002 // opportunities missed by combineEXTEND_VECTOR_INREG not calling
54003 // combineX86ShufflesRecursively on SSE4.1 targets.
54004 // FIXME: This is basically a hack around several other issues related to
54005 // ANY_EXTEND_VECTOR_INREG.
54006 if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() &&
54007 (LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
54008 LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
54009 LHS.getOperand(0).getValueType() == MVT::v4i32) {
54010 SDLoc dl(N);
54011 LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0),
54012 LHS.getOperand(0), { 0, -1, 1, -1 });
54013 LHS = DAG.getBitcast(MVT::v2i64, LHS);
54014 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
54015 }
54016 if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() &&
54017 (RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
54018 RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
54019 RHS.getOperand(0).getValueType() == MVT::v4i32) {
54020 SDLoc dl(N);
54021 RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0),
54022 RHS.getOperand(0), { 0, -1, 1, -1 });
54023 RHS = DAG.getBitcast(MVT::v2i64, RHS);
54024 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
54025 }
54026
54027 return SDValue();
54028}
54029
54030// Simplify VPMADDUBSW/VPMADDWD operations.
54031static SDValue combineVPMADD(SDNode *N, SelectionDAG &DAG,
54032 TargetLowering::DAGCombinerInfo &DCI) {
54033 EVT VT = N->getValueType(0);
54034 SDValue LHS = N->getOperand(0);
54035 SDValue RHS = N->getOperand(1);
54036
54037 // Multiply by zero.
54038 // Don't return LHS/RHS as it may contain UNDEFs.
54039 if (ISD::isBuildVectorAllZeros(LHS.getNode()) ||
54040 ISD::isBuildVectorAllZeros(RHS.getNode()))
54041 return DAG.getConstant(0, SDLoc(N), VT);
54042
54043 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54044 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
54045 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
54046 return SDValue(N, 0);
54047
54048 return SDValue();
54049}
54050
54051static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG,
54052 TargetLowering::DAGCombinerInfo &DCI,
54053 const X86Subtarget &Subtarget) {
54054 EVT VT = N->getValueType(0);
54055 SDValue In = N->getOperand(0);
54056 unsigned Opcode = N->getOpcode();
54057 unsigned InOpcode = In.getOpcode();
54058 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54059 SDLoc DL(N);
54060
54061 // Try to merge vector loads and extend_inreg to an extload.
54062 if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&
54063 In.hasOneUse()) {
54064 auto *Ld = cast<LoadSDNode>(In);
54065 if (Ld->isSimple()) {
54066 MVT SVT = In.getSimpleValueType().getVectorElementType();
54067 ISD::LoadExtType Ext = Opcode == ISD::SIGN_EXTEND_VECTOR_INREG
54068 ? ISD::SEXTLOAD
54069 : ISD::ZEXTLOAD;
54070 EVT MemVT = VT.changeVectorElementType(SVT);
54071 if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {
54072 SDValue Load = DAG.getExtLoad(
54073 Ext, DL, VT, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(),
54074 MemVT, Ld->getOriginalAlign(), Ld->getMemOperand()->getFlags());
54075 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
54076 return Load;
54077 }
54078 }
54079 }
54080
54081 // Fold EXTEND_VECTOR_INREG(EXTEND_VECTOR_INREG(X)) -> EXTEND_VECTOR_INREG(X).
54082 if (Opcode == InOpcode)
54083 return DAG.getNode(Opcode, DL, VT, In.getOperand(0));
54084
54085 // Fold EXTEND_VECTOR_INREG(EXTRACT_SUBVECTOR(EXTEND(X),0))
54086 // -> EXTEND_VECTOR_INREG(X).
54087 // TODO: Handle non-zero subvector indices.
54088 if (InOpcode == ISD::EXTRACT_SUBVECTOR && In.getConstantOperandVal(1) == 0 &&
54089 In.getOperand(0).getOpcode() == getOpcode_EXTEND(Opcode) &&
54090 In.getOperand(0).getOperand(0).getValueSizeInBits() ==
54091 In.getValueSizeInBits())
54092 return DAG.getNode(Opcode, DL, VT, In.getOperand(0).getOperand(0));
54093
54094 // Fold EXTEND_VECTOR_INREG(BUILD_VECTOR(X,Y,?,?)) -> BUILD_VECTOR(X,0,Y,0).
54095 // TODO: Move to DAGCombine?
54096 if (!DCI.isBeforeLegalizeOps() && Opcode == ISD::ZERO_EXTEND_VECTOR_INREG &&
54097 In.getOpcode() == ISD::BUILD_VECTOR && In.hasOneUse() &&
54098 In.getValueSizeInBits() == VT.getSizeInBits()) {
54099 unsigned NumElts = VT.getVectorNumElements();
54100 unsigned Scale = VT.getScalarSizeInBits() / In.getScalarValueSizeInBits();
54101 EVT EltVT = In.getOperand(0).getValueType();
54102 SmallVector<SDValue> Elts(Scale * NumElts, DAG.getConstant(0, DL, EltVT));
54103 for (unsigned I = 0; I != NumElts; ++I)
54104 Elts[I * Scale] = In.getOperand(I);
54105 return DAG.getBitcast(VT, DAG.getBuildVector(In.getValueType(), DL, Elts));
54106 }
54107
54108 // Attempt to combine as a shuffle.
54109 // TODO: General ZERO_EXTEND_VECTOR_INREG support.
54110 if (Opcode == ISD::ANY_EXTEND_VECTOR_INREG ||
54111 (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG && Subtarget.hasSSE41())) {
54112 SDValue Op(N, 0);
54113 if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))
54114 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
54115 return Res;
54116 }
54117
54118 return SDValue();
54119}
54120
54121static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG,
54122 TargetLowering::DAGCombinerInfo &DCI) {
54123 EVT VT = N->getValueType(0);
54124
54125 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
54126 return DAG.getConstant(0, SDLoc(N), VT);
54127
54128 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54129 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
54130 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
54131 return SDValue(N, 0);
54132
54133 return SDValue();
54134}
54135
54136// Optimize (fp16_to_fp (fp_to_fp16 X)) to VCVTPS2PH followed by VCVTPH2PS.
54137// Done as a combine because the lowering for fp16_to_fp and fp_to_fp16 produce
54138// extra instructions between the conversion due to going to scalar and back.
54139static SDValue combineFP16_TO_FP(SDNode *N, SelectionDAG &DAG,
54140 const X86Subtarget &Subtarget) {
54141 if (Subtarget.useSoftFloat() || !Subtarget.hasF16C())
54142 return SDValue();
54143
54144 if (N->getOperand(0).getOpcode() != ISD::FP_TO_FP16)
54145 return SDValue();
54146
54147 if (N->getValueType(0) != MVT::f32 ||
54148 N->getOperand(0).getOperand(0).getValueType() != MVT::f32)
54149 return SDValue();
54150
54151 SDLoc dl(N);
54152 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32,
54153 N->getOperand(0).getOperand(0));
54154 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
54155 DAG.getTargetConstant(4, dl, MVT::i32));
54156 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
54157 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
54158 DAG.getIntPtrConstant(0, dl));
54159}
54160
54161static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG,
54162 const X86Subtarget &Subtarget) {
54163 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
54164 return SDValue();
54165
54166 if (Subtarget.hasFP16())
54167 return SDValue();
54168
54169 bool IsStrict = N->isStrictFPOpcode();
54170 EVT VT = N->getValueType(0);
54171 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
54172 EVT SrcVT = Src.getValueType();
54173
54174 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::f16)
54175 return SDValue();
54176
54177 if (VT.getVectorElementType() != MVT::f32 &&
54178 VT.getVectorElementType() != MVT::f64)
54179 return SDValue();
54180
54181 unsigned NumElts = VT.getVectorNumElements();
54182 if (NumElts == 1 || !isPowerOf2_32(NumElts))
54183 return SDValue();
54184
54185 SDLoc dl(N);
54186
54187 // Convert the input to vXi16.
54188 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
54189 Src = DAG.getBitcast(IntVT, Src);
54190
54191 // Widen to at least 8 input elements.
54192 if (NumElts < 8) {
54193 unsigned NumConcats = 8 / NumElts;
54194 SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT)
54195 : DAG.getConstant(0, dl, IntVT);
54196 SmallVector<SDValue, 4> Ops(NumConcats, Fill);
54197 Ops[0] = Src;
54198 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops);
54199 }
54200
54201 // Destination is vXf32 with at least 4 elements.
54202 EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32,
54203 std::max(4U, NumElts));
54204 SDValue Cvt, Chain;
54205 if (IsStrict) {
54206 Cvt = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {CvtVT, MVT::Other},
54207 {N->getOperand(0), Src});
54208 Chain = Cvt.getValue(1);
54209 } else {
54210 Cvt = DAG.getNode(X86ISD::CVTPH2PS, dl, CvtVT, Src);
54211 }
54212
54213 if (NumElts < 4) {
54214 assert(NumElts == 2 && "Unexpected size")(static_cast <bool> (NumElts == 2 && "Unexpected size"
) ? void (0) : __assert_fail ("NumElts == 2 && \"Unexpected size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54214, __extension__
__PRETTY_FUNCTION__))
;
54215 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Cvt,
54216 DAG.getIntPtrConstant(0, dl));
54217 }
54218
54219 if (IsStrict) {
54220 // Extend to the original VT if necessary.
54221 if (Cvt.getValueType() != VT) {
54222 Cvt = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {VT, MVT::Other},
54223 {Chain, Cvt});
54224 Chain = Cvt.getValue(1);
54225 }
54226 return DAG.getMergeValues({Cvt, Chain}, dl);
54227 }
54228
54229 // Extend to the original VT if necessary.
54230 return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt);
54231}
54232
54233// Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract
54234// from. Limit this to cases where the loads have the same input chain and the
54235// output chains are unused. This avoids any memory ordering issues.
54236static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG,
54237 TargetLowering::DAGCombinerInfo &DCI) {
54238 assert((N->getOpcode() == X86ISD::VBROADCAST_LOAD ||(static_cast <bool> ((N->getOpcode() == X86ISD::VBROADCAST_LOAD
|| N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
"Unknown broadcast load type") ? void (0) : __assert_fail ("(N->getOpcode() == X86ISD::VBROADCAST_LOAD || N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54240, __extension__
__PRETTY_FUNCTION__))
54239 N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&(static_cast <bool> ((N->getOpcode() == X86ISD::VBROADCAST_LOAD
|| N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
"Unknown broadcast load type") ? void (0) : __assert_fail ("(N->getOpcode() == X86ISD::VBROADCAST_LOAD || N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54240, __extension__
__PRETTY_FUNCTION__))
54240 "Unknown broadcast load type")(static_cast <bool> ((N->getOpcode() == X86ISD::VBROADCAST_LOAD
|| N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
"Unknown broadcast load type") ? void (0) : __assert_fail ("(N->getOpcode() == X86ISD::VBROADCAST_LOAD || N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54240, __extension__
__PRETTY_FUNCTION__))
;
54241
54242 // Only do this if the chain result is unused.
54243 if (N->hasAnyUseOfValue(1))
54244 return SDValue();
54245
54246 auto *MemIntrin = cast<MemIntrinsicSDNode>(N);
54247
54248 SDValue Ptr = MemIntrin->getBasePtr();
54249 SDValue Chain = MemIntrin->getChain();
54250 EVT VT = N->getSimpleValueType(0);
54251 EVT MemVT = MemIntrin->getMemoryVT();
54252
54253 // Look at other users of our base pointer and try to find a wider broadcast.
54254 // The input chain and the size of the memory VT must match.
54255 for (SDNode *User : Ptr->uses())
54256 if (User != N && User->getOpcode() == N->getOpcode() &&
54257 cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
54258 cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
54259 cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
54260 MemVT.getSizeInBits() &&
54261 !User->hasAnyUseOfValue(1) &&
54262 User->getValueSizeInBits(0).getFixedSize() > VT.getFixedSizeInBits()) {
54263 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
54264 VT.getSizeInBits());
54265 Extract = DAG.getBitcast(VT, Extract);
54266 return DCI.CombineTo(N, Extract, SDValue(User, 1));
54267 }
54268
54269 return SDValue();
54270}
54271
54272static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG,
54273 const X86Subtarget &Subtarget) {
54274 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
54275 return SDValue();
54276
54277 if (Subtarget.hasFP16())
54278 return SDValue();
54279
54280 EVT VT = N->getValueType(0);
54281 SDValue Src = N->getOperand(0);
54282 EVT SrcVT = Src.getValueType();
54283
54284 if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 ||
54285 SrcVT.getVectorElementType() != MVT::f32)
54286 return SDValue();
54287
54288 unsigned NumElts = VT.getVectorNumElements();
54289 if (NumElts == 1 || !isPowerOf2_32(NumElts))
54290 return SDValue();
54291
54292 SDLoc dl(N);
54293
54294 // Widen to at least 4 input elements.
54295 if (NumElts < 4)
54296 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
54297 DAG.getConstantFP(0.0, dl, SrcVT));
54298
54299 // Destination is v8i16 with at least 8 elements.
54300 EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
54301 std::max(8U, NumElts));
54302 SDValue Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src,
54303 DAG.getTargetConstant(4, dl, MVT::i32));
54304
54305 // Extract down to real number of elements.
54306 if (NumElts < 8) {
54307 EVT IntVT = VT.changeVectorElementTypeToInteger();
54308 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, IntVT, Cvt,
54309 DAG.getIntPtrConstant(0, dl));
54310 }
54311
54312 return DAG.getBitcast(VT, Cvt);
54313}
54314
54315static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG) {
54316 SDValue Src = N->getOperand(0);
54317
54318 // Turn MOVDQ2Q+simple_load into an mmx load.
54319 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
54320 LoadSDNode *LN = cast<LoadSDNode>(Src.getNode());
54321
54322 if (LN->isSimple()) {
54323 SDValue NewLd = DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(),
54324 LN->getBasePtr(),
54325 LN->getPointerInfo(),
54326 LN->getOriginalAlign(),
54327 LN->getMemOperand()->getFlags());
54328 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), NewLd.getValue(1));
54329 return NewLd;
54330 }
54331 }
54332
54333 return SDValue();
54334}
54335
54336static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG,
54337 TargetLowering::DAGCombinerInfo &DCI) {
54338 unsigned NumBits = N->getSimpleValueType(0).getSizeInBits();
54339 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54340 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBits), DCI))
54341 return SDValue(N, 0);
54342
54343 return SDValue();
54344}
54345
54346SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
54347 DAGCombinerInfo &DCI) const {
54348 SelectionDAG &DAG = DCI.DAG;
54349 switch (N->getOpcode()) {
54350 default: break;
54351 case ISD::SCALAR_TO_VECTOR:
54352 return combineScalarToVector(N, DAG);
54353 case ISD::EXTRACT_VECTOR_ELT:
54354 case X86ISD::PEXTRW:
54355 case X86ISD::PEXTRB:
54356 return combineExtractVectorElt(N, DAG, DCI, Subtarget);
54357 case ISD::CONCAT_VECTORS:
54358 return combineConcatVectors(N, DAG, DCI, Subtarget);
54359 case ISD::INSERT_SUBVECTOR:
54360 return combineInsertSubvector(N, DAG, DCI, Subtarget);
54361 case ISD::EXTRACT_SUBVECTOR:
54362 return combineExtractSubvector(N, DAG, DCI, Subtarget);
54363 case ISD::VSELECT:
54364 case ISD::SELECT:
54365 case X86ISD::BLENDV: return combineSelect(N, DAG, DCI, Subtarget);
54366 case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
54367 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
54368 case X86ISD::CMP: return combineCMP(N, DAG);
54369 case ISD::ADD: return combineAdd(N, DAG, DCI, Subtarget);
54370 case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget);
54371 case X86ISD::ADD:
54372 case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI);
54373 case X86ISD::SBB: return combineSBB(N, DAG);
54374 case X86ISD::ADC: return combineADC(N, DAG, DCI);
54375 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
54376 case ISD::SHL: return combineShiftLeft(N, DAG);
54377 case ISD::SRA: return combineShiftRightArithmetic(N, DAG, Subtarget);
54378 case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI, Subtarget);
54379 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
54380 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
54381 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
54382 case X86ISD::BEXTR:
54383 case X86ISD::BEXTRI: return combineBEXTR(N, DAG, DCI, Subtarget);
54384 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
54385 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
54386 case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);
54387 case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget);
54388 case X86ISD::VEXTRACT_STORE:
54389 return combineVEXTRACT_STORE(N, DAG, DCI, Subtarget);
54390 case ISD::SINT_TO_FP:
54391 case ISD::STRICT_SINT_TO_FP:
54392 return combineSIntToFP(N, DAG, DCI, Subtarget);
54393 case ISD::UINT_TO_FP:
54394 case ISD::STRICT_UINT_TO_FP:
54395 return combineUIntToFP(N, DAG, Subtarget);
54396 case ISD::FADD:
54397 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
54398 case X86ISD::VFCMULC:
54399 case X86ISD::VFMULC: return combineFMulcFCMulc(N, DAG, Subtarget);
54400 case ISD::FNEG: return combineFneg(N, DAG, DCI, Subtarget);
54401 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
54402 case X86ISD::VTRUNC: return combineVTRUNC(N, DAG, DCI);
54403 case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
54404 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
54405 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
54406 case X86ISD::FXOR:
54407 case X86ISD::FOR: return combineFOr(N, DAG, DCI, Subtarget);
54408 case X86ISD::FMIN:
54409 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
54410 case ISD::FMINNUM:
54411 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
54412 case X86ISD::CVTSI2P:
54413 case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI);
54414 case X86ISD::CVTP2SI:
54415 case X86ISD::CVTP2UI:
54416 case X86ISD::STRICT_CVTTP2SI:
54417 case X86ISD::CVTTP2SI:
54418 case X86ISD::STRICT_CVTTP2UI:
54419 case X86ISD::CVTTP2UI:
54420 return combineCVTP2I_CVTTP2I(N, DAG, DCI);
54421 case X86ISD::STRICT_CVTPH2PS:
54422 case X86ISD::CVTPH2PS: return combineCVTPH2PS(N, DAG, DCI);
54423 case X86ISD::BT: return combineBT(N, DAG, DCI);
54424 case ISD::ANY_EXTEND:
54425 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
54426 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
54427 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
54428 case ISD::ANY_EXTEND_VECTOR_INREG:
54429 case ISD::SIGN_EXTEND_VECTOR_INREG:
54430 case ISD::ZERO_EXTEND_VECTOR_INREG:
54431 return combineEXTEND_VECTOR_INREG(N, DAG, DCI, Subtarget);
54432 case ISD::SETCC: return combineSetCC(N, DAG, DCI, Subtarget);
54433 case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
54434 case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
54435 case X86ISD::PACKSS:
54436 case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);
54437 case X86ISD::HADD:
54438 case X86ISD::HSUB:
54439 case X86ISD::FHADD:
54440 case X86ISD::FHSUB: return combineVectorHADDSUB(N, DAG, DCI, Subtarget);
54441 case X86ISD::VSHL:
54442 case X86ISD::VSRA:
54443 case X86ISD::VSRL:
54444 return combineVectorShiftVar(N, DAG, DCI, Subtarget);
54445 case X86ISD::VSHLI:
54446 case X86ISD::VSRAI:
54447 case X86ISD::VSRLI:
54448 return combineVectorShiftImm(N, DAG, DCI, Subtarget);
54449 case ISD::INSERT_VECTOR_ELT:
54450 case X86ISD::PINSRB:
54451 case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
54452 case X86ISD::SHUFP: // Handle all target specific shuffles
54453 case X86ISD::INSERTPS:
54454 case X86ISD::EXTRQI:
54455 case X86ISD::INSERTQI:
54456 case X86ISD::VALIGN:
54457 case X86ISD::PALIGNR:
54458 case X86ISD::VSHLDQ:
54459 case X86ISD::VSRLDQ:
54460 case X86ISD::BLENDI:
54461 case X86ISD::UNPCKH:
54462 case X86ISD::UNPCKL:
54463 case X86ISD::MOVHLPS:
54464 case X86ISD::MOVLHPS:
54465 case X86ISD::PSHUFB:
54466 case X86ISD::PSHUFD:
54467 case X86ISD::PSHUFHW:
54468 case X86ISD::PSHUFLW:
54469 case X86ISD::MOVSHDUP:
54470 case X86ISD::MOVSLDUP:
54471 case X86ISD::MOVDDUP:
54472 case X86ISD::MOVSS:
54473 case X86ISD::MOVSD:
54474 case X86ISD::MOVSH:
54475 case X86ISD::VBROADCAST:
54476 case X86ISD::VPPERM:
54477 case X86ISD::VPERMI:
54478 case X86ISD::VPERMV:
54479 case X86ISD::VPERMV3:
54480 case X86ISD::VPERMIL2:
54481 case X86ISD::VPERMILPI:
54482 case X86ISD::VPERMILPV:
54483 case X86ISD::VPERM2X128:
54484 case X86ISD::SHUF128:
54485 case X86ISD::VZEXT_MOVL:
54486 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
54487 case X86ISD::FMADD_RND:
54488 case X86ISD::FMSUB:
54489 case X86ISD::STRICT_FMSUB:
54490 case X86ISD::FMSUB_RND:
54491 case X86ISD::FNMADD:
54492 case X86ISD::STRICT_FNMADD:
54493 case X86ISD::FNMADD_RND:
54494 case X86ISD::FNMSUB:
54495 case X86ISD::STRICT_FNMSUB:
54496 case X86ISD::FNMSUB_RND:
54497 case ISD::FMA:
54498 case ISD::STRICT_FMA: return combineFMA(N, DAG, DCI, Subtarget);
54499 case X86ISD::FMADDSUB_RND:
54500 case X86ISD::FMSUBADD_RND:
54501 case X86ISD::FMADDSUB:
54502 case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, DCI);
54503 case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI, Subtarget);
54504 case X86ISD::MGATHER:
54505 case X86ISD::MSCATTER:
54506 return combineX86GatherScatter(N, DAG, DCI, Subtarget);
54507 case ISD::MGATHER:
54508 case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI);
54509 case X86ISD::PCMPEQ:
54510 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
54511 case X86ISD::PMULDQ:
54512 case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget);
54513 case X86ISD::VPMADDUBSW:
54514 case X86ISD::VPMADDWD: return combineVPMADD(N, DAG, DCI);
54515 case X86ISD::KSHIFTL:
54516 case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI);
54517 case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget);
54518 case ISD::STRICT_FP_EXTEND:
54519 case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, Subtarget);
54520 case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget);
54521 case X86ISD::VBROADCAST_LOAD:
54522 case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI);
54523 case X86ISD::MOVDQ2Q: return combineMOVDQ2Q(N, DAG);
54524 case X86ISD::PDEP: return combinePDEP(N, DAG, DCI);
54525 }
54526
54527 return SDValue();
54528}
54529
54530bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
54531 if (!isTypeLegal(VT))
54532 return false;
54533
54534 // There are no vXi8 shifts.
54535 if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
54536 return false;
54537
54538 // TODO: Almost no 8-bit ops are desirable because they have no actual
54539 // size/speed advantages vs. 32-bit ops, but they do have a major
54540 // potential disadvantage by causing partial register stalls.
54541 //
54542 // 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and
54543 // we have specializations to turn 32-bit multiply/shl into LEA or other ops.
54544 // Also, see the comment in "IsDesirableToPromoteOp" - where we additionally
54545 // check for a constant operand to the multiply.
54546 if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8)
54547 return false;
54548
54549 // i16 instruction encodings are longer and some i16 instructions are slow,
54550 // so those are not desirable.
54551 if (VT == MVT::i16) {
54552 switch (Opc) {
54553 default:
54554 break;
54555 case ISD::LOAD:
54556 case ISD::SIGN_EXTEND:
54557 case ISD::ZERO_EXTEND:
54558 case ISD::ANY_EXTEND:
54559 case ISD::SHL:
54560 case ISD::SRA:
54561 case ISD::SRL:
54562 case ISD::SUB:
54563 case ISD::ADD:
54564 case ISD::MUL:
54565 case ISD::AND:
54566 case ISD::OR:
54567 case ISD::XOR:
54568 return false;
54569 }
54570 }
54571
54572 // Any legal type not explicitly accounted for above here is desirable.
54573 return true;
54574}
54575
54576SDValue X86TargetLowering::expandIndirectJTBranch(const SDLoc& dl,
54577 SDValue Value, SDValue Addr,
54578 SelectionDAG &DAG) const {
54579 const Module *M = DAG.getMachineFunction().getMMI().getModule();
54580 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
54581 if (IsCFProtectionSupported) {
54582 // In case control-flow branch protection is enabled, we need to add
54583 // notrack prefix to the indirect branch.
54584 // In order to do that we create NT_BRIND SDNode.
54585 // Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.
54586 return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, Value, Addr);
54587 }
54588
54589 return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, DAG);
54590}
54591
54592bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
54593 EVT VT = Op.getValueType();
54594 bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL &&
54595 isa<ConstantSDNode>(Op.getOperand(1));
54596
54597 // i16 is legal, but undesirable since i16 instruction encodings are longer
54598 // and some i16 instructions are slow.
54599 // 8-bit multiply-by-constant can usually be expanded to something cheaper
54600 // using LEA and/or other ALU ops.
54601 if (VT != MVT::i16 && !Is8BitMulByConstant)
54602 return false;
54603
54604 auto IsFoldableRMW = [](SDValue Load, SDValue Op) {
54605 if (!Op.hasOneUse())
54606 return false;
54607 SDNode *User = *Op->use_begin();
54608 if (!ISD::isNormalStore(User))
54609 return false;
54610 auto *Ld = cast<LoadSDNode>(Load);
54611 auto *St = cast<StoreSDNode>(User);
54612 return Ld->getBasePtr() == St->getBasePtr();
54613 };
54614
54615 auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) {
54616 if (!Load.hasOneUse() || Load.getOpcode() != ISD::ATOMIC_LOAD)
54617 return false;
54618 if (!Op.hasOneUse())
54619 return false;
54620 SDNode *User = *Op->use_begin();
54621 if (User->getOpcode() != ISD::ATOMIC_STORE)
54622 return false;
54623 auto *Ld = cast<AtomicSDNode>(Load);
54624 auto *St = cast<AtomicSDNode>(User);
54625 return Ld->getBasePtr() == St->getBasePtr();
54626 };
54627
54628 bool Commute = false;
54629 switch (Op.getOpcode()) {
54630 default: return false;
54631 case ISD::SIGN_EXTEND:
54632 case ISD::ZERO_EXTEND:
54633 case ISD::ANY_EXTEND:
54634 break;
54635 case ISD::SHL:
54636 case ISD::SRA:
54637 case ISD::SRL: {
54638 SDValue N0 = Op.getOperand(0);
54639 // Look out for (store (shl (load), x)).
54640 if (X86::mayFoldLoad(N0, Subtarget) && IsFoldableRMW(N0, Op))
54641 return false;
54642 break;
54643 }
54644 case ISD::ADD:
54645 case ISD::MUL:
54646 case ISD::AND:
54647 case ISD::OR:
54648 case ISD::XOR:
54649 Commute = true;
54650 LLVM_FALLTHROUGH[[gnu::fallthrough]];
54651 case ISD::SUB: {
54652 SDValue N0 = Op.getOperand(0);
54653 SDValue N1 = Op.getOperand(1);
54654 // Avoid disabling potential load folding opportunities.
54655 if (X86::mayFoldLoad(N1, Subtarget) &&
54656 (!Commute || !isa<ConstantSDNode>(N0) ||
54657 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
54658 return false;
54659 if (X86::mayFoldLoad(N0, Subtarget) &&
54660 ((Commute && !isa<ConstantSDNode>(N1)) ||
54661 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
54662 return false;
54663 if (IsFoldableAtomicRMW(N0, Op) ||
54664 (Commute && IsFoldableAtomicRMW(N1, Op)))
54665 return false;
54666 }
54667 }
54668
54669 PVT = MVT::i32;
54670 return true;
54671}
54672
54673//===----------------------------------------------------------------------===//
54674// X86 Inline Assembly Support
54675//===----------------------------------------------------------------------===//
54676
54677// Helper to match a string separated by whitespace.
54678static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
54679 S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
54680
54681 for (StringRef Piece : Pieces) {
54682 if (!S.startswith(Piece)) // Check if the piece matches.
54683 return false;
54684
54685 S = S.substr(Piece.size());
54686 StringRef::size_type Pos = S.find_first_not_of(" \t");
54687 if (Pos == 0) // We matched a prefix.
54688 return false;
54689
54690 S = S.substr(Pos);
54691 }
54692
54693 return S.empty();
54694}
54695
54696static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
54697
54698 if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
54699 if (llvm::is_contained(AsmPieces, "~{cc}") &&
54700 llvm::is_contained(AsmPieces, "~{flags}") &&
54701 llvm::is_contained(AsmPieces, "~{fpsr}")) {
54702
54703 if (AsmPieces.size() == 3)
54704 return true;
54705 else if (llvm::is_contained(AsmPieces, "~{dirflag}"))
54706 return true;
54707 }
54708 }
54709 return false;
54710}
54711
54712bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
54713 InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand());
54714
54715 const std::string &AsmStr = IA->getAsmString();
54716
54717 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
54718 if (!Ty || Ty->getBitWidth() % 16 != 0)
54719 return false;
54720
54721 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
54722 SmallVector<StringRef, 4> AsmPieces;
54723 SplitString(AsmStr, AsmPieces, ";\n");
54724
54725 switch (AsmPieces.size()) {
54726 default: return false;
54727 case 1:
54728 // FIXME: this should verify that we are targeting a 486 or better. If not,
54729 // we will turn this bswap into something that will be lowered to logical
54730 // ops instead of emitting the bswap asm. For now, we don't support 486 or
54731 // lower so don't worry about this.
54732 // bswap $0
54733 if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
54734 matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
54735 matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
54736 matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
54737 matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
54738 matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
54739 // No need to check constraints, nothing other than the equivalent of
54740 // "=r,0" would be valid here.
54741 return IntrinsicLowering::LowerToByteSwap(CI);
54742 }
54743
54744 // rorw $$8, ${0:w} --> llvm.bswap.i16
54745 if (CI->getType()->isIntegerTy(16) &&
54746 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
54747 (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
54748 matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
54749 AsmPieces.clear();
54750 StringRef ConstraintsStr = IA->getConstraintString();
54751 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
54752 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
54753 if (clobbersFlagRegisters(AsmPieces))
54754 return IntrinsicLowering::LowerToByteSwap(CI);
54755 }
54756 break;
54757 case 3:
54758 if (CI->getType()->isIntegerTy(32) &&
54759 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
54760 matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
54761 matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
54762 matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
54763 AsmPieces.clear();
54764 StringRef ConstraintsStr = IA->getConstraintString();
54765 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
54766 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
54767 if (clobbersFlagRegisters(AsmPieces))
54768 return IntrinsicLowering::LowerToByteSwap(CI);
54769 }
54770
54771 if (CI->getType()->isIntegerTy(64)) {
54772 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
54773 if (Constraints.size() >= 2 &&
54774 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
54775 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
54776 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
54777 if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
54778 matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
54779 matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
54780 return IntrinsicLowering::LowerToByteSwap(CI);
54781 }
54782 }
54783 break;
54784 }
54785 return false;
54786}
54787
54788static X86::CondCode parseConstraintCode(llvm::StringRef Constraint) {
54789 X86::CondCode Cond = StringSwitch<X86::CondCode>(Constraint)
54790 .Case("{@cca}", X86::COND_A)
54791 .Case("{@ccae}", X86::COND_AE)
54792 .Case("{@ccb}", X86::COND_B)
54793 .Case("{@ccbe}", X86::COND_BE)
54794 .Case("{@ccc}", X86::COND_B)
54795 .Case("{@cce}", X86::COND_E)
54796 .Case("{@ccz}", X86::COND_E)
54797 .Case("{@ccg}", X86::COND_G)
54798 .Case("{@ccge}", X86::COND_GE)
54799 .Case("{@ccl}", X86::COND_L)
54800 .Case("{@ccle}", X86::COND_LE)
54801 .Case("{@ccna}", X86::COND_BE)
54802 .Case("{@ccnae}", X86::COND_B)
54803 .Case("{@ccnb}", X86::COND_AE)
54804 .Case("{@ccnbe}", X86::COND_A)
54805 .Case("{@ccnc}", X86::COND_AE)
54806 .Case("{@ccne}", X86::COND_NE)
54807 .Case("{@ccnz}", X86::COND_NE)
54808 .Case("{@ccng}", X86::COND_LE)
54809 .Case("{@ccnge}", X86::COND_L)
54810 .Case("{@ccnl}", X86::COND_GE)
54811 .Case("{@ccnle}", X86::COND_G)
54812 .Case("{@ccno}", X86::COND_NO)
54813 .Case("{@ccnp}", X86::COND_NP)
54814 .Case("{@ccns}", X86::COND_NS)
54815 .Case("{@cco}", X86::COND_O)
54816 .Case("{@ccp}", X86::COND_P)
54817 .Case("{@ccs}", X86::COND_S)
54818 .Default(X86::COND_INVALID);
54819 return Cond;
54820}
54821
54822/// Given a constraint letter, return the type of constraint for this target.
54823X86TargetLowering::ConstraintType
54824X86TargetLowering::getConstraintType(StringRef Constraint) const {
54825 if (Constraint.size() == 1) {
54826 switch (Constraint[0]) {
54827 case 'R':
54828 case 'q':
54829 case 'Q':
54830 case 'f':
54831 case 't':
54832 case 'u':
54833 case 'y':
54834 case 'x':
54835 case 'v':
54836 case 'l':
54837 case 'k': // AVX512 masking registers.
54838 return C_RegisterClass;
54839 case 'a':
54840 case 'b':
54841 case 'c':
54842 case 'd':
54843 case 'S':
54844 case 'D':
54845 case 'A':
54846 return C_Register;
54847 case 'I':
54848 case 'J':
54849 case 'K':
54850 case 'N':
54851 case 'G':
54852 case 'L':
54853 case 'M':
54854 return C_Immediate;
54855 case 'C':
54856 case 'e':
54857 case 'Z':
54858 return C_Other;
54859 default:
54860 break;
54861 }
54862 }
54863 else if (Constraint.size() == 2) {
54864 switch (Constraint[0]) {
54865 default:
54866 break;
54867 case 'Y':
54868 switch (Constraint[1]) {
54869 default:
54870 break;
54871 case 'z':
54872 return C_Register;
54873 case 'i':
54874 case 'm':
54875 case 'k':
54876 case 't':
54877 case '2':
54878 return C_RegisterClass;
54879 }
54880 }
54881 } else if (parseConstraintCode(Constraint) != X86::COND_INVALID)
54882 return C_Other;
54883 return TargetLowering::getConstraintType(Constraint);
54884}
54885
54886/// Examine constraint type and operand type and determine a weight value.
54887/// This object must already have been set up with the operand type
54888/// and the current alternative constraint selected.
54889TargetLowering::ConstraintWeight
54890 X86TargetLowering::getSingleConstraintMatchWeight(
54891 AsmOperandInfo &info, const char *constraint) const {
54892 ConstraintWeight weight = CW_Invalid;
54893 Value *CallOperandVal = info.CallOperandVal;
54894 // If we don't have a value, we can't do a match,
54895 // but allow it at the lowest weight.
54896 if (!CallOperandVal)
54897 return CW_Default;
54898 Type *type = CallOperandVal->getType();
54899 // Look at the constraint type.
54900 switch (*constraint) {
54901 default:
54902 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
54903 LLVM_FALLTHROUGH[[gnu::fallthrough]];
54904 case 'R':
54905 case 'q':
54906 case 'Q':
54907 case 'a':
54908 case 'b':
54909 case 'c':
54910 case 'd':
54911 case 'S':
54912 case 'D':
54913 case 'A':
54914 if (CallOperandVal->getType()->isIntegerTy())
54915 weight = CW_SpecificReg;
54916 break;
54917 case 'f':
54918 case 't':
54919 case 'u':
54920 if (type->isFloatingPointTy())
54921 weight = CW_SpecificReg;
54922 break;
54923 case 'y':
54924 if (type->isX86_MMXTy() && Subtarget.hasMMX())
54925 weight = CW_SpecificReg;
54926 break;
54927 case 'Y':
54928 if (StringRef(constraint).size() != 2)
54929 break;
54930 switch (constraint[1]) {
54931 default:
54932 return CW_Invalid;
54933 // XMM0
54934 case 'z':
54935 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
54936 ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) ||
54937 ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()))
54938 return CW_SpecificReg;
54939 return CW_Invalid;
54940 // Conditional OpMask regs (AVX512)
54941 case 'k':
54942 if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
54943 return CW_Register;
54944 return CW_Invalid;
54945 // Any MMX reg
54946 case 'm':
54947 if (type->isX86_MMXTy() && Subtarget.hasMMX())
54948 return weight;
54949 return CW_Invalid;
54950 // Any SSE reg when ISA >= SSE2, same as 'x'
54951 case 'i':
54952 case 't':
54953 case '2':
54954 if (!Subtarget.hasSSE2())
54955 return CW_Invalid;
54956 break;
54957 }
54958 break;
54959 case 'v':
54960 if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
54961 weight = CW_Register;
54962 LLVM_FALLTHROUGH[[gnu::fallthrough]];
54963 case 'x':
54964 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
54965 ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
54966 weight = CW_Register;
54967 break;
54968 case 'k':
54969 // Enable conditional vector operations using %k<#> registers.
54970 if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
54971 weight = CW_Register;
54972 break;
54973 case 'I':
54974 if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
54975 if (C->getZExtValue() <= 31)
54976 weight = CW_Constant;
54977 }
54978 break;
54979 case 'J':
54980 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
54981 if (C->getZExtValue() <= 63)
54982 weight = CW_Constant;
54983 }
54984 break;
54985 case 'K':
54986 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
54987 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
54988 weight = CW_Constant;
54989 }
54990 break;
54991 case 'L':
54992 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
54993 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
54994 weight = CW_Constant;
54995 }
54996 break;
54997 case 'M':
54998 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
54999 if (C->getZExtValue() <= 3)
55000 weight = CW_Constant;
55001 }
55002 break;
55003 case 'N':
55004 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
55005 if (C->getZExtValue() <= 0xff)
55006 weight = CW_Constant;
55007 }
55008 break;
55009 case 'G':
55010 case 'C':
55011 if (isa<ConstantFP>(CallOperandVal)) {
55012 weight = CW_Constant;
55013 }
55014 break;
55015 case 'e':
55016 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
55017 if ((C->getSExtValue() >= -0x80000000LL) &&
55018 (C->getSExtValue() <= 0x7fffffffLL))
55019 weight = CW_Constant;
55020 }
55021 break;
55022 case 'Z':
55023 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
55024 if (C->getZExtValue() <= 0xffffffff)
55025 weight = CW_Constant;
55026 }
55027 break;
55028 }
55029 return weight;
55030}
55031
55032/// Try to replace an X constraint, which matches anything, with another that
55033/// has more specific requirements based on the type of the corresponding
55034/// operand.
55035const char *X86TargetLowering::
55036LowerXConstraint(EVT ConstraintVT) const {
55037 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
55038 // 'f' like normal targets.
55039 if (ConstraintVT.isFloatingPoint()) {
55040 if (Subtarget.hasSSE1())
55041 return "x";
55042 }
55043
55044 return TargetLowering::LowerXConstraint(ConstraintVT);
55045}
55046
55047// Lower @cc targets via setcc.
55048SDValue X86TargetLowering::LowerAsmOutputForConstraint(
55049 SDValue &Chain, SDValue &Flag, const SDLoc &DL,
55050 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
55051 X86::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
55052 if (Cond == X86::COND_INVALID)
55053 return SDValue();
55054 // Check that return type is valid.
55055 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
55056 OpInfo.ConstraintVT.getSizeInBits() < 8)
55057 report_fatal_error("Flag output operand is of invalid type");
55058
55059 // Get EFLAGS register. Only update chain when copyfrom is glued.
55060 if (Flag.getNode()) {
55061 Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Flag);
55062 Chain = Flag.getValue(1);
55063 } else
55064 Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32);
55065 // Extract CC code.
55066 SDValue CC = getSETCC(Cond, Flag, DL, DAG);
55067 // Extend to 32-bits
55068 SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
55069
55070 return Result;
55071}
55072
55073/// Lower the specified operand into the Ops vector.
55074/// If it is invalid, don't add anything to Ops.
55075void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
55076 std::string &Constraint,
55077 std::vector<SDValue>&Ops,
55078 SelectionDAG &DAG) const {
55079 SDValue Result;
55080
55081 // Only support length 1 constraints for now.
55082 if (Constraint.length() > 1) return;
55083
55084 char ConstraintLetter = Constraint[0];
55085 switch (ConstraintLetter) {
55086 default: break;
55087 case 'I':
55088 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
55089 if (C->getZExtValue() <= 31) {
55090 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
55091 Op.getValueType());
55092 break;
55093 }
55094 }
55095 return;
55096 case 'J':
55097 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
55098 if (C->getZExtValue() <= 63) {
55099 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
55100 Op.getValueType());
55101 break;
55102 }
55103 }
55104 return;
55105 case 'K':
55106 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
55107 if (isInt<8>(C->getSExtValue())) {
55108 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
55109 Op.getValueType());
55110 break;
55111 }
55112 }
55113 return;
55114 case 'L':
55115 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
55116 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
55117 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
55118 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
55119 Op.getValueType());
55120 break;
55121 }
55122 }
55123 return;
55124 case 'M':
55125 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
55126 if (C->getZExtValue() <= 3) {
55127 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
55128 Op.getValueType());
55129 break;
55130 }
55131 }
55132 return;
55133 case 'N':
55134 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
55135 if (C->getZExtValue() <= 255) {
55136 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
55137 Op.getValueType());
55138 break;
55139 }
55140 }
55141 return;
55142 case 'O':
55143 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
55144 if (C->getZExtValue() <= 127) {
55145 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
55146 Op.getValueType());
55147 break;
55148 }
55149 }
55150 return;
55151 case 'e': {
55152 // 32-bit signed value
55153 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
55154 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
55155 C->getSExtValue())) {
55156 // Widen to 64 bits here to get it sign extended.
55157 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
55158 break;
55159 }
55160 // FIXME gcc accepts some relocatable values here too, but only in certain
55161 // memory models; it's complicated.
55162 }
55163 return;
55164 }
55165 case 'Z': {
55166 // 32-bit unsigned value
55167 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
55168 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
55169 C->getZExtValue())) {
55170 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
55171 Op.getValueType());
55172 break;
55173 }
55174 }
55175 // FIXME gcc accepts some relocatable values here too, but only in certain
55176 // memory models; it's complicated.
55177 return;
55178 }
55179 case 'i': {
55180 // Literal immediates are always ok.
55181 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
55182 bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;
55183 BooleanContent BCont = getBooleanContents(MVT::i64);
55184 ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)
55185 : ISD::SIGN_EXTEND;
55186 int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()
55187 : CST->getSExtValue();
55188 Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);
55189 break;
55190 }
55191
55192 // In any sort of PIC mode addresses need to be computed at runtime by
55193 // adding in a register or some sort of table lookup. These can't
55194 // be used as immediates. BlockAddresses are fine though.
55195 if ((Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC()) &&
55196 !isa<BlockAddressSDNode>(Op))
55197 return;
55198
55199 // If we are in non-pic codegen mode, we allow the address of a global (with
55200 // an optional displacement) to be used with 'i'.
55201 if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
55202 // If we require an extra load to get this address, as in PIC mode, we
55203 // can't accept it.
55204 if (isGlobalStubReference(
55205 Subtarget.classifyGlobalReference(GA->getGlobal())))
55206 return;
55207 break;
55208 }
55209 }
55210
55211 if (Result.getNode()) {
55212 Ops.push_back(Result);
55213 return;
55214 }
55215 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
55216}
55217
55218/// Check if \p RC is a general purpose register class.
55219/// I.e., GR* or one of their variant.
55220static bool isGRClass(const TargetRegisterClass &RC) {
55221 return RC.hasSuperClassEq(&X86::GR8RegClass) ||
55222 RC.hasSuperClassEq(&X86::GR16RegClass) ||
55223 RC.hasSuperClassEq(&X86::GR32RegClass) ||
55224 RC.hasSuperClassEq(&X86::GR64RegClass) ||
55225 RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
55226}
55227
55228/// Check if \p RC is a vector register class.
55229/// I.e., FR* / VR* or one of their variant.
55230static bool isFRClass(const TargetRegisterClass &RC) {
55231 return RC.hasSuperClassEq(&X86::FR16XRegClass) ||
55232 RC.hasSuperClassEq(&X86::FR32XRegClass) ||
55233 RC.hasSuperClassEq(&X86::FR64XRegClass) ||
55234 RC.hasSuperClassEq(&X86::VR128XRegClass) ||
55235 RC.hasSuperClassEq(&X86::VR256XRegClass) ||
55236 RC.hasSuperClassEq(&X86::VR512RegClass);
55237}
55238
55239/// Check if \p RC is a mask register class.
55240/// I.e., VK* or one of their variant.
55241static bool isVKClass(const TargetRegisterClass &RC) {
55242 return RC.hasSuperClassEq(&X86::VK1RegClass) ||
55243 RC.hasSuperClassEq(&X86::VK2RegClass) ||
55244 RC.hasSuperClassEq(&X86::VK4RegClass) ||
55245 RC.hasSuperClassEq(&X86::VK8RegClass) ||
55246 RC.hasSuperClassEq(&X86::VK16RegClass) ||
55247 RC.hasSuperClassEq(&X86::VK32RegClass) ||
55248 RC.hasSuperClassEq(&X86::VK64RegClass);
55249}
55250
55251std::pair<unsigned, const TargetRegisterClass *>
55252X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
55253 StringRef Constraint,
55254 MVT VT) const {
55255 // First, see if this is a constraint that directly corresponds to an LLVM
55256 // register class.
55257 if (Constraint.size() == 1) {
55258 // GCC Constraint Letters
55259 switch (Constraint[0]) {
55260 default: break;
55261 // 'A' means [ER]AX + [ER]DX.
55262 case 'A':
55263 if (Subtarget.is64Bit())
55264 return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);
55265 assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&(static_cast <bool> ((Subtarget.is32Bit() || Subtarget.
is16Bit()) && "Expecting 64, 32 or 16 bit subtarget")
? void (0) : __assert_fail ("(Subtarget.is32Bit() || Subtarget.is16Bit()) && \"Expecting 64, 32 or 16 bit subtarget\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55266, __extension__
__PRETTY_FUNCTION__))
55266 "Expecting 64, 32 or 16 bit subtarget")(static_cast <bool> ((Subtarget.is32Bit() || Subtarget.
is16Bit()) && "Expecting 64, 32 or 16 bit subtarget")
? void (0) : __assert_fail ("(Subtarget.is32Bit() || Subtarget.is16Bit()) && \"Expecting 64, 32 or 16 bit subtarget\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55266, __extension__
__PRETTY_FUNCTION__))
;
55267 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
55268
55269 // TODO: Slight differences here in allocation order and leaving
55270 // RIP in the class. Do they matter any more here than they do
55271 // in the normal allocation?
55272 case 'k':
55273 if (Subtarget.hasAVX512()) {
55274 if (VT == MVT::i1)
55275 return std::make_pair(0U, &X86::VK1RegClass);
55276 if (VT == MVT::i8)
55277 return std::make_pair(0U, &X86::VK8RegClass);
55278 if (VT == MVT::i16)
55279 return std::make_pair(0U, &X86::VK16RegClass);
55280 }
55281 if (Subtarget.hasBWI()) {
55282 if (VT == MVT::i32)
55283 return std::make_pair(0U, &X86::VK32RegClass);
55284 if (VT == MVT::i64)
55285 return std::make_pair(0U, &X86::VK64RegClass);
55286 }
55287 break;
55288 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
55289 if (Subtarget.is64Bit()) {
55290 if (VT == MVT::i8 || VT == MVT::i1)
55291 return std::make_pair(0U, &X86::GR8RegClass);
55292 if (VT == MVT::i16)
55293 return std::make_pair(0U, &X86::GR16RegClass);
55294 if (VT == MVT::i32 || VT == MVT::f32)
55295 return std::make_pair(0U, &X86::GR32RegClass);
55296 if (VT != MVT::f80 && !VT.isVector())
55297 return std::make_pair(0U, &X86::GR64RegClass);
55298 break;
55299 }
55300 LLVM_FALLTHROUGH[[gnu::fallthrough]];
55301 // 32-bit fallthrough
55302 case 'Q': // Q_REGS
55303 if (VT == MVT::i8 || VT == MVT::i1)
55304 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
55305 if (VT == MVT::i16)
55306 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
55307 if (VT == MVT::i32 || VT == MVT::f32 ||
55308 (!VT.isVector() && !Subtarget.is64Bit()))
55309 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
55310 if (VT != MVT::f80 && !VT.isVector())
55311 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
55312 break;
55313 case 'r': // GENERAL_REGS
55314 case 'l': // INDEX_REGS
55315 if (VT == MVT::i8 || VT == MVT::i1)
55316 return std::make_pair(0U, &X86::GR8RegClass);
55317 if (VT == MVT::i16)
55318 return std::make_pair(0U, &X86::GR16RegClass);
55319 if (VT == MVT::i32 || VT == MVT::f32 ||
55320 (!VT.isVector() && !Subtarget.is64Bit()))
55321 return std::make_pair(0U, &X86::GR32RegClass);
55322 if (VT != MVT::f80 && !VT.isVector())
55323 return std::make_pair(0U, &X86::GR64RegClass);
55324 break;
55325 case 'R': // LEGACY_REGS
55326 if (VT == MVT::i8 || VT == MVT::i1)
55327 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
55328 if (VT == MVT::i16)
55329 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
55330 if (VT == MVT::i32 || VT == MVT::f32 ||
55331 (!VT.isVector() && !Subtarget.is64Bit()))
55332 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
55333 if (VT != MVT::f80 && !VT.isVector())
55334 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
55335 break;
55336 case 'f': // FP Stack registers.
55337 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
55338 // value to the correct fpstack register class.
55339 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
55340 return std::make_pair(0U, &X86::RFP32RegClass);
55341 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
55342 return std::make_pair(0U, &X86::RFP64RegClass);
55343 if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80)
55344 return std::make_pair(0U, &X86::RFP80RegClass);
55345 break;
55346 case 'y': // MMX_REGS if MMX allowed.
55347 if (!Subtarget.hasMMX()) break;
55348 return std::make_pair(0U, &X86::VR64RegClass);
55349 case 'v':
55350 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
55351 if (!Subtarget.hasSSE1()) break;
55352 bool VConstraint = (Constraint[0] == 'v');
55353
55354 switch (VT.SimpleTy) {
55355 default: break;
55356 // Scalar SSE types.
55357 case MVT::f16:
55358 if (VConstraint && Subtarget.hasFP16())
55359 return std::make_pair(0U, &X86::FR16XRegClass);
55360 break;
55361 case MVT::f32:
55362 case MVT::i32:
55363 if (VConstraint && Subtarget.hasVLX())
55364 return std::make_pair(0U, &X86::FR32XRegClass);
55365 return std::make_pair(0U, &X86::FR32RegClass);
55366 case MVT::f64:
55367 case MVT::i64:
55368 if (VConstraint && Subtarget.hasVLX())
55369 return std::make_pair(0U, &X86::FR64XRegClass);
55370 return std::make_pair(0U, &X86::FR64RegClass);
55371 case MVT::i128:
55372 if (Subtarget.is64Bit()) {
55373 if (VConstraint && Subtarget.hasVLX())
55374 return std::make_pair(0U, &X86::VR128XRegClass);
55375 return std::make_pair(0U, &X86::VR128RegClass);
55376 }
55377 break;
55378 // Vector types and fp128.
55379 case MVT::v8f16:
55380 if (!Subtarget.hasFP16())
55381 break;
55382 LLVM_FALLTHROUGH[[gnu::fallthrough]];
55383 case MVT::f128:
55384 case MVT::v16i8:
55385 case MVT::v8i16:
55386 case MVT::v4i32:
55387 case MVT::v2i64:
55388 case MVT::v4f32:
55389 case MVT::v2f64:
55390 if (VConstraint && Subtarget.hasVLX())
55391 return std::make_pair(0U, &X86::VR128XRegClass);
55392 return std::make_pair(0U, &X86::VR128RegClass);
55393 // AVX types.
55394 case MVT::v16f16:
55395 if (!Subtarget.hasFP16())
55396 break;
55397 LLVM_FALLTHROUGH[[gnu::fallthrough]];
55398 case MVT::v32i8:
55399 case MVT::v16i16:
55400 case MVT::v8i32:
55401 case MVT::v4i64:
55402 case MVT::v8f32:
55403 case MVT::v4f64:
55404 if (VConstraint && Subtarget.hasVLX())
55405 return std::make_pair(0U, &X86::VR256XRegClass);
55406 if (Subtarget.hasAVX())
55407 return std::make_pair(0U, &X86::VR256RegClass);
55408 break;
55409 case MVT::v32f16:
55410 if (!Subtarget.hasFP16())
55411 break;
55412 LLVM_FALLTHROUGH[[gnu::fallthrough]];
55413 case MVT::v64i8:
55414 case MVT::v32i16:
55415 case MVT::v8f64:
55416 case MVT::v16f32:
55417 case MVT::v16i32:
55418 case MVT::v8i64:
55419 if (!Subtarget.hasAVX512()) break;
55420 if (VConstraint)
55421 return std::make_pair(0U, &X86::VR512RegClass);
55422 return std::make_pair(0U, &X86::VR512_0_15RegClass);
55423 }
55424 break;
55425 }
55426 } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
55427 switch (Constraint[1]) {
55428 default:
55429 break;
55430 case 'i':
55431 case 't':
55432 case '2':
55433 return getRegForInlineAsmConstraint(TRI, "x", VT);
55434 case 'm':
55435 if (!Subtarget.hasMMX()) break;
55436 return std::make_pair(0U, &X86::VR64RegClass);
55437 case 'z':
55438 if (!Subtarget.hasSSE1()) break;
55439 switch (VT.SimpleTy) {
55440 default: break;
55441 // Scalar SSE types.
55442 case MVT::f16:
55443 if (!Subtarget.hasFP16())
55444 break;
55445 return std::make_pair(X86::XMM0, &X86::FR16XRegClass);
55446 case MVT::f32:
55447 case MVT::i32:
55448 return std::make_pair(X86::XMM0, &X86::FR32RegClass);
55449 case MVT::f64:
55450 case MVT::i64:
55451 return std::make_pair(X86::XMM0, &X86::FR64RegClass);
55452 case MVT::v8f16:
55453 if (!Subtarget.hasFP16())
55454 break;
55455 LLVM_FALLTHROUGH[[gnu::fallthrough]];
55456 case MVT::f128:
55457 case MVT::v16i8:
55458 case MVT::v8i16:
55459 case MVT::v4i32:
55460 case MVT::v2i64:
55461 case MVT::v4f32:
55462 case MVT::v2f64:
55463 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
55464 // AVX types.
55465 case MVT::v16f16:
55466 if (!Subtarget.hasFP16())
55467 break;
55468 LLVM_FALLTHROUGH[[gnu::fallthrough]];
55469 case MVT::v32i8:
55470 case MVT::v16i16:
55471 case MVT::v8i32:
55472 case MVT::v4i64:
55473 case MVT::v8f32:
55474 case MVT::v4f64:
55475 if (Subtarget.hasAVX())
55476 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
55477 break;
55478 case MVT::v32f16:
55479 if (!Subtarget.hasFP16())
55480 break;
55481 LLVM_FALLTHROUGH[[gnu::fallthrough]];
55482 case MVT::v64i8:
55483 case MVT::v32i16:
55484 case MVT::v8f64:
55485 case MVT::v16f32:
55486 case MVT::v16i32:
55487 case MVT::v8i64:
55488 if (Subtarget.hasAVX512())
55489 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
55490 break;
55491 }
55492 break;
55493 case 'k':
55494 // This register class doesn't allocate k0 for masked vector operation.
55495 if (Subtarget.hasAVX512()) {
55496 if (VT == MVT::i1)
55497 return std::make_pair(0U, &X86::VK1WMRegClass);
55498 if (VT == MVT::i8)
55499 return std::make_pair(0U, &X86::VK8WMRegClass);
55500 if (VT == MVT::i16)
55501 return std::make_pair(0U, &X86::VK16WMRegClass);
55502 }
55503 if (Subtarget.hasBWI()) {
55504 if (VT == MVT::i32)
55505 return std::make_pair(0U, &X86::VK32WMRegClass);
55506 if (VT == MVT::i64)
55507 return std::make_pair(0U, &X86::VK64WMRegClass);
55508 }
55509 break;
55510 }
55511 }
55512
55513 if (parseConstraintCode(Constraint) != X86::COND_INVALID)
55514 return std::make_pair(0U, &X86::GR32RegClass);
55515
55516 // Use the default implementation in TargetLowering to convert the register
55517 // constraint into a member of a register class.
55518 std::pair<Register, const TargetRegisterClass*> Res;
55519 Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
55520
55521 // Not found as a standard register?
55522 if (!Res.second) {
55523 // Only match x87 registers if the VT is one SelectionDAGBuilder can convert
55524 // to/from f80.
55525 if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) {
55526 // Map st(0) -> st(7) -> ST0
55527 if (Constraint.size() == 7 && Constraint[0] == '{' &&
55528 tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&
55529 Constraint[3] == '(' &&
55530 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
55531 Constraint[5] == ')' && Constraint[6] == '}') {
55532 // st(7) is not allocatable and thus not a member of RFP80. Return
55533 // singleton class in cases where we have a reference to it.
55534 if (Constraint[4] == '7')
55535 return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);
55536 return std::make_pair(X86::FP0 + Constraint[4] - '0',
55537 &X86::RFP80RegClass);
55538 }
55539
55540 // GCC allows "st(0)" to be called just plain "st".
55541 if (StringRef("{st}").equals_insensitive(Constraint))
55542 return std::make_pair(X86::FP0, &X86::RFP80RegClass);
55543 }
55544
55545 // flags -> EFLAGS
55546 if (StringRef("{flags}").equals_insensitive(Constraint))
55547 return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);
55548
55549 // dirflag -> DF
55550 // Only allow for clobber.
55551 if (StringRef("{dirflag}").equals_insensitive(Constraint) &&
55552 VT == MVT::Other)
55553 return std::make_pair(X86::DF, &X86::DFCCRRegClass);
55554
55555 // fpsr -> FPSW
55556 if (StringRef("{fpsr}").equals_insensitive(Constraint))
55557 return std::make_pair(X86::FPSW, &X86::FPCCRRegClass);
55558
55559 return Res;
55560 }
55561
55562 // Make sure it isn't a register that requires 64-bit mode.
55563 if (!Subtarget.is64Bit() &&
55564 (isFRClass(*Res.second) || isGRClass(*Res.second)) &&
55565 TRI->getEncodingValue(Res.first) >= 8) {
55566 // Register requires REX prefix, but we're in 32-bit mode.
55567 return std::make_pair(0, nullptr);
55568 }
55569
55570 // Make sure it isn't a register that requires AVX512.
55571 if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&
55572 TRI->getEncodingValue(Res.first) & 0x10) {
55573 // Register requires EVEX prefix.
55574 return std::make_pair(0, nullptr);
55575 }
55576
55577 // Otherwise, check to see if this is a register class of the wrong value
55578 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
55579 // turn into {ax},{dx}.
55580 // MVT::Other is used to specify clobber names.
55581 if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
55582 return Res; // Correct type already, nothing to do.
55583
55584 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
55585 // return "eax". This should even work for things like getting 64bit integer
55586 // registers when given an f64 type.
55587 const TargetRegisterClass *Class = Res.second;
55588 // The generic code will match the first register class that contains the
55589 // given register. Thus, based on the ordering of the tablegened file,
55590 // the "plain" GR classes might not come first.
55591 // Therefore, use a helper method.
55592 if (isGRClass(*Class)) {
55593 unsigned Size = VT.getSizeInBits();
55594 if (Size == 1) Size = 8;
55595 Register DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
55596 if (DestReg > 0) {
55597 bool is64Bit = Subtarget.is64Bit();
55598 const TargetRegisterClass *RC =
55599 Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
55600 : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
55601 : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
55602 : Size == 64 ? (is64Bit ? &X86::GR64RegClass : nullptr)
55603 : nullptr;
55604 if (Size == 64 && !is64Bit) {
55605 // Model GCC's behavior here and select a fixed pair of 32-bit
55606 // registers.
55607 switch (DestReg) {
55608 case X86::RAX:
55609 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
55610 case X86::RDX:
55611 return std::make_pair(X86::EDX, &X86::GR32_DCRegClass);
55612 case X86::RCX:
55613 return std::make_pair(X86::ECX, &X86::GR32_CBRegClass);
55614 case X86::RBX:
55615 return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass);
55616 case X86::RSI:
55617 return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass);
55618 case X86::RDI:
55619 return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass);
55620 case X86::RBP:
55621 return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass);
55622 default:
55623 return std::make_pair(0, nullptr);
55624 }
55625 }
55626 if (RC && RC->contains(DestReg))
55627 return std::make_pair(DestReg, RC);
55628 return Res;
55629 }
55630 // No register found/type mismatch.
55631 return std::make_pair(0, nullptr);
55632 } else if (isFRClass(*Class)) {
55633 // Handle references to XMM physical registers that got mapped into the
55634 // wrong class. This can happen with constraints like {xmm0} where the
55635 // target independent register mapper will just pick the first match it can
55636 // find, ignoring the required type.
55637
55638 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
55639 if (VT == MVT::f16)
55640 Res.second = &X86::FR16XRegClass;
55641 else if (VT == MVT::f32 || VT == MVT::i32)
55642 Res.second = &X86::FR32XRegClass;
55643 else if (VT == MVT::f64 || VT == MVT::i64)
55644 Res.second = &X86::FR64XRegClass;
55645 else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))
55646 Res.second = &X86::VR128XRegClass;
55647 else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))
55648 Res.second = &X86::VR256XRegClass;
55649 else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
55650 Res.second = &X86::VR512RegClass;
55651 else {
55652 // Type mismatch and not a clobber: Return an error;
55653 Res.first = 0;
55654 Res.second = nullptr;
55655 }
55656 } else if (isVKClass(*Class)) {
55657 if (VT == MVT::i1)
55658 Res.second = &X86::VK1RegClass;
55659 else if (VT == MVT::i8)
55660 Res.second = &X86::VK8RegClass;
55661 else if (VT == MVT::i16)
55662 Res.second = &X86::VK16RegClass;
55663 else if (VT == MVT::i32)
55664 Res.second = &X86::VK32RegClass;
55665 else if (VT == MVT::i64)
55666 Res.second = &X86::VK64RegClass;
55667 else {
55668 // Type mismatch and not a clobber: Return an error;
55669 Res.first = 0;
55670 Res.second = nullptr;
55671 }
55672 }
55673
55674 return Res;
55675}
55676
55677InstructionCost X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
55678 const AddrMode &AM,
55679 Type *Ty,
55680 unsigned AS) const {
55681 // Scaling factors are not free at all.
55682 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
55683 // will take 2 allocations in the out of order engine instead of 1
55684 // for plain addressing mode, i.e. inst (reg1).
55685 // E.g.,
55686 // vaddps (%rsi,%rdx), %ymm0, %ymm1
55687 // Requires two allocations (one for the load, one for the computation)
55688 // whereas:
55689 // vaddps (%rsi), %ymm0, %ymm1
55690 // Requires just 1 allocation, i.e., freeing allocations for other operations
55691 // and having less micro operations to execute.
55692 //
55693 // For some X86 architectures, this is even worse because for instance for
55694 // stores, the complex addressing mode forces the instruction to use the
55695 // "load" ports instead of the dedicated "store" port.
55696 // E.g., on Haswell:
55697 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
55698 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
55699 if (isLegalAddressingMode(DL, AM, Ty, AS))
55700 // Scale represents reg2 * scale, thus account for 1
55701 // as soon as we use a second register.
55702 return AM.Scale != 0;
55703 return -1;
55704}
55705
55706bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
55707 // Integer division on x86 is expensive. However, when aggressively optimizing
55708 // for code size, we prefer to use a div instruction, as it is usually smaller
55709 // than the alternative sequence.
55710 // The exception to this is vector division. Since x86 doesn't have vector
55711 // integer division, leaving the division as-is is a loss even in terms of
55712 // size, because it will have to be scalarized, while the alternative code
55713 // sequence can be performed in vector form.
55714 bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
55715 return OptSize && !VT.isVector();
55716}
55717
55718void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
55719 if (!Subtarget.is64Bit())
55720 return;
55721
55722 // Update IsSplitCSR in X86MachineFunctionInfo.
55723 X86MachineFunctionInfo *AFI =
55724 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
55725 AFI->setIsSplitCSR(true);
55726}
55727
55728void X86TargetLowering::insertCopiesSplitCSR(
55729 MachineBasicBlock *Entry,
55730 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
55731 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
55732 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
55733 if (!IStart)
55734 return;
55735
55736 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
55737 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
55738 MachineBasicBlock::iterator MBBI = Entry->begin();
55739 for (const MCPhysReg *I = IStart; *I; ++I) {
55740 const TargetRegisterClass *RC = nullptr;
55741 if (X86::GR64RegClass.contains(*I))
55742 RC = &X86::GR64RegClass;
55743 else
55744 llvm_unreachable("Unexpected register class in CSRsViaCopy!")::llvm::llvm_unreachable_internal("Unexpected register class in CSRsViaCopy!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55744)
;
55745
55746 Register NewVR = MRI->createVirtualRegister(RC);
55747 // Create copy from CSR to a virtual register.
55748 // FIXME: this currently does not emit CFI pseudo-instructions, it works
55749 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
55750 // nounwind. If we want to generalize this later, we may need to emit
55751 // CFI pseudo-instructions.
55752 assert((static_cast <bool> (Entry->getParent()->getFunction
().hasFnAttribute(Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"
) ? void (0) : __assert_fail ("Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55754, __extension__
__PRETTY_FUNCTION__))
55753 Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&(static_cast <bool> (Entry->getParent()->getFunction
().hasFnAttribute(Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"
) ? void (0) : __assert_fail ("Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55754, __extension__
__PRETTY_FUNCTION__))
55754 "Function should be nounwind in insertCopiesSplitCSR!")(static_cast <bool> (Entry->getParent()->getFunction
().hasFnAttribute(Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"
) ? void (0) : __assert_fail ("Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55754, __extension__
__PRETTY_FUNCTION__))
;
55755 Entry->addLiveIn(*I);
55756 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
55757 .addReg(*I);
55758
55759 // Insert the copy-back instructions right before the terminator.
55760 for (auto *Exit : Exits)
55761 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
55762 TII->get(TargetOpcode::COPY), *I)
55763 .addReg(NewVR);
55764 }
55765}
55766
55767bool X86TargetLowering::supportSwiftError() const {
55768 return Subtarget.is64Bit();
55769}
55770
55771/// Returns true if stack probing through a function call is requested.
55772bool X86TargetLowering::hasStackProbeSymbol(MachineFunction &MF) const {
55773 return !getStackProbeSymbolName(MF).empty();
55774}
55775
55776/// Returns true if stack probing through inline assembly is requested.
55777bool X86TargetLowering::hasInlineStackProbe(MachineFunction &MF) const {
55778
55779 // No inline stack probe for Windows, they have their own mechanism.
55780 if (Subtarget.isOSWindows() ||
55781 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
55782 return false;
55783
55784 // If the function specifically requests inline stack probes, emit them.
55785 if (MF.getFunction().hasFnAttribute("probe-stack"))
55786 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
55787 "inline-asm";
55788
55789 return false;
55790}
55791
55792/// Returns the name of the symbol used to emit stack probes or the empty
55793/// string if not applicable.
55794StringRef
55795X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
55796 // Inline Stack probes disable stack probe call
55797 if (hasInlineStackProbe(MF))
55798 return "";
55799
55800 // If the function specifically requests stack probes, emit them.
55801 if (MF.getFunction().hasFnAttribute("probe-stack"))
55802 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
55803
55804 // Generally, if we aren't on Windows, the platform ABI does not include
55805 // support for stack probes, so don't emit them.
55806 if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO() ||
55807 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
55808 return "";
55809
55810 // We need a stack probe to conform to the Windows ABI. Choose the right
55811 // symbol.
55812 if (Subtarget.is64Bit())
55813 return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
55814 return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
55815}
55816
55817unsigned
55818X86TargetLowering::getStackProbeSize(MachineFunction &MF) const {
55819 // The default stack probe size is 4096 if the function has no stackprobesize
55820 // attribute.
55821 unsigned StackProbeSize = 4096;
55822 const Function &Fn = MF.getFunction();
55823 if (Fn.hasFnAttribute("stack-probe-size"))
55824 Fn.getFnAttribute("stack-probe-size")
55825 .getValueAsString()
55826 .getAsInteger(0, StackProbeSize);
55827 return StackProbeSize;
55828}
55829
55830Align X86TargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
55831 if (ML->isInnermost() &&
55832 ExperimentalPrefInnermostLoopAlignment.getNumOccurrences())
55833 return Align(1ULL << ExperimentalPrefInnermostLoopAlignment);
55834 return TargetLowering::getPrefLoopAlignment();
55835}